├── .gitignore ├── Makefile ├── README.md ├── classifier.cpp ├── convolution.cpp └── dnn.hpp /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.lo 7 | *.o 8 | *.obj 9 | 10 | # Precompiled Headers 11 | *.gch 12 | *.pch 13 | 14 | # Compiled Dynamic libraries 15 | *.so 16 | *.dylib 17 | *.dll 18 | 19 | # Fortran module files 20 | *.mod 21 | *.smod 22 | 23 | # Compiled Static libraries 24 | *.lai 25 | *.la 26 | *.a 27 | *.lib 28 | 29 | # Executables 30 | *.exe 31 | *.out 32 | *.app 33 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CPP=g++ 2 | 3 | OPT?=-O3 4 | 5 | CFLAGS= --std=c++11 -g -ggdb -gdwarf-3 $(OPT) -fsanitize=address 6 | MODULE := conv1 conv2 class1 class2 7 | 8 | .PHONY: all clean 9 | 10 | all: $(MODULE) 11 | 12 | HEADERS=dnn.hpp 13 | 14 | # These tiling parameters are 100% arbitrary, and it may be advantageous to tune/remove/completely-change them for GPU 15 | conv1: convolution.cpp $(HEADERS) 16 | $(CPP) $^ $(CFLAGS) -o $@ -DNx=224 -DNy=224 -DKx=3 -DKy=3 -DNi=64 -DNn=64 -DTii=32 -DTi=16 -DTnn=32 -DTn=16 -DTx=7 -DTy=7 17 | 18 | conv2: convolution.cpp $(HEADERS) 19 | $(CPP) $^ $(CFLAGS) -o $@ -DNx=14 -DNy=14 -DKx=3 -DKy=3 -DNi=512 -DNn=512 -DTii=32 -DTi=16 -DTnn=32 -DTn=16 -DTx=2 -DTy=2 20 | 21 | class1: classifier.cpp $(HEADERS) 22 | $(CPP) $^ $(CFLAGS) -o $@ -DNi=25088 -DNn=4096 -DTii=512 -DTi=64 -DTnn=32 -DTn=16 23 | 24 | class2: classifier.cpp $(HEADERS) 25 | $(CPP) $^ $(CFLAGS) -o $@ -DNi=4096 -DNn=1024 -DTii=32 -DTi=32 -DTnn=32 -DTn=16 26 | 27 | clean: 28 | @rm -f $(MODULE) 29 | 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # fp-diannao 2 | 3 | * This is an extremely simple C version of classifier/convolution dnn kernels, based on data layout and implementation from diannao paper: 4 | http://novel.ict.ac.cn/ychen/pdf/DianNao.pdf 5 | 6 | * Data is completely made up, but shouldn't matter for dense kernels. Datatypes are currently set at fp32; see VTYPE in dnn.h. 7 | 8 | * Currently, Makefile is configured for example layers from VGG16. 9 | https://arxiv.org/abs/1409.1556 10 | 11 | -------------------------------------------------------------------------------- /classifier.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "dnn.hpp" 3 | 4 | using namespace std; 5 | 6 | //Define the parameters if not defined externally 7 | #ifndef Nn 8 | #define Nn 128 // Number of Output Layers 9 | #define Ni 224 // Number of Input Layers 10 | #endif 11 | 12 | #ifndef Tii 13 | // Tiling Sizes 14 | #define Tnn 32 15 | #define Tii 32 16 | //#define Tn 5 17 | //#define Ti 25 18 | #define Tn 16 19 | #define Ti 16 20 | #endif 21 | 22 | //Arrays: 23 | VTYPE synapse[Nn][Ni] __attribute__((aligned(64))); 24 | VTYPE neuron_i[Ni] __attribute__((aligned(64))); 25 | VTYPE neuron_n[Nn] __attribute__((aligned(64))), neuron_n2[Nn] __attribute__((aligned(64))); 26 | 27 | void fill_classifier(VTYPE (&synapse)[Nn][Ni], VTYPE (&neuron_i)[Ni], 28 | VTYPE (&neuron_n)[Nn], VTYPE (&neuron_n2)[Nn]) { 29 | for(int n = 0; n < Nn; ++n) { 30 | for(int i = 0; i < Ni; ++i) { 31 | synapse[n][i] = static_cast (rand()) / static_cast (RAND_MAX) - 0.5f; 32 | } 33 | } 34 | for(int i = 0; i < Ni; ++i) { 35 | neuron_i[i] = static_cast (rand()) / static_cast (RAND_MAX) - 0.5f; 36 | } 37 | for(int n = 0; n < Nn; ++n) { 38 | neuron_n[n] = 0; //i; 39 | neuron_n2[n] = 0; //i; 40 | } 41 | } 42 | 43 | void classifier_layer(VTYPE (&synapse)[Nn][Ni], VTYPE (&neuron_i)[Ni], VTYPE (&neuron_n)[Nn]) { 44 | int total_calc=0; 45 | for (int n = 0; n < Nn; n++) { 46 | VTYPE temp=0; 47 | for (int i = 0; i < Ni; i++) { 48 | temp += synapse[n][i] * neuron_i[i]; 49 | } 50 | neuron_n[n] = transfer(temp); 51 | } 52 | } 53 | 54 | void classifier_layer_blocked(VTYPE (&synapse)[Nn][Ni], VTYPE (&neuron_i)[Ni], 55 | VTYPE (&neuron_n)[Nn]) { 56 | int total_calc=0; 57 | VTYPE sum[Nn]={0}; 58 | for (int nnn = 0; nnn < Nn; nnn += Tnn) { // tiling for output neurons; 59 | for (int iii = 0; iii < Ni; iii += Tii) { // tiling for input neurons; 60 | for (int nn = nnn; nn < nnn + Tnn; nn += Tn) { 61 | for (int ii = iii; ii < iii + Tii; ii += Ti) { 62 | // — Original code — 63 | for (int n = nn; n < nn + Tn; n++) { 64 | VTYPE sum_sc=0; 65 | for (int i = ii; i < ii + Ti; i++) { 66 | sum_sc += (synapse[n][i] * neuron_i[i]); 67 | } 68 | sum[n]+=sum_sc; 69 | } 70 | } 71 | } 72 | } 73 | for (int nn = nnn; nn < nnn + Tnn; nn++) { 74 | neuron_n[nn] = transfer(sum[nn]); 75 | } 76 | } 77 | } 78 | 79 | int main(int argc, char** argv) { 80 | cout << "initializing arrays\n"; 81 | 82 | fill_classifier(synapse,neuron_i,neuron_n,neuron_n2); 83 | 84 | cout << "starting computation\n"; 85 | 86 | begin_roi(); 87 | classifier_layer(synapse,neuron_i,neuron_n); 88 | end_roi(); 89 | 90 | cout << "simple version complete!\n"; 91 | 92 | begin_roi(); 93 | classifier_layer_blocked(synapse,neuron_i,neuron_n2); 94 | end_roi(); 95 | 96 | cout << "blocked computation complete!\n"; 97 | 98 | compare(neuron_n,neuron_n2,Nn); 99 | } 100 | 101 | -------------------------------------------------------------------------------- /convolution.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "dnn.hpp" 4 | 5 | using namespace std; 6 | 7 | //Define the parameters if not defined externally 8 | #ifndef Sy 9 | #define Sy 1 10 | #define Sx 1 11 | #endif 12 | 13 | #ifndef Tnn 14 | //Tiling Sizes 15 | #define Tnn 32 16 | #define Tn 16 17 | #define Ti 16 18 | 19 | #define Ty 8 20 | #define Tx 8 21 | #endif 22 | 23 | #define NYPAD (Ny+Ky) 24 | #define NXPAD (Nx+Kx) 25 | 26 | #define NYSCL (Ny/Sy) 27 | #define NXSCL (Nx/Sx) 28 | 29 | #define SYNAPSE_SIZE (1L*Ky*Kx*Nn*Ni) 30 | 31 | VTYPE (*synapse)[Ky][Kx][Nn][Ni]; 32 | 33 | VTYPE (*neuron_i)[NYPAD][NXPAD][Ni]; 34 | VTYPE (*neuron_n)[NYSCL][NXSCL][Nn]; 35 | VTYPE (*neuron_n2)[NYSCL][NXSCL][Nn]; 36 | 37 | void fill_convolution_shared_simple(VTYPE (&synapse)[Ky][Kx][Nn][Ni], 38 | VTYPE (&neuron_i)[NYPAD][NXPAD][Ni]) { 39 | for(int yy = 0; yy < Ky; ++yy) { 40 | for(int xx = 0; xx < Kx; ++xx) { 41 | for(int nn = 0; nn < Nn; ++nn) { 42 | for(int ni = 0; ni < Ni; ++ni) { 43 | synapse[yy][xx][nn][ni] = static_cast (rand()) / static_cast (RAND_MAX) - 0.5f; 44 | } } } } 45 | for(int yy = 0; yy < NYPAD; ++yy) { 46 | for(int xx = 0; xx < NXPAD; ++xx) { 47 | for(int ni = 0; ni < Ni; ++ni) { 48 | neuron_i[yy][xx][ni] = static_cast (rand()) / static_cast (RAND_MAX) - 0.5f; 49 | } } } 50 | } 51 | 52 | void convolution_layer_blocked( 53 | VTYPE (&synapse)[Ky][Kx][Nn][Ni], 54 | VTYPE (&neuron_i)[NYPAD][NXPAD][Ni], 55 | VTYPE (&neuron_n)[NYSCL][NXSCL][Nn]) { 56 | int c1=0,c2=0; 57 | VTYPE sum[Nn]={0}; 58 | 59 | for (int yy = 0; yy < Ny; yy += Ty) { 60 | for (int xx = 0; xx < Nx; xx += Tx) { 61 | for (int nnn = 0; nnn < Nn; nnn += Tnn) { 62 | int yout = yy/Sy; 63 | for (int y = yy; y < yy + Ty; y += Sy) { // tiling for y; 64 | int xout = xx/Sx; 65 | 66 | for (int x = xx; x < xx + Tx; x += Sx) { // tiling for x; 67 | 68 | for (int nn = nnn; nn < nnn + Tnn; nn += Tn) { 69 | for (int n = nn; n < nn + Tn; n++) { 70 | sum[n] = 0; 71 | } 72 | 73 | for (int ky = 0; ky < Ky; ky++) { // sliding window; 74 | for (int kx = 0; kx < Kx; kx++) { 75 | 76 | int ii = 0; 77 | VTYPE sum_sc; 78 | 79 | for (; ii < Ni -Ti+1; ii += Ti) { 80 | for (int n = nn; n < nn + Tn; n++) { 81 | sum_sc=0; 82 | for (int i = ii; i < ii + Ti; i++) { 83 | VTYPE sv = synapse[ky][kx][n][i]; 84 | VTYPE nv = neuron_i[ky + y][kx + x][i]; 85 | sum_sc+=sv*nv; 86 | } 87 | sum[n]+=sum_sc; 88 | } 89 | } 90 | } 91 | } 92 | 93 | //transfer 94 | for (int n = nn; n < nn + Tn; n++) { 95 | neuron_n[yout][xout][n] = transfer(sum[n]); 96 | } 97 | } 98 | xout++; 99 | } 100 | yout++; 101 | } 102 | } 103 | } 104 | } 105 | } 106 | 107 | void convolution_layer(VTYPE (&synapse)[Ky][Kx][Nn][Ni], 108 | VTYPE (&neuron_i)[NYPAD][NXPAD][Ni], 109 | VTYPE (&neuron_n)[NYSCL][NXSCL][Nn]) { 110 | VTYPE sum[Nn]={0}; 111 | 112 | // — Original code — (excluding nn, ii loops) 113 | int yout = 0; 114 | for (int y = 0; y < Ny; y += Sy) { // tiling for y; 115 | int xout = 0; 116 | for (int x = 0; x < Ny; x += Sx) { // tiling for x; 117 | for (int nn = 0; nn < Nn; nn += Tn) { 118 | for (int n = nn; n < nn + Tn; n++) { 119 | sum[n]=0; 120 | } 121 | 122 | // sliding window; 123 | for (int ky = 0; ky < Ky; ky++) 124 | for (int kx = 0; kx < Kx; kx++) 125 | for (int n = nn; n < nn + Tn; n++) 126 | for (int i = 0; i < Ni; i++) { 127 | VTYPE sv = synapse[ky][kx][n][i]; 128 | VTYPE nv = neuron_i[ky + y][kx + x][i]; 129 | sum[n]+=sv*nv; 130 | } 131 | for (int n = nn; n < nn + Tn; n++) { 132 | neuron_n[yout][xout][n] = transfer(sum[n]); 133 | } 134 | } 135 | xout++; 136 | } 137 | yout++; 138 | } 139 | } 140 | 141 | 142 | 143 | 144 | int main(const int argc, const char** argv) { 145 | synapse = (VTYPE (*)[Ky][Kx][Nn][Ni]) aligned_malloc(64, SYNAPSE_SIZE*sizeof(VTYPE)); 146 | neuron_i = (VTYPE (*)[NYPAD][NXPAD][Ni])aligned_malloc(64,NYPAD*NXPAD*Ni*sizeof(VTYPE)); 147 | neuron_n = (VTYPE (*)[NYSCL][NXSCL][Nn])aligned_malloc(64,NYSCL*NXSCL*Nn*sizeof(VTYPE)); 148 | neuron_n2 = (VTYPE (*)[NYSCL][NXSCL][Nn])aligned_malloc(64,NYSCL*NXSCL*Nn*sizeof(VTYPE)); 149 | 150 | cout << "initializing arrays\n"; 151 | 152 | fill_convolution_shared_simple(*synapse,*neuron_i); 153 | 154 | cout << "starting computation\n"; 155 | 156 | //Simple Version 157 | begin_roi(); 158 | convolution_layer(*synapse,*neuron_i,*neuron_n); 159 | end_roi(); 160 | 161 | cout << "simple version complete!\n"; 162 | 163 | //Blocked Version 164 | begin_roi(); 165 | convolution_layer_blocked(*synapse,*neuron_i,*neuron_n2); 166 | end_roi(); 167 | 168 | cout << "blocked computation complete!\n"; 169 | 170 | compare((VTYPE*)*neuron_n,(VTYPE*)*neuron_n2,NYSCL*NXSCL*Nn); 171 | } 172 | 173 | 174 | -------------------------------------------------------------------------------- /dnn.hpp: -------------------------------------------------------------------------------- 1 | #ifndef DNN_H 2 | #define DNN_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #define VTYPE float 12 | 13 | static __inline__ uint64_t gettime(void) { 14 | struct timeval tv; 15 | gettimeofday(&tv, NULL); 16 | return (((uint64_t)tv.tv_sec) * 1000000 + ((uint64_t)tv.tv_usec)); 17 | } 18 | 19 | static uint64_t usec; 20 | 21 | __attribute__ ((noinline)) void begin_roi() { 22 | usec=gettime(); 23 | } 24 | 25 | __attribute__ ((noinline)) void end_roi() { 26 | usec=(gettime()-usec); 27 | std::cout << "elapsed (sec): " << usec/1000000.0 << "\n"; 28 | } 29 | 30 | 31 | // Is this a leaky relu? 32 | VTYPE transfer(VTYPE i) { 33 | return (i>0) ? i : i/4; 34 | } 35 | 36 | void compare(VTYPE* neuron1, VTYPE* neuron2, int size) { 37 | bool error = false; 38 | for(int i = 0; i < size; ++i) { 39 | VTYPE diff = neuron1[i] - neuron2[i]; 40 | if(diff>0.001f || diff <-0.001f) { 41 | error = true; 42 | break; 43 | } 44 | } 45 | if(error) { 46 | for(int i = 0; i < size; ++i) { 47 | std::cout << i << " " << neuron1[i] << ":" << neuron2[i];; 48 | 49 | VTYPE diff = neuron1[i] - neuron2[i]; 50 | if(diff>0.001f || diff <-0.001f) { 51 | std::cout << " \t\tERROR"; 52 | } 53 | std::cout << "\n"; 54 | } 55 | } else { 56 | std::cout << "results match\n"; 57 | } 58 | } 59 | 60 | void* aligned_malloc(uint64_t align, uint64_t bytes) { 61 | size_t mask = (align-1)^((size_t)-1); 62 | char* ptr = (((char*)malloc(bytes+align)) + align); 63 | ptr = (char*) (((size_t)ptr) & mask); 64 | return (void*) ptr; 65 | } 66 | 67 | #endif 68 | --------------------------------------------------------------------------------