├── makefile ├── LICENSE ├── wyhash32.h ├── tlfn.hpp ├── wymlp.hpp ├── mnist.cpp ├── train.cpp ├── wymlp1.hpp ├── wymlp256.hpp ├── README.md ├── wyhash.h ├── sgemm256.hpp ├── avx_mathfun.h └── sgemm.hpp /makefile: -------------------------------------------------------------------------------- 1 | train: train.cpp wymlp.hpp 2 | g++ train.cpp -o train -Ofast -Wall -march=native 3 | 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | This is free and unencumbered software released into the public domain. 2 | 3 | Anyone is free to copy, modify, publish, use, compile, sell, or 4 | distribute this software, either in source code form or as a compiled 5 | binary, for any purpose, commercial or non-commercial, and by any 6 | means. 7 | 8 | In jurisdictions that recognize copyright laws, the author or authors 9 | of this software dedicate any and all copyright interest in the 10 | software to the public domain. We make this dedication for the benefit 11 | of the public at large and to the detriment of our heirs and 12 | successors. We intend this dedication to be an overt act of 13 | relinquishment in perpetuity of all present and future rights to this 14 | software under copyright law. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | For more information, please refer to 25 | -------------------------------------------------------------------------------- /wyhash32.h: -------------------------------------------------------------------------------- 1 | // Author: Wang Yi 2 | #include 3 | #include 4 | #ifndef WYHASH32_BIG_ENDIAN 5 | static inline unsigned _wyr32(const uint8_t *p) { unsigned v; memcpy(&v, p, 4); return v;} 6 | #elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__) 7 | static inline unsigned _wyr32(const uint8_t *p) { unsigned v; memcpy(&v, p, 4); return __builtin_bswap32(v);} 8 | #elif defined(_MSC_VER) 9 | static inline unsigned _wyr32(const uint8_t *p) { unsigned v; memcpy(&v, p, 4); return _byteswap_ulong(v);} 10 | #endif 11 | static inline unsigned _wyr24(const uint8_t *p, unsigned k) { return (((unsigned)p[0])<<16)|(((unsigned)p[k>>1])<<8)|p[k-1];} 12 | static inline void _wymix32(unsigned *A, unsigned *B){ 13 | uint64_t c=*A^0x53c5ca59u; c*=*B^0x74743c1bu; 14 | *A=(unsigned)c; 15 | *B=(unsigned)(c>>32); 16 | } 17 | static inline unsigned wyhash32(const void *key, uint64_t len, unsigned seed) { 18 | const uint8_t *p=(const uint8_t *)key; uint64_t i=len; 19 | unsigned see1=(unsigned)len; seed^=(unsigned)(len>>32); _wymix32(&seed, &see1); 20 | for(;i>8;i-=8,p+=8){ seed^=_wyr32(p); see1^=_wyr32(p+4); _wymix32(&seed, &see1); } 21 | if(i>=4){ seed^=_wyr32(p); see1^=_wyr32(p+i-4); } else if (i) seed^=_wyr24(p,i); 22 | _wymix32(&seed, &see1); _wymix32(&seed, &see1); return seed^see1; 23 | } 24 | static inline uint64_t wyrand(uint64_t *seed){ 25 | *seed+=0xa0761d6478bd642full; 26 | uint64_t see1=*seed^0xe7037ed1a0b428dbull; 27 | see1*=(see1>>32)|(see1<<32); 28 | return (*seed*((*seed>>32)|(*seed<<32)))^((see1>>32)|(see1<<32)); 29 | } 30 | static inline unsigned wy32x32(unsigned a, unsigned b) { _wymix32(&a,&b); _wymix32(&a,&b); return a^b; } 31 | static inline float wy2u01(unsigned r) { const float _wynorm=1.0f/(1ull<<23); return (r>>9)*_wynorm;} 32 | static inline float wy2gau(unsigned r) { const float _wynorm=1.0f/(1ull<<9); return ((r&0x3ff)+((r>>10)&0x3ff)+((r>>20)&0x3ff))*_wynorm-3.0f;} 33 | -------------------------------------------------------------------------------- /tlfn.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | template 5 | struct tlfn{ 6 | float acti(float x) { return x/(1+(x>0?x:-x)); } 7 | float grad(float x) { x=1-(x>0?x:-x); return x*x; } 8 | float *weight; 9 | tlfn(){ 10 | size_t size=(input+1)*hidden+hidden*hidden+output*hidden; 11 | weight=(float*)aligned_alloc(64,size*sizeof(float)); 12 | for(size_t i=0; i 2 | static inline float wymlp_activate(float x) { return x/(1+fabsf(x)); } 3 | static inline float wymlp_gradient(float x) { x=1-fabsf(x); return x*x; } 4 | static inline unsigned wymlp_size(unsigned input, unsigned hidden, unsigned depth, unsigned output){ 5 | return (input+1)*hidden+depth*hidden*hidden+output*hidden; 6 | } 7 | template 8 | static inline void wymlp(float *weight, float *x, float *y, float eta){ 9 | float a[2*depth*hidden+output]={},*o=a+2*depth*hidden,wh=1/sqrtf(hidden),wi=1/sqrtf(input+1),s,*w,*p,*q,*g,*h; 10 | unsigned i, j, l; 11 | for(i=0; i<=input; i++){ 12 | s=iy[i]?1:-1)*wh*eta; w=weight+(input+1)*hidden+(depth-1)*hidden*hidden+i*hidden; 34 | for(j=0; j 3 | #include 4 | #include "wymlp.hpp" 5 | #include 6 | #include 7 | using namespace std; 8 | const unsigned feature=784; 9 | wymlp<128,2,10,2> model; 10 | 11 | bool load_image(const char *F, vector &D, unsigned N){ 12 | gzFile in=gzopen(F, "rb"); 13 | if(in==Z_NULL) return false; 14 | unsigned n; gzread(in, &n, 4); gzread(in, &n, 4); gzread(in, &n, 4); gzread(in, &n, 4); 15 | D.resize(N*feature); vector temp(feature); 16 | for(size_t i=0; i &D, unsigned N){ 24 | gzFile in=gzopen(F, "rb"); 25 | if(in==Z_NULL) return false; 26 | unsigned n; gzread(in, &n, 4); gzread(in, &n, 4); D.resize(N); uint8_t temp; 27 | for(size_t i=0; i trainx, trainy, testx, testy; unsigned trainn=60000, testn=10000; 40 | if(!load_image("train-images-idx3-ubyte.gz", trainx, trainn)) return 0; 41 | if(!load_image("t10k-images-idx3-ubyte.gz", testx, testn)) return 0; 42 | if(!load_label("train-labels-idx1-ubyte.gz", trainy, trainn)) return 0; 43 | if(!load_label("t10k-labels-idx1-ubyte.gz", testy, testn)) return 0; 44 | double sx=0, sxx=0, sn=(trainn+testn)*feature; 45 | for(size_t i=0; i0.001; it++,eta*=0.97){ 54 | timeval beg, end; gettimeofday(&beg,NULL); 55 | for(size_t i=0; ip[pre]) pre=j; 65 | err+=pre!=testy[i]; 66 | } 67 | cerr.precision(3); cerr.setf(ios::fixed); t0+=(end.tv_sec-beg.tv_sec)+1e-6*(end.tv_usec-beg.tv_usec); 68 | cerr< 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "wyhash.h" 8 | #include "tlfn.hpp" 9 | #include 10 | using namespace std; 11 | const unsigned fullbatch=1<<20; 12 | const uint64_t input=12; 13 | const uint64_t hidden=32; 14 | const uint64_t output=1; 15 | const unsigned size=tlfn_size(input,hidden,output); 16 | 17 | bool load_matrix(const char *F, vector &M, unsigned &R, unsigned &C) { 18 | ifstream fi(F); 19 | if(!fi) { cerr<<"fail to open "<=0) { 53 | switch(opt) { 54 | case 'e': learning_rate=atof(optarg); break; 55 | case 'n': epoches=atoi(optarg); break; 56 | default: document(); 57 | } 58 | } 59 | if(ac xmat, ymat, data; unsigned sample, sample1, xsize, ysize, feature; 62 | if(!load_matrix(av[optind], xmat, sample, xsize)) return 0; 63 | if(!load_matrix(av[optind+1], ymat, sample1, ysize)) return 0; 64 | if(sample!=sample1) return 0; 65 | feature=ysize+xsize; 66 | data.resize(sample*feature); 67 | for(size_t i=0; i().swap(xmat); vector().swap(ymat); 72 | vector mean(feature), prec(feature); 73 | for(size_t j=0; j0?1/sxx:0; 77 | for(size_t i=0; i trte; for(size_t i=0; i(input,model,data.data()+ran*feature+ysize,data.data()+ran*feature,learning_rate); 87 | } 88 | double loss=0, n=0; 89 | for(size_t i=0; i(input,model,data.data()+i*feature+ysize,&h,-1); 92 | loss+=(h-t)*(h-t); n+=1; 93 | } 94 | cerr< 3 | static inline float wymlp_act(float x){ return (x/(1+(((int)(x>0)<<1)-1)*x)); } 4 | static inline float wymlp_gra(float x){ return ((1-(((int)(x>0)<<1)-1)*x)*(1-(((int)(x>0)<<1)-1)*x)); } 5 | template 6 | double wymlp(type *weight, type *x, type *y, type eta, uint64_t seed, double dropout) { 7 | #ifdef WYMLP_RNN 8 | if(weight==NULL) return (input+1)*hidden+hidden*hidden+output*hidden; 9 | #define woff(i,l) (l?(l=drop); 17 | for(unsigned j=0; jm) m=o[i]; 33 | for(unsigned i=0; i=drop); 48 | for(unsigned j=0; j 54 | 55 | Example: 56 | int main(void){ 57 | float x[4]={1,2,3,5}, y[1]={2}; 58 | vector weight(wymlp(NULL,NULL,NULL,0,0,-1)); //set dropout<0 to return size 59 | for(size_t i=0; i(weight.data(), x, y, 0.1, wygrand(), 0.5); // training. set eta>0 to train 63 | wymlp(weight.data(), x, y, -1, wygrand(), 0.5); // training. set eta<0 to predict 64 | } 65 | return 0; 66 | } 67 | 68 | Comments: 69 | 0: task=0: regression; task=1: logistic; task=2: softmax 70 | 1: dropout<0 lead to size() function 71 | 2: eta<0 lead to prediction only. 72 | 3: The expected |X[i]|, |Y[i]| should be around 1. Normalize yor input and output first. 73 | 4: In practice, it is OK to call model function parallelly with multi-threads, however, they may be slower for small net. 74 | 5: The code is portable, however, if Ofast is used on X86, SSE or AVX or even AVX512 will enable very fast code! 75 | 6: The default and suggested model is shared hidden-hidden weights. If you want vanilla MLP, use the following code 76 | if(weight==NULL) return (input+1)*hidden+(depth-1)*hidden*hidden+output*hidden; 77 | #define woff(i,l) (l?(input+1)*hidden+(l-1)*hidden*hidden+i*hidden:i*hidden) 78 | 79 | */ 80 | -------------------------------------------------------------------------------- /wymlp256.hpp: -------------------------------------------------------------------------------- 1 | #include "sgemm256.hpp" 2 | #include 3 | #include 4 | #include "wyhash.h" 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | template 13 | class wymlp{ 14 | private: 15 | int fd; 16 | struct stat sb; 17 | float act(float x){ return x>1?1:(x<-1?-1:x); } 18 | float gra(float x){ return x>=1||x<=-1?0:1; } 19 | unsigned woff(unsigned i, unsigned l){ return (l?input*hidden+(l==depth)*hidden*hidden+i*hidden:i*input); } 20 | public: 21 | float *weight; 22 | wymlp(){ weight=NULL; } 23 | const uint64_t size(void){ return input*hidden+hidden*hidden+output*hidden; } 24 | void alloc_weight(void){ free(weight); weight=(float*)aligned_alloc(64,size()*sizeof(float)); } 25 | void free_weight(void){ free(weight); weight=NULL; } 26 | void init_weight(uint64_t seed){ for(size_t i=0; i(wi,weight,inp,a); 63 | for(unsigned b=0; b(wh,weight+woff(0,l),aoff(0,l-1),aoff(0,l)); 70 | for(unsigned b=0; b(wh,weight+woff(0,depth),aoff(0,depth-1),o); 77 | for(unsigned b=0; b(1,weight+woff(0,depth),o,doff(0,depth-1)); 90 | sgemm<0,1,hidden,output,batch,hidden,output,hidden,1>(-1,aoff(0,depth-1),o,weight+woff(0,depth)); 91 | for(unsigned l=depth-1; l; l--) { 92 | for(unsigned b=0; b(1,weight+woff(0,l),doff(0,l),doff(0,l-1)); 97 | sgemm<0,1,hidden,hidden,batch,hidden,hidden,hidden,1>(-1,aoff(0,l-1),doff(0,l),weight+woff(0,l)); 98 | } 99 | for(unsigned b=0; b(-1,inp,d,weight); 104 | free(a); 105 | return ret; 106 | } 107 | }; 108 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Real-Time Intelligence for Every Machine 2 | Tiny fast portable real-time deep neural network for regression and classification within 50 LOC. 3 | 4 | ## Benchmark 5 | Intel(R) Core(TM) i7-8700 CPU @ 3.20GHz Single Thread @ VirtualBox 6.0 6 | 7 | -Ofast -mavx2 -mfma 8 | 9 | Speed Measure: Sample Per Second 10 | 11 | |Hidden&Depth|float_training|float_inference|double_training|double_inference| 12 | |----|----|----|----|----| 13 | |16H16L(scalar)|141,958|397,894|140,921|368,760| 14 | |32H16L|155,054|297,814|96,204 |208,723| 15 | |64H16L|47,253|95,647|24,794|52,018| 16 | |128H16L|11,469|25,124|4,950|11,696| 17 | |256H16L|2,356|5,502|1,260|2,725| 18 | |512H16L|610|1,246|312|648| 19 | 20 | ## Code Example: 21 | 22 | ```C++ 23 | int main(void){ 24 | float x[4]={1,2,3,5}, y[1]={2}; 25 | vector weight(wymlp(NULL,NULL,NULL,0,0,0)); //set weight==NULL to return size 26 | for(size_t i=0; i(weight.data(), x, y, 0.1, wygrand(), 0.5); // training. set eta>0 to train 30 | wymlp(weight.data(), x, y, -1, wygrand(), 0.5); // prediction. set eta<0 to predict 31 | } 32 | return 0; 33 | } 34 | ``` 35 | Comments: 36 | 37 | 0: loss=0: regression; loss=1: logistic; loss=2: softmax 38 | 39 | 1: dropout<0 lead to size() function 40 | 41 | 2: eta<0 lead to prediction only. 42 | 43 | 3: The expected |X[i]|, |Y[i]| should be around 1. Normalize yor input and output first. 44 | 45 | 4: In practice, it is OK to call model function parallelly with multi-threads, however, they may be slower for small net. 46 | 47 | 5: The code is portable, however, if Ofast is used on X86, SSE or AVX or even AVX512 will enable very fast code! 48 | 49 | 6: The default and suggested model is shared hidden-hidden weights. If you want vanilla MLP, define VanillaMLP 50 | 51 | 52 | ## MNIST test error of 128H2L with single CPU thread: 53 | 54 | ``` 55 | 0 error=8.240% eta=0.300 time=2.058s 56 | 1 error=6.720% eta=0.297 time=4.081s 57 | 2 error=5.660% eta=0.294 time=6.112s 58 | 3 error=5.200% eta=0.291 time=8.137s 59 | 4 error=4.330% eta=0.288 time=10.154s 60 | 5 error=4.050% eta=0.285 time=12.184s 61 | 6 error=3.740% eta=0.282 time=14.206s 62 | 7 error=3.360% eta=0.280 time=16.220s 63 | 8 error=3.550% eta=0.277 time=18.274s 64 | 9 error=3.170% eta=0.274 time=20.350s 65 | 10 error=2.980% eta=0.271 time=22.373s 66 | 11 error=3.080% eta=0.269 time=24.452s 67 | 12 error=2.670% eta=0.266 time=26.561s 68 | 13 error=2.880% eta=0.263 time=28.622s 69 | 14 error=2.430% eta=0.261 time=30.712s 70 | 15 error=2.660% eta=0.258 time=32.788s 71 | 16 error=2.330% eta=0.255 time=34.841s 72 | 17 error=2.420% eta=0.253 time=36.871s 73 | 18 error=2.470% eta=0.250 time=38.931s 74 | 19 error=2.160% eta=0.248 time=40.973s 75 | 20 error=2.180% eta=0.245 time=43.015s 76 | 21 error=2.220% eta=0.243 time=45.082s 77 | 22 error=2.240% eta=0.240 time=47.163s 78 | 23 error=2.310% eta=0.238 time=49.222s 79 | 24 error=2.100% eta=0.236 time=51.292s 80 | 25 error=2.110% eta=0.233 time=53.365s 81 | 26 error=2.050% eta=0.231 time=55.427s 82 | 27 error=2.060% eta=0.229 time=57.483s 83 | 28 error=2.160% eta=0.226 time=59.564s 84 | 29 error=2.110% eta=0.224 time=61.624s 85 | 30 error=2.060% eta=0.222 time=63.675s 86 | 31 error=2.010% eta=0.220 time=65.721s 87 | 32 error=2.000% eta=0.217 time=67.772s 88 | 33 error=1.960% eta=0.215 time=69.810s 89 | 34 error=1.900% eta=0.213 time=71.860s 90 | 35 error=2.040% eta=0.211 time=73.899s 91 | 36 error=1.960% eta=0.209 time=75.955s 92 | 37 error=1.880% eta=0.207 time=78.036s 93 | 38 error=1.980% eta=0.205 time=80.074s 94 | 39 error=1.840% eta=0.203 time=82.146s 95 | 40 error=1.910% eta=0.201 time=84.217s 96 | 41 error=1.900% eta=0.199 time=86.276s 97 | 42 error=1.870% eta=0.197 time=88.361s 98 | 43 error=1.810% eta=0.195 time=90.478s 99 | 44 error=1.720% eta=0.193 time=92.610s 100 | 45 error=1.860% eta=0.191 time=94.665s 101 | 46 error=1.820% eta=0.189 time=96.767s 102 | 47 error=1.760% eta=0.187 time=98.826s 103 | 48 error=1.870% eta=0.185 time=100.898s 104 | 49 error=2.000% eta=0.183 time=102.945s 105 | 50 error=1.680% eta=0.182 time=104.979s 106 | 51 error=1.750% eta=0.180 time=107.059s 107 | 52 error=1.710% eta=0.178 time=109.127s 108 | 53 error=1.790% eta=0.176 time=111.144s 109 | 54 error=1.870% eta=0.174 time=113.179s 110 | 55 error=1.720% eta=0.173 time=115.232s 111 | 56 error=1.770% eta=0.171 time=117.281s 112 | 57 error=1.690% eta=0.169 time=119.326s 113 | 58 error=1.770% eta=0.167 time=121.387s 114 | 59 error=1.740% eta=0.166 time=123.436s 115 | 60 error=1.740% eta=0.164 time=125.487s 116 | 61 error=1.870% eta=0.163 time=127.522s 117 | 62 error=1.790% eta=0.161 time=129.578s 118 | 63 error=1.730% eta=0.159 time=131.622s 119 | 64 error=1.760% eta=0.158 time=133.669s 120 | 65 error=1.790% eta=0.156 time=135.743s 121 | 66 error=1.640% eta=0.155 time=137.811s 122 | 67 error=1.640% eta=0.153 time=139.856s 123 | 68 error=1.610% eta=0.151 time=141.894s 124 | 69 error=1.600% eta=0.150 time=143.962s 125 | 70 error=1.680% eta=0.148 time=145.992s 126 | 71 error=1.590% eta=0.147 time=148.026s 127 | 128 | ``` 129 | 130 | 131 | -------------------------------------------------------------------------------- /wyhash.h: -------------------------------------------------------------------------------- 1 | // This is free and unencumbered software released into the public domain under The Unlicense (http://unlicense.org/) 2 | // main repo: https://github.com/wangyi-fudan/wyhash 3 | // author: 王一 Wang Yi 4 | // contributors: Reini Urban, Dietrich Epp, Joshua Haberman, Tommy Ettinger, Daniel Lemire, Otmar Ertl, cocowalla, leo-yuriev, Diego Barrios Romero, paulie-g, dumblob, Yann Collet, ivte-ms, hyb, James Z.M. Gao, easyaspi314 (Devin), TheOneric 5 | 6 | /* quick example: 7 | string s="fjsakfdsjkf"; 8 | uint64_t hash=wyhash(s.c_str(), s.size(), 0, _wyp); 9 | */ 10 | 11 | #ifndef wyhash_final_version_3 12 | #define wyhash_final_version_3 13 | 14 | #ifndef WYHASH_CONDOM 15 | //protections that produce different results: 16 | //1: normal valid behavior 17 | //2: extra protection against entropy loss (probability=2^-63), aka. "blind multiplication" 18 | #define WYHASH_CONDOM 1 19 | #endif 20 | 21 | #ifndef WYHASH_32BIT_MUM 22 | //0: normal version, slow on 32 bit systems 23 | //1: faster on 32 bit systems but produces different results, incompatible with wy2u0k function 24 | #define WYHASH_32BIT_MUM 0 25 | #endif 26 | 27 | //includes 28 | #include 29 | #include 30 | #if defined(_MSC_VER) && defined(_M_X64) 31 | #include 32 | #pragma intrinsic(_umul128) 33 | #endif 34 | 35 | //likely and unlikely macros 36 | #if defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__) 37 | #define _likely_(x) __builtin_expect(x,1) 38 | #define _unlikely_(x) __builtin_expect(x,0) 39 | #else 40 | #define _likely_(x) (x) 41 | #define _unlikely_(x) (x) 42 | #endif 43 | 44 | //128bit multiply function 45 | static inline uint64_t _wyrot(uint64_t x) { return (x>>32)|(x<<32); } 46 | static inline void _wymum(uint64_t *A, uint64_t *B){ 47 | #if(WYHASH_32BIT_MUM) 48 | uint64_t hh=(*A>>32)*(*B>>32), hl=(*A>>32)*(uint32_t)*B, lh=(uint32_t)*A*(*B>>32), ll=(uint64_t)(uint32_t)*A*(uint32_t)*B; 49 | #if(WYHASH_CONDOM>1) 50 | *A^=_wyrot(hl)^hh; *B^=_wyrot(lh)^ll; 51 | #else 52 | *A=_wyrot(hl)^hh; *B=_wyrot(lh)^ll; 53 | #endif 54 | #elif defined(__SIZEOF_INT128__) 55 | __uint128_t r=*A; r*=*B; 56 | #if(WYHASH_CONDOM>1) 57 | *A^=(uint64_t)r; *B^=(uint64_t)(r>>64); 58 | #else 59 | *A=(uint64_t)r; *B=(uint64_t)(r>>64); 60 | #endif 61 | #elif defined(_MSC_VER) && defined(_M_X64) 62 | #if(WYHASH_CONDOM>1) 63 | uint64_t a, b; 64 | a=_umul128(*A,*B,&b); 65 | *A^=a; *B^=b; 66 | #else 67 | *A=_umul128(*A,*B,B); 68 | #endif 69 | #else 70 | uint64_t ha=*A>>32, hb=*B>>32, la=(uint32_t)*A, lb=(uint32_t)*B, hi, lo; 71 | uint64_t rh=ha*hb, rm0=ha*lb, rm1=hb*la, rl=la*lb, t=rl+(rm0<<32), c=t>32)+(rm1>>32)+c; 73 | #if(WYHASH_CONDOM>1) 74 | *A^=lo; *B^=hi; 75 | #else 76 | *A=lo; *B=hi; 77 | #endif 78 | #endif 79 | } 80 | 81 | //multiply and xor mix function, aka MUM 82 | static inline uint64_t _wymix(uint64_t A, uint64_t B){ _wymum(&A,&B); return A^B; } 83 | 84 | //endian macros 85 | #ifndef WYHASH_LITTLE_ENDIAN 86 | #if defined(_WIN32) || defined(__LITTLE_ENDIAN__) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) 87 | #define WYHASH_LITTLE_ENDIAN 1 88 | #elif defined(__BIG_ENDIAN__) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) 89 | #define WYHASH_LITTLE_ENDIAN 0 90 | #else 91 | #warning could not determine endianness! Falling back to little endian. 92 | #define WYHASH_LITTLE_ENDIAN 1 93 | #endif 94 | #endif 95 | 96 | //read functions 97 | #if (WYHASH_LITTLE_ENDIAN) 98 | static inline uint64_t _wyr8(const uint8_t *p) { uint64_t v; memcpy(&v, p, 8); return v;} 99 | static inline uint64_t _wyr4(const uint8_t *p) { uint32_t v; memcpy(&v, p, 4); return v;} 100 | #elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__) 101 | static inline uint64_t _wyr8(const uint8_t *p) { uint64_t v; memcpy(&v, p, 8); return __builtin_bswap64(v);} 102 | static inline uint64_t _wyr4(const uint8_t *p) { uint32_t v; memcpy(&v, p, 4); return __builtin_bswap32(v);} 103 | #elif defined(_MSC_VER) 104 | static inline uint64_t _wyr8(const uint8_t *p) { uint64_t v; memcpy(&v, p, 8); return _byteswap_uint64(v);} 105 | static inline uint64_t _wyr4(const uint8_t *p) { uint32_t v; memcpy(&v, p, 4); return _byteswap_ulong(v);} 106 | #else 107 | static inline uint64_t _wyr8(const uint8_t *p) { 108 | uint64_t v; memcpy(&v, p, 8); 109 | return (((v >> 56) & 0xff)| ((v >> 40) & 0xff00)| ((v >> 24) & 0xff0000)| ((v >> 8) & 0xff000000)| ((v << 8) & 0xff00000000)| ((v << 24) & 0xff0000000000)| ((v << 40) & 0xff000000000000)| ((v << 56) & 0xff00000000000000)); 110 | } 111 | static inline uint64_t _wyr4(const uint8_t *p) { 112 | uint32_t v; memcpy(&v, p, 4); 113 | return (((v >> 24) & 0xff)| ((v >> 8) & 0xff00)| ((v << 8) & 0xff0000)| ((v << 24) & 0xff000000)); 114 | } 115 | #endif 116 | static inline uint64_t _wyr3(const uint8_t *p, size_t k) { return (((uint64_t)p[0])<<16)|(((uint64_t)p[k>>1])<<8)|p[k-1];} 117 | 118 | //wyhash main function 119 | static inline uint64_t wyhash(const void *key, size_t len, uint64_t seed, const uint64_t *secret){ 120 | const uint8_t *p=(const uint8_t *)key; seed^=*secret; uint64_t a, b; 121 | if(_likely_(len<=16)){ 122 | if(_likely_(len>=4)){ a=(_wyr4(p)<<32)|_wyr4(p+((len>>3)<<2)); b=(_wyr4(p+len-4)<<32)|_wyr4(p+len-4-((len>>3)<<2)); } 123 | else if(_likely_(len>0)){ a=_wyr3(p,len); b=0;} 124 | else a=b=0; 125 | } 126 | else{ 127 | size_t i=len; 128 | if(_unlikely_(i>48)){ 129 | uint64_t see1=seed, see2=seed; 130 | do{ 131 | seed=_wymix(_wyr8(p)^secret[1],_wyr8(p+8)^seed); 132 | see1=_wymix(_wyr8(p+16)^secret[2],_wyr8(p+24)^see1); 133 | see2=_wymix(_wyr8(p+32)^secret[3],_wyr8(p+40)^see2); 134 | p+=48; i-=48; 135 | }while(_likely_(i>48)); 136 | seed^=see1^see2; 137 | } 138 | while(_unlikely_(i>16)){ seed=_wymix(_wyr8(p)^secret[1],_wyr8(p+8)^seed); i-=16; p+=16; } 139 | a=_wyr8(p+i-16); b=_wyr8(p+i-8); 140 | } 141 | return _wymix(secret[1]^len,_wymix(a^secret[1],b^seed)); 142 | } 143 | //the default secret parameters 144 | static const uint64_t _wyp[4] = {0xa0761d6478bd642full, 0xe7037ed1a0b428dbull, 0x8ebc6af09c88c6e3ull, 0x589965cc75374cc3ull}; 145 | 146 | //a useful 64bit-64bit mix function to produce deterministic pseudo random numbers that can pass BigCrush and PractRand 147 | static inline uint64_t wyhash64(uint64_t A, uint64_t B){ A^=_wyp[0]; B^=_wyp[1]; _wymum(&A,&B); return _wymix(A^_wyp[0],B^_wyp[1]);} 148 | 149 | //The wyrand PRNG that pass BigCrush and PractRand 150 | static inline uint64_t wyrand(uint64_t *seed){ *seed+=_wyp[0]; return _wymix(*seed,*seed^_wyp[1]);} 151 | 152 | //convert any 64 bit pseudo random numbers to uniform distribution [0,1). It can be combined with wyrand, wyhash64 or wyhash. 153 | static inline double wy2u01(uint64_t r){ const double _wynorm=1.0/(1ull<<52); return (r>>12)*_wynorm;} 154 | 155 | //convert any 64 bit pseudo random numbers to APPROXIMATE Gaussian distribution. It can be combined with wyrand, wyhash64 or wyhash. 156 | static inline double wy2gau(uint64_t r){ const double _wynorm=1.0/(1ull<<20); return ((r&0x1fffff)+((r>>21)&0x1fffff)+((r>>42)&0x1fffff))*_wynorm-3.0;} 157 | 158 | #if(!WYHASH_32BIT_MUM) 159 | //fast range integer random number generation on [0,k) credit to Daniel Lemire. May not work when WYHASH_32BIT_MUM=1. It can be combined with wyrand, wyhash64 or wyhash. 160 | static inline uint64_t wy2u0k(uint64_t r, uint64_t k){ _wymum(&r,&k); return k; } 161 | #endif 162 | 163 | //make your own secret 164 | static inline void make_secret(uint64_t seed, uint64_t *secret){ 165 | uint8_t c[] = {15, 23, 27, 29, 30, 39, 43, 45, 46, 51, 53, 54, 57, 58, 60, 71, 75, 77, 78, 83, 85, 86, 89, 90, 92, 99, 101, 102, 105, 106, 108, 113, 114, 116, 120, 135, 139, 141, 142, 147, 149, 150, 153, 154, 156, 163, 165, 166, 169, 170, 172, 177, 178, 180, 184, 195, 197, 198, 201, 202, 204, 209, 210, 212, 216, 225, 226, 228, 232, 240 }; 166 | for(size_t i=0;i<4;i++){ 167 | uint8_t ok; 168 | do{ 169 | ok=1; secret[i]=0; 170 | for(size_t j=0;j<64;j+=8) secret[i]|=((uint64_t)c[wyrand(&seed)%sizeof(c)])<> 1) & 0x5555555555555555; 181 | x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333); 182 | x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0f; 183 | x = (x * 0x0101010101010101) >> 56; 184 | if(x!=32){ ok=0; break; } 185 | #endif 186 | } 187 | if(!ok)continue; 188 | for(uint64_t j=3;j<0x100000000ull;j+=2) if(secret[i]%j==0){ ok=0; break; } 189 | }while(!ok); 190 | } 191 | } 192 | #endif 193 | 194 | /* test vectors for portability test 195 | wyhash("",0)=42bc986dc5eec4d3 196 | wyhash("a",1)=84508dc903c31551 197 | wyhash("abc",2)=bc54887cfc9ecb1 198 | wyhash("message digest",3)=6e2ff3298208a67c 199 | wyhash("abcdefghijklmnopqrstuvwxyz",4)=9a64e42e897195b9 200 | wyhash("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789",5)=9199383239c32554 201 | wyhash("12345678901234567890123456789012345678901234567890123456789012345678901234567890",6)=7c1ccf6bba30f5a5 202 | */ 203 | 204 | /* The Unlicense 205 | This is free and unencumbered software released into the public domain. 206 | 207 | Anyone is free to copy, modify, publish, use, compile, sell, or 208 | distribute this software, either in source code form or as a compiled 209 | binary, for any purpose, commercial or non-commercial, and by any 210 | means. 211 | 212 | In jurisdictions that recognize copyright laws, the author or authors 213 | of this software dedicate any and all copyright interest in the 214 | software to the public domain. We make this dedication for the benefit 215 | of the public at large and to the detriment of our heirs and 216 | successors. We intend this dedication to be an overt act of 217 | relinquishment in perpetuity of all present and future rights to this 218 | software under copyright law. 219 | 220 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 221 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 222 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 223 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 224 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 225 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 226 | OTHER DEALINGS IN THE SOFTWARE. 227 | 228 | For more information, please refer to 229 | */ 230 | -------------------------------------------------------------------------------- /sgemm256.hpp: -------------------------------------------------------------------------------- 1 | /* %0=a_ptr, %1=b_ptr, %2=c_ptr, %3=c_tmp, %4=ldc(bytes), %5=&alpha */ 2 | 3 | #define KERNEL_k1m1n8 \ 4 | "vbroadcastss (%0),%%ymm1; addq $4,%0; vfmadd231ps (%1),%%ymm1,%%ymm4; addq $32,%1;" 5 | #define KERNEL_k1m1n16 \ 6 | "vbroadcastss (%0),%%ymm1; addq $4,%0; vfmadd231ps (%1),%%ymm1,%%ymm4; vfmadd231ps 32(%1),%%ymm1,%%ymm5; addq $64,%1;" 7 | #define KERNEL_k1m2n8 \ 8 | "vbroadcastss (%0),%%ymm1; vbroadcastss 4(%0),%%ymm2; addq $8,%0;"\ 9 | "vmovups (%1),%%ymm0; vfmadd231ps %%ymm1,%%ymm0,%%ymm4; vfmadd231ps %%ymm2,%%ymm0,%%ymm5; addq $32,%1;" 10 | #define KERNEL_k1m2n16 \ 11 | "vbroadcastss (%0),%%ymm1; vbroadcastss 4(%0),%%ymm2; addq $8,%0;"\ 12 | "vmovups (%1),%%ymm0; vfmadd231ps %%ymm1,%%ymm0,%%ymm4; vfmadd231ps %%ymm2,%%ymm0,%%ymm5;"\ 13 | "vmovups 32(%1),%%ymm0; vfmadd231ps %%ymm1,%%ymm0,%%ymm6; vfmadd231ps %%ymm2,%%ymm0,%%ymm7; addq $64,%1;" 14 | #define unit_acc_m4n8(c1_no,c2_no,c3_no,c4_no,boff) \ 15 | "vmovsldup "#boff"(%1),%%ymm0; vfmadd231ps %%ymm1,%%ymm0,%%ymm"#c1_no"; vfmadd231ps %%ymm2,%%ymm0,%%ymm"#c3_no";"\ 16 | "vmovshdup "#boff"(%1),%%ymm0; vfmadd231ps %%ymm1,%%ymm0,%%ymm"#c2_no"; vfmadd231ps %%ymm2,%%ymm0,%%ymm"#c4_no";" 17 | #define KERNEL_k1m4n8 \ 18 | "vbroadcastsd (%0),%%ymm1; vbroadcastsd 8(%0),%%ymm2; addq $16,%0;"\ 19 | unit_acc_m4n8(4,5,6,7,0) "addq $32,%1;" 20 | #define KERNEL_k1m4n16 \ 21 | "vbroadcastsd (%0),%%ymm1; vbroadcastsd 8(%0),%%ymm2; addq $16,%0;"\ 22 | unit_acc_m4n8(4,5,6,7,0) unit_acc_m4n8(8,9,10,11,32) "addq $64,%1;" 23 | #define unit_acc_m6n8(c1_no,c2_no,c3_no,c4_no,c5_no,c6_no,boff) \ 24 | "vmovsldup "#boff"(%1),%%ymm0; vfmadd231ps %%ymm1,%%ymm0,%%ymm"#c1_no"; vfmadd231ps %%ymm2,%%ymm0,%%ymm"#c3_no"; vfmadd231ps %%ymm3,%%ymm0,%%ymm"#c5_no";"\ 25 | "vmovshdup "#boff"(%1),%%ymm0; vfmadd231ps %%ymm1,%%ymm0,%%ymm"#c2_no"; vfmadd231ps %%ymm2,%%ymm0,%%ymm"#c4_no"; vfmadd231ps %%ymm3,%%ymm0,%%ymm"#c6_no";" 26 | #define KERNEL_k1m6n8 \ 27 | "vbroadcastsd (%0),%%ymm1; vbroadcastsd 8(%0),%%ymm2; vbroadcastsd 16(%0),%%ymm3; addq $24,%0;"\ 28 | unit_acc_m6n8(4,5,6,7,8,9,0) "addq $32,%1;" 29 | #define KERNEL_k1m6n16 \ 30 | "vbroadcastsd (%0),%%ymm1; vbroadcastsd 8(%0),%%ymm2; vbroadcastsd 16(%0),%%ymm3; addq $24,%0;"\ 31 | unit_acc_m6n8(4,5,6,7,8,9,0) unit_acc_m6n8(10,11,12,13,14,15,32) "addq $64,%1;" 32 | #define KERNEL_k2m6n16 \ 33 | "vbroadcastsd (%0),%%ymm1; vbroadcastsd 8(%0),%%ymm2; vbroadcastsd 16(%0),%%ymm3; prefetcht0 256(%1);"\ 34 | unit_acc_m6n8(4,5,6,7,8,9,0) unit_acc_m6n8(10,11,12,13,14,15,32)\ 35 | "vbroadcastsd 24(%0),%%ymm1; vbroadcastsd 32(%0),%%ymm2; vbroadcastsd 40(%0),%%ymm3; prefetcht0 320(%1); addq $48,%0; prefetcht0 384(%0);"\ 36 | unit_acc_m6n8(4,5,6,7,8,9,64) unit_acc_m6n8(10,11,12,13,14,15,96) "addq $128,%1;" 37 | #define save_init_m1 "vbroadcastss (%5),%%ymm0; movq %2,%3; addq $4,%2;" 38 | #define save_init_m2 "vbroadcastss (%5),%%ymm0; movq %2,%3; addq $8,%2;" 39 | #define save_init_m4 "vbroadcastss (%5),%%ymm0; movq %2,%3; addq $16,%2;" 40 | #define save_init_m6 "vbroadcastss (%5),%%ymm0; movq %2,%3; addq $24,%2;" 41 | #define unit_save_m1_dn1_m1(c0_no) \ 42 | "vmovss (%3),%%xmm2; vinsertps $16,(%3,%4,1),%%xmm2,%%xmm2; vfmadd231ps %%xmm"#c0_no",%%xmm0,%%xmm2;"\ 43 | "vmovss %%xmm2,(%3); vextractps $1,%%xmm2,(%3,%4,1);" 44 | #define unit_save_m1n8(c1_no) \ 45 | "vextractf128 $1,%%ymm"#c1_no",%%xmm3;" unit_save_m1_dn1_m1(c1_no) "leaq (%3,%4,2),%3;"\ 46 | "vunpckhpd %%xmm"#c1_no",%%xmm"#c1_no",%%xmm"#c1_no";" unit_save_m1_dn1_m1(c1_no) "leaq (%3,%4,2),%3;"\ 47 | unit_save_m1_dn1_m1(3) "leaq (%3,%4,2),%3;"\ 48 | "vunpckhpd %%xmm3,%%xmm3,%%xmm3;" unit_save_m1_dn1_m1(3) "leaq (%3,%4,2),%3;" 49 | #define SAVE_m1n8 save_init_m1 unit_save_m1n8(4) 50 | #define SAVE_m1n16 SAVE_m1n8 unit_save_m1n8(5) 51 | #define unit_save_m4_dn4_m4(c0_no) \ 52 | "vmovups (%3),%%xmm2; vinsertf128 $1,(%3,%4,4),%%ymm2,%%ymm2;"\ 53 | "vfmadd213ps %%ymm2,%%ymm0,%%ymm"#c0_no"; vmovups %%xmm"#c0_no",(%3); vextractf128 $1,%%ymm"#c0_no",(%3,%4,4);" 54 | #define unit_save_m2_dn1_m2(c0_no,off) \ 55 | "vmovsd "#off"(%3),%%xmm2; vmovhpd "#off"(%3,%4,1),%%xmm2,%%xmm2; vfmadd231ps %%xmm"#c0_no",%%xmm0,%%xmm2;"\ 56 | "vmovsd %%xmm2,"#off"(%3); vmovhpd %%xmm2,"#off"(%3,%4,1);" 57 | #define unit_save_m2_dn2_m2(c0_no,off) \ 58 | "vmovsd "#off"(%3),%%xmm2; vmovhpd "#off"(%3,%4,2),%%xmm2,%%xmm2; vfmadd231ps %%xmm"#c0_no",%%xmm0,%%xmm2;"\ 59 | "vmovsd %%xmm2,"#off"(%3); vmovhpd %%xmm2,"#off"(%3,%4,2);" 60 | #define unit_save_dn2_m2_dn2_m2(c0_no) \ 61 | "vmovsd 16(%3,%4,2),%%xmm2; vmovhpd 16(%3,%4,4),%%xmm2,%%xmm2; vfmadd231ps %%xmm"#c0_no",%%xmm0,%%xmm2;"\ 62 | "vmovsd %%xmm2,16(%3,%4,2); vmovhpd %%xmm2,16(%3,%4,4);" 63 | #define unit_save_m2n8(c1_no,c2_no) \ 64 | "vunpcklps %%ymm"#c2_no",%%ymm"#c1_no",%%ymm1; vunpckhps %%ymm"#c2_no",%%ymm"#c1_no",%%ymm3;"\ 65 | unit_save_m2_dn1_m2(1,0) "leaq (%3,%4,2),%3; vextractf128 $1,%%ymm1,%%xmm1;"\ 66 | unit_save_m2_dn1_m2(3,0) "leaq (%3,%4,2),%3; vextractf128 $1,%%ymm3,%%xmm3;"\ 67 | unit_save_m2_dn1_m2(1,0) "leaq (%3,%4,2),%3;"\ 68 | unit_save_m2_dn1_m2(3,0) "leaq (%3,%4,2),%3;" 69 | #define SAVE_m2n8 save_init_m2 unit_save_m2n8(4,5) 70 | #define SAVE_m2n16 SAVE_m2n8 unit_save_m2n8(6,7) 71 | #define unit_save_m4n8(c1_no,c2_no,c3_no,c4_no) \ 72 | "vunpcklpd %%ymm"#c3_no",%%ymm"#c1_no",%%ymm1;" unit_save_m4_dn4_m4(1) "addq %4,%3;"\ 73 | "vunpcklpd %%ymm"#c4_no",%%ymm"#c2_no",%%ymm1;" unit_save_m4_dn4_m4(1) "addq %4,%3;"\ 74 | "vunpckhpd %%ymm"#c3_no",%%ymm"#c1_no",%%ymm1;" unit_save_m4_dn4_m4(1) "addq %4,%3;"\ 75 | "vunpckhpd %%ymm"#c4_no",%%ymm"#c2_no",%%ymm1;" unit_save_m4_dn4_m4(1) "addq %4,%3; leaq (%3,%4,4),%3;" 76 | #define SAVE_m4n8 save_init_m4 unit_save_m4n8(4,5,6,7) 77 | #define SAVE_m4n16 SAVE_m4n8 unit_save_m4n8(8,9,10,11) 78 | #define unit_save_m6n8(c1_no,c2_no,c3_no,c4_no,c5_no,c6_no) \ 79 | "vunpcklpd %%ymm"#c3_no",%%ymm"#c1_no",%%ymm1;" unit_save_m4_dn4_m4(1) unit_save_m2_dn2_m2(c5_no,16) "addq %4,%3;"\ 80 | "vunpcklpd %%ymm"#c4_no",%%ymm"#c2_no",%%ymm1;" unit_save_m4_dn4_m4(1) unit_save_m2_dn2_m2(c6_no,16) "addq %4,%3;"\ 81 | "vextractf128 $1,%%ymm"#c5_no",%%xmm"#c5_no"; vextractf128 $1,%%ymm"#c6_no",%%xmm"#c6_no";"\ 82 | "vunpckhpd %%ymm"#c3_no",%%ymm"#c1_no",%%ymm1;" unit_save_m4_dn4_m4(1) unit_save_dn2_m2_dn2_m2(c5_no) "addq %4,%3;"\ 83 | "vunpckhpd %%ymm"#c4_no",%%ymm"#c2_no",%%ymm1;" unit_save_m4_dn4_m4(1) unit_save_dn2_m2_dn2_m2(c6_no) "addq %4,%3; leaq (%3,%4,4),%3;" 84 | #define SAVE_m6n8 save_init_m6 unit_save_m6n8(4,5,6,7,8,9) 85 | #define SAVE_m6n16 SAVE_m6n8 unit_save_m6n8(10,11,12,13,14,15) 86 | #define INIT_m1n8 "vpxor %%ymm4,%%ymm4,%%ymm4;" 87 | #define INIT_m1n16 INIT_m1n8 "vpxor %%ymm5,%%ymm5,%%ymm5;" 88 | #define INIT_m2n8 INIT_m1n16 89 | #define INIT_m2n16 INIT_m2n8 "vpxor %%ymm6,%%ymm6,%%ymm6; vpxor %%ymm7,%%ymm7,%%ymm7;" 90 | #define INIT_m4n8 INIT_m2n16 91 | #define INIT_m4n16 INIT_m4n8\ 92 | "vpxor %%ymm8,%%ymm8,%%ymm8; vpxor %%ymm9,%%ymm9,%%ymm9; vpxor %%ymm10,%%ymm10,%%ymm10; vpxor %%ymm11,%%ymm11,%%ymm11;" 93 | #define INIT_m6n8 INIT_m4n8 "vpxor %%ymm8,%%ymm8,%%ymm8; vpxor %%ymm9,%%ymm9,%%ymm9;" 94 | #define INIT_m6n16 INIT_m6n8\ 95 | "vpxor %%ymm10,%%ymm10,%%ymm10; vpxor %%ymm11,%%ymm11,%%ymm11; vpxor %%ymm12,%%ymm12,%%ymm12;"\ 96 | "vpxor %%ymm13,%%ymm13,%%ymm13; vpxor %%ymm14,%%ymm14,%%ymm14; vpxor %%ymm15,%%ymm15,%%ymm15;" 97 | 98 | #define KERNEL_k1m4n1 \ 99 | "vbroadcastss (%1),%%xmm1; addq $4,%1; vfmadd231ps (%0),%%xmm1,%%xmm4; addq $16,%0;" 100 | #define KERNEL_k1m4n2 \ 101 | "vmovups (%0),%%xmm0; addq $16,%0;"\ 102 | "vbroadcastss (%1),%%xmm1; vfmadd231ps %%xmm0,%%xmm1,%%xmm4;"\ 103 | "vbroadcastss 4(%1),%%xmm2; vfmadd231ps %%xmm0,%%xmm2,%%xmm5; addq $8,%1;" 104 | #define KERNEL_k1m4n4 \ 105 | "vmovsldup (%1),%%xmm1; vmovshdup (%1),%%xmm2; addq $16,%1;"\ 106 | "vmovddup (%0),%%xmm0; vfmadd231ps %%xmm1,%%xmm0,%%xmm4; vfmadd231ps %%xmm2,%%xmm0,%%xmm5;"\ 107 | "vmovddup 8(%0),%%xmm0; vfmadd231ps %%xmm1,%%xmm0,%%xmm6; vfmadd231ps %%xmm2,%%xmm0,%%xmm7; addq $16,%0;" 108 | #define SAVE_m4n1 save_init_m4 "vfmadd213ps (%3),%%xmm0,%%xmm4; vmovups %%xmm4,(%3);" 109 | #define SAVE_m4n2 SAVE_m4n1 "vfmadd213ps (%3,%4,1),%%xmm0,%%xmm5; vmovups %%xmm5,(%3,%4,1);" 110 | #define SAVE_m4n4 save_init_m4\ 111 | "vunpcklpd %%xmm6,%%xmm4,%%xmm1; vfmadd213ps (%3),%%xmm0,%%xmm1; vmovups %%xmm1,(%3);"\ 112 | "vunpcklpd %%xmm7,%%xmm5,%%xmm2; vfmadd213ps (%3,%4,1),%%xmm0,%%xmm2; vmovups %%xmm2,(%3,%4,1); leaq (%3,%4,2),%3;"\ 113 | "vunpckhpd %%xmm6,%%xmm4,%%xmm1; vfmadd213ps (%3),%%xmm0,%%xmm1; vmovups %%xmm1,(%3);"\ 114 | "vunpckhpd %%xmm7,%%xmm5,%%xmm2; vfmadd213ps (%3,%4,1),%%xmm0,%%xmm2; vmovups %%xmm2,(%3,%4,1);" 115 | #define INIT_m4n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" 116 | #define INIT_m4n2 INIT_m4n1 "vpxor %%xmm5,%%xmm5,%%xmm5;" 117 | #define INIT_m4n4 INIT_m4n2 "vpxor %%xmm6,%%xmm6,%%xmm6; vpxor %%xmm7,%%xmm7,%%xmm7;" 118 | #define KERNEL_k1m6n1 \ 119 | "vbroadcastss (%1),%%xmm1; addq $4,%1;"\ 120 | "vfmadd231ps (%0),%%xmm1,%%xmm4; vfmadd231ps 8(%0),%%xmm1,%%xmm5; addq $24,%0;" 121 | #define KERNEL_k1m6n2 \ 122 | "vbroadcastss (%1),%%xmm1; vbroadcastss 4(%1),%%xmm2; addq $8,%1;"\ 123 | "vmovups (%0),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm4; vfmadd231ps %%xmm2,%%xmm3,%%xmm6;"\ 124 | "vmovsd 16(%0),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm5; vfmadd231ps %%xmm2,%%xmm3,%%xmm7; addq $24,%0;" 125 | #define KERNEL_k1m6n4 \ 126 | "vmovsldup (%1),%%xmm1; vmovshdup (%1),%%xmm2; addq $16,%1;"\ 127 | "vmovddup (%0),%%xmm0; vfmadd231ps %%xmm1,%%xmm0,%%xmm4; vfmadd231ps %%xmm2,%%xmm0,%%xmm5;"\ 128 | "vmovddup 8(%0),%%xmm0; vfmadd231ps %%xmm1,%%xmm0,%%xmm6; vfmadd231ps %%xmm2,%%xmm0,%%xmm7;"\ 129 | "vmovddup 16(%0),%%xmm0; vfmadd231ps %%xmm1,%%xmm0,%%xmm8; vfmadd231ps %%xmm2,%%xmm0,%%xmm9; addq $24,%0;" 130 | #define SAVE_m6n1 save_init_m6 "vfmadd213ps (%3),%%xmm0,%%xmm4; vmovsd %%xmm4,(%3); vfmadd213ps 8(%3),%%xmm0,%%xmm5; vmovups %%xmm5,8(%3);" 131 | #define SAVE_m6n2 save_init_m6\ 132 | "vfmadd213ps (%3),%%xmm0,%%xmm4; vmovups %%xmm4,(%3); vmovsd 16(%3),%%xmm2; vfmadd213ps %%xmm2,%%xmm0,%%xmm5; vmovsd %%xmm5,16(%3);"\ 133 | "vfmadd213ps (%3,%4,1),%%xmm0,%%xmm6; vmovups %%xmm6,(%3,%4,1); vmovsd 16(%3,%4,1),%%xmm2; vfmadd213ps %%xmm2,%%xmm0,%%xmm7; vmovsd %%xmm7,16(%3,%4,1);" 134 | #define SAVE_m6n4 save_init_m6\ 135 | "vunpcklpd %%xmm6,%%xmm4,%%xmm1; vfmadd213ps (%3),%%xmm0,%%xmm1; vmovups %%xmm1,(%3);"\ 136 | "vunpckhpd %%xmm6,%%xmm4,%%xmm3; vfmadd213ps (%3,%4,2),%%xmm0,%%xmm3; vmovups %%xmm3,(%3,%4,2);" unit_save_m2_dn2_m2(8,16) "addq %4,%3;"\ 137 | "vunpcklpd %%xmm7,%%xmm5,%%xmm1; vfmadd213ps (%3),%%xmm0,%%xmm1; vmovups %%xmm1,(%3);"\ 138 | "vunpckhpd %%xmm7,%%xmm5,%%xmm3; vfmadd213ps (%3,%4,2),%%xmm0,%%xmm3; vmovups %%xmm3,(%3,%4,2);" unit_save_m2_dn2_m2(9,16) 139 | #define INIT_m6n1 "vpxor %%xmm4,%%xmm4,%%xmm4; vpxor %%xmm5,%%xmm5,%%xmm5;" 140 | #define INIT_m6n2 INIT_m6n1 "vpxor %%xmm6,%%xmm6,%%xmm6; vpxor %%xmm7,%%xmm7,%%xmm7;" 141 | #define INIT_m6n4 INIT_m6n2 "vpxor %%xmm8,%%xmm8,%%xmm8; vpxor %%xmm9,%%xmm9,%%xmm9;" 142 | #define KERNEL_k1m2n4 \ 143 | "vmovups (%1),%%xmm0; addq $16,%1;"\ 144 | "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm0,%%xmm1,%%xmm4;"\ 145 | "vbroadcastss 4(%0),%%xmm2; vfmadd231ps %%xmm0,%%xmm2,%%xmm5; addq $8,%0;" 146 | #define KERNEL_k1m2n2 \ 147 | "vmovsd (%0),%%xmm0; addq $8,%0;"\ 148 | "vbroadcastss (%1),%%xmm1; vfmadd231ps %%xmm0,%%xmm1,%%xmm4;"\ 149 | "vbroadcastss 4(%1),%%xmm2; vfmadd231ps %%xmm0,%%xmm2,%%xmm5; addq $8,%1;" 150 | #define KERNEL_k1m2n1 \ 151 | "vbroadcastss (%1),%%xmm1; addq $4,%1;"\ 152 | "vmovsd (%0),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4; addq $8,%0;" 153 | #define SAVE_m2n1 save_init_m2 "vmovsd (%3),%%xmm2; vfmadd213ps %%xmm2,%%xmm0,%%xmm4; vmovsd %%xmm4,(%3);" 154 | #define SAVE_m2n2 SAVE_m2n1 "vmovsd (%3,%4,1),%%xmm2; vfmadd213ps %%xmm2,%%xmm0,%%xmm5; vmovsd %%xmm5,(%3,%4,1);" 155 | #define SAVE_m2n4 save_init_m2\ 156 | "vunpcklps %%xmm5,%%xmm4,%%xmm1;" unit_save_m2_dn1_m2(1,0) "leaq (%3,%4,2),%3;"\ 157 | "vunpckhps %%xmm5,%%xmm4,%%xmm1;" unit_save_m2_dn1_m2(1,0) 158 | #define INIT_m2n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" 159 | #define INIT_m2n2 INIT_m2n1 "vpxor %%xmm5,%%xmm5,%%xmm5;" 160 | #define INIT_m2n4 INIT_m2n2 161 | #define KERNEL_k1m1n1 "vmovss (%1),%%xmm1; addq $4,%1; vfmadd231ss (%0),%%xmm1,%%xmm4; addq $4,%0;" 162 | #define KERNEL_k1m1n2 "vmovsd (%1),%%xmm1; addq $8,%1; vbroadcastss (%0),%%xmm2; vfmadd231ps %%xmm2,%%xmm1,%%xmm4; addq $4,%0;" 163 | #define KERNEL_k1m1n4 "vbroadcastss (%0),%%xmm2; addq $4,%0; vfmadd231ps (%1),%%xmm2,%%xmm4; addq $16,%1;" 164 | #define SAVE_m1n1 save_init_m1 "vfmadd213ss (%3),%%xmm0,%%xmm4; vmovss %%xmm4,(%3);" 165 | #define SAVE_m1n2 save_init_m1 unit_save_m1_dn1_m1(4) 166 | #define SAVE_m1n4 SAVE_m1n2 "leaq (%3,%4,2),%3; vunpckhpd %%xmm4,%%xmm4,%%xmm4;" unit_save_m1_dn1_m1(4) 167 | #define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" 168 | #define INIT_m1n2 INIT_m1n1 169 | #define INIT_m1n4 INIT_m1n1 170 | 171 | /* %6=k_counter, %7=b_pref */ 172 | /* r12=tmp, r13=k, r14=b_head */ 173 | #define COMPUTE_SIMPLE(mdim,ndim) \ 174 | "testq %%r13,%%r13; jz 5"#mdim"55"#ndim"5f;"\ 175 | "movq %%r13,%6; movq %%r14,%1;" INIT_m##mdim##n##ndim\ 176 | "5"#mdim"55"#ndim"7:\n\t"\ 177 | KERNEL_k1m##mdim##n##ndim "decq %6; jnz 5"#mdim"55"#ndim"7b;"\ 178 | "5"#mdim"55"#ndim"5:\n\t"\ 179 | SAVE_m##mdim##n##ndim 180 | #define COMPUTE_m6n1 COMPUTE_SIMPLE(6,1) 181 | #define COMPUTE_m6n2 COMPUTE_SIMPLE(6,2) 182 | #define COMPUTE_m6n4 COMPUTE_SIMPLE(6,4) 183 | #define COMPUTE_m6n8 COMPUTE_SIMPLE(6,8) 184 | #define COMPUTE_m6n16 \ 185 | "movq %%r13,%6; movq %%r14,%1;" INIT_m6n16\ 186 | "cmpq $16,%6; jb 5655165f; movq %2,%3; testq %%r12,%%r12;"\ 187 | "5655167:\n\t"\ 188 | KERNEL_k2m6n16 "cmpq $46,%%r12; movq $46,%%r12; cmoveq %4,%%r12;"\ 189 | KERNEL_k2m6n16 "prefetcht1 (%3); subq $23,%3; addq %%r12,%3;"\ 190 | KERNEL_k2m6n16 "prefetcht1 (%7); addq $16,%7;"\ 191 | KERNEL_k2m6n16 "subq $8,%6; cmpq $16,%6; jnb 5655167b;"\ 192 | "5655165:\n\t"\ 193 | "movq %2,%3; prefetcht0 (%5); testq %6,%6; jz 5655169f;"\ 194 | "5655163:\n\t"\ 195 | "prefetcht0 (%3); prefetcht0 23(%3); prefetcht0 (%3,%4,1); prefetcht0 23(%3,%4,1);"\ 196 | KERNEL_k1m6n16 "leaq (%3,%4,2),%3; decq %6; jnz 5655163b;"\ 197 | "5655169:\n\t"\ 198 | "prefetcht0 (%%r14); prefetcht0 64(%%r14); prefetcht0 128(%%r14); prefetcht0 192(%%r14);" SAVE_m6n16 199 | 200 | /* r11=m_counter */ 201 | #define COMPUTE(ndim) {\ 202 | b_pref=b_ptr+ndim*ldc;\ 203 | __asm__ __volatile__(\ 204 | "movq %1,%%r14; movq %6,%%r13; movq %8,%%r11;"\ 205 | "cmpq $6,%%r11; jb 99301f;"\ 206 | "99300:\n\t"\ 207 | COMPUTE_m6n##ndim "subq $6,%%r11; cmpq $6,%%r11; jnb 99300b;"\ 208 | "99301:\n\t"\ 209 | "cmpq $4,%%r11; jb 99302f;"\ 210 | COMPUTE_SIMPLE(4,ndim) "subq $4,%%r11;"\ 211 | "99302:\n\t"\ 212 | "cmpq $2,%%r11; jb 99303f;"\ 213 | COMPUTE_SIMPLE(2,ndim) "subq $2,%%r11;"\ 214 | "99303:\n\t"\ 215 | "testq %%r11,%%r11; jz 99304f;"\ 216 | COMPUTE_SIMPLE(1,ndim)\ 217 | "99304:\n\t"\ 218 | "movq %%r13,%6; movq %%r14,%1;"\ 219 | :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_in_bytes),"+r"(alp),"+r"(K),"+r"(b_pref)\ 220 | :"m"(M):"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7",\ 221 | "xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15",\ 222 | "r11","r12","r13","r14","cc","memory");\ 223 | a_ptr-=M*K; b_ptr+=K*ndim; c_ptr+=ldc*ndim-M;\ 224 | } 225 | 226 | //#include "common.h" 227 | #include 228 | #include //debug 229 | #include //debug 230 | #include 231 | #define BLASLONG int//debug 232 | int __attribute__ ((noinline)) 233 | CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG ldc) 234 | { 235 | if(m==0||n==0||k==0||alpha==(float)0.0) return 0; 236 | int64_t ldc_in_bytes = (int64_t)ldc * sizeof(float); float ALPHA = alpha; 237 | int64_t M = (int64_t)m, K = (int64_t)k; 238 | BLASLONG n_count = n; 239 | float *a_ptr = A,*b_ptr = B,*c_ptr = C,*c_tmp = C,*alp = &ALPHA,*b_pref = B; 240 | for(;n_count>15;n_count-=16) COMPUTE(16) 241 | for(;n_count>7;n_count-=8) COMPUTE(8) 242 | for(;n_count>3;n_count-=4) COMPUTE(4) 243 | for(;n_count>1;n_count-=2) COMPUTE(2) 244 | if(n_count>0) COMPUTE(1) 245 | return 0; 246 | } 247 | /* test zone */ 248 | static void sgemm_tcopy_6(float *src, float *dst, BLASLONG lead_dim, BLASLONG dim_first, BLASLONG dim_second){ 249 | //src_leading_dim parallel with dst_tile_leading_dim 250 | if(dim_first==0 || dim_second==0) return; 251 | BLASLONG count_first,count_second; 252 | float *tosrc,*todst; 253 | for(count_second=0;count_second5;count_first-=6){ 257 | todst[0]=tosrc[0];todst[1]=tosrc[1];todst[2]=tosrc[2];todst[3]=tosrc[3]; 258 | todst[4]=tosrc[4];todst[5]=tosrc[5]; 259 | tosrc+=6;todst+=6*dim_second; 260 | } 261 | todst -= count_second * 2; 262 | for(;count_first>3;count_first-=4){ 263 | todst[0]=tosrc[0];todst[1]=tosrc[1];todst[2]=tosrc[2];todst[3]=tosrc[3]; 264 | tosrc+=4;todst+=4*dim_second; 265 | } 266 | todst -= count_second * 2; 267 | for(;count_first>1;count_first-=2){ 268 | todst[0]=tosrc[0];todst[1]=tosrc[1]; 269 | tosrc+=2;todst+=2*dim_second; 270 | } 271 | todst -= count_second; 272 | if(count_first>0) *todst=*tosrc; 273 | } 274 | } 275 | static void sgemm_ncopy_6(float *src, float *dst, BLASLONG lead_dim, BLASLONG dim_first, BLASLONG dim_second){ 276 | //src_leading_dim perpendicular to dst_tile_leading_dim 277 | if(dim_first==0 || dim_second==0) return; 278 | BLASLONG count_first,count_second,tosrc_inc; 279 | float *tosrc1,*tosrc2,*tosrc3,*tosrc4,*tosrc5,*tosrc6; 280 | float *todst=dst; 281 | tosrc1=src;tosrc2=tosrc1+lead_dim;tosrc3=tosrc2+lead_dim;tosrc4=tosrc3+lead_dim; 282 | tosrc5=tosrc4+lead_dim;tosrc6=tosrc5+lead_dim; 283 | tosrc_inc=6*lead_dim-dim_first; 284 | for(count_second=dim_second;count_second>5;count_second-=6){ 285 | for(count_first=0;count_first3;count_second-=4){ 296 | for(count_first=0;count_first1;count_second-=2){ 305 | for(count_first=0;count_first0){ 312 | for(count_first=0;count_first15;count_first-=16){ 327 | todst[0]=tosrc[0];todst[1]=tosrc[1];todst[2]=tosrc[2];todst[3]=tosrc[3]; 328 | todst[4]=tosrc[4];todst[5]=tosrc[5];todst[6]=tosrc[6];todst[7]=tosrc[7]; 329 | todst[8]=tosrc[8];todst[9]=tosrc[9];todst[10]=tosrc[10];todst[11]=tosrc[11]; 330 | todst[12]=tosrc[12];todst[13]=tosrc[13];todst[14]=tosrc[14];todst[15]=tosrc[15]; 331 | tosrc+=16;todst+=16*dim_second; 332 | } 333 | todst -= count_second * 8; 334 | for(;count_first>7;count_first-=8){ 335 | todst[0]=tosrc[0];todst[1]=tosrc[1];todst[2]=tosrc[2];todst[3]=tosrc[3]; 336 | todst[4]=tosrc[4];todst[5]=tosrc[5];todst[6]=tosrc[6];todst[7]=tosrc[7]; 337 | tosrc+=8;todst+=8*dim_second; 338 | } 339 | todst -= count_second * 4; 340 | for(;count_first>3;count_first-=4){ 341 | todst[0]=tosrc[0];todst[1]=tosrc[1];todst[2]=tosrc[2];todst[3]=tosrc[3]; 342 | tosrc+=4;todst+=4*dim_second; 343 | } 344 | todst -= count_second * 2; 345 | for(;count_first>1;count_first-=2){ 346 | todst[0]=tosrc[0];todst[1]=tosrc[1]; 347 | tosrc+=2;todst+=2*dim_second; 348 | } 349 | todst -= count_second; 350 | if(count_first>0) *todst=*tosrc; 351 | } 352 | } 353 | static void sgemm_ncopy_16(float *src, float *dst, BLASLONG lead_dim, BLASLONG dim_first, BLASLONG dim_second){ 354 | //src_leading_dim perpendicular to dst_tile_leading_dim 355 | if(dim_first==0 || dim_second==0) return; 356 | BLASLONG count_first,count_second,tosrc_inc; 357 | float *tosrc1,*tosrc2,*tosrc3,*tosrc4,*tosrc5,*tosrc6,*tosrc7,*tosrc8; 358 | float *tosrc9,*tosrc10,*tosrc11,*tosrc12,*tosrc13,*tosrc14,*tosrc15,*tosrc16; 359 | float *todst=dst; 360 | tosrc1=src;tosrc2=tosrc1+lead_dim;tosrc3=tosrc2+lead_dim;tosrc4=tosrc3+lead_dim; 361 | tosrc5=tosrc4+lead_dim;tosrc6=tosrc5+lead_dim;tosrc7=tosrc6+lead_dim;tosrc8=tosrc7+lead_dim; 362 | tosrc9=tosrc8+lead_dim;tosrc10=tosrc9+lead_dim;tosrc11=tosrc10+lead_dim;tosrc12=tosrc11+lead_dim; 363 | tosrc13=tosrc12+lead_dim;tosrc14=tosrc13+lead_dim;tosrc15=tosrc14+lead_dim;tosrc16=tosrc15+lead_dim; 364 | tosrc_inc=16*lead_dim-dim_first; 365 | for(count_second=dim_second;count_second>15;count_second-=16){ 366 | for(count_first=0;count_first7;count_second-=8){ 384 | for(count_first=0;count_first3;count_second-=4){ 396 | for(count_first=0;count_first1;count_second-=2){ 405 | for(count_first=0;count_first0){ 412 | for(count_first=0;count_first 433 | void inline sgemm(float alpha,float *a,float *b,float *c){ 434 | const unsigned BLOCKDIM=256; 435 | float *b_buffer = (float *)aligned_alloc(64,BLOCKDIM*n*sizeof(float)); 436 | float *a_buffer = (float *)aligned_alloc(4096,BLOCKDIM*BLOCKDIM*sizeof(float)); 437 | float *a_current_pos,*b_current_pos=b; 438 | unsigned m_count,k_count,k_subdim,m_subdim; 439 | if(beta==0) memset(c,0,m*n*sizeof(float)); 440 | for(k_count=0;k_count BLOCKDIM) k_subdim = BLOCKDIM; 443 | if(!transb) { sgemm_ncopy_16(b_current_pos,b_buffer,ldb,k_subdim,n); b_current_pos += BLOCKDIM; } 444 | else { sgemm_tcopy_16(b_current_pos,b_buffer,ldb,n,k_subdim); b_current_pos += (int64_t)(ldb) * BLOCKDIM; } 445 | if(!transa) a_current_pos = a + (int64_t)k_count * (int64_t)(lda); 446 | else a_current_pos = a + k_count; 447 | for(m_count=0;m_count BLOCKDIM) m_subdim = BLOCKDIM; 450 | if(!transa) { sgemm_tcopy_6(a_current_pos,a_buffer,lda,m_subdim,k_subdim); a_current_pos += BLOCKDIM; } 451 | else { sgemm_ncopy_6(a_current_pos,a_buffer,lda,k_subdim,m_subdim); a_current_pos += (int64_t)(lda) * BLOCKDIM; } 452 | CNAME(m_subdim,n,k_subdim,alpha,a_buffer,b_buffer,c+m_count,ldc); 453 | } 454 | } 455 | free(a_buffer); free(b_buffer); 456 | } 457 | 458 | -------------------------------------------------------------------------------- /avx_mathfun.h: -------------------------------------------------------------------------------- 1 | /* 2 | AVX implementation of sin, cos, sincos, exp and log 3 | 4 | Based on "sse_mathfun.h", by Julien Pommier 5 | http://gruntthepeon.free.fr/ssemath/ 6 | 7 | Copyright (C) 2012 Giovanni Garberoglio 8 | Interdisciplinary Laboratory for Computational Science (LISC) 9 | Fondazione Bruno Kessler and University of Trento 10 | via Sommarive, 18 11 | I-38123 Trento (Italy) 12 | 13 | This software is provided 'as-is', without any express or implied 14 | warranty. In no event will the authors be held liable for any damages 15 | arising from the use of this software. 16 | 17 | Permission is granted to anyone to use this software for any purpose, 18 | including commercial applications, and to alter it and redistribute it 19 | freely, subject to the following restrictions: 20 | 21 | 1. The origin of this software must not be misrepresented; you must not 22 | claim that you wrote the original software. If you use this software 23 | in a product, an acknowledgment in the product documentation would be 24 | appreciated but is not required. 25 | 2. Altered source versions must be plainly marked as such, and must not be 26 | misrepresented as being the original software. 27 | 3. This notice may not be removed or altered from any source distribution. 28 | 29 | (this is the zlib license) 30 | */ 31 | 32 | #include 33 | 34 | /* yes I know, the top of this file is quite ugly */ 35 | # define ALIGN32_BEG 36 | # define ALIGN32_END __attribute__((aligned(32))) 37 | 38 | /* __m128 is ugly to write */ 39 | typedef __m256 v8sf; // vector of 8 float (avx) 40 | typedef __m256i v8si; // vector of 8 int (avx) 41 | typedef __m128i v4si; // vector of 8 int (avx) 42 | 43 | #define _PI32AVX_CONST(Name, Val) \ 44 | static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = { Val, Val, Val, Val } 45 | 46 | _PI32AVX_CONST(1, 1); 47 | _PI32AVX_CONST(inv1, ~1); 48 | _PI32AVX_CONST(2, 2); 49 | _PI32AVX_CONST(4, 4); 50 | 51 | 52 | /* declare some AVX constants -- why can't I figure a better way to do that? */ 53 | #define _PS256_CONST(Name, Val) \ 54 | static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val } 55 | #define _PI32_CONST256(Name, Val) \ 56 | static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val } 57 | #define _PS256_CONST_TYPE(Name, Type, Val) \ 58 | static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val } 59 | 60 | _PS256_CONST(1 , 1.0f); 61 | _PS256_CONST(0p5, 0.5f); 62 | /* the smallest non denormalized float number */ 63 | _PS256_CONST_TYPE(min_norm_pos, int, 0x00800000); 64 | _PS256_CONST_TYPE(mant_mask, int, 0x7f800000); 65 | _PS256_CONST_TYPE(inv_mant_mask, int, ~0x7f800000); 66 | 67 | _PS256_CONST_TYPE(sign_mask, int, (int)0x80000000); 68 | _PS256_CONST_TYPE(inv_sign_mask, int, ~0x80000000); 69 | 70 | _PI32_CONST256(0, 0); 71 | _PI32_CONST256(1, 1); 72 | _PI32_CONST256(inv1, ~1); 73 | _PI32_CONST256(2, 2); 74 | _PI32_CONST256(4, 4); 75 | _PI32_CONST256(0x7f, 0x7f); 76 | 77 | _PS256_CONST(cephes_SQRTHF, 0.707106781186547524); 78 | _PS256_CONST(cephes_log_p0, 7.0376836292E-2); 79 | _PS256_CONST(cephes_log_p1, - 1.1514610310E-1); 80 | _PS256_CONST(cephes_log_p2, 1.1676998740E-1); 81 | _PS256_CONST(cephes_log_p3, - 1.2420140846E-1); 82 | _PS256_CONST(cephes_log_p4, + 1.4249322787E-1); 83 | _PS256_CONST(cephes_log_p5, - 1.6668057665E-1); 84 | _PS256_CONST(cephes_log_p6, + 2.0000714765E-1); 85 | _PS256_CONST(cephes_log_p7, - 2.4999993993E-1); 86 | _PS256_CONST(cephes_log_p8, + 3.3333331174E-1); 87 | _PS256_CONST(cephes_log_q1, -2.12194440e-4); 88 | _PS256_CONST(cephes_log_q2, 0.693359375); 89 | 90 | #ifndef __AVX2__ 91 | 92 | typedef union imm_xmm_union { 93 | v8si imm; 94 | v4si xmm[2]; 95 | } imm_xmm_union; 96 | 97 | #define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) { \ 98 | imm_xmm_union u __attribute__((aligned(32))); \ 99 | u.imm = imm_; \ 100 | xmm0_ = u.xmm[0]; \ 101 | xmm1_ = u.xmm[1]; \ 102 | } 103 | 104 | #define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) { \ 105 | imm_xmm_union u __attribute__((aligned(32))); \ 106 | u.xmm[0]=xmm0_; u.xmm[1]=xmm1_; imm_ = u.imm; \ 107 | } 108 | 109 | 110 | #define AVX2_BITOP_USING_SSE2(fn) \ 111 | static inline v8si avx2_mm256_##fn(v8si x, int a) \ 112 | { \ 113 | /* use SSE2 instruction to perform the bitop AVX2 */ \ 114 | v4si x1, x2; \ 115 | v8si ret; \ 116 | COPY_IMM_TO_XMM(x, x1, x2); \ 117 | x1 = _mm_##fn(x1,a); \ 118 | x2 = _mm_##fn(x2,a); \ 119 | COPY_XMM_TO_IMM(x1, x2, ret); \ 120 | return(ret); \ 121 | } 122 | 123 | //#warning "Using SSE2 to perform AVX2 bitshift ops" 124 | AVX2_BITOP_USING_SSE2(slli_epi32) 125 | AVX2_BITOP_USING_SSE2(srli_epi32) 126 | 127 | #define AVX2_INTOP_USING_SSE2(fn) \ 128 | static inline v8si avx2_mm256_##fn(v8si x, v8si y) \ 129 | { \ 130 | /* use SSE2 instructions to perform the AVX2 integer operation */ \ 131 | v4si x1, x2; \ 132 | v4si y1, y2; \ 133 | v8si ret; \ 134 | COPY_IMM_TO_XMM(x, x1, x2); \ 135 | COPY_IMM_TO_XMM(y, y1, y2); \ 136 | x1 = _mm_##fn(x1,y1); \ 137 | x2 = _mm_##fn(x2,y2); \ 138 | COPY_XMM_TO_IMM(x1, x2, ret); \ 139 | return(ret); \ 140 | } 141 | 142 | //#warning "Using SSE2 to perform AVX2 integer ops" 143 | AVX2_INTOP_USING_SSE2(and_si128) 144 | AVX2_INTOP_USING_SSE2(andnot_si128) 145 | AVX2_INTOP_USING_SSE2(cmpeq_epi32) 146 | AVX2_INTOP_USING_SSE2(sub_epi32) 147 | AVX2_INTOP_USING_SSE2(add_epi32) 148 | #define avx2_mm256_and_si256 avx2_mm256_and_si128 149 | #define avx2_mm256_andnot_si256 avx2_mm256_andnot_si128 150 | #else 151 | #define avx2_mm256_slli_epi32 _mm256_slli_epi32 152 | #define avx2_mm256_srli_epi32 _mm256_srli_epi32 153 | #define avx2_mm256_and_si256 _mm256_and_si256 154 | #define avx2_mm256_andnot_si256 _mm256_andnot_si256 155 | #define avx2_mm256_cmpeq_epi32 _mm256_cmpeq_epi32 156 | #define avx2_mm256_sub_epi32 _mm256_sub_epi32 157 | #define avx2_mm256_add_epi32 _mm256_add_epi32 158 | #endif /* __AVX2__ */ 159 | 160 | 161 | /* natural logarithm computed for 8 simultaneous float 162 | return NaN for x <= 0 163 | */ 164 | v8sf log256_ps(v8sf x) { 165 | v8si imm0; 166 | v8sf one = *(v8sf*)_ps256_1; 167 | 168 | //v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps()); 169 | v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS); 170 | 171 | x = _mm256_max_ps(x, *(v8sf*)_ps256_min_norm_pos); /* cut off denormalized stuff */ 172 | 173 | // can be done with AVX2 174 | imm0 = avx2_mm256_srli_epi32(_mm256_castps_si256(x), 23); 175 | 176 | /* keep only the fractional part */ 177 | x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_mant_mask); 178 | x = _mm256_or_ps(x, *(v8sf*)_ps256_0p5); 179 | 180 | // this is again another AVX2 instruction 181 | imm0 = avx2_mm256_sub_epi32(imm0, *(v8si*)_pi32_256_0x7f); 182 | v8sf e = _mm256_cvtepi32_ps(imm0); 183 | 184 | e = _mm256_add_ps(e, one); 185 | 186 | /* part2: 187 | if( x < SQRTHF ) { 188 | e -= 1; 189 | x = x + x - 1.0; 190 | } else { x = x - 1.0; } 191 | */ 192 | //v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF); 193 | v8sf mask = _mm256_cmp_ps(x, *(v8sf*)_ps256_cephes_SQRTHF, _CMP_LT_OS); 194 | v8sf tmp = _mm256_and_ps(x, mask); 195 | x = _mm256_sub_ps(x, one); 196 | e = _mm256_sub_ps(e, _mm256_and_ps(one, mask)); 197 | x = _mm256_add_ps(x, tmp); 198 | 199 | v8sf z = _mm256_mul_ps(x,x); 200 | 201 | v8sf y = *(v8sf*)_ps256_cephes_log_p0; 202 | y = _mm256_mul_ps(y, x); 203 | y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p1); 204 | y = _mm256_mul_ps(y, x); 205 | y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p2); 206 | y = _mm256_mul_ps(y, x); 207 | y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p3); 208 | y = _mm256_mul_ps(y, x); 209 | y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p4); 210 | y = _mm256_mul_ps(y, x); 211 | y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p5); 212 | y = _mm256_mul_ps(y, x); 213 | y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p6); 214 | y = _mm256_mul_ps(y, x); 215 | y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p7); 216 | y = _mm256_mul_ps(y, x); 217 | y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p8); 218 | y = _mm256_mul_ps(y, x); 219 | 220 | y = _mm256_mul_ps(y, z); 221 | 222 | tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q1); 223 | y = _mm256_add_ps(y, tmp); 224 | 225 | 226 | tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5); 227 | y = _mm256_sub_ps(y, tmp); 228 | 229 | tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q2); 230 | x = _mm256_add_ps(x, y); 231 | x = _mm256_add_ps(x, tmp); 232 | x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN 233 | return x; 234 | } 235 | 236 | _PS256_CONST(exp_hi, 88.3762626647949f); 237 | _PS256_CONST(exp_lo, -88.3762626647949f); 238 | 239 | _PS256_CONST(cephes_LOG2EF, 1.44269504088896341); 240 | _PS256_CONST(cephes_exp_C1, 0.693359375); 241 | _PS256_CONST(cephes_exp_C2, -2.12194440e-4); 242 | 243 | _PS256_CONST(cephes_exp_p0, 1.9875691500E-4); 244 | _PS256_CONST(cephes_exp_p1, 1.3981999507E-3); 245 | _PS256_CONST(cephes_exp_p2, 8.3334519073E-3); 246 | _PS256_CONST(cephes_exp_p3, 4.1665795894E-2); 247 | _PS256_CONST(cephes_exp_p4, 1.6666665459E-1); 248 | _PS256_CONST(cephes_exp_p5, 5.0000001201E-1); 249 | 250 | v8sf exp256_ps(v8sf x) { 251 | v8sf tmp = _mm256_setzero_ps(), fx; 252 | v8si imm0; 253 | v8sf one = *(v8sf*)_ps256_1; 254 | 255 | x = _mm256_min_ps(x, *(v8sf*)_ps256_exp_hi); 256 | x = _mm256_max_ps(x, *(v8sf*)_ps256_exp_lo); 257 | 258 | /* express exp(x) as exp(g + n*log(2)) */ 259 | fx = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_LOG2EF); 260 | fx = _mm256_add_ps(fx, *(v8sf*)_ps256_0p5); 261 | 262 | /* how to perform a floorf with SSE: just below */ 263 | //imm0 = _mm256_cvttps_epi32(fx); 264 | //tmp = _mm256_cvtepi32_ps(imm0); 265 | 266 | tmp = _mm256_floor_ps(fx); 267 | 268 | /* if greater, substract 1 */ 269 | //v8sf mask = _mm256_cmpgt_ps(tmp, fx); 270 | v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS); 271 | mask = _mm256_and_ps(mask, one); 272 | fx = _mm256_sub_ps(tmp, mask); 273 | 274 | tmp = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C1); 275 | v8sf z = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C2); 276 | x = _mm256_sub_ps(x, tmp); 277 | x = _mm256_sub_ps(x, z); 278 | 279 | z = _mm256_mul_ps(x,x); 280 | 281 | v8sf y = *(v8sf*)_ps256_cephes_exp_p0; 282 | y = _mm256_mul_ps(y, x); 283 | y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p1); 284 | y = _mm256_mul_ps(y, x); 285 | y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p2); 286 | y = _mm256_mul_ps(y, x); 287 | y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p3); 288 | y = _mm256_mul_ps(y, x); 289 | y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p4); 290 | y = _mm256_mul_ps(y, x); 291 | y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p5); 292 | y = _mm256_mul_ps(y, z); 293 | y = _mm256_add_ps(y, x); 294 | y = _mm256_add_ps(y, one); 295 | 296 | /* build 2^n */ 297 | imm0 = _mm256_cvttps_epi32(fx); 298 | // another two AVX2 instructions 299 | imm0 = avx2_mm256_add_epi32(imm0, *(v8si*)_pi32_256_0x7f); 300 | imm0 = avx2_mm256_slli_epi32(imm0, 23); 301 | v8sf pow2n = _mm256_castsi256_ps(imm0); 302 | y = _mm256_mul_ps(y, pow2n); 303 | return y; 304 | } 305 | 306 | v8sf tanh256_ps(v8sf x) { 307 | v8sf tmp = _mm256_setzero_ps(), fx; 308 | v8si imm0; 309 | v8sf one = *(v8sf*)_ps256_1; 310 | v8sf two={2,2,2,2,2,2,2,2}; 311 | x = _mm256_mul_ps(x, two); 312 | x = _mm256_min_ps(x, *(v8sf*)_ps256_exp_hi); 313 | x = _mm256_max_ps(x, *(v8sf*)_ps256_exp_lo); 314 | 315 | /* express exp(x) as exp(g + n*log(2)) */ 316 | fx = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_LOG2EF); 317 | fx = _mm256_add_ps(fx, *(v8sf*)_ps256_0p5); 318 | 319 | /* how to perform a floorf with SSE: just below */ 320 | //imm0 = _mm256_cvttps_epi32(fx); 321 | //tmp = _mm256_cvtepi32_ps(imm0); 322 | 323 | tmp = _mm256_floor_ps(fx); 324 | 325 | /* if greater, substract 1 */ 326 | //v8sf mask = _mm256_cmpgt_ps(tmp, fx); 327 | v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS); 328 | mask = _mm256_and_ps(mask, one); 329 | fx = _mm256_sub_ps(tmp, mask); 330 | 331 | tmp = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C1); 332 | v8sf z = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C2); 333 | x = _mm256_sub_ps(x, tmp); 334 | x = _mm256_sub_ps(x, z); 335 | 336 | z = _mm256_mul_ps(x,x); 337 | 338 | v8sf y = *(v8sf*)_ps256_cephes_exp_p0; 339 | y = _mm256_mul_ps(y, x); 340 | y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p1); 341 | y = _mm256_mul_ps(y, x); 342 | y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p2); 343 | y = _mm256_mul_ps(y, x); 344 | y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p3); 345 | y = _mm256_mul_ps(y, x); 346 | y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p4); 347 | y = _mm256_mul_ps(y, x); 348 | y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p5); 349 | y = _mm256_mul_ps(y, z); 350 | y = _mm256_add_ps(y, x); 351 | y = _mm256_add_ps(y, one); 352 | 353 | /* build 2^n */ 354 | imm0 = _mm256_cvttps_epi32(fx); 355 | // another two AVX2 instructions 356 | imm0 = avx2_mm256_add_epi32(imm0, *(v8si*)_pi32_256_0x7f); 357 | imm0 = avx2_mm256_slli_epi32(imm0, 23); 358 | v8sf pow2n = _mm256_castsi256_ps(imm0); 359 | y = _mm256_mul_ps(y, pow2n); 360 | return _mm256_div_ps(_mm256_sub_ps(y,one),_mm256_add_ps(y,one)); 361 | } 362 | 363 | _PS256_CONST(minus_cephes_DP1, -0.78515625); 364 | _PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); 365 | _PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8); 366 | _PS256_CONST(sincof_p0, -1.9515295891E-4); 367 | _PS256_CONST(sincof_p1, 8.3321608736E-3); 368 | _PS256_CONST(sincof_p2, -1.6666654611E-1); 369 | _PS256_CONST(coscof_p0, 2.443315711809948E-005); 370 | _PS256_CONST(coscof_p1, -1.388731625493765E-003); 371 | _PS256_CONST(coscof_p2, 4.166664568298827E-002); 372 | _PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI 373 | 374 | 375 | /* evaluation of 8 sines at onces using AVX intrisics 376 | 377 | The code is the exact rewriting of the cephes sinf function. 378 | Precision is excellent as long as x < 8192 (I did not bother to 379 | take into account the special handling they have for greater values 380 | -- it does not return garbage for arguments over 8192, though, but 381 | the extra precision is missing). 382 | 383 | Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the 384 | surprising but correct result. 385 | 386 | */ 387 | v8sf sin256_ps(v8sf x) { // any x 388 | v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y; 389 | v8si imm0, imm2; 390 | 391 | #ifndef __AVX2__ 392 | v4si imm0_1, imm0_2; 393 | v4si imm2_1, imm2_2; 394 | #endif 395 | 396 | sign_bit = x; 397 | /* take the absolute value */ 398 | x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask); 399 | /* extract the sign bit (upper one) */ 400 | sign_bit = _mm256_and_ps(sign_bit, *(v8sf*)_ps256_sign_mask); 401 | 402 | /* scale by 4/Pi */ 403 | y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI); 404 | 405 | /* 406 | Here we start a series of integer operations, which are in the 407 | realm of AVX2. 408 | If we don't have AVX, let's perform them using SSE2 directives 409 | */ 410 | 411 | #ifdef __AVX2__ 412 | /* store the integer part of y in mm0 */ 413 | imm2 = _mm256_cvttps_epi32(y); 414 | /* j=(j+1) & (~1) (see the cephes sources) */ 415 | // another two AVX2 instruction 416 | imm2 = avx2_mm256_add_epi32(imm2, *(v8si*)_pi32_256_1); 417 | imm2 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1); 418 | y = _mm256_cvtepi32_ps(imm2); 419 | 420 | /* get the swap sign flag */ 421 | imm0 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_4); 422 | imm0 = avx2_mm256_slli_epi32(imm0, 29); 423 | /* get the polynom selection mask 424 | there is one polynom for 0 <= x <= Pi/4 425 | and another one for Pi/4 2 | #include 3 | #include 4 | #include 5 | #define KERNEL_k1m16n1 \ 6 | "vmovups (%0),%%zmm4; addq $64,%0;"\ 7 | "vbroadcastss (%1),%%zmm6; vfmadd231ps %%zmm4,%%zmm6,%%zmm8;"\ 8 | "addq $4,%1;" 9 | #define KERNEL_h_k1m16n2 \ 10 | "vmovsldup (%0),%%zmm4; vmovshdup (%0),%%zmm5; prefetcht0 512(%0); addq $64,%0;"\ 11 | "vbroadcastsd (%1),%%zmm6; vfmadd231ps %%zmm4,%%zmm6,%%zmm8; vfmadd231ps %%zmm5,%%zmm6,%%zmm9;" 12 | #define KERNEL_k1m16n2 KERNEL_h_k1m16n2 "addq $8,%1;" 13 | #define KERNEL_h_k1m16n4 KERNEL_h_k1m16n2 "vbroadcastsd 8(%1),%%zmm7; vfmadd231ps %%zmm4,%%zmm7,%%zmm10; vfmadd231ps %%zmm5,%%zmm7,%%zmm11;" 14 | #define KERNEL_k1m16n4 KERNEL_h_k1m16n4 "addq $16,%1;" 15 | #define unit_kernel_k1m16n4(c1,c2,c3,c4, ...) \ 16 | "vbroadcastsd ("#__VA_ARGS__"),%%zmm6; vfmadd231ps %%zmm4,%%zmm6,"#c1"; vfmadd231ps %%zmm5,%%zmm6,"#c2";"\ 17 | "vbroadcastsd 8("#__VA_ARGS__"),%%zmm7; vfmadd231ps %%zmm4,%%zmm7,"#c3"; vfmadd231ps %%zmm5,%%zmm7,"#c4";" 18 | #define KERNEL_h_k1m16n8 KERNEL_h_k1m16n4 unit_kernel_k1m16n4(%%zmm12,%%zmm13,%%zmm14,%%zmm15,%1,%%r12,1) 19 | #define KERNEL_k1m16n8 KERNEL_h_k1m16n8 "addq $16,%1;" 20 | #define KERNEL_h_k1m16n12 KERNEL_h_k1m16n8 unit_kernel_k1m16n4(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%1,%%r12,2) 21 | #define KERNEL_k1m16n12 KERNEL_h_k1m16n12 "addq $16,%1;" 22 | #define KERNEL_h_k1m16n16 KERNEL_k1m16n12 unit_kernel_k1m16n4(%%zmm20,%%zmm21,%%zmm22,%%zmm23,%%r15) 23 | #define KERNEL_k1m16n16 KERNEL_h_k1m16n16 "addq $16,%%r15;" 24 | #define KERNEL_h_k1m16n20 KERNEL_h_k1m16n16 unit_kernel_k1m16n4(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%r15,%%r12,1) 25 | #define KERNEL_k1m16n20 KERNEL_h_k1m16n20 "addq $16,%%r15;" 26 | #define KERNEL_h_k1m16n24 KERNEL_h_k1m16n20 unit_kernel_k1m16n4(%%zmm28,%%zmm29,%%zmm30,%%zmm31,%%r15,%%r12,2) 27 | #define KERNEL_k1m16n24 KERNEL_h_k1m16n24 "addq $16,%%r15;" 28 | #define INIT_m16n1 "vpxorq %%zmm8,%%zmm8,%%zmm8;" 29 | #define INIT_m16n2 INIT_m16n1 "vpxorq %%zmm9,%%zmm9,%%zmm9;" 30 | #define INIT_m16n4 INIT_m16n2 "vpxorq %%zmm10,%%zmm10,%%zmm10;vpxorq %%zmm11,%%zmm11,%%zmm11;" 31 | #define unit_init_m16n4(c1,c2,c3,c4) \ 32 | "vpxorq "#c1","#c1","#c1";vpxorq "#c2","#c2","#c2";vpxorq "#c3","#c3","#c3";vpxorq "#c4","#c4","#c4";" 33 | #define INIT_m16n8 INIT_m16n4 unit_init_m16n4(%%zmm12,%%zmm13,%%zmm14,%%zmm15) 34 | #define INIT_m16n12 INIT_m16n8 unit_init_m16n4(%%zmm16,%%zmm17,%%zmm18,%%zmm19) 35 | #define INIT_m16n16 INIT_m16n12 unit_init_m16n4(%%zmm20,%%zmm21,%%zmm22,%%zmm23) 36 | #define INIT_m16n20 INIT_m16n16 unit_init_m16n4(%%zmm24,%%zmm25,%%zmm26,%%zmm27) 37 | #define INIT_m16n24 INIT_m16n20 unit_init_m16n4(%%zmm28,%%zmm29,%%zmm30,%%zmm31) 38 | #define SAVE_h_m16n1 "vfmadd213ps (%2),%%zmm0,%%zmm8; vmovups %%zmm8,(%2);" 39 | #define unit_save_m16n2(c1,c2) \ 40 | "vunpcklps "#c2","#c1",%%zmm6; vunpckhps "#c2","#c1",%%zmm7; vunpcklpd %%zmm7,%%zmm6,%%zmm4; vunpckhpd %%zmm7,%%zmm6,%%zmm5;"\ 41 | "vfmadd213ps (%5),%%zmm0,%%zmm4; vfmadd213ps (%5,%3,1),%%zmm0,%%zmm5;"\ 42 | "prefetcht1 127(%5); prefetcht1 127(%5,%3,1);"\ 43 | "vmovups %%zmm4,(%5); vmovups %%zmm5,(%5,%3,1); leaq (%5,%3,2),%5;" 44 | #define SAVE_h_m16n2 "movq %2,%5;" unit_save_m16n2(%%zmm8,%%zmm9) 45 | #define SAVE_h_m16n4 SAVE_h_m16n2 unit_save_m16n2(%%zmm10,%%zmm11) 46 | #define SAVE_h_m16n8 SAVE_h_m16n4 unit_save_m16n2(%%zmm12,%%zmm13) unit_save_m16n2(%%zmm14,%%zmm15) 47 | #define SAVE_h_m16n12 SAVE_h_m16n8 unit_save_m16n2(%%zmm16,%%zmm17) unit_save_m16n2(%%zmm18,%%zmm19) 48 | #define SAVE_h_m16n16 SAVE_h_m16n12 unit_save_m16n2(%%zmm20,%%zmm21) unit_save_m16n2(%%zmm22,%%zmm23) 49 | #define SAVE_h_m16n20 SAVE_h_m16n16 unit_save_m16n2(%%zmm24,%%zmm25) unit_save_m16n2(%%zmm26,%%zmm27) 50 | #define SAVE_h_m16n24 SAVE_h_m16n20 unit_save_m16n2(%%zmm28,%%zmm29) unit_save_m16n2(%%zmm30,%%zmm31) 51 | #define SAVE_m16(ndim) SAVE_h_m16n##ndim "addq $64,%2;" 52 | #define COMPUTE_m16(ndim) \ 53 | INIT_m16n##ndim\ 54 | "movq %%r13,%4; movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15;"\ 55 | "cmpq $4,%4; jb "#ndim"016162f;"\ 56 | #ndim"016161:\n\t"\ 57 | KERNEL_k1m16n##ndim\ 58 | KERNEL_k1m16n##ndim\ 59 | KERNEL_k1m16n##ndim\ 60 | KERNEL_k1m16n##ndim\ 61 | "subq $4,%4; cmpq $4,%4; jnb "#ndim"016161b;"\ 62 | #ndim"016162:\n\t"\ 63 | "testq %4,%4; jz "#ndim"016163f;"\ 64 | KERNEL_k1m16n##ndim\ 65 | "decq %4; jmp "#ndim"016162b;"\ 66 | #ndim"016163:\n\t"\ 67 | SAVE_m16(ndim) 68 | #define KERNEL_k1m8n1(b_addr) \ 69 | "vmovups (%0),%%ymm1; addq $32,%0;"\ 70 | "vbroadcastss ("#b_addr"),%%ymm2; vfmadd231ps %%ymm1,%%ymm2,%%ymm4;"\ 71 | "addq $4,"#b_addr";" 72 | #define KERNEL_h_k1m8n2(b_addr) \ 73 | "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; addq $32,%0;"\ 74 | "vbroadcastsd ("#b_addr"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;" 75 | #define KERNEL_k1m8n2(b_addr) KERNEL_h_k1m8n2(b_addr) "addq $8,"#b_addr";" 76 | #define KERNEL_h_k1m8n4(b_addr) \ 77 | KERNEL_h_k1m8n2(b_addr) "vbroadcastsd 8("#b_addr"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;" 78 | #define KERNEL_k1m8n4(b_addr) KERNEL_h_k1m8n4(b_addr) "addq $16,"#b_addr";" 79 | #define unit_kernel_k1m8n4(c1,c2,c3,c4,...) \ 80 | "vbroadcastsd ("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c1"; vfmadd231ps %%ymm2,%%ymm3,"#c2";"\ 81 | "vbroadcastsd 8("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c3"; vfmadd231ps %%ymm2,%%ymm3,"#c4";" 82 | #define KERNEL_h_k1m8n8(b_addr) KERNEL_h_k1m8n4(b_addr) unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,b_addr,%%r12,1) 83 | #define KERNEL_k1m8n8(b_addr) KERNEL_h_k1m8n8(b_addr) "addq $16,"#b_addr";" 84 | #define KERNEL_h_k1m8n12(b_addr) KERNEL_h_k1m8n8(b_addr) unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,b_addr,%%r12,2) 85 | #define KERNEL_k1m8n12(b_addr) KERNEL_h_k1m8n12(b_addr) "addq $16,"#b_addr";" 86 | #define INIT_m8n1 "vpxor %%ymm4,%%ymm4,%%ymm4;" 87 | #define INIT_m8n2 INIT_m8n1 "vpxor %%ymm5,%%ymm5,%%ymm5;" 88 | #define INIT_m8n4 INIT_m8n2 "vpxor %%ymm6,%%ymm6,%%ymm6;vpxor %%ymm7,%%ymm7,%%ymm7;" 89 | #define unit_init_m8n4(c1,c2,c3,c4) \ 90 | "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";" 91 | #define INIT_m8n8 INIT_m8n4 unit_init_m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11) 92 | #define INIT_m8n12 INIT_m8n8 unit_init_m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15) 93 | #define SAVE_L_m8n1 "vfmadd213ps (%2),%%ymm0,%%ymm4; vmovups %%ymm4,(%2);" 94 | #define unit_save_m8n2(c1,c2) \ 95 | "vunpcklps "#c2","#c1",%%ymm2; vunpckhps "#c2","#c1",%%ymm3;"\ 96 | "vunpcklpd %%ymm3,%%ymm2,%%ymm1;vfmadd213ps (%5), %%ymm0,%%ymm1;vmovups %%ymm1,(%5);"\ 97 | "vunpckhpd %%ymm3,%%ymm2,%%ymm1;vfmadd213ps (%5,%3,1),%%ymm0,%%ymm1;vmovups %%ymm1,(%5,%3,1);"\ 98 | "leaq (%5,%3,2),%5;" 99 | #define SAVE_L_m8n2 "movq %2,%5;" unit_save_m8n2(%%ymm4,%%ymm5) 100 | #define SAVE_L_m8n4 SAVE_L_m8n2 unit_save_m8n2(%%ymm6,%%ymm7) 101 | #define SAVE_L_m8n8 SAVE_L_m8n4 unit_save_m8n2(%%ymm8,%%ymm9) unit_save_m8n2(%%ymm10,%%ymm11) 102 | #define SAVE_L_m8n12 SAVE_L_m8n8 unit_save_m8n2(%%ymm12,%%ymm13) unit_save_m8n2(%%ymm14,%%ymm15) 103 | #define SAVE_R_m8n4 unit_save_m8n2(%%ymm4,%%ymm5) unit_save_m8n2(%%ymm6,%%ymm7) 104 | #define SAVE_R_m8n8 SAVE_R_m8n4 unit_save_m8n2(%%ymm8,%%ymm9) unit_save_m8n2(%%ymm10,%%ymm11) 105 | #define SAVE_R_m8n12 SAVE_R_m8n8 unit_save_m8n2(%%ymm12,%%ymm13) unit_save_m8n2(%%ymm14,%%ymm15) 106 | #define COMPUTE_L_m8(ndim,sim) \ 107 | INIT_m8n##ndim\ 108 | "movq %%r13,%4; movq %%r14,%1;"\ 109 | #ndim""#sim"882:\n\t"\ 110 | "testq %4,%4; jz "#ndim""#sim"883f;"\ 111 | KERNEL_k1m8n##ndim(%1)\ 112 | "decq %4; jmp "#ndim""#sim"882b;"\ 113 | #ndim""#sim"883:\n\t"\ 114 | SAVE_L_m8n##ndim "addq $32,%2;" 115 | #define COMPUTE_R_m8(ndim,sim) \ 116 | "subq %%r12,%0; subq %%r12,%0;"\ 117 | INIT_m8n##ndim\ 118 | "movq %%r13,%4; leaq (%%r14,%%r12,2),%%r15; addq %%r12,%%r15;"\ 119 | #ndim""#sim"882:\n\t"\ 120 | "testq %4,%4; jz "#ndim""#sim"883f;"\ 121 | KERNEL_k1m8n##ndim(%%r15)\ 122 | "decq %4; jmp "#ndim""#sim"882b;"\ 123 | #ndim""#sim"883:\n\t"\ 124 | SAVE_R_m8n##ndim 125 | #define COMPUTE_m8_n1 COMPUTE_L_m8(1,33833) 126 | #define COMPUTE_m8_n2 COMPUTE_L_m8(2,33833) 127 | #define COMPUTE_m8_n4 COMPUTE_L_m8(4,33833) 128 | #define COMPUTE_m8_n8 COMPUTE_L_m8(8,33833) 129 | #define COMPUTE_m8_n12 COMPUTE_L_m8(12,33833) 130 | #define COMPUTE_m8_n16 COMPUTE_L_m8(12,33733) COMPUTE_R_m8(4,33933) 131 | #define COMPUTE_m8_n20 COMPUTE_L_m8(12,33633) COMPUTE_R_m8(8,33933) 132 | #define COMPUTE_m8_n24 COMPUTE_L_m8(12,33533) COMPUTE_R_m8(12,33933) 133 | #define COMPUTE_m8(ndim) COMPUTE_m8_n##ndim 134 | #define KERNEL_k1m4n1(b_addr) \ 135 | "vmovups (%0),%%xmm1; addq $16,%0;"\ 136 | "vbroadcastss ("#b_addr"),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ 137 | "addq $4,"#b_addr";" 138 | #define KERNEL_h_k1m4n2(b_addr) \ 139 | "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2; addq $16,%0;"\ 140 | "vmovddup ("#b_addr"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm4; vfmadd231ps %%xmm2,%%xmm3,%%xmm5;" 141 | #define KERNEL_k1m4n2(b_addr) KERNEL_h_k1m4n2(b_addr) "addq $8,"#b_addr";" 142 | #define KERNEL_h_k1m4n4(b_addr) \ 143 | KERNEL_h_k1m4n2(b_addr) "vmovddup 8("#b_addr"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm6; vfmadd231ps %%xmm2,%%xmm3,%%xmm7;" 144 | #define KERNEL_k1m4n4(b_addr) KERNEL_h_k1m4n4(b_addr) "addq $16,"#b_addr";" 145 | #define unit_kernel_k1m4n4(c1,c2,c3,c4,...) \ 146 | "vmovddup ("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c1"; vfmadd231ps %%xmm2,%%xmm3,"#c2";"\ 147 | "vmovddup 8("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c3"; vfmadd231ps %%xmm2,%%xmm3,"#c4";" 148 | #define KERNEL_h_k1m4n8(b_addr) KERNEL_h_k1m4n4(b_addr) unit_kernel_k1m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11,b_addr,%%r12,1) 149 | #define KERNEL_k1m4n8(b_addr) KERNEL_h_k1m4n8(b_addr) "addq $16,"#b_addr";" 150 | #define KERNEL_h_k1m4n12(b_addr) KERNEL_h_k1m4n8(b_addr) unit_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,b_addr,%%r12,2) 151 | #define KERNEL_k1m4n12(b_addr) KERNEL_h_k1m4n12(b_addr) "addq $16,"#b_addr";" 152 | #define INIT_m4n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" 153 | #define INIT_m4n2 INIT_m4n1 "vpxor %%xmm5,%%xmm5,%%xmm5;" 154 | #define INIT_m4n4 INIT_m4n2 "vpxor %%xmm6,%%xmm6,%%xmm6;vpxor %%xmm7,%%xmm7,%%xmm7;" 155 | #define unit_init_m4n4(c1,c2,c3,c4) \ 156 | "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";" 157 | #define INIT_m4n8 INIT_m4n4 unit_init_m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11) 158 | #define INIT_m4n12 INIT_m4n8 unit_init_m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15) 159 | #define SAVE_L_m4n1 "vfmadd213ps (%2),%%xmm0,%%xmm4; vmovups %%xmm4,(%2);" 160 | #define unit_save_m4n2(c1,c2) \ 161 | "vunpcklps "#c2","#c1",%%xmm2; vunpckhps "#c2","#c1",%%xmm3;"\ 162 | "vunpcklpd %%xmm3,%%xmm2,%%xmm1;vfmadd213ps (%5), %%xmm0,%%xmm1;vmovups %%xmm1,(%5);"\ 163 | "vunpckhpd %%xmm3,%%xmm2,%%xmm1;vfmadd213ps (%5,%3,1),%%xmm0,%%xmm1;vmovups %%xmm1,(%5,%3,1);"\ 164 | "leaq (%5,%3,2),%5;" 165 | #define SAVE_L_m4n2 "movq %2,%5;" unit_save_m4n2(%%xmm4,%%xmm5) 166 | #define SAVE_L_m4n4 SAVE_L_m4n2 unit_save_m4n2(%%xmm6,%%xmm7) 167 | #define SAVE_L_m4n8 SAVE_L_m4n4 unit_save_m4n2(%%xmm8,%%xmm9) unit_save_m4n2(%%xmm10,%%xmm11) 168 | #define SAVE_L_m4n12 SAVE_L_m4n8 unit_save_m4n2(%%xmm12,%%xmm13) unit_save_m4n2(%%xmm14,%%xmm15) 169 | #define SAVE_R_m4n4 unit_save_m4n2(%%xmm4,%%xmm5) unit_save_m4n2(%%xmm6,%%xmm7) 170 | #define SAVE_R_m4n8 SAVE_R_m4n4 unit_save_m4n2(%%xmm8,%%xmm9) unit_save_m4n2(%%xmm10,%%xmm11) 171 | #define SAVE_R_m4n12 SAVE_R_m4n8 unit_save_m4n2(%%xmm12,%%xmm13) unit_save_m4n2(%%xmm14,%%xmm15) 172 | #define COMPUTE_L_m4(ndim,sim) \ 173 | INIT_m4n##ndim\ 174 | "movq %%r13,%4; movq %%r14,%1;"\ 175 | #ndim""#sim"442:\n\t"\ 176 | "testq %4,%4; jz "#ndim""#sim"443f;"\ 177 | KERNEL_k1m4n##ndim(%1)\ 178 | "decq %4; jmp "#ndim""#sim"442b;"\ 179 | #ndim""#sim"443:\n\t"\ 180 | SAVE_L_m4n##ndim "addq $16,%2;" 181 | #define COMPUTE_R_m4(ndim,sim) \ 182 | "subq %%r12,%0;"\ 183 | INIT_m4n##ndim\ 184 | "movq %%r13,%4; leaq (%%r14,%%r12,2),%%r15; addq %%r12,%%r15;"\ 185 | #ndim""#sim"442:\n\t"\ 186 | "testq %4,%4; jz "#ndim""#sim"443f;"\ 187 | KERNEL_k1m4n##ndim(%%r15)\ 188 | "decq %4; jmp "#ndim""#sim"442b;"\ 189 | #ndim""#sim"443:\n\t"\ 190 | SAVE_R_m4n##ndim 191 | #define COMPUTE_m4_n1 COMPUTE_L_m4(1,55855) 192 | #define COMPUTE_m4_n2 COMPUTE_L_m4(2,55855) 193 | #define COMPUTE_m4_n4 COMPUTE_L_m4(4,55855) 194 | #define COMPUTE_m4_n8 COMPUTE_L_m4(8,55855) 195 | #define COMPUTE_m4_n12 COMPUTE_L_m4(12,55855) 196 | #define COMPUTE_m4_n16 COMPUTE_L_m4(12,55755) COMPUTE_R_m4(4,55955) 197 | #define COMPUTE_m4_n20 COMPUTE_L_m4(12,55655) COMPUTE_R_m4(8,55955) 198 | #define COMPUTE_m4_n24 COMPUTE_L_m4(12,55555) COMPUTE_R_m4(12,55955) 199 | #define COMPUTE_m4(ndim) COMPUTE_m4_n##ndim 200 | #define INIT_m2n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" 201 | #define KERNEL_k1m2n1(b_addr) \ 202 | "vmovsd (%0),%%xmm1; addq $8,%0;"\ 203 | "vbroadcastss ("#b_addr"),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ 204 | "addq $4,"#b_addr";" 205 | #define SAVE_L_m2n1 "vmovsd (%2),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2);" 206 | #define INIT_m2n2 INIT_m2n1 "vpxor %%xmm5,%%xmm5,%%xmm5;" 207 | #define KERNEL_k1m2n2(b_addr) \ 208 | "vmovsd (%0),%%xmm1; addq $8,%0;"\ 209 | "vbroadcastss ("#b_addr"),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ 210 | "vbroadcastss 4("#b_addr"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm5;"\ 211 | "addq $8,"#b_addr";" 212 | #define SAVE_L_m2n2 SAVE_L_m2n1 "vmovsd (%2,%3,1),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm5; vmovsd %%xmm5,(%2,%3,1);" 213 | #define INIT_m2n4 INIT_m2n2 214 | #define INIT_m2n8 INIT_m2n4 "vpxor %%xmm6,%%xmm6,%%xmm6; vpxor %%xmm7,%%xmm7,%%xmm7;" 215 | #define INIT_m2n12 INIT_m2n8 "vpxor %%xmm8,%%xmm8,%%xmm8; vpxor %%xmm9,%%xmm9,%%xmm9;" 216 | #define KERNEL_k1m2n4(b_addr) \ 217 | "vmovups ("#b_addr"),%%xmm3; addq $16,"#b_addr";"\ 218 | "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ 219 | "vbroadcastss 4(%0),%%xmm2; vfmadd231ps %%xmm3,%%xmm2,%%xmm5;"\ 220 | "addq $8,%0;" 221 | #define KERNEL_k1m2n8(b_addr) \ 222 | "vmovups ("#b_addr"),%%xmm3; vmovups ("#b_addr",%%r12,1),%%xmm2; addq $16,"#b_addr";"\ 223 | "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm6;"\ 224 | "vbroadcastss 4(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm5; vfmadd231ps %%xmm2,%%xmm1,%%xmm7;"\ 225 | "addq $8,%0;" 226 | #define KERNEL_k1m2n12(b_addr) \ 227 | "vmovups ("#b_addr"),%%xmm3; vmovups ("#b_addr",%%r12,1),%%xmm2; vmovups ("#b_addr",%%r12,2),%%xmm1; addq $16,"#b_addr";"\ 228 | "vbroadcastss (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm6; vfmadd231ps %%xmm1,%%xmm10,%%xmm8;"\ 229 | "vbroadcastss 4(%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm5; vfmadd231ps %%xmm2,%%xmm10,%%xmm7; vfmadd231ps %%xmm1,%%xmm10,%%xmm9;"\ 230 | "addq $8,%0;" 231 | #define unit_save_m2n4(c1,c2) \ 232 | "vunpcklps "#c2","#c1",%%xmm1; vunpckhps "#c2","#c1",%%xmm2;"\ 233 | "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1; vmovsd %%xmm1,(%5); vmovhpd %%xmm1,(%5,%3,1);"\ 234 | "leaq (%5,%3,2),%5;"\ 235 | "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2; vmovsd %%xmm2,(%5); vmovhpd %%xmm2,(%5,%3,1);"\ 236 | "leaq (%5,%3,2),%5;" 237 | #define SAVE_L_m2n4 "movq %2,%5;" unit_save_m2n4(%%xmm4,%%xmm5) 238 | #define SAVE_L_m2n8 SAVE_L_m2n4 unit_save_m2n4(%%xmm6,%%xmm7) 239 | #define SAVE_L_m2n12 SAVE_L_m2n8 unit_save_m2n4(%%xmm8,%%xmm9) 240 | #define SAVE_R_m2n4 unit_save_m2n4(%%xmm4,%%xmm5) 241 | #define SAVE_R_m2n8 SAVE_R_m2n4 unit_save_m2n4(%%xmm6,%%xmm7) 242 | #define SAVE_R_m2n12 SAVE_R_m2n8 unit_save_m2n4(%%xmm8,%%xmm9) 243 | #define COMPUTE_L_m2(ndim,sim) \ 244 | INIT_m2n##ndim\ 245 | "movq %%r13,%4; movq %%r14,%1;"\ 246 | #ndim""#sim"222:\n\t"\ 247 | "testq %4,%4; jz "#ndim""#sim"223f;"\ 248 | KERNEL_k1m2n##ndim(%1)\ 249 | "decq %4; jmp "#ndim""#sim"222b;"\ 250 | #ndim""#sim"223:\n\t"\ 251 | SAVE_L_m2n##ndim "addq $8,%2;" 252 | #define COMPUTE_R_m2(ndim,sim) \ 253 | "salq $3,%%r13;subq %%r13,%0;sarq $3,%%r13;"\ 254 | INIT_m2n##ndim\ 255 | "movq %%r13,%4; leaq (%%r14,%%r12,2),%%r15; addq %%r12,%%r15;"\ 256 | #ndim""#sim"222:\n\t"\ 257 | "testq %4,%4; jz "#ndim""#sim"223f;"\ 258 | KERNEL_k1m2n##ndim(%%r15)\ 259 | "decq %4; jmp "#ndim""#sim"222b;"\ 260 | #ndim""#sim"223:\n\t"\ 261 | SAVE_R_m2n##ndim 262 | #define COMPUTE_m2_n1 COMPUTE_L_m2(1,77877) 263 | #define COMPUTE_m2_n2 COMPUTE_L_m2(2,77877) 264 | #define COMPUTE_m2_n4 COMPUTE_L_m2(4,77877) 265 | #define COMPUTE_m2_n8 COMPUTE_L_m2(8,77877) 266 | #define COMPUTE_m2_n12 COMPUTE_L_m2(12,77877) 267 | #define COMPUTE_m2_n16 COMPUTE_L_m2(12,77777) COMPUTE_R_m2(4,77977) 268 | #define COMPUTE_m2_n20 COMPUTE_L_m2(12,77677) COMPUTE_R_m2(8,77977) 269 | #define COMPUTE_m2_n24 COMPUTE_L_m2(12,77577) COMPUTE_R_m2(12,77977) 270 | #define COMPUTE_m2(ndim) COMPUTE_m2_n##ndim 271 | #define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" 272 | #define KERNEL_k1m1n1(b_addr) \ 273 | "vmovss ("#b_addr"),%%xmm3; addq $4,"#b_addr";"\ 274 | "vmovss (%0),%%xmm1; vfmadd231ss %%xmm3,%%xmm1,%%xmm4;"\ 275 | "addq $4,%0;" 276 | #define SAVE_L_m1n1 "vfmadd213ss (%2),%%xmm0,%%xmm4; vmovss %%xmm4,(%2);" 277 | #define INIT_m1n2 INIT_m1n1 278 | #define KERNEL_k1m1n2(b_addr) \ 279 | "vmovsd ("#b_addr"),%%xmm3; addq $8,"#b_addr";"\ 280 | "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ 281 | "addq $4,%0;" 282 | #define SAVE_L_m1n2 \ 283 | "vmovss (%2),%%xmm3; vinsertps $16,(%2,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm4;"\ 284 | "vmovss %%xmm4,(%2); vextractps $1,%%xmm4,(%2,%3,1);" 285 | #define INIT_m1n4 INIT_m1n2 286 | #define INIT_m1n8 INIT_m1n4 "vpxor %%xmm5,%%xmm5,%%xmm5;" 287 | #define INIT_m1n12 INIT_m1n8 "vpxor %%xmm6,%%xmm6,%%xmm6;" 288 | #define KERNEL_k1m1n4(b_addr) \ 289 | "vmovups ("#b_addr"),%%xmm3; addq $16,"#b_addr";"\ 290 | "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ 291 | "addq $4,%0;" 292 | #define KERNEL_k1m1n8(b_addr) \ 293 | "vmovups ("#b_addr"),%%xmm3; vmovups ("#b_addr",%%r12,1),%%xmm2; addq $16,"#b_addr";"\ 294 | "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm5;"\ 295 | "addq $4,%0;" 296 | #define KERNEL_k1m1n12(b_addr) \ 297 | "vmovups ("#b_addr"),%%xmm3; vmovups ("#b_addr",%%r12,1),%%xmm2; vmovups ("#b_addr",%%r12,2),%%xmm1; addq $16,"#b_addr";"\ 298 | "vbroadcastss (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm5; vfmadd231ps %%xmm1,%%xmm10,%%xmm6;"\ 299 | "addq $4,%0;" 300 | #define unit_save_m1n4(c1) \ 301 | "vpxor %%xmm10,%%xmm10,%%xmm10; vmovsd "#c1",%%xmm10,%%xmm2; vmovhlps "#c1",%%xmm10,%%xmm1;"\ 302 | "vmovss (%5),%%xmm3; vinsertps $16,(%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2;"\ 303 | "vmovss %%xmm2,(%5); vextractps $1,%%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;"\ 304 | "vmovss (%5),%%xmm3; vinsertps $16,(%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1;"\ 305 | "vmovss %%xmm1,(%5); vextractps $1,%%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;" 306 | #define SAVE_L_m1n4 "movq %2,%5;" unit_save_m1n4(%%xmm4) 307 | #define SAVE_L_m1n8 SAVE_L_m1n4 unit_save_m1n4(%%xmm5) 308 | #define SAVE_L_m1n12 SAVE_L_m1n8 unit_save_m1n4(%%xmm6) 309 | #define SAVE_R_m1n4 unit_save_m1n4(%%xmm4) 310 | #define SAVE_R_m1n8 SAVE_R_m1n4 unit_save_m1n4(%%xmm5) 311 | #define SAVE_R_m1n12 SAVE_R_m1n8 unit_save_m1n4(%%xmm6) 312 | #define COMPUTE_L_m1(ndim,sim) \ 313 | INIT_m1n##ndim\ 314 | "movq %%r13,%4; movq %%r14,%1;"\ 315 | #ndim""#sim"112:\n\t"\ 316 | "testq %4,%4; jz "#ndim""#sim"113f;"\ 317 | KERNEL_k1m1n##ndim(%1)\ 318 | "decq %4; jmp "#ndim""#sim"112b;"\ 319 | #ndim""#sim"113:\n\t"\ 320 | SAVE_L_m1n##ndim "addq $4,%2;" 321 | #define COMPUTE_R_m1(ndim,sim) \ 322 | "salq $2,%%r13;subq %%r13,%0;sarq $2,%%r13;"\ 323 | INIT_m1n##ndim\ 324 | "movq %%r13,%4; leaq (%%r14,%%r12,2),%%r15; addq %%r12,%%r15;"\ 325 | #ndim""#sim"112:\n\t"\ 326 | "testq %4,%4; jz "#ndim""#sim"113f;"\ 327 | KERNEL_k1m1n##ndim(%%r15)\ 328 | "decq %4; jmp "#ndim""#sim"112b;"\ 329 | #ndim""#sim"113:\n\t"\ 330 | SAVE_R_m1n##ndim 331 | #define COMPUTE_m1_n1 COMPUTE_L_m1(1,99899) 332 | #define COMPUTE_m1_n2 COMPUTE_L_m1(2,99899) 333 | #define COMPUTE_m1_n4 COMPUTE_L_m1(4,99899) 334 | #define COMPUTE_m1_n8 COMPUTE_L_m1(8,99899) 335 | #define COMPUTE_m1_n12 COMPUTE_L_m1(12,99899) 336 | #define COMPUTE_m1_n16 COMPUTE_L_m1(12,99799) COMPUTE_R_m1(4,99999) 337 | #define COMPUTE_m1_n20 COMPUTE_L_m1(12,99699) COMPUTE_R_m1(8,99999) 338 | #define COMPUTE_m1_n24 COMPUTE_L_m1(12,99599) COMPUTE_R_m1(12,99999) 339 | #define COMPUTE_m1(ndim) COMPUTE_m1_n##ndim 340 | #define COMPUTE(ndim) {\ 341 | __asm__ __volatile__(\ 342 | "vbroadcastss (%6),%%zmm0;"\ 343 | "movq %4,%%r13; movq %4,%%r12; salq $4,%%r12; movq %1,%%r14; movq %7,%%r11;"\ 344 | "cmpq $16,%7;jb 33101"#ndim"f;"\ 345 | "33109"#ndim":\n\t"\ 346 | COMPUTE_m16(ndim)\ 347 | "subq $16,%7;cmpq $16,%7;jnb 33109"#ndim"b;"\ 348 | "33101"#ndim":\n\t"\ 349 | "cmpq $8,%7;jb 33102"#ndim"f;"\ 350 | COMPUTE_m8(ndim)\ 351 | "subq $8,%7;"\ 352 | "33102"#ndim":\n\t"\ 353 | "cmpq $4,%7;jb 33103"#ndim"f;"\ 354 | COMPUTE_m4(ndim)\ 355 | "subq $4,%7;"\ 356 | "33103"#ndim":\n\t"\ 357 | "cmpq $2,%7;jb 33104"#ndim"f;"\ 358 | COMPUTE_m2(ndim)\ 359 | "subq $2,%7;"\ 360 | "33104"#ndim":\n\t"\ 361 | "testq %7,%7;jz 33105"#ndim"f;"\ 362 | COMPUTE_m1(ndim)\ 363 | "33105"#ndim":\n\t"\ 364 | "movq %%r13,%4; movq %%r14,%1; movq %%r11,%7;"\ 365 | :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(K),"+r"(ctemp),"+r"(alp),"+r"(M)\ 366 | ::"r11","r12","r13","r14","r15","zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14",\ 367 | "zmm15","zmm16","zmm17","zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31",\ 368 | "cc","memory");\ 369 | a_pointer -= M * K; b_pointer += ndim * K;c_pointer += LDC * ndim - M;\ 370 | } 371 | static inline void CNAME(unsigned m, unsigned n, unsigned k, float alpha, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, unsigned LDC){ 372 | int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float);float ALPHA = alpha; 373 | int64_t M = (int64_t)m, K = (int64_t)k; 374 | unsigned n_count = n; 375 | float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*alp = &ALPHA; 376 | for(;n_count>23;n_count-=24) COMPUTE(24) 377 | for(;n_count>19;n_count-=20) COMPUTE(20) 378 | for(;n_count>15;n_count-=16) COMPUTE(16) 379 | for(;n_count>11;n_count-=12) COMPUTE(12) 380 | for(;n_count>7;n_count-=8) COMPUTE(8) 381 | for(;n_count>3;n_count-=4) COMPUTE(4) 382 | for(;n_count>1;n_count-=2) COMPUTE(2) 383 | if(n_count>0) COMPUTE(1) 384 | } 385 | 386 | static inline void sgemm_tcopy_16(float *src, float *dst, unsigned lead_dim, unsigned dim_first, unsigned dim_second){ 387 | if(dim_first==0 || dim_second==0) return; 388 | unsigned count_first,count_second; 389 | float *tosrc,*todst; 390 | for(count_second=0;count_second15;count_first-=16){ 394 | _mm512_storeu_ps(todst,_mm512_loadu_ps(tosrc)); 395 | tosrc+=16;todst+=16*dim_second; 396 | } 397 | todst -= count_second * 8; 398 | for(;count_first>7;count_first-=8){ 399 | _mm256_storeu_ps(todst,_mm256_loadu_ps(tosrc)); 400 | tosrc+=8;todst+=8*dim_second; 401 | } 402 | todst -= count_second * 4; 403 | for(;count_first>3;count_first-=4){ 404 | _mm_storeu_ps(todst,_mm_loadu_ps(tosrc)); 405 | tosrc+=4;todst+=4*dim_second; 406 | } 407 | todst -= count_second * 2; 408 | for(;count_first>1;count_first-=2){ 409 | *todst=*tosrc;todst[1]=tosrc[1]; 410 | tosrc+=2;todst+=2*dim_second; 411 | } 412 | todst -= count_second; 413 | if(count_first>0) *todst=*tosrc; 414 | } 415 | } 416 | 417 | static inline void sgemm_ncopy_16(float *src, float *dst, unsigned lead_dim, unsigned dim_first, unsigned dim_second){ 418 | if(dim_first==0 || dim_second==0) return; 419 | unsigned count_first,count_second,tosrc_inc; 420 | float *tosrc1,*tosrc2,*tosrc3,*tosrc4,*tosrc5,*tosrc6,*tosrc7,*tosrc8,*tosrc9,*tosrc10,*tosrc11,*tosrc12,*tosrc13,*tosrc14,*tosrc15,*tosrc16; 421 | float *todst=dst; 422 | tosrc1=src;tosrc2=tosrc1+lead_dim;tosrc3=tosrc2+lead_dim;tosrc4=tosrc3+lead_dim; 423 | tosrc5=tosrc4+lead_dim;tosrc6=tosrc5+lead_dim;tosrc7=tosrc6+lead_dim;tosrc8=tosrc7+lead_dim; 424 | tosrc9=tosrc8+lead_dim;tosrc10=tosrc9+lead_dim;tosrc11=tosrc10+lead_dim;tosrc12=tosrc11+lead_dim; 425 | tosrc13=tosrc12+lead_dim;tosrc14=tosrc13+lead_dim;tosrc15=tosrc14+lead_dim;tosrc16=tosrc15+lead_dim; 426 | tosrc_inc=16*lead_dim-dim_first; 427 | for(count_second=dim_second;count_second>15;count_second-=16){ 428 | for(count_first=0;count_first7;count_second-=8){ 446 | for(count_first=0;count_first3;count_second-=4){ 458 | for(count_first=0;count_first1;count_second-=2){ 467 | for(count_first=0;count_first0){ 474 | for(count_first=0;count_first3;count_first-=4){ 488 | _mm_storeu_ps(todst,_mm_loadu_ps(tosrc)); 489 | tosrc+=4;todst+=4*dim_second; 490 | } 491 | todst -= count_second * 2; 492 | for(;count_first>1;count_first-=2){ 493 | *todst=*tosrc;todst[1]=tosrc[1]; 494 | tosrc+=2;todst+=2*dim_second; 495 | } 496 | todst -= count_second; 497 | if(count_first>0) *todst=*tosrc; 498 | } 499 | } 500 | static inline void sgemm_ncopy_4(float *src, float *dst, unsigned lead_dim, unsigned dim_first, unsigned dim_second){ 501 | if(dim_first==0 || dim_second==0) return; 502 | unsigned count_first,count_second,tosrc_inc; 503 | float *tosrc1,*tosrc2,*tosrc3,*tosrc4; 504 | float *todst=dst; 505 | tosrc1=src;tosrc2=tosrc1+lead_dim;tosrc3=tosrc2+lead_dim;tosrc4=tosrc3+lead_dim; 506 | tosrc_inc=4*lead_dim-dim_first; 507 | for(count_second=dim_second;count_second>3;count_second-=4){ 508 | for(count_first=0;count_first1;count_second-=2){ 517 | for(count_first=0;count_first0){ 524 | for(count_first=0;count_first 531 | void inline sgemm(float alpha,float *a,float *b,float *c){ 532 | const unsigned BLOCKDIM=256; 533 | float *b_buffer = (float *)aligned_alloc(64,BLOCKDIM*n*sizeof(float)); 534 | float *a_buffer = (float *)aligned_alloc(4096,BLOCKDIM*BLOCKDIM*sizeof(float)); 535 | float *a_current_pos,*b_current_pos=b; 536 | unsigned m_count,k_count,k_subdim,m_subdim; 537 | if(beta==0) memset(c,0,m*n*sizeof(float)); 538 | for(k_count=0;k_count BLOCKDIM) k_subdim = BLOCKDIM; 541 | if(!transb) { sgemm_ncopy_4(b_current_pos,b_buffer,ldb,k_subdim,n); b_current_pos += BLOCKDIM; } 542 | else { sgemm_tcopy_4(b_current_pos,b_buffer,ldb,n,k_subdim); b_current_pos += (int64_t)ldb * BLOCKDIM; } 543 | if(!transa) a_current_pos = a + (int64_t)k_count * (int64_t)lda; 544 | else a_current_pos = a + k_count; 545 | for(m_count=0;m_count BLOCKDIM) m_subdim = BLOCKDIM; 548 | if(!transa) { sgemm_tcopy_16(a_current_pos,a_buffer,lda,m_subdim,k_subdim); a_current_pos += BLOCKDIM; } 549 | else { sgemm_ncopy_16(a_current_pos,a_buffer,lda,k_subdim,m_subdim); a_current_pos += (int64_t)lda * BLOCKDIM; } 550 | CNAME(m_subdim,n,k_subdim,alpha,a_buffer,b_buffer,c+m_count,ldc); 551 | } 552 | } 553 | free(a_buffer); free(b_buffer); 554 | } 555 | --------------------------------------------------------------------------------