├── README.md └── mm.c /README.md: -------------------------------------------------------------------------------- 1 | # samplematrixcode 2 | Sample matrix multiply code to show affect of blocking and data alignment 3 | The code mm.c accompanies two papers at software.intel.com that discuss memory layout and performance. 4 | A simple matrix multiply is reordered and blocked to show performance improvement 5 | An exercise is included to show the impact on performance when matrices are not aligned on cacheline boundaries. 6 | -------------------------------------------------------------------------------- /mm.c: -------------------------------------------------------------------------------- 1 | // A simple matrix multiply code to show affect of ordering and blocking 2 | // to compile this use gcc -O2 mm.c -lrt or icc -O2 mm.c -lrt 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #define MATSIZE 8000 10 | #define BLOCKSIZE 8 11 | 12 | void setmat() ; 13 | void fillmat() ; 14 | void abasicmm() ; 15 | void abettermm() ; 16 | void ablockmm() ; 17 | void checkmatmult() ; 18 | 19 | int main(int argc, char *argv[]) 20 | { 21 | 22 | double *a, *b, *c, *aa ; 23 | unsigned int n ; 24 | unsigned i, j, k, iInner, jInner, kInner, blockSize ; 25 | struct timespec ts1, ts2, ts3, ts4, ts5, ts6, ts7 ; 26 | 27 | printf("hello code beginning\n") ; 28 | n = MATSIZE ; // default settings 29 | blockSize = BLOCKSIZE ; 30 | if (argc != 3) 31 | { 32 | printf("input matrix size and blocksize\n") ; 33 | exit(0); 34 | } 35 | n = atoi(argv[1]) ; 36 | blockSize = atoi(argv[2]) ; 37 | printf("matrix size %d blocksize %d\n", n,blockSize) ; 38 | if (n%blockSize) 39 | { 40 | printf("for this simple example matrix size must be a multiple of the block size.\n Please re-start \n") ; 41 | exit(0); 42 | } 43 | // allocate matrices 44 | a = (double *)calloc((n+blockSize)*(n+blockSize), sizeof(double)) ; 45 | b = (double *)calloc((n+blockSize)*(n+blockSize), sizeof(double)) ; 46 | c = (double *)calloc((n+blockSize)*(n+blockSize), sizeof(double)) ; 47 | aa = (double *)calloc((n+blockSize)*(n+blockSize), sizeof(double)) ; 48 | if (aa == NULL) // cheap check only the last allocation checked. 49 | { 50 | printf("insufficient memory \n") ; 51 | exit(0) ; 52 | } 53 | 54 | // fill matrices 55 | 56 | setmat(n, n, a) ; 57 | setmat(n, n, aa) ; 58 | 59 | 60 | srand(4.16) ; // set random seed (change to go off time stamp to make it better 61 | 62 | fillmat(n,n,b) ; 63 | fillmat(n,n,c) ; 64 | 65 | clock_gettime(CLOCK_REALTIME, &ts1) ; 66 | // multiply matrices 67 | abasicmm (n,n,a,b,c) ; 68 | 69 | clock_gettime(CLOCK_REALTIME, &ts2) ; 70 | 71 | setmat(n, n, a) ; 72 | 73 | clock_gettime(CLOCK_REALTIME, &ts3) ; 74 | 75 | abettermm (n,n,a,b,c) ; 76 | 77 | clock_gettime(CLOCK_REALTIME, &ts4) ; 78 | 79 | ablockmm (n, n, aa, b, c, blockSize) ; 80 | 81 | clock_gettime(CLOCK_REALTIME, &ts5) ; 82 | 83 | printf("matrix multplies complete \n") ; fflush(stdout) ; 84 | 85 | /**/ 86 | checkmatmult(n,n,a,aa) ; 87 | 88 | { 89 | double t1, t2, t3, tmp ; 90 | t1 = ts2.tv_sec-ts1.tv_sec; 91 | tmp = ts2.tv_nsec-ts1.tv_nsec; 92 | tmp /= 1.0e+09 ; 93 | t1 += tmp ; 94 | printf("ijk ordering basic time %lf\n",t1) ; 95 | t2 = ts4.tv_sec-ts3.tv_sec; 96 | tmp = ts4.tv_nsec-ts3.tv_nsec; 97 | tmp /= 1.0e+09 ; 98 | t2 += tmp ; 99 | printf("ikj ordering bette time %lf\n",t2) ; 100 | t3 = ts5.tv_sec-ts4.tv_sec; 101 | tmp = ts5.tv_nsec-ts4.tv_nsec; 102 | tmp /= 1.0e+09 ; 103 | t3 += tmp ; 104 | printf("ikj blocked time %lf\n",t3) ; 105 | 106 | } 107 | 108 | } 109 | 110 | void setmat(int n, int m, double a[n][m]) 111 | { 112 | int i, j ; 113 | 114 | for (i=0;i