├── README.md
├── figures
    ├── compare_MMult-1x4-3_MMult-1x4-4.png
    ├── compare_MMult-1x4-3_MMult-1x4-5.png
    ├── compare_MMult-1x4-3_MMult-4x4-3.png
    ├── compare_MMult-1x4-4_MMult-1x4-5.png
    ├── compare_MMult-1x4-4_MMult-4x4-4.png
    ├── compare_MMult-1x4-5_MMult-1x4-6.png
    ├── compare_MMult-1x4-5_MMult-4x4-5.png
    ├── compare_MMult-1x4-6_MMult-1x4-7.png
    ├── compare_MMult-1x4-6_MMult-4x4-6.png
    ├── compare_MMult-1x4-7_MMult-1x4-8.png
    ├── compare_MMult-1x4-7_MMult-4x4-7.png
    ├── compare_MMult-1x4-8_MMult-1x4-9.png
    ├── compare_MMult-1x4-9_MMult-4x4-10.png
    ├── compare_MMult-4x4-10_MMult-4x4-11.png
    ├── compare_MMult-4x4-11_MMult-4x4-12.png
    ├── compare_MMult-4x4-11_MMult-4x4-13.png
    ├── compare_MMult-4x4-12_MMult-4x4-13.png
    ├── compare_MMult-4x4-13_MMult-4x4-14.png
    ├── compare_MMult-4x4-13_MMult-4x4-15.png
    ├── compare_MMult-4x4-13_MMult_4x4_15.png
    ├── compare_MMult-4x4-14_MMult-4x4-15.png
    ├── compare_MMult-4x4-3_MMult-4x4-4.png
    ├── compare_MMult-4x4-4_MMult-4x4-5.png
    ├── compare_MMult-4x4-5_MMult-4x4-6.png
    ├── compare_MMult-4x4-6_MMult-4x4-7.png
    ├── compare_MMult-4x4-7_MMult-4x4-8.png
    ├── compare_MMult-4x4-8_MMult-4x4-9.png
    ├── compare_MMult-4x4-9_MMult-4x4-10.png
    ├── compare_MMult0_MMult-1x4-5.png
    ├── compare_MMult0_MMult-1x4-9.png
    ├── compare_MMult0_MMult-4x4-10.png
    ├── compare_MMult0_MMult-4x4-11.png
    ├── compare_MMult0_MMult-4x4-13.png
    ├── compare_MMult0_MMult-4x4-15.png
    ├── compare_MMult0_MMult-4x4-5.png
    ├── compare_MMult0_MMult0.png
    ├── compare_MMult0_MMult1.png
    ├── compare_MMult0_MMult2.png
    ├── compare_MMult0_MMult_4x4_15.png
    ├── compare_MMult0_vs_MMult0.png
    ├── compare_MMult1_MMult2.png
    ├── compare_MMult2_MMult-1x4-3.png
    ├── compare_MMult2_MMult-4x4-3.png
    ├── graph_10_vs_11.png
    ├── graph_1_vs_2.png
    ├── graph_2_vs_3.png
    ├── graph_3_vs_4.png
    ├── graph_4_vs_5.png
    ├── graph_5_vs_6.png
    ├── graph_6_vs_7.png
    ├── graph_7_vs_8.png
    ├── graph_7_vs_9.png
    ├── graph_8_vs_10.png
    └── graph_8_vs_9.png
└── src
    ├── HowToOptimizeGemm.tar.gz
    ├── HowToOptimizeGemm
        ├── MMult0.c
        ├── MMult1.c
        ├── PlotAll.m
        ├── PlotAll.py
        ├── REF_MMult.c
        ├── compare_matrices.c
        ├── copy_matrix.c
        ├── dclock.c
        ├── makefile
        ├── parameters.h
        ├── print_matrix.c
        ├── proc_parameters.m
        ├── random_matrix.c
        └── test_MMult.c
    ├── MMult1.c
    ├── MMult2.c
    ├── MMult_1x4_3.c
    ├── MMult_1x4_4.c
    ├── MMult_1x4_5.c
    ├── MMult_1x4_6.c
    ├── MMult_1x4_7.c
    ├── MMult_1x4_8.c
    ├── MMult_1x4_9.c
    ├── MMult_4x4_10.c
    ├── MMult_4x4_11.c
    ├── MMult_4x4_12.c
    ├── MMult_4x4_13.c
    ├── MMult_4x4_14.c
    ├── MMult_4x4_15.c
    ├── MMult_4x4_3.c
    ├── MMult_4x4_4.c
    ├── MMult_4x4_5.c
    ├── MMult_4x4_6.c
    ├── MMult_4x4_7.c
    ├── MMult_4x4_8.c
    └── MMult_4x4_9.c


/README.md:
--------------------------------------------------------------------------------
 1 | # How To Optimize Gemm wiki pages
 2 | https://github.com/flame/how-to-optimize-gemm/wiki
 3 | 
 4 | Copyright by Prof. Robert van de Geijn (rvdg@cs.utexas.edu).
 5 | 
 6 | Adapted to Github Markdown Wiki by Jianyu Huang (jianyu@cs.utexas.edu).
 7 | 
 8 | # Table of contents
 9 | 
10 |   * [The GotoBLAS/BLIS Approach to Optimizing Matrix-Matrix Multiplication - Step-by-Step](../../wiki#the-gotoblasblis-approach-to-optimizing-matrix-matrix-multiplication---step-by-step)
11 |   * [NOTICE ON ACADEMIC HONESTY](../../wiki#notice-on-academic-honesty)
12 |   * [References](../../wiki#references)
13 |   * [Set Up](../../wiki#set-up)
14 |   * [Step-by-step optimizations](../../wiki#step-by-step-optimizations)
15 |   * [Computing four elements of C at a time](../../wiki#computing-four-elements-of-c-at-a-time)
16 |     * [Hiding computation in a subroutine](../../wiki#hiding-computation-in-a-subroutine)
17 |     * [Computing four elements at a time](../../wiki#computing-four-elements-at-a-time)
18 |     * [Further optimizing](../../wiki#further-optimizing)
19 |   * [Computing a 4 x 4 block of C at a time](../../wiki#computing-a-4-x-4-block-of-c-at-a-time)
20 |     * [Repeating the same optimizations](../../wiki#repeating-the-same-optimizations)
21 |     * [Further optimizing](../../wiki#further-optimizing-1)
22 |     * [Blocking to maintain performance](../../wiki#blocking-to-maintain-performance)
23 |     * [Packing into contiguous memory](../../wiki#packing-into-contiguous-memory)
24 |   * [Acknowledgement](../../wiki#acknowledgement)
25 | 
26 | # Related Links
27 | * [BLISlab: A Sandbox for Optimizing GEMM](https://github.com/flame/blislab)
28 | * [GEMM: From Pure C to SSE Optimized Micro Kernels](http://apfel.mathematik.uni-ulm.de/~lehn/sghpc/gemm/)
29 | 
30 | # Acknowledgement
31 | This material was partially sponsored by grants from the National Science Foundation (Awards ACI-1148125/1340293 and ACI-1550493).
32 | 
33 | _Any opinions, findings and conclusions or recommendations expressed in this material are those of the author(s) and do not necessarily reflect the views of the National Science Foundation (NSF)._
34 | 


--------------------------------------------------------------------------------
/figures/compare_MMult-1x4-3_MMult-1x4-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult-1x4-3_MMult-1x4-4.png


--------------------------------------------------------------------------------
/figures/compare_MMult-1x4-3_MMult-1x4-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult-1x4-3_MMult-1x4-5.png


--------------------------------------------------------------------------------
/figures/compare_MMult-1x4-3_MMult-4x4-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult-1x4-3_MMult-4x4-3.png


--------------------------------------------------------------------------------
/figures/compare_MMult-1x4-4_MMult-1x4-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult-1x4-4_MMult-1x4-5.png


--------------------------------------------------------------------------------
/figures/compare_MMult-1x4-4_MMult-4x4-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult-1x4-4_MMult-4x4-4.png


--------------------------------------------------------------------------------
/figures/compare_MMult-1x4-5_MMult-1x4-6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult-1x4-5_MMult-1x4-6.png


--------------------------------------------------------------------------------
/figures/compare_MMult-1x4-5_MMult-4x4-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult-1x4-5_MMult-4x4-5.png


--------------------------------------------------------------------------------
/figures/compare_MMult-1x4-6_MMult-1x4-7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult-1x4-6_MMult-1x4-7.png


--------------------------------------------------------------------------------
/figures/compare_MMult-1x4-6_MMult-4x4-6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult-1x4-6_MMult-4x4-6.png


--------------------------------------------------------------------------------
/figures/compare_MMult-1x4-7_MMult-1x4-8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult-1x4-7_MMult-1x4-8.png


--------------------------------------------------------------------------------
/figures/compare_MMult-1x4-7_MMult-4x4-7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult-1x4-7_MMult-4x4-7.png


--------------------------------------------------------------------------------
/figures/compare_MMult-1x4-8_MMult-1x4-9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult-1x4-8_MMult-1x4-9.png


--------------------------------------------------------------------------------
/figures/compare_MMult-1x4-9_MMult-4x4-10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult-1x4-9_MMult-4x4-10.png


--------------------------------------------------------------------------------
/figures/compare_MMult-4x4-10_MMult-4x4-11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult-4x4-10_MMult-4x4-11.png


--------------------------------------------------------------------------------
/figures/compare_MMult-4x4-11_MMult-4x4-12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult-4x4-11_MMult-4x4-12.png


--------------------------------------------------------------------------------
/figures/compare_MMult-4x4-11_MMult-4x4-13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult-4x4-11_MMult-4x4-13.png


--------------------------------------------------------------------------------
/figures/compare_MMult-4x4-12_MMult-4x4-13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult-4x4-12_MMult-4x4-13.png


--------------------------------------------------------------------------------
/figures/compare_MMult-4x4-13_MMult-4x4-14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult-4x4-13_MMult-4x4-14.png


--------------------------------------------------------------------------------
/figures/compare_MMult-4x4-13_MMult-4x4-15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult-4x4-13_MMult-4x4-15.png


--------------------------------------------------------------------------------
/figures/compare_MMult-4x4-13_MMult_4x4_15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult-4x4-13_MMult_4x4_15.png


--------------------------------------------------------------------------------
/figures/compare_MMult-4x4-14_MMult-4x4-15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult-4x4-14_MMult-4x4-15.png


--------------------------------------------------------------------------------
/figures/compare_MMult-4x4-3_MMult-4x4-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult-4x4-3_MMult-4x4-4.png


--------------------------------------------------------------------------------
/figures/compare_MMult-4x4-4_MMult-4x4-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult-4x4-4_MMult-4x4-5.png


--------------------------------------------------------------------------------
/figures/compare_MMult-4x4-5_MMult-4x4-6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult-4x4-5_MMult-4x4-6.png


--------------------------------------------------------------------------------
/figures/compare_MMult-4x4-6_MMult-4x4-7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult-4x4-6_MMult-4x4-7.png


--------------------------------------------------------------------------------
/figures/compare_MMult-4x4-7_MMult-4x4-8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult-4x4-7_MMult-4x4-8.png


--------------------------------------------------------------------------------
/figures/compare_MMult-4x4-8_MMult-4x4-9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult-4x4-8_MMult-4x4-9.png


--------------------------------------------------------------------------------
/figures/compare_MMult-4x4-9_MMult-4x4-10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult-4x4-9_MMult-4x4-10.png


--------------------------------------------------------------------------------
/figures/compare_MMult0_MMult-1x4-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult0_MMult-1x4-5.png


--------------------------------------------------------------------------------
/figures/compare_MMult0_MMult-1x4-9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult0_MMult-1x4-9.png


--------------------------------------------------------------------------------
/figures/compare_MMult0_MMult-4x4-10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult0_MMult-4x4-10.png


--------------------------------------------------------------------------------
/figures/compare_MMult0_MMult-4x4-11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult0_MMult-4x4-11.png


--------------------------------------------------------------------------------
/figures/compare_MMult0_MMult-4x4-13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult0_MMult-4x4-13.png


--------------------------------------------------------------------------------
/figures/compare_MMult0_MMult-4x4-15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult0_MMult-4x4-15.png


--------------------------------------------------------------------------------
/figures/compare_MMult0_MMult-4x4-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult0_MMult-4x4-5.png


--------------------------------------------------------------------------------
/figures/compare_MMult0_MMult0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult0_MMult0.png


--------------------------------------------------------------------------------
/figures/compare_MMult0_MMult1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult0_MMult1.png


--------------------------------------------------------------------------------
/figures/compare_MMult0_MMult2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult0_MMult2.png


--------------------------------------------------------------------------------
/figures/compare_MMult0_MMult_4x4_15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult0_MMult_4x4_15.png


--------------------------------------------------------------------------------
/figures/compare_MMult0_vs_MMult0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult0_vs_MMult0.png


--------------------------------------------------------------------------------
/figures/compare_MMult1_MMult2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult1_MMult2.png


--------------------------------------------------------------------------------
/figures/compare_MMult2_MMult-1x4-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult2_MMult-1x4-3.png


--------------------------------------------------------------------------------
/figures/compare_MMult2_MMult-4x4-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/compare_MMult2_MMult-4x4-3.png


--------------------------------------------------------------------------------
/figures/graph_10_vs_11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/graph_10_vs_11.png


--------------------------------------------------------------------------------
/figures/graph_1_vs_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/graph_1_vs_2.png


--------------------------------------------------------------------------------
/figures/graph_2_vs_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/graph_2_vs_3.png


--------------------------------------------------------------------------------
/figures/graph_3_vs_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/graph_3_vs_4.png


--------------------------------------------------------------------------------
/figures/graph_4_vs_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/graph_4_vs_5.png


--------------------------------------------------------------------------------
/figures/graph_5_vs_6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/graph_5_vs_6.png


--------------------------------------------------------------------------------
/figures/graph_6_vs_7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/graph_6_vs_7.png


--------------------------------------------------------------------------------
/figures/graph_7_vs_8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/graph_7_vs_8.png


--------------------------------------------------------------------------------
/figures/graph_7_vs_9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/graph_7_vs_9.png


--------------------------------------------------------------------------------
/figures/graph_8_vs_10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/graph_8_vs_10.png


--------------------------------------------------------------------------------
/figures/graph_8_vs_9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/figures/graph_8_vs_9.png


--------------------------------------------------------------------------------
/src/HowToOptimizeGemm.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/how-to-optimize-gemm/40b1ec685db23e63ed92f3f60b0fef4cd193455d/src/HowToOptimizeGemm.tar.gz


--------------------------------------------------------------------------------
/src/HowToOptimizeGemm/MMult0.c:
--------------------------------------------------------------------------------
 1 | /* Create macros so that the matrices are stored in column-major order */
 2 | 
 3 | #define A(i,j) a[ (j)*lda + (i) ]
 4 | #define B(i,j) b[ (j)*ldb + (i) ]
 5 | #define C(i,j) c[ (j)*ldc + (i) ]
 6 | 
 7 | /* Routine for computing C = A * B + C */
 8 | 
 9 | void MY_MMult( int m, int n, int k, double *a, int lda, 
10 |                                     double *b, int ldb,
11 |                                     double *c, int ldc )
12 | {
13 |   int i, j, p;
14 | 
15 |   for ( i=0; i<m; i++ ){        /* Loop over the rows of C */
16 |     for ( j=0; j<n; j++ ){        /* Loop over the columns of C */
17 |       for ( p=0; p<k; p++ ){        /* Update C( i,j ) with the inner
18 | 				       product of the ith row of A and
19 | 				       the jth column of B */
20 | 	C( i,j ) = C( i,j ) +  A( i,p ) * B( p,j );
21 |       }
22 |     }
23 |   }
24 | }
25 | 
26 | 
27 |   
28 | 


--------------------------------------------------------------------------------
/src/HowToOptimizeGemm/MMult1.c:
--------------------------------------------------------------------------------
 1 | /* Create macros so that the matrices are stored in column-major order */
 2 | 
 3 | #define A(i,j) a[ (j)*lda + (i) ]
 4 | #define B(i,j) b[ (j)*ldb + (i) ]
 5 | #define C(i,j) c[ (j)*ldc + (i) ]
 6 | 
 7 | /* Routine for computing C = A * B + C */
 8 | 
 9 | void AddDot( int, double *, int, double *, double * );
10 | 
11 | void MY_MMult( int m, int n, int k, double *a, int lda, 
12 |                                     double *b, int ldb,
13 |                                     double *c, int ldc )
14 | {
15 |   int i, j;
16 | 
17 |   for ( j=0; j<n; j+=1 ){        /* Loop over the columns of C */
18 |     for ( i=0; i<m; i+=1 ){        /* Loop over the rows of C */
19 |       /* Update the C( i,j ) with the inner product of the ith row of A
20 | 	 and the jth column of B */
21 | 
22 |       AddDot( k, &A( i,0 ), lda, &B( 0,j ), &C( i,j ) );
23 |     }
24 |   }
25 | }
26 | 
27 | 
28 | /* Create macro to let X( i ) equal the ith element of x */
29 | 
30 | #define X(i) x[ (i)*incx ]
31 | 
32 | void AddDot( int k, double *x, int incx,  double *y, double *gamma )
33 | {
34 |   /* compute gamma := x' * y + gamma with vectors x and y of length n.
35 | 
36 |      Here x starts at location x with increment (stride) incx and y starts at location y and has (implicit) stride of 1.
37 |   */
38 |  
39 |   int p;
40 | 
41 |   for ( p=0; p<k; p++ ){
42 |     *gamma += X( p ) * y[ p ];     
43 |   }
44 | }
45 | 


--------------------------------------------------------------------------------
/src/HowToOptimizeGemm/PlotAll.m:
--------------------------------------------------------------------------------
 1 | %
 2 | % Clear all variables and close all graphs
 3 | %
 4 | 
 5 | clear all
 6 | close all
 7 | 
 8 | %
 9 | % Get max_gflops from /proc/cpuinfo by reading the parameters
10 | % set in file proc_parameters.m
11 | %
12 | 
13 | proc_parameters
14 | 
15 | max_gflops = nflops_per_cycle * nprocessors * GHz_of_processor;
16 | 
17 | %
18 | % Read in the first data set and plot it.
19 | %
20 | 
21 | output_old
22 | 
23 | version_old = version;
24 | 
25 | plot( MY_MMult( :,1 ), MY_MMult( :,2 ), 'bo-.;OLD;' );
26 | last = size( MY_MMult, 1 );
27 | 
28 | hold on
29 | 
30 | axis( [ 0 MY_MMult( last,1 ) 0 max_gflops ] );
31 | 
32 | xlabel( 'm = n = k' );
33 | ylabel( 'GFLOPS/sec.' );
34 | 
35 | %
36 | % Read in second data set and plot it.
37 | %
38 | 
39 | output_new
40 | 
41 | version_new = version
42 | 
43 | title_string = sprintf("OLD = %s, NEW = %s", version_old, version_new);
44 | 
45 | plot( MY_MMult( :,1 ), MY_MMult( :,2 ), 'r-*;NEW;' );
46 | 
47 | title( title_string );
48 | 
49 | filename = sprintf( "compare_%s_%s", version_old, version_new );
50 | 
51 | print( filename, '-dpng' );
52 | 


--------------------------------------------------------------------------------
/src/HowToOptimizeGemm/PlotAll.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | 
 4 | # Indicate the number of floating point operations that can be executed
 5 | # per clock cycle
 6 | nflops_per_cycle = 4
 7 | 
 8 | # Indicate the number of processors being used (in case you are using a
 9 | # multicore or SMP)
10 | nprocessors = 1
11 | 
12 | # Indicate the clock speed of the processor.  On a Linux machine this info
13 | # can be found in the file /proc/cpuinfo
14 | #
15 | # Note: some processors have a "turbo boost" mode, which increases
16 | # the peak clock rate...
17 | #
18 | GHz_of_processor = 2.0
19 | 
20 | 
21 | class Parser:
22 |     def __init__(self, file_name) -> None:
23 |         self.attrs = {}
24 |         with open(file_name) as file:
25 |             self.toks = file.read().split()
26 |             self.toksi = 0
27 |             file.close()
28 |             self.attrs = self.parse()
29 | 
30 |     def next(self):
31 |         tok = self.toks[self.toksi]
32 |         self.toksi += 1
33 |         return tok
34 | 
35 |     def get_var_name(self):
36 |         return self.next()
37 | 
38 |     def get_symbol(self, sym):
39 |         tok = self.next()
40 |         assert(tok == sym)
41 |         return tok
42 | 
43 |     def get_value(self):
44 |         value = None
45 |         tok = self.next()
46 |         if tok == '[':
47 |             # list
48 |             value = []
49 |             tok = self.next()
50 |             while not tok.startswith(']'):
51 |                 value.append(float(tok))
52 |                 tok = self.next()
53 |         elif tok.startswith("'"):
54 |             value = tok[1:-2]
55 |         
56 |         assert value != None
57 |         return value
58 |     
59 |     def parse(self):
60 |         res = {}
61 |         while self.toksi < len(self.toks):
62 |             var = self.get_var_name()
63 |             self.get_symbol('=')
64 |             val = self.get_value()
65 |             res[var] = val
66 |         return res
67 |     
68 |     def __getattr__(self, name):
69 |         return self.attrs[name]
70 | 
71 | old = Parser("output_old.m")
72 | new = Parser("output_new.m")
73 | 
74 | #print(old)
75 | #print(new)
76 | 
77 | old_data = np.array(old.MY_MMult).reshape(-1, 3)
78 | new_data = np.array(new.MY_MMult).reshape(-1, 3)
79 | 
80 | max_gflops = nflops_per_cycle * nprocessors * GHz_of_processor;
81 | 
82 | fig, ax = plt.subplots()
83 | ax.plot(old_data[:,0], old_data[:,1], 'bo-.', label='old:' + old.version)
84 | ax.plot(new_data[:,0], new_data[:,1], 'r-*', label='new:' + new.version)
85 | 
86 | ax.set(xlabel='m = n = k', ylabel='GFLOPS/sec.',
87 |        title="OLD = {}, NEW = {}".format(old.version, new.version))
88 | ax.grid()
89 | ax.legend()
90 | 
91 | ax.set_xlim([old_data[0,0], old_data[-1,0]])
92 | ax.set_ylim([0, max_gflops])
93 | 
94 | # fig.savefig("test.png")
95 | plt.show()


--------------------------------------------------------------------------------
/src/HowToOptimizeGemm/REF_MMult.c:
--------------------------------------------------------------------------------
 1 | /* Create macros so that the matrices are stored in column-major order */
 2 | 
 3 | #define A(i,j) a[ (j)*lda + (i) ]
 4 | #define B(i,j) b[ (j)*ldb + (i) ]
 5 | #define C(i,j) c[ (j)*ldc + (i) ]
 6 | 
 7 | /* Routine for computing C = A * B + C */
 8 | 
 9 | void REF_MMult( int m, int n, int k, double *a, int lda, 
10 |                                     double *b, int ldb,
11 |                                     double *c, int ldc )
12 | {
13 |   int i, j, p;
14 | 
15 |   for ( i=0; i<m; i++ ){
16 |     for ( j=0; j<n; j++ ){
17 |       for ( p=0; p<k; p++ ){
18 | 	C( i,j ) = C( i,j ) +  A( i,p ) * B( p,j );
19 |       }
20 |     }
21 |   }
22 | }
23 | 
24 | 
25 |   
26 | 


--------------------------------------------------------------------------------
/src/HowToOptimizeGemm/compare_matrices.c:
--------------------------------------------------------------------------------
 1 | #define A( i, j ) a[ (j)*lda + (i) ]
 2 | #define B( i, j ) b[ (j)*ldb + (i) ]
 3 | #define abs( x ) ( (x) < 0.0 ? -(x) : (x) )
 4 | 
 5 | double compare_matrices( int m, int n, double *a, int lda, double *b, int ldb )
 6 | {
 7 |   int i, j;
 8 |   double max_diff = 0.0, diff;
 9 | 
10 |   for ( j=0; j<n; j++ )
11 |     for ( i=0; i<m; i++ ){
12 |       diff = abs( A( i,j ) - B( i,j ) );
13 |       max_diff = ( diff > max_diff ? diff : max_diff );
14 |     }
15 | 
16 |   return max_diff;
17 | }
18 | 
19 | 


--------------------------------------------------------------------------------
/src/HowToOptimizeGemm/copy_matrix.c:
--------------------------------------------------------------------------------
 1 | #define A( i, j ) a[ (j)*lda + (i) ]
 2 | #define B( i, j ) b[ (j)*ldb + (i) ]
 3 | 
 4 | void copy_matrix( int m, int n, double *a, int lda, double *b, int ldb )
 5 | {
 6 |   int i, j;
 7 | 
 8 |   for ( j=0; j<n; j++ )
 9 |     for ( i=0; i<m; i++ )
10 |       B( i,j ) = A( i,j );
11 | }
12 | 
13 | 


--------------------------------------------------------------------------------
/src/HowToOptimizeGemm/dclock.c:
--------------------------------------------------------------------------------
 1 | #include <sys/time.h>
 2 | #include <time.h>
 3 | 
 4 | static double gtod_ref_time_sec = 0.0;
 5 | 
 6 | /* Adapted from the bl2_clock() routine in the BLIS library */
 7 | 
 8 | double dclock()
 9 | {
10 |         double         the_time, norm_sec;
11 |         struct timeval tv;
12 | 
13 |         gettimeofday( &tv, NULL );
14 | 
15 |         if ( gtod_ref_time_sec == 0.0 )
16 |                 gtod_ref_time_sec = ( double ) tv.tv_sec;
17 | 
18 |         norm_sec = ( double ) tv.tv_sec - gtod_ref_time_sec;
19 | 
20 |         the_time = norm_sec + tv.tv_usec * 1.0e-6;
21 | 
22 |         return the_time;
23 | }
24 | 
25 | 


--------------------------------------------------------------------------------
/src/HowToOptimizeGemm/makefile:
--------------------------------------------------------------------------------
 1 | OLD  := MMult0
 2 | NEW  := MMult0
 3 | #
 4 | # sample makefile
 5 | #
 6 | 
 7 | CC         := gcc
 8 | LINKER     := $(CC)
 9 | CFLAGS     := -O2 -Wall -msse3
10 | LDFLAGS    := -lm
11 | 
12 | UTIL       := copy_matrix.o \
13 |               compare_matrices.o \
14 |               random_matrix.o \
15 |               dclock.o \
16 |               REF_MMult.o \
17 |               print_matrix.o
18 | 
19 | TEST_OBJS  := test_MMult.o $(NEW).o 
20 | 
21 | %.o: %.c
22 | 	$(CC) $(CFLAGS) -c $< -o $@
23 | %.o: %.c
24 | 	$(CC) $(CFLAGS) -c $< -o $@
25 | 
26 | all: 
27 | 	make clean;
28 | 	make test_MMult.x
29 | 
30 | test_MMult.x: $(TEST_OBJS) $(UTIL) parameters.h
31 | 	$(LINKER) $(TEST_OBJS) $(UTIL) $(LDFLAGS) \
32 |         $(BLAS_LIB) -o $(TEST_BIN) $@ 
33 | 
34 | run:	
35 | 	make all
36 | 	export OMP_NUM_THREADS=1
37 | 	export GOTO_NUM_THREADS=1
38 | 	echo "version = '$(NEW)';" > output_$(NEW).m
39 | 	./test_MMult.x >> output_$(NEW).m
40 | 	cp output_$(OLD).m output_old.m
41 | 	cp output_$(NEW).m output_new.m
42 | 
43 | clean:
44 | 	rm -f *.o *~ core *.x
45 | 
46 | cleanall:
47 | 	rm -f *.o *~ core *.x output*.m *.eps *.png
48 | 


--------------------------------------------------------------------------------
/src/HowToOptimizeGemm/parameters.h:
--------------------------------------------------------------------------------
 1 | /* 
 2 | In the test driver, there is a loop "for ( p=PFIRST; p<= PLAST; p+= PINC )"
 3 | The below parameters set this range of values that p takes on 
 4 | */   
 5 | #define PFIRST 40
 6 | #define PLAST  800
 7 | #define PINC   40
 8 | 
 9 | /* 
10 | In the test driver, the m, n, and k dimensions are set to the below 
11 | values.  If the value equals "-1" then that dimension is bound to the
12 | index p, given above.
13 | */
14 | 
15 | #define M -1
16 | #define N -1
17 | #define K -1
18 | 
19 | /* 
20 | In the test driver, each experiment is repeated NREPEATS times and
21 | the best time from these repeats is used to compute the performance
22 | */
23 | 
24 | #define NREPEATS 2
25 | 
26 | /* 
27 | Matrices A, B, and C are stored in two dimensional arrays with
28 | row dimensions that are greater than or equal to the row dimension
29 | of the matrix.  This row dimension of the array is known as the 
30 | "leading dimension" and determines the stride (the number of 
31 | double precision numbers) when one goes from one element in a row
32 | to the next.  Having this number larger than the row dimension of
33 | the matrix tends to adversely affect performance.  LDX equals the
34 | leading dimension of the array that stores matrix X.  If LDX=-1 
35 | then the leading dimension is set to the row dimension of matrix X.
36 | */
37 | 
38 | #define LDA 1000
39 | #define LDB 1000
40 | #define LDC 1000
41 | 


--------------------------------------------------------------------------------
/src/HowToOptimizeGemm/print_matrix.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | #define A( i, j ) a[ (j)*lda + (i) ]
 4 | 
 5 | void print_matrix( int m, int n, double *a, int lda )
 6 | {
 7 |   int i, j;
 8 | 
 9 |   for ( j=0; j<n; j++ ){
10 |     for ( i=0; i<m; i++ )
11 |       printf("%le ", A( i,j ) );
12 |     printf("\n");
13 |   }
14 |   printf("\n");
15 | }
16 | 
17 | 


--------------------------------------------------------------------------------
/src/HowToOptimizeGemm/proc_parameters.m:
--------------------------------------------------------------------------------
 1 | % Indicate the number of floating point operations that can be executed
 2 | % per clock cycle
 3 | %
 4 | 
 5 | nflops_per_cycle = 4;
 6 | 
 7 | %
 8 | % Indicate the number of processors being used (in case you are using a
 9 | % multicore or SMP)
10 | %
11 | 
12 | nprocessors = 1;
13 | 
14 | %
15 | % Indicate the clock speed of the processor.  On a Linux machine this info
16 | % can be found in the file /proc/cpuinfo
17 | %
18 | % Note: some processors have a "turbo boost" mode, which increases
19 | % the peak clock rate...
20 | %
21 | 
22 | GHz_of_processor = 2.6;
23 | 


--------------------------------------------------------------------------------
/src/HowToOptimizeGemm/random_matrix.c:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | 
 3 | #define A( i,j ) a[ (j)*lda + (i) ]
 4 | 
 5 | void random_matrix( int m, int n, double *a, int lda )
 6 | {
 7 |   double drand48();
 8 |   int i,j;
 9 | 
10 |   for ( j=0; j<n; j++ )
11 |     for ( i=0; i<m; i++ )
12 |       A( i,j ) = 2.0 * drand48( ) - 1.0;
13 | }
14 | 


--------------------------------------------------------------------------------
/src/HowToOptimizeGemm/test_MMult.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | // #include <malloc.h>
 3 | #include <stdlib.h>
 4 | 
 5 | #include "parameters.h"
 6 | 
 7 | void REF_MMult(int, int, int, double *, int, double *, int, double *, int );
 8 | void MY_MMult(int, int, int, double *, int, double *, int, double *, int );
 9 | void copy_matrix(int, int, double *, int, double *, int );
10 | void random_matrix(int, int, double *, int);
11 | double compare_matrices( int, int, double *, int, double *, int );
12 | 
13 | double dclock();
14 | 
15 | int main()
16 | {
17 |   int 
18 |     p, 
19 |     m, n, k,
20 |     lda, ldb, ldc, 
21 |     rep;
22 | 
23 |   double
24 |     dtime, dtime_best,        
25 |     gflops, 
26 |     diff;
27 | 
28 |   double 
29 |     *a, *b, *c, *cref, *cold;    
30 |   
31 |   printf( "MY_MMult = [\n" );
32 |     
33 |   for ( p=PFIRST; p<=PLAST; p+=PINC ){
34 |     m = ( M == -1 ? p : M );
35 |     n = ( N == -1 ? p : N );
36 |     k = ( K == -1 ? p : K );
37 | 
38 |     gflops = 2.0 * m * n * k * 1.0e-09;
39 | 
40 |     lda = ( LDA == -1 ? m : LDA );
41 |     ldb = ( LDB == -1 ? k : LDB );
42 |     ldc = ( LDC == -1 ? m : LDC );
43 | 
44 |     /* Allocate space for the matrices */
45 |     /* Note: I create an extra column in A to make sure that
46 |        prefetching beyond the matrix does not cause a segfault */
47 |     a = ( double * ) malloc( lda * (k+1) * sizeof( double ) );  
48 |     b = ( double * ) malloc( ldb * n * sizeof( double ) );
49 |     c = ( double * ) malloc( ldc * n * sizeof( double ) );
50 |     cold = ( double * ) malloc( ldc * n * sizeof( double ) );
51 |     cref = ( double * ) malloc( ldc * n * sizeof( double ) );
52 | 
53 |     /* Generate random matrices A, B, Cold */
54 |     random_matrix( m, k, a, lda );
55 |     random_matrix( k, n, b, ldb );
56 |     random_matrix( m, n, cold, ldc );
57 | 
58 |     copy_matrix( m, n, cold, ldc, cref, ldc );
59 | 
60 |     /* Run the reference implementation so the answers can be compared */
61 | 
62 |     REF_MMult( m, n, k, a, lda, b, ldb, cref, ldc );
63 | 
64 |     /* Time the "optimized" implementation */
65 |     for ( rep=0; rep<NREPEATS; rep++ ){
66 |       copy_matrix( m, n, cold, ldc, c, ldc );
67 | 
68 |       /* Time your implementation */
69 |       dtime = dclock();
70 | 
71 |       MY_MMult( m, n, k, a, lda, b, ldb, c, ldc );
72 |       
73 |       dtime = dclock() - dtime;
74 | 
75 |       if ( rep==0 )
76 | 	dtime_best = dtime;
77 |       else
78 | 	dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
79 |     }
80 | 
81 |     diff = compare_matrices( m, n, c, ldc, cref, ldc );
82 | 
83 |     printf( "%d %le %le \n", p, gflops / dtime_best, diff );
84 |     fflush( stdout );
85 | 
86 |     free( a );
87 |     free( b );
88 |     free( c );
89 |     free( cold );
90 |     free( cref );
91 |   }
92 | 
93 |   printf( "];\n" );
94 | 
95 |   exit( 0 );
96 | }
97 | 
98 | 


--------------------------------------------------------------------------------
/src/MMult1.c:
--------------------------------------------------------------------------------
 1 | /* Create macros so that the matrices are stored in column-major order */
 2 | 
 3 | #define A(i,j) a[ (j)*lda + (i) ]
 4 | #define B(i,j) b[ (j)*ldb + (i) ]
 5 | #define C(i,j) c[ (j)*ldc + (i) ]
 6 | 
 7 | /* Routine for computing C = A * B + C */
 8 | 
 9 | void AddDot( int, double *, int, double *, double * );
10 | 
11 | void MY_MMult( int m, int n, int k, double *a, int lda, 
12 |                                     double *b, int ldb,
13 |                                     double *c, int ldc )
14 | {
15 |   int i, j;
16 | 
17 |   for ( j=0; j<n; j+=1 ){        /* Loop over the columns of C */
18 |     for ( i=0; i<m; i+=1 ){        /* Loop over the rows of C */
19 |       /* Update the C( i,j ) with the inner product of the ith row of A
20 | 	 and the jth column of B */
21 | 
22 |       AddDot( k, &A( i,0 ), lda, &B( 0,j ), &C( i,j ) );
23 |     }
24 |   }
25 | }
26 | 
27 | 
28 | /* Create macro to let X( i ) equal the ith element of x */
29 | 
30 | #define X(i) x[ (i)*incx ]
31 | 
32 | void AddDot( int k, double *x, int incx,  double *y, double *gamma )
33 | {
34 |   /* compute gamma := x' * y + gamma with vectors x and y of length n.
35 | 
36 |      Here x starts at location x with increment (stride) incx and y starts at location y and has (implicit) stride of 1.
37 |   */
38 |  
39 |   int p;
40 | 
41 |   for ( p=0; p<k; p++ ){
42 |     *gamma += X( p ) * y[ p ];     
43 |   }
44 | }
45 | 


--------------------------------------------------------------------------------
/src/MMult2.c:
--------------------------------------------------------------------------------
 1 | /* Create macros so that the matrices are stored in column-major order */
 2 | 
 3 | #define A(i,j) a[ (j)*lda + (i) ]
 4 | #define B(i,j) b[ (j)*ldb + (i) ]
 5 | #define C(i,j) c[ (j)*ldc + (i) ]
 6 | 
 7 | /* Routine for computing C = A * B + C */
 8 | 
 9 | void AddDot( int, double *, int, double *, double * );
10 | 
11 | void MY_MMult( int m, int n, int k, double *a, int lda, 
12 |                                     double *b, int ldb,
13 |                                     double *c, int ldc )
14 | {
15 |   int i, j;
16 | 
17 |   for ( j=0; j<n; j+=4 ){        /* Loop over the columns of C, unrolled by 4 */
18 |     for ( i=0; i<m; i+=1 ){        /* Loop over the rows of C */
19 |       /* Update the C( i,j ) with the inner product of the ith row of A
20 | 	 and the jth column of B */
21 | 
22 |       AddDot( k, &A( i,0 ), lda, &B( 0,j ), &C( i,j ) );
23 | 
24 |       /* Update the C( i,j+1 ) with the inner product of the ith row of A
25 | 	 and the (j+1)th column of B */
26 | 
27 |       AddDot( k, &A( i,0 ), lda, &B( 0,j+1 ), &C( i,j+1 ) );
28 | 
29 |       /* Update the C( i,j+2 ) with the inner product of the ith row of A
30 | 	 and the (j+2)th column of B */
31 | 
32 |       AddDot( k, &A( i,0 ), lda, &B( 0,j+2 ), &C( i,j+2 ) );
33 | 
34 |       /* Update the C( i,j+3 ) with the inner product of the ith row of A
35 | 	 and the (j+1)th column of B */
36 | 
37 |       AddDot( k, &A( i,0 ), lda, &B( 0,j+3 ), &C( i,j+3 ) );
38 |     }
39 |   }
40 | }
41 | 
42 | 
43 | /* Create macro to let X( i ) equal the ith element of x */
44 | 
45 | #define X(i) x[ (i)*incx ]
46 | 
47 | void AddDot( int k, double *x, int incx,  double *y, double *gamma )
48 | {
49 |   /* compute gamma := x' * y + gamma with vectors x and y of length n.
50 | 
51 |      Here x starts at location x with increment (stride) incx and y starts at location y and has (implicit) stride of 1.
52 |   */
53 |  
54 |   int p;
55 | 
56 |   for ( p=0; p<k; p++ ){
57 |     *gamma += X( p ) * y[ p ];     
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/src/MMult_1x4_3.c:
--------------------------------------------------------------------------------
 1 | /* Create macros so that the matrices are stored in column-major order */
 2 | 
 3 | #define A(i,j) a[ (j)*lda + (i) ]
 4 | #define B(i,j) b[ (j)*ldb + (i) ]
 5 | #define C(i,j) c[ (j)*ldc + (i) ]
 6 | 
 7 | /* Routine for computing C = A * B + C */
 8 | 
 9 | void AddDot( int, double *, int, double *, double * );
10 | void AddDot1x4( int, double *, int,  double *, int, double *, int );
11 | 
12 | void MY_MMult( int m, int n, int k, double *a, int lda, 
13 |                                     double *b, int ldb,
14 |                                     double *c, int ldc )
15 | {
16 |   int i, j;
17 | 
18 |   for ( j=0; j<n; j+=4 ){        /* Loop over the columns of C, unrolled by 4 */
19 |     for ( i=0; i<m; i+=1 ){        /* Loop over the rows of C */
20 |       /* Update C( i,j ), C( i,j+1 ), C( i,j+2 ), and C( i,j+3 ) in
21 | 	 one routine (four inner products) */
22 | 
23 |       AddDot1x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );
24 |     }
25 |   }
26 | }
27 | 
28 | 
29 | void AddDot1x4( int k, double *a, int lda,  double *b, int ldb, double *c, int ldc )
30 | {
31 |   /* So, this routine computes four elements of C: 
32 | 
33 |            C( 0, 0 ), C( 0, 1 ), C( 0, 2 ), C( 0, 3 ).  
34 | 
35 |      Notice that this routine is called with c = C( i, j ) in the
36 |      previous routine, so these are actually the elements 
37 | 
38 |            C( i, j ), C( i, j+1 ), C( i, j+2 ), C( i, j+3 ) 
39 | 	  
40 |      in the original matrix C */ 
41 | 
42 |   AddDot( k, &A( 0, 0 ), lda, &B( 0, 0 ), &C( 0, 0 ) );
43 |   AddDot( k, &A( 0, 0 ), lda, &B( 0, 1 ), &C( 0, 1 ) );
44 |   AddDot( k, &A( 0, 0 ), lda, &B( 0, 2 ), &C( 0, 2 ) );
45 |   AddDot( k, &A( 0, 0 ), lda, &B( 0, 3 ), &C( 0, 3 ) );
46 | }
47 | 
48 | 
49 | /* Create macro to let X( i ) equal the ith element of x */
50 | 
51 | #define X(i) x[ (i)*incx ]
52 | 
53 | void AddDot( int k, double *x, int incx,  double *y, double *gamma )
54 | {
55 |   /* compute gamma := x' * y + gamma with vectors x and y of length n.
56 | 
57 |      Here x starts at location x with increment (stride) incx and y starts at location y and has (implicit) stride of 1.
58 |   */
59 |  
60 |   int p;
61 | 
62 |   for ( p=0; p<k; p++ ){
63 |     *gamma += X( p ) * y[ p ];     
64 |   }
65 | }
66 | 


--------------------------------------------------------------------------------
/src/MMult_1x4_4.c:
--------------------------------------------------------------------------------
 1 | /* Create macros so that the matrices are stored in column-major order */
 2 | 
 3 | #define A(i,j) a[ (j)*lda + (i) ]
 4 | #define B(i,j) b[ (j)*ldb + (i) ]
 5 | #define C(i,j) c[ (j)*ldc + (i) ]
 6 | 
 7 | /* Routine for computing C = A * B + C */
 8 | 
 9 | void AddDot1x4( int, double *, int,  double *, int, double *, int );
10 | 
11 | void MY_MMult( int m, int n, int k, double *a, int lda, 
12 |                                     double *b, int ldb,
13 |                                     double *c, int ldc )
14 | {
15 |   int i, j;
16 | 
17 |   for ( j=0; j<n; j+=4 ){        /* Loop over the columns of C, unrolled by 4 */
18 |     for ( i=0; i<m; i+=1 ){        /* Loop over the rows of C */
19 |       /* Update C( i,j ), C( i,j+1 ), C( i,j+2 ), and C( i,j+3 ) in
20 | 	 one routine (four inner products) */
21 | 
22 |       AddDot1x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );
23 |     }
24 |   }
25 | }
26 | 
27 | 
28 | void AddDot1x4( int k, double *a, int lda,  double *b, int ldb, double *c, int ldc )
29 | {
30 |   /* So, this routine computes four elements of C: 
31 | 
32 |            C( 0, 0 ), C( 0, 1 ), C( 0, 2 ), C( 0, 3 ).  
33 | 
34 |      Notice that this routine is called with c = C( i, j ) in the
35 |      previous routine, so these are actually the elements 
36 | 
37 |            C( i, j ), C( i, j+1 ), C( i, j+2 ), C( i, j+3 ) 
38 | 	  
39 |      in the original matrix C.
40 | 
41 |      In this version, we "inline" AddDot */ 
42 | 
43 |   int p;
44 | 
45 |   //  AddDot( k, &A( 0, 0 ), lda, &B( 0, 0 ), &C( 0, 0 ) );
46 |   for ( p=0; p<k; p++ ){
47 |     C( 0, 0 ) += A( 0, p ) * B( p, 0 );     
48 |   }
49 | 
50 |   //  AddDot( k, &A( 0, 0 ), lda, &B( 0, 1 ), &C( 0, 1 ) );
51 |   for ( p=0; p<k; p++ ){
52 |     C( 0, 1 ) += A( 0, p ) * B( p, 1 );     
53 |   }
54 | 
55 |   //  AddDot( k, &A( 0, 0 ), lda, &B( 0, 2 ), &C( 0, 2 ) );
56 |   for ( p=0; p<k; p++ ){
57 |     C( 0, 2 ) += A( 0, p ) * B( p, 2 );     
58 |   }
59 | 
60 |   //  AddDot( k, &A( 0, 0 ), lda, &B( 0, 3 ), &C( 0, 3 ) );
61 |   for ( p=0; p<k; p++ ){
62 |     C( 0, 3 ) += A( 0, p ) * B( p, 3 );     
63 |   }
64 | }
65 | 


--------------------------------------------------------------------------------
/src/MMult_1x4_5.c:
--------------------------------------------------------------------------------
 1 | /* Create macros so that the matrices are stored in column-major order */
 2 | 
 3 | #define A(i,j) a[ (j)*lda + (i) ]
 4 | #define B(i,j) b[ (j)*ldb + (i) ]
 5 | #define C(i,j) c[ (j)*ldc + (i) ]
 6 | 
 7 | /* Routine for computing C = A * B + C */
 8 | 
 9 | void AddDot1x4( int, double *, int,  double *, int, double *, int );
10 | 
11 | void MY_MMult( int m, int n, int k, double *a, int lda, 
12 |                                     double *b, int ldb,
13 |                                     double *c, int ldc )
14 | {
15 |   int i, j;
16 | 
17 |   for ( j=0; j<n; j+=4 ){        /* Loop over the columns of C, unrolled by 4 */
18 |     for ( i=0; i<m; i+=1 ){        /* Loop over the rows of C */
19 |       /* Update C( i,j ), C( i,j+1 ), C( i,j+2 ), and C( i,j+3 ) in
20 | 	 one routine (four inner products) */
21 | 
22 |       AddDot1x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );
23 |     }
24 |   }
25 | }
26 | 
27 | 
28 | void AddDot1x4( int k, double *a, int lda,  double *b, int ldb, double *c, int ldc )
29 | {
30 |   /* So, this routine computes four elements of C: 
31 | 
32 |            C( 0, 0 ), C( 0, 1 ), C( 0, 2 ), C( 0, 3 ).  
33 | 
34 |      Notice that this routine is called with c = C( i, j ) in the
35 |      previous routine, so these are actually the elements 
36 | 
37 |            C( i, j ), C( i, j+1 ), C( i, j+2 ), C( i, j+3 ) 
38 | 	  
39 |      in the original matrix C.
40 | 
41 |      In this version, we merge the four loops, computing four inner
42 |      products simultaneously. */
43 | 
44 |   int p;
45 | 
46 |   //  AddDot( k, &A( 0, 0 ), lda, &B( 0, 0 ), &C( 0, 0 ) );
47 |   //  AddDot( k, &A( 0, 0 ), lda, &B( 0, 1 ), &C( 0, 1 ) );
48 |   //  AddDot( k, &A( 0, 0 ), lda, &B( 0, 2 ), &C( 0, 2 ) );
49 |   //  AddDot( k, &A( 0, 0 ), lda, &B( 0, 3 ), &C( 0, 3 ) );
50 |   for ( p=0; p<k; p++ ){
51 |     C( 0, 0 ) += A( 0, p ) * B( p, 0 );     
52 |     C( 0, 1 ) += A( 0, p ) * B( p, 1 );     
53 |     C( 0, 2 ) += A( 0, p ) * B( p, 2 );     
54 |     C( 0, 3 ) += A( 0, p ) * B( p, 3 );     
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------
/src/MMult_1x4_6.c:
--------------------------------------------------------------------------------
 1 | /* Create macros so that the matrices are stored in column-major order */
 2 | 
 3 | #define A(i,j) a[ (j)*lda + (i) ]
 4 | #define B(i,j) b[ (j)*ldb + (i) ]
 5 | #define C(i,j) c[ (j)*ldc + (i) ]
 6 | 
 7 | /* Routine for computing C = A * B + C */
 8 | 
 9 | void AddDot1x4( int, double *, int,  double *, int, double *, int )
10 | 
11 | void MY_MMult( int m, int n, int k, double *a, int lda, 
12 |                                     double *b, int ldb,
13 |                                     double *c, int ldc )
14 | {
15 |   int i, j;
16 | 
17 |   for ( j=0; j<n; j+=4 ){        /* Loop over the columns of C, unrolled by 4 */
18 |     for ( i=0; i<m; i+=1 ){        /* Loop over the rows of C */
19 |       /* Update C( i,j ), C( i,j+1 ), C( i,j+2 ), and C( i,j+3 ) in
20 | 	 one routine (four inner products) */
21 | 
22 |       AddDot1x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );
23 |     }
24 |   }
25 | }
26 | 
27 | 
28 | void AddDot1x4( int k, double *a, int lda,  double *b, int ldb, double *c, int ldc )
29 | {
30 |   /* So, this routine computes four elements of C: 
31 | 
32 |            C( 0, 0 ), C( 0, 1 ), C( 0, 2 ), C( 0, 3 ).  
33 | 
34 |      Notice that this routine is called with c = C( i, j ) in the
35 |      previous routine, so these are actually the elements 
36 | 
37 |            C( i, j ), C( i, j+1 ), C( i, j+2 ), C( i, j+3 ) 
38 | 	  
39 |      in the original matrix C.
40 | 
41 |      In this version, we accumulate in registers and put A( 0, p ) in a register */
42 | 
43 |   int p;
44 |   register double 
45 |     /* hold contributions to
46 |        C( 0, 0 ), C( 0, 1 ), C( 0, 2 ), C( 0, 3 ) */
47 |        c_00_reg,   c_01_reg,   c_02_reg,   c_03_reg,  
48 |     /* holds A( 0, p ) */
49 |        a_0p_reg;
50 |     
51 |   c_00_reg = 0.0; 
52 |   c_01_reg = 0.0; 
53 |   c_02_reg = 0.0; 
54 |   c_03_reg = 0.0;
55 |  
56 |   for ( p=0; p<k; p++ ){
57 |     a_0p_reg = A( 0, p );
58 | 
59 |     c_00_reg += a_0p_reg * B( p, 0 );     
60 |     c_01_reg += a_0p_reg * B( p, 1 );     
61 |     c_02_reg += a_0p_reg * B( p, 2 );     
62 |     c_03_reg += a_0p_reg * B( p, 3 );     
63 |   }
64 | 
65 |   C( 0, 0 ) += c_00_reg; 
66 |   C( 0, 1 ) += c_01_reg; 
67 |   C( 0, 2 ) += c_02_reg; 
68 |   C( 0, 3 ) += c_03_reg;
69 | }
70 | 


--------------------------------------------------------------------------------
/src/MMult_1x4_7.c:
--------------------------------------------------------------------------------
 1 | /* Create macros so that the matrices are stored in column-major order */
 2 | 
 3 | #define A(i,j) a[ (j)*lda + (i) ]
 4 | #define B(i,j) b[ (j)*ldb + (i) ]
 5 | #define C(i,j) c[ (j)*ldc + (i) ]
 6 | 
 7 | /* Routine for computing C = A * B + C */
 8 | 
 9 | void AddDot1x4( int, double *, int,  double *, int, double *, int )
10 | 
11 | void MY_MMult( int m, int n, int k, double *a, int lda, 
12 |                                     double *b, int ldb,
13 |                                     double *c, int ldc )
14 | {
15 |   int i, j;
16 | 
17 |   for ( j=0; j<n; j+=4 ){        /* Loop over the columns of C, unrolled by 4 */
18 |     for ( i=0; i<m; i+=1 ){        /* Loop over the rows of C */
19 |       /* Update C( i,j ), C( i,j+1 ), C( i,j+2 ), and C( i,j+3 ) in
20 | 	 one routine (four inner products) */
21 | 
22 |       AddDot1x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );
23 |     }
24 |   }
25 | }
26 | 
27 | 
28 | void AddDot1x4( int k, double *a, int lda,  double *b, int ldb, double *c, int ldc )
29 | {
30 |   /* So, this routine computes four elements of C: 
31 | 
32 |            C( 0, 0 ), C( 0, 1 ), C( 0, 2 ), C( 0, 3 ).  
33 | 
34 |      Notice that this routine is called with c = C( i, j ) in the
35 |      previous routine, so these are actually the elements 
36 | 
37 |            C( i, j ), C( i, j+1 ), C( i, j+2 ), C( i, j+3 ) 
38 | 	  
39 |      in the original matrix C.
40 | 
41 |      In this version, we use pointer to track where in four columns of B we are */
42 | 
43 |   int p;
44 |   register double 
45 |     /* hold contributions to
46 |        C( 0, 0 ), C( 0, 1 ), C( 0, 2 ), C( 0, 3 ) */
47 |        c_00_reg,   c_01_reg,   c_02_reg,   c_03_reg,  
48 |     /* holds A( 0, p ) */
49 |        a_0p_reg;
50 |   double 
51 |     /* Point to the current elements in the four columns of B */
52 |     *bp0_pntr, *bp1_pntr, *bp2_pntr, *bp3_pntr; 
53 |     
54 |   bp0_pntr = &B( 0, 0 );
55 |   bp1_pntr = &B( 0, 1 );
56 |   bp2_pntr = &B( 0, 2 );
57 |   bp3_pntr = &B( 0, 3 );
58 | 
59 |   c_00_reg = 0.0; 
60 |   c_01_reg = 0.0; 
61 |   c_02_reg = 0.0; 
62 |   c_03_reg = 0.0;
63 |  
64 |   for ( p=0; p<k; p++ ){
65 |     a_0p_reg = A( 0, p );
66 | 
67 |     c_00_reg += a_0p_reg * *bp0_pntr++;
68 |     c_01_reg += a_0p_reg * *bp1_pntr++;
69 |     c_02_reg += a_0p_reg * *bp2_pntr++;
70 |     c_03_reg += a_0p_reg * *bp3_pntr++;
71 |   }
72 | 
73 |   C( 0, 0 ) += c_00_reg; 
74 |   C( 0, 1 ) += c_01_reg; 
75 |   C( 0, 2 ) += c_02_reg; 
76 |   C( 0, 3 ) += c_03_reg;
77 | }
78 | 


--------------------------------------------------------------------------------
/src/MMult_1x4_8.c:
--------------------------------------------------------------------------------
 1 | /* Create macros so that the matrices are stored in column-major order */
 2 | 
 3 | #define A(i,j) a[ (j)*lda + (i) ]
 4 | #define B(i,j) b[ (j)*ldb + (i) ]
 5 | #define C(i,j) c[ (j)*ldc + (i) ]
 6 | 
 7 | /* Routine for computing C = A * B + C */
 8 | 
 9 | void AddDot1x4( int, double *, int,  double *, int, double *, int )
10 | 
11 | void MY_MMult( int m, int n, int k, double *a, int lda, 
12 |                                     double *b, int ldb,
13 |                                     double *c, int ldc )
14 | {
15 |   int i, j;
16 | 
17 |   for ( j=0; j<n; j+=4 ){        /* Loop over the columns of C, unrolled by 4 */
18 |     for ( i=0; i<m; i+=1 ){        /* Loop over the rows of C */
19 |       /* Update C( i,j ), C( i,j+1 ), C( i,j+2 ), and C( i,j+3 ) in
20 | 	 one routine (four inner products) */
21 | 
22 |       AddDot1x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );
23 |     }
24 |   }
25 | }
26 | 
27 | 
28 | void AddDot1x4( int k, double *a, int lda,  double *b, int ldb, double *c, int ldc )
29 | {
30 |   /* So, this routine computes four elements of C: 
31 | 
32 |            C( 0, 0 ), C( 0, 1 ), C( 0, 2 ), C( 0, 3 ).  
33 | 
34 |      Notice that this routine is called with c = C( i, j ) in the
35 |      previous routine, so these are actually the elements 
36 | 
37 |            C( i, j ), C( i, j+1 ), C( i, j+2 ), C( i, j+3 ) 
38 | 	  
39 |      in the original matrix C.
40 | 
41 |      We now unroll the loop */
42 | 
43 |   int p;
44 |   register double 
45 |     /* hold contributions to
46 |        C( 0, 0 ), C( 0, 1 ), C( 0, 2 ), C( 0, 3 ) */
47 |        c_00_reg,   c_01_reg,   c_02_reg,   c_03_reg,  
48 |     /* holds A( 0, p ) */
49 |        a_0p_reg;
50 |   double 
51 |     /* Point to the current elements in the four columns of B */
52 |     *bp0_pntr, *bp1_pntr, *bp2_pntr, *bp3_pntr; 
53 |     
54 |   bp0_pntr = &B( 0, 0 );
55 |   bp1_pntr = &B( 0, 1 );
56 |   bp2_pntr = &B( 0, 2 );
57 |   bp3_pntr = &B( 0, 3 );
58 | 
59 |   c_00_reg = 0.0; 
60 |   c_01_reg = 0.0; 
61 |   c_02_reg = 0.0; 
62 |   c_03_reg = 0.0;
63 |  
64 |   for ( p=0; p<k; p+=4 ){
65 |     a_0p_reg = A( 0, p );
66 | 
67 |     c_00_reg += a_0p_reg * *bp0_pntr++;
68 |     c_01_reg += a_0p_reg * *bp1_pntr++;
69 |     c_02_reg += a_0p_reg * *bp2_pntr++;
70 |     c_03_reg += a_0p_reg * *bp3_pntr++;
71 | 
72 |     a_0p_reg = A( 0, p+1 );
73 | 
74 |     c_00_reg += a_0p_reg * *bp0_pntr++;
75 |     c_01_reg += a_0p_reg * *bp1_pntr++;
76 |     c_02_reg += a_0p_reg * *bp2_pntr++;
77 |     c_03_reg += a_0p_reg * *bp3_pntr++;
78 | 
79 |     a_0p_reg = A( 0, p+2 );
80 | 
81 |     c_00_reg += a_0p_reg * *bp0_pntr++;
82 |     c_01_reg += a_0p_reg * *bp1_pntr++;
83 |     c_02_reg += a_0p_reg * *bp2_pntr++;
84 |     c_03_reg += a_0p_reg * *bp3_pntr++;
85 | 
86 |     a_0p_reg = A( 0, p+3 );
87 | 
88 |     c_00_reg += a_0p_reg * *bp0_pntr++;
89 |     c_01_reg += a_0p_reg * *bp1_pntr++;
90 |     c_02_reg += a_0p_reg * *bp2_pntr++;
91 |     c_03_reg += a_0p_reg * *bp3_pntr++;
92 |   }
93 | 
94 |   C( 0, 0 ) += c_00_reg; 
95 |   C( 0, 1 ) += c_01_reg; 
96 |   C( 0, 2 ) += c_02_reg; 
97 |   C( 0, 3 ) += c_03_reg;
98 | }
99 | 


--------------------------------------------------------------------------------
/src/MMult_1x4_9.c:
--------------------------------------------------------------------------------
  1 | /* Create macros so that the matrices are stored in column-major order */
  2 | 
  3 | #define A(i,j) a[ (j)*lda + (i) ]
  4 | #define B(i,j) b[ (j)*ldb + (i) ]
  5 | #define C(i,j) c[ (j)*ldc + (i) ]
  6 | 
  7 | /* Routine for computing C = A * B + C */
  8 | 
  9 | void AddDot1x4( int, double *, int,  double *, int, double *, int )
 10 | 
 11 | void MY_MMult( int m, int n, int k, double *a, int lda, 
 12 |                                     double *b, int ldb,
 13 |                                     double *c, int ldc )
 14 | {
 15 |   int i, j;
 16 | 
 17 |   for ( j=0; j<n; j+=4 ){        /* Loop over the columns of C, unrolled by 4 */
 18 |     for ( i=0; i<m; i+=1 ){        /* Loop over the rows of C */
 19 |       /* Update C( i,j ), C( i,j+1 ), C( i,j+2 ), and C( i,j+3 ) in
 20 | 	 one routine (four inner products) */
 21 | 
 22 |       AddDot1x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );
 23 |     }
 24 |   }
 25 | }
 26 | 
 27 | 
 28 | void AddDot1x4( int k, double *a, int lda,  double *b, int ldb, double *c, int ldc )
 29 | {
 30 |   /* So, this routine computes four elements of C: 
 31 | 
 32 |            C( 0, 0 ), C( 0, 1 ), C( 0, 2 ), C( 0, 3 ).  
 33 | 
 34 |      Notice that this routine is called with c = C( i, j ) in the
 35 |      previous routine, so these are actually the elements 
 36 | 
 37 |            C( i, j ), C( i, j+1 ), C( i, j+2 ), C( i, j+3 ) 
 38 | 	  
 39 |      in the original matrix C.
 40 | 
 41 |      We next use indirect addressing */
 42 | 
 43 |   int p;
 44 |   register double 
 45 |     /* hold contributions to
 46 |        C( 0, 0 ), C( 0, 1 ), C( 0, 2 ), C( 0, 3 ) */
 47 |        c_00_reg,   c_01_reg,   c_02_reg,   c_03_reg,  
 48 |     /* holds A( 0, p ) */
 49 |        a_0p_reg;
 50 |   double 
 51 |     /* Point to the current elements in the four columns of B */
 52 |     *bp0_pntr, *bp1_pntr, *bp2_pntr, *bp3_pntr; 
 53 |     
 54 |   bp0_pntr = &B( 0, 0 );
 55 |   bp1_pntr = &B( 0, 1 );
 56 |   bp2_pntr = &B( 0, 2 );
 57 |   bp3_pntr = &B( 0, 3 );
 58 | 
 59 |   c_00_reg = 0.0; 
 60 |   c_01_reg = 0.0; 
 61 |   c_02_reg = 0.0; 
 62 |   c_03_reg = 0.0;
 63 |  
 64 |   for ( p=0; p<k; p+=4 ){
 65 |     a_0p_reg = A( 0, p );
 66 | 
 67 |     c_00_reg += a_0p_reg * *bp0_pntr;
 68 |     c_01_reg += a_0p_reg * *bp1_pntr;
 69 |     c_02_reg += a_0p_reg * *bp2_pntr;
 70 |     c_03_reg += a_0p_reg * *bp3_pntr;
 71 | 
 72 |     a_0p_reg = A( 0, p+1 );
 73 | 
 74 |     c_00_reg += a_0p_reg * *(bp0_pntr+1);
 75 |     c_01_reg += a_0p_reg * *(bp1_pntr+1);
 76 |     c_02_reg += a_0p_reg * *(bp2_pntr+1);
 77 |     c_03_reg += a_0p_reg * *(bp3_pntr+1);
 78 | 
 79 |     a_0p_reg = A( 0, p+2 );
 80 | 
 81 |     c_00_reg += a_0p_reg * *(bp0_pntr+2);
 82 |     c_01_reg += a_0p_reg * *(bp1_pntr+2);
 83 |     c_02_reg += a_0p_reg * *(bp2_pntr+2);
 84 |     c_03_reg += a_0p_reg * *(bp3_pntr+2);
 85 | 
 86 |     a_0p_reg = A( 0, p+3 );
 87 | 
 88 |     c_00_reg += a_0p_reg * *(bp0_pntr+3);
 89 |     c_01_reg += a_0p_reg * *(bp1_pntr+3);
 90 |     c_02_reg += a_0p_reg * *(bp2_pntr+3);
 91 |     c_03_reg += a_0p_reg * *(bp3_pntr+3);
 92 | 
 93 |     bp0_pntr+=4;
 94 |     bp1_pntr+=4;
 95 |     bp2_pntr+=4;
 96 |     bp3_pntr+=4;
 97 |   }
 98 | 
 99 |   C( 0, 0 ) += c_00_reg; 
100 |   C( 0, 1 ) += c_01_reg; 
101 |   C( 0, 2 ) += c_02_reg; 
102 |   C( 0, 3 ) += c_03_reg;
103 | }
104 | 


--------------------------------------------------------------------------------
/src/MMult_4x4_10.c:
--------------------------------------------------------------------------------
  1 | 
  2 | /* Create macros so that the matrices are stored in column-major order */
  3 | 
  4 | #define A(i,j) a[ (j)*lda + (i) ]
  5 | #define B(i,j) b[ (j)*ldb + (i) ]
  6 | #define C(i,j) c[ (j)*ldc + (i) ]
  7 | 
  8 | /* Routine for computing C = A * B + C */
  9 | 
 10 | void AddDot4x4( int, double *, int, double *, int, double *, int );
 11 | 
 12 | void MY_MMult( int m, int n, int k, double *a, int lda, 
 13 |                                     double *b, int ldb,
 14 |                                     double *c, int ldc )
 15 | {
 16 |   int i, j;
 17 | 
 18 |   for ( j=0; j<n; j+=4 ){        /* Loop over the columns of C, unrolled by 4 */
 19 |     for ( i=0; i<m; i+=4 ){        /* Loop over the rows of C */
 20 |       /* Update C( i,j ), C( i,j+1 ), C( i,j+2 ), and C( i,j+3 ) in
 21 | 	 one routine (four inner products) */
 22 | 
 23 |       AddDot4x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );
 24 |     }
 25 |   }
 26 | }
 27 | 
 28 | #include <mmintrin.h>
 29 | #include <xmmintrin.h>  // SSE
 30 | #include <pmmintrin.h>  // SSE2
 31 | #include <emmintrin.h>  // SSE3
 32 | 
 33 | typedef union
 34 | {
 35 |   __m128d v;
 36 |   double d[2];
 37 | } v2df_t;
 38 | 
 39 | void AddDot4x4( int k, double *a, int lda,  double *b, int ldb, double *c, int ldc )
 40 | {
 41 |   /* So, this routine computes a 4x4 block of matrix A
 42 | 
 43 |            C( 0, 0 ), C( 0, 1 ), C( 0, 2 ), C( 0, 3 ).  
 44 |            C( 1, 0 ), C( 1, 1 ), C( 1, 2 ), C( 1, 3 ).  
 45 |            C( 2, 0 ), C( 2, 1 ), C( 2, 2 ), C( 2, 3 ).  
 46 |            C( 3, 0 ), C( 3, 1 ), C( 3, 2 ), C( 3, 3 ).  
 47 | 
 48 |      Notice that this routine is called with c = C( i, j ) in the
 49 |      previous routine, so these are actually the elements 
 50 | 
 51 |            C( i  , j ), C( i  , j+1 ), C( i  , j+2 ), C( i  , j+3 ) 
 52 |            C( i+1, j ), C( i+1, j+1 ), C( i+1, j+2 ), C( i+1, j+3 ) 
 53 |            C( i+2, j ), C( i+2, j+1 ), C( i+2, j+2 ), C( i+2, j+3 ) 
 54 |            C( i+3, j ), C( i+3, j+1 ), C( i+3, j+2 ), C( i+3, j+3 ) 
 55 | 	  
 56 |      in the original matrix C 
 57 | 
 58 |      And now we use vector registers and instructions */
 59 | 
 60 |   int p;
 61 | 
 62 |   v2df_t
 63 |     c_00_c_10_vreg,    c_01_c_11_vreg,    c_02_c_12_vreg,    c_03_c_13_vreg,
 64 |     c_20_c_30_vreg,    c_21_c_31_vreg,    c_22_c_32_vreg,    c_23_c_33_vreg,
 65 |     a_0p_a_1p_vreg,
 66 |     a_2p_a_3p_vreg,
 67 |     b_p0_vreg, b_p1_vreg, b_p2_vreg, b_p3_vreg; 
 68 | 
 69 |   double 
 70 |     /* Point to the current elements in the four columns of B */
 71 |     *b_p0_pntr, *b_p1_pntr, *b_p2_pntr, *b_p3_pntr; 
 72 |     
 73 |   b_p0_pntr = &B( 0, 0 );
 74 |   b_p1_pntr = &B( 0, 1 );
 75 |   b_p2_pntr = &B( 0, 2 );
 76 |   b_p3_pntr = &B( 0, 3 );
 77 | 
 78 |   c_00_c_10_vreg.v = _mm_setzero_pd();   
 79 |   c_01_c_11_vreg.v = _mm_setzero_pd();
 80 |   c_02_c_12_vreg.v = _mm_setzero_pd(); 
 81 |   c_03_c_13_vreg.v = _mm_setzero_pd(); 
 82 |   c_20_c_30_vreg.v = _mm_setzero_pd();   
 83 |   c_21_c_31_vreg.v = _mm_setzero_pd();  
 84 |   c_22_c_32_vreg.v = _mm_setzero_pd();   
 85 |   c_23_c_33_vreg.v = _mm_setzero_pd(); 
 86 | 
 87 |   for ( p=0; p<k; p++ ){
 88 |     a_0p_a_1p_vreg.v = _mm_load_pd( (double *) &A( 0, p ) );
 89 |     a_2p_a_3p_vreg.v = _mm_load_pd( (double *) &A( 2, p ) );
 90 | 
 91 |     b_p0_vreg.v = _mm_loaddup_pd( (double *) b_p0_pntr++ );   /* load and duplicate */
 92 |     b_p1_vreg.v = _mm_loaddup_pd( (double *) b_p1_pntr++ );   /* load and duplicate */
 93 |     b_p2_vreg.v = _mm_loaddup_pd( (double *) b_p2_pntr++ );   /* load and duplicate */
 94 |     b_p3_vreg.v = _mm_loaddup_pd( (double *) b_p3_pntr++ );   /* load and duplicate */
 95 | 
 96 |     /* First row and second rows */
 97 |     c_00_c_10_vreg.v += a_0p_a_1p_vreg.v * b_p0_vreg.v;
 98 |     c_01_c_11_vreg.v += a_0p_a_1p_vreg.v * b_p1_vreg.v;
 99 |     c_02_c_12_vreg.v += a_0p_a_1p_vreg.v * b_p2_vreg.v;
100 |     c_03_c_13_vreg.v += a_0p_a_1p_vreg.v * b_p3_vreg.v;
101 | 
102 |     /* Third and fourth rows */
103 |     c_20_c_30_vreg.v += a_2p_a_3p_vreg.v * b_p0_vreg.v;
104 |     c_21_c_31_vreg.v += a_2p_a_3p_vreg.v * b_p1_vreg.v;
105 |     c_22_c_32_vreg.v += a_2p_a_3p_vreg.v * b_p2_vreg.v;
106 |     c_23_c_33_vreg.v += a_2p_a_3p_vreg.v * b_p3_vreg.v;
107 |   }
108 | 
109 |   C( 0, 0 ) += c_00_c_10_vreg.d[0];  C( 0, 1 ) += c_01_c_11_vreg.d[0];  
110 |   C( 0, 2 ) += c_02_c_12_vreg.d[0];  C( 0, 3 ) += c_03_c_13_vreg.d[0]; 
111 | 
112 |   C( 1, 0 ) += c_00_c_10_vreg.d[1];  C( 1, 1 ) += c_01_c_11_vreg.d[1];  
113 |   C( 1, 2 ) += c_02_c_12_vreg.d[1];  C( 1, 3 ) += c_03_c_13_vreg.d[1]; 
114 | 
115 |   C( 2, 0 ) += c_20_c_30_vreg.d[0];  C( 2, 1 ) += c_21_c_31_vreg.d[0];  
116 |   C( 2, 2 ) += c_22_c_32_vreg.d[0];  C( 2, 3 ) += c_23_c_33_vreg.d[0]; 
117 | 
118 |   C( 3, 0 ) += c_20_c_30_vreg.d[1];  C( 3, 1 ) += c_21_c_31_vreg.d[1];  
119 |   C( 3, 2 ) += c_22_c_32_vreg.d[1];  C( 3, 3 ) += c_23_c_33_vreg.d[1]; 
120 | }
121 | 


--------------------------------------------------------------------------------
/src/MMult_4x4_11.c:
--------------------------------------------------------------------------------
  1 | 
  2 | /* Create macros so that the matrices are stored in column-major order */
  3 | 
  4 | #define A(i,j) a[ (j)*lda + (i) ]
  5 | #define B(i,j) b[ (j)*ldb + (i) ]
  6 | #define C(i,j) c[ (j)*ldc + (i) ]
  7 | 
  8 | /* Block sizes */
  9 | #define mc 256
 10 | #define kc 128
 11 | 
 12 | #define min( i, j ) ( (i)<(j) ? (i): (j) )
 13 | 
 14 | /* Routine for computing C = A * B + C */
 15 | 
 16 | void AddDot4x4( int, double *, int, double *, int, double *, int );
 17 | 
 18 | void MY_MMult( int m, int n, int k, double *a, int lda, 
 19 |                                     double *b, int ldb,
 20 |                                     double *c, int ldc )
 21 | {
 22 |   int i, j, p, pb, ib;
 23 | 
 24 |   /* This time, we compute a mc x n block of C by a call to the InnerKernel */
 25 | 
 26 |   for ( p=0; p<k; p+=kc ){
 27 |     pb = min( k-p, kc );
 28 |     for ( i=0; i<m; i+=mc ){
 29 |       ib = min( m-i, mc );
 30 |       InnerKernel( ib, n, pb, &A( i,p ), lda, &B(p, 0 ), ldb, &C( i,0 ), ldc );
 31 |     }
 32 |   }
 33 | }
 34 | 
 35 | void InnerKernel( int m, int n, int k, double *a, int lda, 
 36 |                                        double *b, int ldb,
 37 |                                        double *c, int ldc )
 38 | {
 39 |   int i, j;
 40 | 
 41 |   for ( j=0; j<n; j+=4 ){        /* Loop over the columns of C, unrolled by 4 */
 42 |     for ( i=0; i<m; i+=4 ){        /* Loop over the rows of C */
 43 |       /* Update C( i,j ), C( i,j+1 ), C( i,j+2 ), and C( i,j+3 ) in
 44 | 	 one routine (four inner products) */
 45 | 
 46 |       AddDot4x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );
 47 |     }
 48 |   }
 49 | }
 50 | 
 51 | #include <mmintrin.h>
 52 | #include <xmmintrin.h>  // SSE
 53 | #include <pmmintrin.h>  // SSE2
 54 | #include <emmintrin.h>  // SSE3
 55 | 
 56 | typedef union
 57 | {
 58 |   __m128d v;
 59 |   double d[2];
 60 | } v2df_t;
 61 | 
 62 | void AddDot4x4( int k, double *a, int lda,  double *b, int ldb, double *c, int ldc )
 63 | {
 64 |   /* So, this routine computes a 4x4 block of matrix A
 65 | 
 66 |            C( 0, 0 ), C( 0, 1 ), C( 0, 2 ), C( 0, 3 ).  
 67 |            C( 1, 0 ), C( 1, 1 ), C( 1, 2 ), C( 1, 3 ).  
 68 |            C( 2, 0 ), C( 2, 1 ), C( 2, 2 ), C( 2, 3 ).  
 69 |            C( 3, 0 ), C( 3, 1 ), C( 3, 2 ), C( 3, 3 ).  
 70 | 
 71 |      Notice that this routine is called with c = C( i, j ) in the
 72 |      previous routine, so these are actually the elements 
 73 | 
 74 |            C( i  , j ), C( i  , j+1 ), C( i  , j+2 ), C( i  , j+3 ) 
 75 |            C( i+1, j ), C( i+1, j+1 ), C( i+1, j+2 ), C( i+1, j+3 ) 
 76 |            C( i+2, j ), C( i+2, j+1 ), C( i+2, j+2 ), C( i+2, j+3 ) 
 77 |            C( i+3, j ), C( i+3, j+1 ), C( i+3, j+2 ), C( i+3, j+3 ) 
 78 | 	  
 79 |      in the original matrix C 
 80 | 
 81 |      And now we use vector registers and instructions */
 82 | 
 83 |   int p;
 84 |   v2df_t
 85 |     c_00_c_10_vreg,    c_01_c_11_vreg,    c_02_c_12_vreg,    c_03_c_13_vreg,
 86 |     c_20_c_30_vreg,    c_21_c_31_vreg,    c_22_c_32_vreg,    c_23_c_33_vreg,
 87 |     a_0p_a_1p_vreg,
 88 |     a_2p_a_3p_vreg,
 89 |     b_p0_vreg, b_p1_vreg, b_p2_vreg, b_p3_vreg; 
 90 | 
 91 |   double 
 92 |     /* Point to the current elements in the four columns of B */
 93 |     *b_p0_pntr, *b_p1_pntr, *b_p2_pntr, *b_p3_pntr; 
 94 |     
 95 |   b_p0_pntr = &B( 0, 0 );
 96 |   b_p1_pntr = &B( 0, 1 );
 97 |   b_p2_pntr = &B( 0, 2 );
 98 |   b_p3_pntr = &B( 0, 3 );
 99 | 
100 |   c_00_c_10_vreg.v = _mm_setzero_pd();   
101 |   c_01_c_11_vreg.v = _mm_setzero_pd();
102 |   c_02_c_12_vreg.v = _mm_setzero_pd(); 
103 |   c_03_c_13_vreg.v = _mm_setzero_pd(); 
104 |   c_20_c_30_vreg.v = _mm_setzero_pd();   
105 |   c_21_c_31_vreg.v = _mm_setzero_pd();  
106 |   c_22_c_32_vreg.v = _mm_setzero_pd();   
107 |   c_23_c_33_vreg.v = _mm_setzero_pd(); 
108 | 
109 |   for ( p=0; p<k; p++ ){
110 |     a_0p_a_1p_vreg.v = _mm_load_pd( (double *) &A( 0, p ) );
111 |     a_2p_a_3p_vreg.v = _mm_load_pd( (double *) &A( 2, p ) );
112 | 
113 |     b_p0_vreg.v = _mm_loaddup_pd( (double *) b_p0_pntr++ );   /* load and duplicate */
114 |     b_p1_vreg.v = _mm_loaddup_pd( (double *) b_p1_pntr++ );   /* load and duplicate */
115 |     b_p2_vreg.v = _mm_loaddup_pd( (double *) b_p2_pntr++ );   /* load and duplicate */
116 |     b_p3_vreg.v = _mm_loaddup_pd( (double *) b_p3_pntr++ );   /* load and duplicate */
117 | 
118 |     /* First row and second rows */
119 |     c_00_c_10_vreg.v += a_0p_a_1p_vreg.v * b_p0_vreg.v;
120 |     c_01_c_11_vreg.v += a_0p_a_1p_vreg.v * b_p1_vreg.v;
121 |     c_02_c_12_vreg.v += a_0p_a_1p_vreg.v * b_p2_vreg.v;
122 |     c_03_c_13_vreg.v += a_0p_a_1p_vreg.v * b_p3_vreg.v;
123 | 
124 |     /* Third and fourth rows */
125 |     c_20_c_30_vreg.v += a_2p_a_3p_vreg.v * b_p0_vreg.v;
126 |     c_21_c_31_vreg.v += a_2p_a_3p_vreg.v * b_p1_vreg.v;
127 |     c_22_c_32_vreg.v += a_2p_a_3p_vreg.v * b_p2_vreg.v;
128 |     c_23_c_33_vreg.v += a_2p_a_3p_vreg.v * b_p3_vreg.v;
129 |   }
130 | 
131 |   C( 0, 0 ) += c_00_c_10_vreg.d[0];  C( 0, 1 ) += c_01_c_11_vreg.d[0];  
132 |   C( 0, 2 ) += c_02_c_12_vreg.d[0];  C( 0, 3 ) += c_03_c_13_vreg.d[0]; 
133 | 
134 |   C( 1, 0 ) += c_00_c_10_vreg.d[1];  C( 1, 1 ) += c_01_c_11_vreg.d[1];  
135 |   C( 1, 2 ) += c_02_c_12_vreg.d[1];  C( 1, 3 ) += c_03_c_13_vreg.d[1]; 
136 | 
137 |   C( 2, 0 ) += c_20_c_30_vreg.d[0];  C( 2, 1 ) += c_21_c_31_vreg.d[0];  
138 |   C( 2, 2 ) += c_22_c_32_vreg.d[0];  C( 2, 3 ) += c_23_c_33_vreg.d[0]; 
139 | 
140 |   C( 3, 0 ) += c_20_c_30_vreg.d[1];  C( 3, 1 ) += c_21_c_31_vreg.d[1];  
141 |   C( 3, 2 ) += c_22_c_32_vreg.d[1];  C( 3, 3 ) += c_23_c_33_vreg.d[1]; 
142 | }
143 | 


--------------------------------------------------------------------------------
/src/MMult_4x4_12.c:
--------------------------------------------------------------------------------
  1 | 
  2 | /* Create macros so that the matrices are stored in column-major order */
  3 | 
  4 | #define A(i,j) a[ (j)*lda + (i) ]
  5 | #define B(i,j) b[ (j)*ldb + (i) ]
  6 | #define C(i,j) c[ (j)*ldc + (i) ]
  7 | 
  8 | /* Block sizes */
  9 | #define mc 256
 10 | #define kc 128
 11 | 
 12 | #define min( i, j ) ( (i)<(j) ? (i): (j) )
 13 | 
 14 | /* Routine for computing C = A * B + C */
 15 | 
 16 | void AddDot4x4( int, double *, int, double *, int, double *, int );
 17 | void PackMatrixA( int, double *, int, double * );
 18 | 
 19 | void MY_MMult( int m, int n, int k, double *a, int lda, 
 20 |                                     double *b, int ldb,
 21 |                                     double *c, int ldc )
 22 | {
 23 |   int i, p, pb, ib;
 24 | 
 25 |   /* This time, we compute a mc x n block of C by a call to the InnerKernel */
 26 | 
 27 |   for ( p=0; p<k; p+=kc ){
 28 |     pb = min( k-p, kc );
 29 |     for ( i=0; i<m; i+=mc ){
 30 |       ib = min( m-i, mc );
 31 |       InnerKernel( ib, n, pb, &A( i,p ), lda, &B(p, 0 ), ldb, &C( i,0 ), ldc );
 32 |     }
 33 |   }
 34 | }
 35 | 
 36 | void InnerKernel( int m, int n, int k, double *a, int lda, 
 37 |                                        double *b, int ldb,
 38 |                                        double *c, int ldc )
 39 | {
 40 |   int i, j;
 41 |   double 
 42 |     packedA[ m * k ];
 43 | 
 44 |   for ( j=0; j<n; j+=4 ){        /* Loop over the columns of C, unrolled by 4 */
 45 |     for ( i=0; i<m; i+=4 ){        /* Loop over the rows of C */
 46 |       /* Update C( i,j ), C( i,j+1 ), C( i,j+2 ), and C( i,j+3 ) in
 47 | 	 one routine (four inner products) */
 48 |       PackMatrixA( k, &A( i, 0 ), lda, &packedA[ i*k ] );
 49 |       AddDot4x4( k, &packedA[ i*k ], 4, &B( 0,j ), ldb, &C( i,j ), ldc );
 50 |     }
 51 |   }
 52 | }
 53 | 
 54 | void PackMatrixA( int k, double *a, int lda, double *a_to )
 55 | {
 56 |   int j;
 57 | 
 58 |   for( j=0; j<k; j++){  /* loop over columns of A */
 59 |     double 
 60 |       *a_ij_pntr = &A( 0, j );
 61 | 
 62 |     *a_to++ = *a_ij_pntr;
 63 |     *a_to++ = *(a_ij_pntr+1);
 64 |     *a_to++ = *(a_ij_pntr+2);
 65 |     *a_to++ = *(a_ij_pntr+3);
 66 |   }
 67 | }
 68 | 
 69 | #include <mmintrin.h>
 70 | #include <xmmintrin.h>  // SSE
 71 | #include <pmmintrin.h>  // SSE2
 72 | #include <emmintrin.h>  // SSE3
 73 | 
 74 | typedef union
 75 | {
 76 |   __m128d v;
 77 |   double d[2];
 78 | } v2df_t;
 79 | 
 80 | void AddDot4x4( int k, double *a, int lda,  double *b, int ldb, double *c, int ldc )
 81 | {
 82 |   /* So, this routine computes a 4x4 block of matrix A
 83 | 
 84 |            C( 0, 0 ), C( 0, 1 ), C( 0, 2 ), C( 0, 3 ).  
 85 |            C( 1, 0 ), C( 1, 1 ), C( 1, 2 ), C( 1, 3 ).  
 86 |            C( 2, 0 ), C( 2, 1 ), C( 2, 2 ), C( 2, 3 ).  
 87 |            C( 3, 0 ), C( 3, 1 ), C( 3, 2 ), C( 3, 3 ).  
 88 | 
 89 |      Notice that this routine is called with c = C( i, j ) in the
 90 |      previous routine, so these are actually the elements 
 91 | 
 92 |            C( i  , j ), C( i  , j+1 ), C( i  , j+2 ), C( i  , j+3 ) 
 93 |            C( i+1, j ), C( i+1, j+1 ), C( i+1, j+2 ), C( i+1, j+3 ) 
 94 |            C( i+2, j ), C( i+2, j+1 ), C( i+2, j+2 ), C( i+2, j+3 ) 
 95 |            C( i+3, j ), C( i+3, j+1 ), C( i+3, j+2 ), C( i+3, j+3 ) 
 96 | 	  
 97 |      in the original matrix C 
 98 | 
 99 |      And now we use vector registers and instructions */
100 | 
101 |   int p;
102 |   v2df_t
103 |     c_00_c_10_vreg,    c_01_c_11_vreg,    c_02_c_12_vreg,    c_03_c_13_vreg,
104 |     c_20_c_30_vreg,    c_21_c_31_vreg,    c_22_c_32_vreg,    c_23_c_33_vreg,
105 |     a_0p_a_1p_vreg,
106 |     a_2p_a_3p_vreg,
107 |     b_p0_vreg, b_p1_vreg, b_p2_vreg, b_p3_vreg; 
108 | 
109 |   double 
110 |     /* Point to the current elements in the four columns of B */
111 |     *b_p0_pntr, *b_p1_pntr, *b_p2_pntr, *b_p3_pntr; 
112 |     
113 |   b_p0_pntr = &B( 0, 0 );
114 |   b_p1_pntr = &B( 0, 1 );
115 |   b_p2_pntr = &B( 0, 2 );
116 |   b_p3_pntr = &B( 0, 3 );
117 | 
118 |   c_00_c_10_vreg.v = _mm_setzero_pd();   
119 |   c_01_c_11_vreg.v = _mm_setzero_pd();
120 |   c_02_c_12_vreg.v = _mm_setzero_pd(); 
121 |   c_03_c_13_vreg.v = _mm_setzero_pd(); 
122 |   c_20_c_30_vreg.v = _mm_setzero_pd();   
123 |   c_21_c_31_vreg.v = _mm_setzero_pd();  
124 |   c_22_c_32_vreg.v = _mm_setzero_pd();   
125 |   c_23_c_33_vreg.v = _mm_setzero_pd(); 
126 | 
127 |   for ( p=0; p<k; p++ ){
128 |     a_0p_a_1p_vreg.v = _mm_load_pd( (double *) &A( 0, p ) );
129 |     a_2p_a_3p_vreg.v = _mm_load_pd( (double *) &A( 2, p ) );
130 | 
131 |     b_p0_vreg.v = _mm_loaddup_pd( (double *) b_p0_pntr++ );   /* load and duplicate */
132 |     b_p1_vreg.v = _mm_loaddup_pd( (double *) b_p1_pntr++ );   /* load and duplicate */
133 |     b_p2_vreg.v = _mm_loaddup_pd( (double *) b_p2_pntr++ );   /* load and duplicate */
134 |     b_p3_vreg.v = _mm_loaddup_pd( (double *) b_p3_pntr++ );   /* load and duplicate */
135 | 
136 |     /* First row and second rows */
137 |     c_00_c_10_vreg.v += a_0p_a_1p_vreg.v * b_p0_vreg.v;
138 |     c_01_c_11_vreg.v += a_0p_a_1p_vreg.v * b_p1_vreg.v;
139 |     c_02_c_12_vreg.v += a_0p_a_1p_vreg.v * b_p2_vreg.v;
140 |     c_03_c_13_vreg.v += a_0p_a_1p_vreg.v * b_p3_vreg.v;
141 | 
142 |     /* Third and fourth rows */
143 |     c_20_c_30_vreg.v += a_2p_a_3p_vreg.v * b_p0_vreg.v;
144 |     c_21_c_31_vreg.v += a_2p_a_3p_vreg.v * b_p1_vreg.v;
145 |     c_22_c_32_vreg.v += a_2p_a_3p_vreg.v * b_p2_vreg.v;
146 |     c_23_c_33_vreg.v += a_2p_a_3p_vreg.v * b_p3_vreg.v;
147 |   }
148 | 
149 |   C( 0, 0 ) += c_00_c_10_vreg.d[0];  C( 0, 1 ) += c_01_c_11_vreg.d[0];  
150 |   C( 0, 2 ) += c_02_c_12_vreg.d[0];  C( 0, 3 ) += c_03_c_13_vreg.d[0]; 
151 | 
152 |   C( 1, 0 ) += c_00_c_10_vreg.d[1];  C( 1, 1 ) += c_01_c_11_vreg.d[1];  
153 |   C( 1, 2 ) += c_02_c_12_vreg.d[1];  C( 1, 3 ) += c_03_c_13_vreg.d[1]; 
154 | 
155 |   C( 2, 0 ) += c_20_c_30_vreg.d[0];  C( 2, 1 ) += c_21_c_31_vreg.d[0];  
156 |   C( 2, 2 ) += c_22_c_32_vreg.d[0];  C( 2, 3 ) += c_23_c_33_vreg.d[0]; 
157 | 
158 |   C( 3, 0 ) += c_20_c_30_vreg.d[1];  C( 3, 1 ) += c_21_c_31_vreg.d[1];  
159 |   C( 3, 2 ) += c_22_c_32_vreg.d[1];  C( 3, 3 ) += c_23_c_33_vreg.d[1]; 
160 | }
161 | 


--------------------------------------------------------------------------------
/src/MMult_4x4_13.c:
--------------------------------------------------------------------------------
  1 | 
  2 | /* Create macros so that the matrices are stored in column-major order */
  3 | 
  4 | #define A(i,j) a[ (j)*lda + (i) ]
  5 | #define B(i,j) b[ (j)*ldb + (i) ]
  6 | #define C(i,j) c[ (j)*ldc + (i) ]
  7 | 
  8 | /* Block sizes */
  9 | #define mc 256
 10 | #define kc 128
 11 | 
 12 | #define min( i, j ) ( (i)<(j) ? (i): (j) )
 13 | 
 14 | /* Routine for computing C = A * B + C */
 15 | 
 16 | void AddDot4x4( int, double *, int, double *, int, double *, int );
 17 | void PackMatrixA( int, double *, int, double * );
 18 | 
 19 | void MY_MMult( int m, int n, int k, double *a, int lda, 
 20 |                                     double *b, int ldb,
 21 |                                     double *c, int ldc )
 22 | {
 23 |   int i, p, pb, ib;
 24 | 
 25 |   /* This time, we compute a mc x n block of C by a call to the InnerKernel */
 26 | 
 27 |   for ( p=0; p<k; p+=kc ){
 28 |     pb = min( k-p, kc );
 29 |     for ( i=0; i<m; i+=mc ){
 30 |       ib = min( m-i, mc );
 31 |       InnerKernel( ib, n, pb, &A( i,p ), lda, &B(p, 0 ), ldb, &C( i,0 ), ldc );
 32 |     }
 33 |   }
 34 | }
 35 | 
 36 | void InnerKernel( int m, int n, int k, double *a, int lda, 
 37 |                                        double *b, int ldb,
 38 |                                        double *c, int ldc )
 39 | {
 40 |   int i, j;
 41 |   double 
 42 |     packedA[ m * k ];
 43 | 
 44 |   for ( j=0; j<n; j+=4 ){        /* Loop over the columns of C, unrolled by 4 */
 45 |     for ( i=0; i<m; i+=4 ){        /* Loop over the rows of C */
 46 |       /* Update C( i,j ), C( i,j+1 ), C( i,j+2 ), and C( i,j+3 ) in
 47 | 	 one routine (four inner products) */
 48 |       if ( j == 0 ) PackMatrixA( k, &A( i, 0 ), lda, &packedA[ i*k ] );
 49 |       AddDot4x4( k, &packedA[ i*k ], 4, &B( 0,j ), ldb, &C( i,j ), ldc );
 50 |     }
 51 |   }
 52 | }
 53 | 
 54 | void PackMatrixA( int k, double *a, int lda, double *a_to )
 55 | {
 56 |   int j;
 57 | 
 58 |   for( j=0; j<k; j++){  /* loop over columns of A */
 59 |     double 
 60 |       *a_ij_pntr = &A( 0, j );
 61 | 
 62 |     *a_to++ = *a_ij_pntr;
 63 |     *a_to++ = *(a_ij_pntr+1);
 64 |     *a_to++ = *(a_ij_pntr+2);
 65 |     *a_to++ = *(a_ij_pntr+3);
 66 |   }
 67 | }
 68 | 
 69 | #include <mmintrin.h>
 70 | #include <xmmintrin.h>  // SSE
 71 | #include <pmmintrin.h>  // SSE2
 72 | #include <emmintrin.h>  // SSE3
 73 | 
 74 | typedef union
 75 | {
 76 |   __m128d v;
 77 |   double d[2];
 78 | } v2df_t;
 79 | 
 80 | void AddDot4x4( int k, double *a, int lda,  double *b, int ldb, double *c, int ldc )
 81 | {
 82 |   /* So, this routine computes a 4x4 block of matrix A
 83 | 
 84 |            C( 0, 0 ), C( 0, 1 ), C( 0, 2 ), C( 0, 3 ).  
 85 |            C( 1, 0 ), C( 1, 1 ), C( 1, 2 ), C( 1, 3 ).  
 86 |            C( 2, 0 ), C( 2, 1 ), C( 2, 2 ), C( 2, 3 ).  
 87 |            C( 3, 0 ), C( 3, 1 ), C( 3, 2 ), C( 3, 3 ).  
 88 | 
 89 |      Notice that this routine is called with c = C( i, j ) in the
 90 |      previous routine, so these are actually the elements 
 91 | 
 92 |            C( i  , j ), C( i  , j+1 ), C( i  , j+2 ), C( i  , j+3 ) 
 93 |            C( i+1, j ), C( i+1, j+1 ), C( i+1, j+2 ), C( i+1, j+3 ) 
 94 |            C( i+2, j ), C( i+2, j+1 ), C( i+2, j+2 ), C( i+2, j+3 ) 
 95 |            C( i+3, j ), C( i+3, j+1 ), C( i+3, j+2 ), C( i+3, j+3 ) 
 96 | 	  
 97 |      in the original matrix C 
 98 | 
 99 |      And now we use vector registers and instructions */
100 | 
101 |   int p;
102 |   v2df_t
103 |     c_00_c_10_vreg,    c_01_c_11_vreg,    c_02_c_12_vreg,    c_03_c_13_vreg,
104 |     c_20_c_30_vreg,    c_21_c_31_vreg,    c_22_c_32_vreg,    c_23_c_33_vreg,
105 |     a_0p_a_1p_vreg,
106 |     a_2p_a_3p_vreg,
107 |     b_p0_vreg, b_p1_vreg, b_p2_vreg, b_p3_vreg; 
108 | 
109 |   double 
110 |     /* Point to the current elements in the four columns of B */
111 |     *b_p0_pntr, *b_p1_pntr, *b_p2_pntr, *b_p3_pntr; 
112 |     
113 |   b_p0_pntr = &B( 0, 0 );
114 |   b_p1_pntr = &B( 0, 1 );
115 |   b_p2_pntr = &B( 0, 2 );
116 |   b_p3_pntr = &B( 0, 3 );
117 | 
118 |   c_00_c_10_vreg.v = _mm_setzero_pd();   
119 |   c_01_c_11_vreg.v = _mm_setzero_pd();
120 |   c_02_c_12_vreg.v = _mm_setzero_pd(); 
121 |   c_03_c_13_vreg.v = _mm_setzero_pd(); 
122 |   c_20_c_30_vreg.v = _mm_setzero_pd();   
123 |   c_21_c_31_vreg.v = _mm_setzero_pd();  
124 |   c_22_c_32_vreg.v = _mm_setzero_pd();   
125 |   c_23_c_33_vreg.v = _mm_setzero_pd(); 
126 | 
127 |   for ( p=0; p<k; p++ ){
128 |     a_0p_a_1p_vreg.v = _mm_load_pd( (double *) a );
129 |     a_2p_a_3p_vreg.v = _mm_load_pd( (double *) ( a+2 ) );
130 |     a += 4;
131 | 
132 |     b_p0_vreg.v = _mm_loaddup_pd( (double *) b_p0_pntr++ );   /* load and duplicate */
133 |     b_p1_vreg.v = _mm_loaddup_pd( (double *) b_p1_pntr++ );   /* load and duplicate */
134 |     b_p2_vreg.v = _mm_loaddup_pd( (double *) b_p2_pntr++ );   /* load and duplicate */
135 |     b_p3_vreg.v = _mm_loaddup_pd( (double *) b_p3_pntr++ );   /* load and duplicate */
136 | 
137 |     /* First row and second rows */
138 |     c_00_c_10_vreg.v += a_0p_a_1p_vreg.v * b_p0_vreg.v;
139 |     c_01_c_11_vreg.v += a_0p_a_1p_vreg.v * b_p1_vreg.v;
140 |     c_02_c_12_vreg.v += a_0p_a_1p_vreg.v * b_p2_vreg.v;
141 |     c_03_c_13_vreg.v += a_0p_a_1p_vreg.v * b_p3_vreg.v;
142 | 
143 |     /* Third and fourth rows */
144 |     c_20_c_30_vreg.v += a_2p_a_3p_vreg.v * b_p0_vreg.v;
145 |     c_21_c_31_vreg.v += a_2p_a_3p_vreg.v * b_p1_vreg.v;
146 |     c_22_c_32_vreg.v += a_2p_a_3p_vreg.v * b_p2_vreg.v;
147 |     c_23_c_33_vreg.v += a_2p_a_3p_vreg.v * b_p3_vreg.v;
148 |   }
149 | 
150 |   C( 0, 0 ) += c_00_c_10_vreg.d[0];  C( 0, 1 ) += c_01_c_11_vreg.d[0];  
151 |   C( 0, 2 ) += c_02_c_12_vreg.d[0];  C( 0, 3 ) += c_03_c_13_vreg.d[0]; 
152 | 
153 |   C( 1, 0 ) += c_00_c_10_vreg.d[1];  C( 1, 1 ) += c_01_c_11_vreg.d[1];  
154 |   C( 1, 2 ) += c_02_c_12_vreg.d[1];  C( 1, 3 ) += c_03_c_13_vreg.d[1]; 
155 | 
156 |   C( 2, 0 ) += c_20_c_30_vreg.d[0];  C( 2, 1 ) += c_21_c_31_vreg.d[0];  
157 |   C( 2, 2 ) += c_22_c_32_vreg.d[0];  C( 2, 3 ) += c_23_c_33_vreg.d[0]; 
158 | 
159 |   C( 3, 0 ) += c_20_c_30_vreg.d[1];  C( 3, 1 ) += c_21_c_31_vreg.d[1];  
160 |   C( 3, 2 ) += c_22_c_32_vreg.d[1];  C( 3, 3 ) += c_23_c_33_vreg.d[1]; 
161 | }
162 | 


--------------------------------------------------------------------------------
/src/MMult_4x4_14.c:
--------------------------------------------------------------------------------
  1 | /* Create macros so that the matrices are stored in column-major order */
  2 | 
  3 | #define A(i,j) a[ (j)*lda + (i) ]
  4 | #define B(i,j) b[ (j)*ldb + (i) ]
  5 | #define C(i,j) c[ (j)*ldc + (i) ]
  6 | 
  7 | /* Block sizes */
  8 | #define mc 256
  9 | #define kc 128
 10 | 
 11 | #define min( i, j ) ( (i)<(j) ? (i): (j) )
 12 | 
 13 | /* Routine for computing C = A * B + C */
 14 | 
 15 | void AddDot4x4( int, double *, int, double *, int, double *, int );
 16 | void PackMatrixA( int, double *, int, double * );
 17 | void PackMatrixB( int, double *, int, double * );
 18 | void InnerKernel( int, int, int, double *, int, double *, int, double *, int, int );
 19 | 
 20 | void MY_MMult( int m, int n, int k, double *a, int lda, 
 21 |                                     double *b, int ldb,
 22 |                                     double *c, int ldc )
 23 | {
 24 |   int i, p, pb, ib;
 25 | 
 26 |   /* This time, we compute a mc x n block of C by a call to the InnerKernel */
 27 | 
 28 |   for ( p=0; p<k; p+=kc ){
 29 |     pb = min( k-p, kc );
 30 |     for ( i=0; i<m; i+=mc ){
 31 |       ib = min( m-i, mc );
 32 |       InnerKernel( ib, n, pb, &A( i,p ), lda, &B(p, 0 ), ldb, &C( i,0 ), ldc, i==0 );
 33 |     }
 34 |   }
 35 | }
 36 | 
 37 | void InnerKernel( int m, int n, int k, double *a, int lda, 
 38 |                                        double *b, int ldb,
 39 |                                        double *c, int ldc, int first_time )
 40 | {
 41 |   int i, j;
 42 |   double 
 43 |     packedA[ m * k ], packedB[ k*n ];
 44 | 
 45 |   for ( j=0; j<n; j+=4 ){        /* Loop over the columns of C, unrolled by 4 */
 46 |     PackMatrixB( k, &B( 0, j ), ldb, &packedB[ j*k ] );
 47 |     for ( i=0; i<m; i+=4 ){        /* Loop over the rows of C */
 48 |       /* Update C( i,j ), C( i,j+1 ), C( i,j+2 ), and C( i,j+3 ) in
 49 | 	 one routine (four inner products) */
 50 |       if ( j == 0 ) 
 51 | 	PackMatrixA( k, &A( i, 0 ), lda, &packedA[ i*k ] );
 52 |       AddDot4x4( k, &packedA[ i*k ], 4, &packedB[ j*k ], k, &C( i,j ), ldc );
 53 |     }
 54 |   }
 55 | }
 56 | 
 57 | void PackMatrixA( int k, double *a, int lda, double *a_to )
 58 | {
 59 |   int j;
 60 | 
 61 |   for( j=0; j<k; j++){  /* loop over columns of A */
 62 |     double 
 63 |       *a_ij_pntr = &A( 0, j );
 64 | 
 65 |     *a_to     = *a_ij_pntr;
 66 |     *(a_to+1) = *(a_ij_pntr+1);
 67 |     *(a_to+2) = *(a_ij_pntr+2);
 68 |     *(a_to+3) = *(a_ij_pntr+3);
 69 | 
 70 |     a_to += 4;
 71 |   }
 72 | }
 73 | 
 74 | void PackMatrixB( int k, double *b, int ldb, double *b_to )
 75 | {
 76 |   int i;
 77 |   double 
 78 |     *b_i0_pntr = &B( 0, 0 ), *b_i1_pntr = &B( 0, 1 ),
 79 |     *b_i2_pntr = &B( 0, 2 ), *b_i3_pntr = &B( 0, 3 );
 80 | 
 81 |   for( i=0; i<k; i++){  /* loop over rows of B */
 82 |     *b_to++ = *b_i0_pntr++;
 83 |     *b_to++ = *b_i1_pntr++;
 84 |     *b_to++ = *b_i2_pntr++;
 85 |     *b_to++ = *b_i3_pntr++;
 86 |   }
 87 | }
 88 | 
 89 | #include <mmintrin.h>
 90 | #include <xmmintrin.h>  // SSE
 91 | #include <pmmintrin.h>  // SSE2
 92 | #include <emmintrin.h>  // SSE3
 93 | 
 94 | typedef union
 95 | {
 96 |   __m128d v;
 97 |   double d[2];
 98 | } v2df_t;
 99 | 
100 | void AddDot4x4( int k, double *a, int lda,  double *b, int ldb, double *c, int ldc )
101 | {
102 |   /* So, this routine computes a 4x4 block of matrix A
103 | 
104 |            C( 0, 0 ), C( 0, 1 ), C( 0, 2 ), C( 0, 3 ).  
105 |            C( 1, 0 ), C( 1, 1 ), C( 1, 2 ), C( 1, 3 ).  
106 |            C( 2, 0 ), C( 2, 1 ), C( 2, 2 ), C( 2, 3 ).  
107 |            C( 3, 0 ), C( 3, 1 ), C( 3, 2 ), C( 3, 3 ).  
108 | 
109 |      Notice that this routine is called with c = C( i, j ) in the
110 |      previous routine, so these are actually the elements 
111 | 
112 |            C( i  , j ), C( i  , j+1 ), C( i  , j+2 ), C( i  , j+3 ) 
113 |            C( i+1, j ), C( i+1, j+1 ), C( i+1, j+2 ), C( i+1, j+3 ) 
114 |            C( i+2, j ), C( i+2, j+1 ), C( i+2, j+2 ), C( i+2, j+3 ) 
115 |            C( i+3, j ), C( i+3, j+1 ), C( i+3, j+2 ), C( i+3, j+3 ) 
116 | 	  
117 |      in the original matrix C 
118 | 
119 |      And now we use vector registers and instructions */
120 | 
121 |   int p;
122 |   v2df_t
123 |     c_00_c_10_vreg,    c_01_c_11_vreg,    c_02_c_12_vreg,    c_03_c_13_vreg,
124 |     c_20_c_30_vreg,    c_21_c_31_vreg,    c_22_c_32_vreg,    c_23_c_33_vreg,
125 |     a_0p_a_1p_vreg,
126 |     a_2p_a_3p_vreg,
127 |     b_p0_vreg, b_p1_vreg, b_p2_vreg, b_p3_vreg; 
128 | 
129 |   c_00_c_10_vreg.v = _mm_setzero_pd();   
130 |   c_01_c_11_vreg.v = _mm_setzero_pd();
131 |   c_02_c_12_vreg.v = _mm_setzero_pd(); 
132 |   c_03_c_13_vreg.v = _mm_setzero_pd(); 
133 |   c_20_c_30_vreg.v = _mm_setzero_pd();   
134 |   c_21_c_31_vreg.v = _mm_setzero_pd();  
135 |   c_22_c_32_vreg.v = _mm_setzero_pd();   
136 |   c_23_c_33_vreg.v = _mm_setzero_pd(); 
137 | 
138 |   for ( p=0; p<k; p++ ){
139 |     a_0p_a_1p_vreg.v = _mm_load_pd( (double *) a );
140 |     a_2p_a_3p_vreg.v = _mm_load_pd( (double *) ( a+2 ) );
141 |     a += 4;
142 | 
143 |     b_p0_vreg.v = _mm_loaddup_pd( (double *) b );       /* load and duplicate */
144 |     b_p1_vreg.v = _mm_loaddup_pd( (double *) (b+1) );   /* load and duplicate */
145 |     b_p2_vreg.v = _mm_loaddup_pd( (double *) (b+2) );   /* load and duplicate */
146 |     b_p3_vreg.v = _mm_loaddup_pd( (double *) (b+3) );   /* load and duplicate */
147 | 
148 |     b += 4;
149 | 
150 |     /* First row and second rows */
151 |     c_00_c_10_vreg.v += a_0p_a_1p_vreg.v * b_p0_vreg.v;
152 |     c_01_c_11_vreg.v += a_0p_a_1p_vreg.v * b_p1_vreg.v;
153 |     c_02_c_12_vreg.v += a_0p_a_1p_vreg.v * b_p2_vreg.v;
154 |     c_03_c_13_vreg.v += a_0p_a_1p_vreg.v * b_p3_vreg.v;
155 | 
156 |     /* Third and fourth rows */
157 |     c_20_c_30_vreg.v += a_2p_a_3p_vreg.v * b_p0_vreg.v;
158 |     c_21_c_31_vreg.v += a_2p_a_3p_vreg.v * b_p1_vreg.v;
159 |     c_22_c_32_vreg.v += a_2p_a_3p_vreg.v * b_p2_vreg.v;
160 |     c_23_c_33_vreg.v += a_2p_a_3p_vreg.v * b_p3_vreg.v;
161 |   }
162 | 
163 |   C( 0, 0 ) += c_00_c_10_vreg.d[0];  C( 0, 1 ) += c_01_c_11_vreg.d[0];  
164 |   C( 0, 2 ) += c_02_c_12_vreg.d[0];  C( 0, 3 ) += c_03_c_13_vreg.d[0]; 
165 | 
166 |   C( 1, 0 ) += c_00_c_10_vreg.d[1];  C( 1, 1 ) += c_01_c_11_vreg.d[1];  
167 |   C( 1, 2 ) += c_02_c_12_vreg.d[1];  C( 1, 3 ) += c_03_c_13_vreg.d[1]; 
168 | 
169 |   C( 2, 0 ) += c_20_c_30_vreg.d[0];  C( 2, 1 ) += c_21_c_31_vreg.d[0];  
170 |   C( 2, 2 ) += c_22_c_32_vreg.d[0];  C( 2, 3 ) += c_23_c_33_vreg.d[0]; 
171 | 
172 |   C( 3, 0 ) += c_20_c_30_vreg.d[1];  C( 3, 1 ) += c_21_c_31_vreg.d[1];  
173 |   C( 3, 2 ) += c_22_c_32_vreg.d[1];  C( 3, 3 ) += c_23_c_33_vreg.d[1]; 
174 | }
175 | 


--------------------------------------------------------------------------------
/src/MMult_4x4_15.c:
--------------------------------------------------------------------------------
  1 | /* Create macros so that the matrices are stored in column-major order */
  2 | 
  3 | #define A(i,j) a[ (j)*lda + (i) ]
  4 | #define B(i,j) b[ (j)*ldb + (i) ]
  5 | #define C(i,j) c[ (j)*ldc + (i) ]
  6 | 
  7 | /* Block sizes */
  8 | #define mc 256
  9 | #define kc 128
 10 | #define nb 1000
 11 | 
 12 | #define min( i, j ) ( (i)<(j) ? (i): (j) )
 13 | 
 14 | /* Routine for computing C = A * B + C */
 15 | 
 16 | void AddDot4x4( int, double *, int, double *, int, double *, int );
 17 | void PackMatrixA( int, double *, int, double * );
 18 | void PackMatrixB( int, double *, int, double * );
 19 | void InnerKernel( int, int, int, double *, int, double *, int, double *, int, int );
 20 | 
 21 | void MY_MMult( int m, int n, int k, double *a, int lda, 
 22 |                                     double *b, int ldb,
 23 |                                     double *c, int ldc )
 24 | {
 25 |   int i, p, pb, ib;
 26 | 
 27 |   /* This time, we compute a mc x n block of C by a call to the InnerKernel */
 28 | 
 29 |   for ( p=0; p<k; p+=kc ){
 30 |     pb = min( k-p, kc );
 31 |     for ( i=0; i<m; i+=mc ){
 32 |       ib = min( m-i, mc );
 33 |       InnerKernel( ib, n, pb, &A( i,p ), lda, &B(p, 0 ), ldb, &C( i,0 ), ldc, i==0 );
 34 |     }
 35 |   }
 36 | }
 37 | 
 38 | void InnerKernel( int m, int n, int k, double *a, int lda, 
 39 |                                        double *b, int ldb,
 40 |                                        double *c, int ldc, int first_time )
 41 | {
 42 |   int i, j;
 43 |   double 
 44 |     packedA[ m * k ];
 45 |   static double 
 46 |     packedB[ kc*nb ];    /* Note: using a static buffer is not thread safe... */
 47 | 
 48 |   for ( j=0; j<n; j+=4 ){        /* Loop over the columns of C, unrolled by 4 */
 49 |     if ( first_time )
 50 |       PackMatrixB( k, &B( 0, j ), ldb, &packedB[ j*k ] );
 51 |     for ( i=0; i<m; i+=4 ){        /* Loop over the rows of C */
 52 |       /* Update C( i,j ), C( i,j+1 ), C( i,j+2 ), and C( i,j+3 ) in
 53 | 	 one routine (four inner products) */
 54 |       if ( j == 0 ) 
 55 | 	PackMatrixA( k, &A( i, 0 ), lda, &packedA[ i*k ] );
 56 |       AddDot4x4( k, &packedA[ i*k ], 4, &packedB[ j*k ], k, &C( i,j ), ldc );
 57 |     }
 58 |   }
 59 | }
 60 | 
 61 | void PackMatrixA( int k, double *a, int lda, double *a_to )
 62 | {
 63 |   int j;
 64 | 
 65 |   for( j=0; j<k; j++){  /* loop over columns of A */
 66 |     double 
 67 |       *a_ij_pntr = &A( 0, j );
 68 | 
 69 |     *a_to     = *a_ij_pntr;
 70 |     *(a_to+1) = *(a_ij_pntr+1);
 71 |     *(a_to+2) = *(a_ij_pntr+2);
 72 |     *(a_to+3) = *(a_ij_pntr+3);
 73 | 
 74 |     a_to += 4;
 75 |   }
 76 | }
 77 | 
 78 | void PackMatrixB( int k, double *b, int ldb, double *b_to )
 79 | {
 80 |   int i;
 81 |   double 
 82 |     *b_i0_pntr = &B( 0, 0 ), *b_i1_pntr = &B( 0, 1 ),
 83 |     *b_i2_pntr = &B( 0, 2 ), *b_i3_pntr = &B( 0, 3 );
 84 | 
 85 |   for( i=0; i<k; i++){  /* loop over rows of B */
 86 |     *b_to++ = *b_i0_pntr++;
 87 |     *b_to++ = *b_i1_pntr++;
 88 |     *b_to++ = *b_i2_pntr++;
 89 |     *b_to++ = *b_i3_pntr++;
 90 |   }
 91 | }
 92 | 
 93 | #include <mmintrin.h>
 94 | #include <xmmintrin.h>  // SSE
 95 | #include <pmmintrin.h>  // SSE2
 96 | #include <emmintrin.h>  // SSE3
 97 | 
 98 | typedef union
 99 | {
100 |   __m128d v;
101 |   double d[2];
102 | } v2df_t;
103 | 
104 | void AddDot4x4( int k, double *a, int lda,  double *b, int ldb, double *c, int ldc )
105 | {
106 |   /* So, this routine computes a 4x4 block of matrix A
107 | 
108 |            C( 0, 0 ), C( 0, 1 ), C( 0, 2 ), C( 0, 3 ).  
109 |            C( 1, 0 ), C( 1, 1 ), C( 1, 2 ), C( 1, 3 ).  
110 |            C( 2, 0 ), C( 2, 1 ), C( 2, 2 ), C( 2, 3 ).  
111 |            C( 3, 0 ), C( 3, 1 ), C( 3, 2 ), C( 3, 3 ).  
112 | 
113 |      Notice that this routine is called with c = C( i, j ) in the
114 |      previous routine, so these are actually the elements 
115 | 
116 |            C( i  , j ), C( i  , j+1 ), C( i  , j+2 ), C( i  , j+3 ) 
117 |            C( i+1, j ), C( i+1, j+1 ), C( i+1, j+2 ), C( i+1, j+3 ) 
118 |            C( i+2, j ), C( i+2, j+1 ), C( i+2, j+2 ), C( i+2, j+3 ) 
119 |            C( i+3, j ), C( i+3, j+1 ), C( i+3, j+2 ), C( i+3, j+3 ) 
120 | 	  
121 |      in the original matrix C 
122 | 
123 |      And now we use vector registers and instructions */
124 | 
125 |   int p;
126 |   v2df_t
127 |     c_00_c_10_vreg,    c_01_c_11_vreg,    c_02_c_12_vreg,    c_03_c_13_vreg,
128 |     c_20_c_30_vreg,    c_21_c_31_vreg,    c_22_c_32_vreg,    c_23_c_33_vreg,
129 |     a_0p_a_1p_vreg,
130 |     a_2p_a_3p_vreg,
131 |     b_p0_vreg, b_p1_vreg, b_p2_vreg, b_p3_vreg; 
132 | 
133 |   c_00_c_10_vreg.v = _mm_setzero_pd();   
134 |   c_01_c_11_vreg.v = _mm_setzero_pd();
135 |   c_02_c_12_vreg.v = _mm_setzero_pd(); 
136 |   c_03_c_13_vreg.v = _mm_setzero_pd(); 
137 |   c_20_c_30_vreg.v = _mm_setzero_pd();   
138 |   c_21_c_31_vreg.v = _mm_setzero_pd();  
139 |   c_22_c_32_vreg.v = _mm_setzero_pd();   
140 |   c_23_c_33_vreg.v = _mm_setzero_pd(); 
141 | 
142 |   for ( p=0; p<k; p++ ){
143 |     a_0p_a_1p_vreg.v = _mm_load_pd( (double *) a );
144 |     a_2p_a_3p_vreg.v = _mm_load_pd( (double *) ( a+2 ) );
145 |     a += 4;
146 | 
147 |     b_p0_vreg.v = _mm_loaddup_pd( (double *) b );       /* load and duplicate */
148 |     b_p1_vreg.v = _mm_loaddup_pd( (double *) (b+1) );   /* load and duplicate */
149 |     b_p2_vreg.v = _mm_loaddup_pd( (double *) (b+2) );   /* load and duplicate */
150 |     b_p3_vreg.v = _mm_loaddup_pd( (double *) (b+3) );   /* load and duplicate */
151 | 
152 |     b += 4;
153 | 
154 |     /* First row and second rows */
155 |     c_00_c_10_vreg.v += a_0p_a_1p_vreg.v * b_p0_vreg.v;
156 |     c_01_c_11_vreg.v += a_0p_a_1p_vreg.v * b_p1_vreg.v;
157 |     c_02_c_12_vreg.v += a_0p_a_1p_vreg.v * b_p2_vreg.v;
158 |     c_03_c_13_vreg.v += a_0p_a_1p_vreg.v * b_p3_vreg.v;
159 | 
160 |     /* Third and fourth rows */
161 |     c_20_c_30_vreg.v += a_2p_a_3p_vreg.v * b_p0_vreg.v;
162 |     c_21_c_31_vreg.v += a_2p_a_3p_vreg.v * b_p1_vreg.v;
163 |     c_22_c_32_vreg.v += a_2p_a_3p_vreg.v * b_p2_vreg.v;
164 |     c_23_c_33_vreg.v += a_2p_a_3p_vreg.v * b_p3_vreg.v;
165 |   }
166 | 
167 |   C( 0, 0 ) += c_00_c_10_vreg.d[0];  C( 0, 1 ) += c_01_c_11_vreg.d[0];  
168 |   C( 0, 2 ) += c_02_c_12_vreg.d[0];  C( 0, 3 ) += c_03_c_13_vreg.d[0]; 
169 | 
170 |   C( 1, 0 ) += c_00_c_10_vreg.d[1];  C( 1, 1 ) += c_01_c_11_vreg.d[1];  
171 |   C( 1, 2 ) += c_02_c_12_vreg.d[1];  C( 1, 3 ) += c_03_c_13_vreg.d[1]; 
172 | 
173 |   C( 2, 0 ) += c_20_c_30_vreg.d[0];  C( 2, 1 ) += c_21_c_31_vreg.d[0];  
174 |   C( 2, 2 ) += c_22_c_32_vreg.d[0];  C( 2, 3 ) += c_23_c_33_vreg.d[0]; 
175 | 
176 |   C( 3, 0 ) += c_20_c_30_vreg.d[1];  C( 3, 1 ) += c_21_c_31_vreg.d[1];  
177 |   C( 3, 2 ) += c_22_c_32_vreg.d[1];  C( 3, 3 ) += c_23_c_33_vreg.d[1]; 
178 | }
179 | 


--------------------------------------------------------------------------------
/src/MMult_4x4_3.c:
--------------------------------------------------------------------------------
 1 | 
 2 | /* Create macros so that the matrices are stored in column-major order */
 3 | 
 4 | #define A(i,j) a[ (j)*lda + (i) ]
 5 | #define B(i,j) b[ (j)*ldb + (i) ]
 6 | #define C(i,j) c[ (j)*ldc + (i) ]
 7 | 
 8 | /* Routine for computing C = A * B + C */
 9 | 
10 | void AddDot( int, double *, int, double *, double * );
11 | 
12 | void MY_MMult( int m, int n, int k, double *a, int lda, 
13 |                                     double *b, int ldb,
14 |                                     double *c, int ldc )
15 | {
16 |   int i, j;
17 | 
18 |   for ( j=0; j<n; j+=4 ){        /* Loop over the columns of C, unrolled by 4 */
19 |     for ( i=0; i<m; i+=4 ){        /* Loop over the rows of C */
20 |       /* Update C( i,j ), C( i,j+1 ), C( i,j+2 ), and C( i,j+3 ) in
21 | 	 one routine (four inner products) */
22 | 
23 |       AddDot4x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );
24 |     }
25 |   }
26 | }
27 | 
28 | 
29 | void AddDot4x4( int k, double *a, int lda,  double *b, int ldb, double *c, int ldc )
30 | {
31 |   /* So, this routine computes a 4x4 block of matrix A
32 | 
33 |            C( 0, 0 ), C( 0, 1 ), C( 0, 2 ), C( 0, 3 ).  
34 |            C( 1, 0 ), C( 1, 1 ), C( 1, 2 ), C( 1, 3 ).  
35 |            C( 2, 0 ), C( 2, 1 ), C( 2, 2 ), C( 2, 3 ).  
36 |            C( 3, 0 ), C( 3, 1 ), C( 3, 2 ), C( 3, 3 ).  
37 | 
38 |      Notice that this routine is called with c = C( i, j ) in the
39 |      previous routine, so these are actually the elements 
40 | 
41 |            C( i  , j ), C( i  , j+1 ), C( i  , j+2 ), C( i  , j+3 ) 
42 |            C( i+1, j ), C( i+1, j+1 ), C( i+1, j+2 ), C( i+1, j+3 ) 
43 |            C( i+2, j ), C( i+2, j+1 ), C( i+2, j+2 ), C( i+2, j+3 ) 
44 |            C( i+3, j ), C( i+3, j+1 ), C( i+3, j+2 ), C( i+3, j+3 ) 
45 | 	  
46 |      in the original matrix C */ 
47 | 
48 |   /* First row */
49 |   AddDot( k, &A( 0, 0 ), lda, &B( 0, 0 ), &C( 0, 0 ) );
50 |   AddDot( k, &A( 0, 0 ), lda, &B( 0, 1 ), &C( 0, 1 ) );
51 |   AddDot( k, &A( 0, 0 ), lda, &B( 0, 2 ), &C( 0, 2 ) );
52 |   AddDot( k, &A( 0, 0 ), lda, &B( 0, 3 ), &C( 0, 3 ) );
53 | 
54 |   /* Second row */
55 |   AddDot( k, &A( 1, 0 ), lda, &B( 0, 0 ), &C( 1, 0 ) );
56 |   AddDot( k, &A( 1, 0 ), lda, &B( 0, 1 ), &C( 1, 1 ) );
57 |   AddDot( k, &A( 1, 0 ), lda, &B( 0, 2 ), &C( 1, 2 ) );
58 |   AddDot( k, &A( 1, 0 ), lda, &B( 0, 3 ), &C( 1, 3 ) );
59 | 
60 |   /* Third row */
61 |   AddDot( k, &A( 2, 0 ), lda, &B( 0, 0 ), &C( 2, 0 ) );
62 |   AddDot( k, &A( 2, 0 ), lda, &B( 0, 1 ), &C( 2, 1 ) );
63 |   AddDot( k, &A( 2, 0 ), lda, &B( 0, 2 ), &C( 2, 2 ) );
64 |   AddDot( k, &A( 2, 0 ), lda, &B( 0, 3 ), &C( 2, 3 ) );
65 | 
66 |   /* Four row */
67 |   AddDot( k, &A( 3, 0 ), lda, &B( 0, 0 ), &C( 3, 0 ) );
68 |   AddDot( k, &A( 3, 0 ), lda, &B( 0, 1 ), &C( 3, 1 ) );
69 |   AddDot( k, &A( 3, 0 ), lda, &B( 0, 2 ), &C( 3, 2 ) );
70 |   AddDot( k, &A( 3, 0 ), lda, &B( 0, 3 ), &C( 3, 3 ) );
71 | }
72 | 
73 | 
74 | /* Create macro to let X( i ) equal the ith element of x */
75 | 
76 | #define X(i) x[ (i)*incx ]
77 | 
78 | void AddDot( int k, double *x, int incx,  double *y, double *gamma )
79 | {
80 |   /* compute gamma := x' * y + gamma with vectors x and y of length n.
81 | 
82 |      Here x starts at location x with increment (stride) incx and y starts at location y and has (implicit) stride of 1.
83 |   */
84 |  
85 |   int p;
86 | 
87 |   for ( p=0; p<k; p++ ){
88 |     *gamma += X( p ) * y[ p ];     
89 |   }
90 | }
91 | 


--------------------------------------------------------------------------------
/src/MMult_4x4_4.c:
--------------------------------------------------------------------------------
  1 | 
  2 | /* Create macros so that the matrices are stored in column-major order */
  3 | 
  4 | #define A(i,j) a[ (j)*lda + (i) ]
  5 | #define B(i,j) b[ (j)*ldb + (i) ]
  6 | #define C(i,j) c[ (j)*ldc + (i) ]
  7 | 
  8 | /* Routine for computing C = A * B + C */
  9 | 
 10 | void AddDot4x4( int, double *, int, double *, int, double *, int );
 11 | void AddDot( int, double *, int, double *, double * );
 12 | 
 13 | void MY_MMult( int m, int n, int k, double *a, int lda, 
 14 |                                     double *b, int ldb,
 15 |                                     double *c, int ldc )
 16 | {
 17 |   int i, j;
 18 | 
 19 |   for ( j=0; j<n; j+=4 ){        /* Loop over the columns of C, unrolled by 4 */
 20 |     for ( i=0; i<m; i+=4 ){        /* Loop over the rows of C */
 21 |       /* Update C( i,j ), C( i,j+1 ), C( i,j+2 ), and C( i,j+3 ) in
 22 | 	 one routine (four inner products) */
 23 | 
 24 |       AddDot4x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );
 25 |     }
 26 |   }
 27 | }
 28 | 
 29 | 
 30 | void AddDot4x4( int k, double *a, int lda,  double *b, int ldb, double *c, int ldc )
 31 | {
 32 |   /* So, this routine computes a 4x4 block of matrix A
 33 | 
 34 |            C( 0, 0 ), C( 0, 1 ), C( 0, 2 ), C( 0, 3 ).  
 35 |            C( 1, 0 ), C( 1, 1 ), C( 1, 2 ), C( 1, 3 ).  
 36 |            C( 2, 0 ), C( 2, 1 ), C( 2, 2 ), C( 2, 3 ).  
 37 |            C( 3, 0 ), C( 3, 1 ), C( 3, 2 ), C( 3, 3 ).  
 38 | 
 39 |      Notice that this routine is called with c = C( i, j ) in the
 40 |      previous routine, so these are actually the elements 
 41 | 
 42 |            C( i  , j ), C( i  , j+1 ), C( i  , j+2 ), C( i  , j+3 ) 
 43 |            C( i+1, j ), C( i+1, j+1 ), C( i+1, j+2 ), C( i+1, j+3 ) 
 44 |            C( i+2, j ), C( i+2, j+1 ), C( i+2, j+2 ), C( i+2, j+3 ) 
 45 |            C( i+3, j ), C( i+3, j+1 ), C( i+3, j+2 ), C( i+3, j+3 ) 
 46 | 	  
 47 |      in the original matrix C 
 48 | 
 49 |      In this version, we "inline" AddDot */ 
 50 | 
 51 |   int p;
 52 | 
 53 |   /* First row */
 54 |   //  AddDot( k, &A( 0, 0 ), lda, &B( 0, 0 ), &C( 0, 0 ) );
 55 |   for ( p=0; p<k; p++ ){
 56 |     C( 0, 0 ) += A( 0, p ) * B( p, 0 );     
 57 |   }
 58 |   //  AddDot( k, &A( 0, 0 ), lda, &B( 0, 1 ), &C( 0, 1 ) );
 59 |   for ( p=0; p<k; p++ ){
 60 |     C( 0, 1 ) += A( 0, p ) * B( p, 1 );     
 61 |   }
 62 |   //  AddDot( k, &A( 0, 0 ), lda, &B( 0, 2 ), &C( 0, 2 ) );
 63 |   for ( p=0; p<k; p++ ){
 64 |     C( 0, 2 ) += A( 0, p ) * B( p, 2 );     
 65 |   }
 66 |   //  AddDot( k, &A( 0, 0 ), lda, &B( 0, 3 ), &C( 0, 3 ) );
 67 |   for ( p=0; p<k; p++ ){
 68 |     C( 0, 3 ) += A( 0, p ) * B( p, 3 );     
 69 |   }
 70 | 
 71 |   /* Second row */
 72 |   //  AddDot( k, &A( 1, 0 ), lda, &B( 0, 0 ), &C( 1, 0 ) );
 73 |   for ( p=0; p<k; p++ ){
 74 |     C( 1, 0 ) += A( 1, p ) * B( p, 0 );     
 75 |   }
 76 |   //  AddDot( k, &A( 1, 0 ), lda, &B( 0, 1 ), &C( 1, 1 ) );
 77 |   for ( p=0; p<k; p++ ){
 78 |     C( 1, 1 ) += A( 1, p ) * B( p, 1 );     
 79 |   }
 80 |   //  AddDot( k, &A( 1, 0 ), lda, &B( 0, 2 ), &C( 1, 2 ) );
 81 |   for ( p=0; p<k; p++ ){
 82 |     C( 1, 2 ) += A( 1, p ) * B( p, 2 );     
 83 |   }
 84 |   //  AddDot( k, &A( 1, 0 ), lda, &B( 0, 3 ), &C( 1, 3 ) );
 85 |   for ( p=0; p<k; p++ ){
 86 |     C( 1, 3 ) += A( 1, p ) * B( p, 3 );     
 87 |   }
 88 | 
 89 |   /* Third row */
 90 |   //  AddDot( k, &A( 2, 0 ), lda, &B( 0, 0 ), &C( 2, 0 ) );
 91 |   for ( p=0; p<k; p++ ){
 92 |     C( 2, 0 ) += A( 2, p ) * B( p, 0 );     
 93 |   }
 94 |   //  AddDot( k, &A( 2, 0 ), lda, &B( 0, 1 ), &C( 2, 1 ) );
 95 |   for ( p=0; p<k; p++ ){
 96 |     C( 2, 1 ) += A( 2, p ) * B( p, 1 );     
 97 |   }
 98 |   //  AddDot( k, &A( 2, 0 ), lda, &B( 0, 2 ), &C( 2, 2 ) );
 99 |   for ( p=0; p<k; p++ ){
100 |     C( 2, 2 ) += A( 2, p ) * B( p, 2 );     
101 |   }
102 |   //  AddDot( k, &A( 2, 0 ), lda, &B( 0, 3 ), &C( 2, 3 ) );
103 |   for ( p=0; p<k; p++ ){
104 |     C( 2, 3 ) += A( 2, p ) * B( p, 3 );     
105 |   }
106 | 
107 |   /* Four row */
108 |   //  AddDot( k, &A( 3, 0 ), lda, &B( 0, 0 ), &C( 3, 0 ) );
109 |   for ( p=0; p<k; p++ ){
110 |     C( 3, 0 ) += A( 3, p ) * B( p, 0 );     
111 |   }
112 |   //  AddDot( k, &A( 3, 0 ), lda, &B( 0, 1 ), &C( 3, 1 ) );
113 |   for ( p=0; p<k; p++ ){
114 |     C( 3, 1 ) += A( 3, p ) * B( p, 1 );     
115 |   }
116 |   //  AddDot( k, &A( 3, 0 ), lda, &B( 0, 2 ), &C( 3, 2 ) );
117 |   for ( p=0; p<k; p++ ){
118 |     C( 3, 2 ) += A( 3, p ) * B( p, 2 );     
119 |   }
120 |   //  AddDot( k, &A( 3, 0 ), lda, &B( 0, 3 ), &C( 3, 3 ) );
121 |   for ( p=0; p<k; p++ ){
122 |     C( 3, 3 ) += A( 3, p ) * B( p, 3 );     
123 |   }
124 | }
125 | 


--------------------------------------------------------------------------------
/src/MMult_4x4_5.c:
--------------------------------------------------------------------------------
 1 | 
 2 | /* Create macros so that the matrices are stored in column-major order */
 3 | 
 4 | #define A(i,j) a[ (j)*lda + (i) ]
 5 | #define B(i,j) b[ (j)*ldb + (i) ]
 6 | #define C(i,j) c[ (j)*ldc + (i) ]
 7 | 
 8 | /* Routine for computing C = A * B + C */
 9 | 
10 | void AddDot4x4( int, double *, int, double *, int, double *, int );
11 | void AddDot( int, double *, int, double *, double * );
12 | 
13 | void MY_MMult( int m, int n, int k, double *a, int lda, 
14 |                                     double *b, int ldb,
15 |                                     double *c, int ldc )
16 | {
17 |   int i, j;
18 | 
19 |   for ( j=0; j<n; j+=4 ){        /* Loop over the columns of C, unrolled by 4 */
20 |     for ( i=0; i<m; i+=4 ){        /* Loop over the rows of C */
21 |       /* Update C( i,j ), C( i,j+1 ), C( i,j+2 ), and C( i,j+3 ) in
22 | 	 one routine (four inner products) */
23 | 
24 |       AddDot4x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );
25 |     }
26 |   }
27 | }
28 | 
29 | 
30 | void AddDot4x4( int k, double *a, int lda,  double *b, int ldb, double *c, int ldc )
31 | {
32 |   /* So, this routine computes a 4x4 block of matrix A
33 | 
34 |            C( 0, 0 ), C( 0, 1 ), C( 0, 2 ), C( 0, 3 ).  
35 |            C( 1, 0 ), C( 1, 1 ), C( 1, 2 ), C( 1, 3 ).  
36 |            C( 2, 0 ), C( 2, 1 ), C( 2, 2 ), C( 2, 3 ).  
37 |            C( 3, 0 ), C( 3, 1 ), C( 3, 2 ), C( 3, 3 ).  
38 | 
39 |      Notice that this routine is called with c = C( i, j ) in the
40 |      previous routine, so these are actually the elements 
41 | 
42 |            C( i  , j ), C( i  , j+1 ), C( i  , j+2 ), C( i  , j+3 ) 
43 |            C( i+1, j ), C( i+1, j+1 ), C( i+1, j+2 ), C( i+1, j+3 ) 
44 |            C( i+2, j ), C( i+2, j+1 ), C( i+2, j+2 ), C( i+2, j+3 ) 
45 |            C( i+3, j ), C( i+3, j+1 ), C( i+3, j+2 ), C( i+3, j+3 ) 
46 | 	  
47 |      in the original matrix C 
48 | 
49 |      In this version, we merge each set of four loops, computing four
50 |      inner products simultaneously. */
51 | 
52 |   int p;
53 | 
54 |   for ( p=0; p<k; p++ ){
55 |     /* First row */
56 |     C( 0, 0 ) += A( 0, p ) * B( p, 0 );     
57 |     C( 0, 1 ) += A( 0, p ) * B( p, 1 );     
58 |     C( 0, 2 ) += A( 0, p ) * B( p, 2 );     
59 |     C( 0, 3 ) += A( 0, p ) * B( p, 3 );     
60 | 
61 |     /* Second row */
62 |     C( 1, 0 ) += A( 1, p ) * B( p, 0 );     
63 |     C( 1, 1 ) += A( 1, p ) * B( p, 1 );     
64 |     C( 1, 2 ) += A( 1, p ) * B( p, 2 );     
65 |     C( 1, 3 ) += A( 1, p ) * B( p, 3 );     
66 | 
67 |     /* Third row */
68 |     C( 2, 0 ) += A( 2, p ) * B( p, 0 );     
69 |     C( 2, 1 ) += A( 2, p ) * B( p, 1 );     
70 |     C( 2, 2 ) += A( 2, p ) * B( p, 2 );     
71 |     C( 2, 3 ) += A( 2, p ) * B( p, 3 );     
72 | 
73 |     /* Fourth row */
74 |     C( 3, 0 ) += A( 3, p ) * B( p, 0 );     
75 |     C( 3, 1 ) += A( 3, p ) * B( p, 1 );     
76 |     C( 3, 2 ) += A( 3, p ) * B( p, 2 );     
77 |     C( 3, 3 ) += A( 3, p ) * B( p, 3 );     
78 |   }
79 | }
80 | 


--------------------------------------------------------------------------------
/src/MMult_4x4_6.c:
--------------------------------------------------------------------------------
  1 | 
  2 | /* Create macros so that the matrices are stored in column-major order */
  3 | 
  4 | #define A(i,j) a[ (j)*lda + (i) ]
  5 | #define B(i,j) b[ (j)*ldb + (i) ]
  6 | #define C(i,j) c[ (j)*ldc + (i) ]
  7 | 
  8 | /* Routine for computing C = A * B + C */
  9 | 
 10 | void AddDot4x4( int, double *, int, double *, int, double *, int );
 11 | 
 12 | void MY_MMult( int m, int n, int k, double *a, int lda, 
 13 |                                     double *b, int ldb,
 14 |                                     double *c, int ldc )
 15 | {
 16 |   int i, j;
 17 | 
 18 |   for ( j=0; j<n; j+=4 ){        /* Loop over the columns of C, unrolled by 4 */
 19 |     for ( i=0; i<m; i+=4 ){        /* Loop over the rows of C */
 20 |       /* Update C( i,j ), C( i,j+1 ), C( i,j+2 ), and C( i,j+3 ) in
 21 | 	 one routine (four inner products) */
 22 | 
 23 |       AddDot4x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );
 24 |     }
 25 |   }
 26 | }
 27 | 
 28 | 
 29 | void AddDot4x4( int k, double *a, int lda,  double *b, int ldb, double *c, int ldc )
 30 | {
 31 |   /* So, this routine computes a 4x4 block of matrix A
 32 | 
 33 |            C( 0, 0 ), C( 0, 1 ), C( 0, 2 ), C( 0, 3 ).  
 34 |            C( 1, 0 ), C( 1, 1 ), C( 1, 2 ), C( 1, 3 ).  
 35 |            C( 2, 0 ), C( 2, 1 ), C( 2, 2 ), C( 2, 3 ).  
 36 |            C( 3, 0 ), C( 3, 1 ), C( 3, 2 ), C( 3, 3 ).  
 37 | 
 38 |      Notice that this routine is called with c = C( i, j ) in the
 39 |      previous routine, so these are actually the elements 
 40 | 
 41 |            C( i  , j ), C( i  , j+1 ), C( i  , j+2 ), C( i  , j+3 ) 
 42 |            C( i+1, j ), C( i+1, j+1 ), C( i+1, j+2 ), C( i+1, j+3 ) 
 43 |            C( i+2, j ), C( i+2, j+1 ), C( i+2, j+2 ), C( i+2, j+3 ) 
 44 |            C( i+3, j ), C( i+3, j+1 ), C( i+3, j+2 ), C( i+3, j+3 ) 
 45 | 	  
 46 |      in the original matrix C 
 47 | 
 48 |      In this version, we accumulate in registers and put A( 0, p ) in a register */
 49 | 
 50 |   int p;
 51 |   register double 
 52 |     /* hold contributions to
 53 |        C( 0, 0 ), C( 0, 1 ), C( 0, 2 ), C( 0, 3 ) 
 54 |        C( 1, 0 ), C( 1, 1 ), C( 1, 2 ), C( 1, 3 ) 
 55 |        C( 2, 0 ), C( 2, 1 ), C( 2, 2 ), C( 2, 3 ) 
 56 |        C( 3, 0 ), C( 3, 1 ), C( 3, 2 ), C( 3, 3 )   */
 57 |        c_00_reg,   c_01_reg,   c_02_reg,   c_03_reg,  
 58 |        c_10_reg,   c_11_reg,   c_12_reg,   c_13_reg,  
 59 |        c_20_reg,   c_21_reg,   c_22_reg,   c_23_reg,  
 60 |        c_30_reg,   c_31_reg,   c_32_reg,   c_33_reg,
 61 |     /* hold 
 62 |        A( 0, p ) 
 63 |        A( 1, p ) 
 64 |        A( 2, p ) 
 65 |        A( 3, p ) */
 66 |        a_0p_reg,
 67 |        a_1p_reg,
 68 |        a_2p_reg,
 69 |        a_3p_reg;
 70 | 
 71 |   c_00_reg = 0.0;   c_01_reg = 0.0;   c_02_reg = 0.0;   c_03_reg = 0.0;
 72 |   c_10_reg = 0.0;   c_11_reg = 0.0;   c_12_reg = 0.0;   c_13_reg = 0.0;
 73 |   c_20_reg = 0.0;   c_21_reg = 0.0;   c_22_reg = 0.0;   c_23_reg = 0.0;
 74 |   c_30_reg = 0.0;   c_31_reg = 0.0;   c_32_reg = 0.0;   c_33_reg = 0.0;
 75 | 
 76 |   for ( p=0; p<k; p++ ){
 77 |     a_0p_reg = A( 0, p );
 78 |     a_1p_reg = A( 1, p );
 79 |     a_2p_reg = A( 2, p );
 80 |     a_3p_reg = A( 3, p );
 81 | 
 82 |     /* First row */
 83 |     c_00_reg += a_0p_reg * B( p, 0 );     
 84 |     c_01_reg += a_0p_reg * B( p, 1 );     
 85 |     c_02_reg += a_0p_reg * B( p, 2 );     
 86 |     c_03_reg += a_0p_reg * B( p, 3 );     
 87 | 
 88 |     /* Second row */
 89 |     c_10_reg += a_1p_reg * B( p, 0 );     
 90 |     c_11_reg += a_1p_reg * B( p, 1 );     
 91 |     c_12_reg += a_1p_reg * B( p, 2 );     
 92 |     c_13_reg += a_1p_reg * B( p, 3 );     
 93 | 
 94 |     /* Third row */
 95 |     c_20_reg += a_2p_reg * B( p, 0 );     
 96 |     c_21_reg += a_2p_reg * B( p, 1 );     
 97 |     c_22_reg += a_2p_reg * B( p, 2 );     
 98 |     c_23_reg += a_2p_reg * B( p, 3 );     
 99 | 
100 |     /* Four row */
101 |     c_30_reg += a_3p_reg * B( p, 0 );     
102 |     c_31_reg += a_3p_reg * B( p, 1 );     
103 |     c_32_reg += a_3p_reg * B( p, 2 );     
104 |     c_33_reg += a_3p_reg * B( p, 3 );     
105 |   }
106 | 
107 |   C( 0, 0 ) += c_00_reg;   C( 0, 1 ) += c_01_reg;   C( 0, 2 ) += c_02_reg;   C( 0, 3 ) += c_03_reg;
108 |   C( 1, 0 ) += c_10_reg;   C( 1, 1 ) += c_11_reg;   C( 1, 2 ) += c_12_reg;   C( 1, 3 ) += c_13_reg;
109 |   C( 2, 0 ) += c_20_reg;   C( 2, 1 ) += c_21_reg;   C( 2, 2 ) += c_22_reg;   C( 2, 3 ) += c_23_reg;
110 |   C( 3, 0 ) += c_30_reg;   C( 3, 1 ) += c_31_reg;   C( 3, 2 ) += c_32_reg;   C( 3, 3 ) += c_33_reg;
111 | }
112 | 


--------------------------------------------------------------------------------
/src/MMult_4x4_7.c:
--------------------------------------------------------------------------------
  1 | 
  2 | /* Create macros so that the matrices are stored in column-major order */
  3 | 
  4 | #define A(i,j) a[ (j)*lda + (i) ]
  5 | #define B(i,j) b[ (j)*ldb + (i) ]
  6 | #define C(i,j) c[ (j)*ldc + (i) ]
  7 | 
  8 | /* Routine for computing C = A * B + C */
  9 | 
 10 | void AddDot4x4( int, double *, int, double *, int, double *, int );
 11 | 
 12 | void MY_MMult( int m, int n, int k, double *a, int lda, 
 13 |                                     double *b, int ldb,
 14 |                                     double *c, int ldc )
 15 | {
 16 |   int i, j;
 17 | 
 18 |   for ( j=0; j<n; j+=4 ){        /* Loop over the columns of C, unrolled by 4 */
 19 |     for ( i=0; i<m; i+=4 ){        /* Loop over the rows of C */
 20 |       /* Update C( i,j ), C( i,j+1 ), C( i,j+2 ), and C( i,j+3 ) in
 21 | 	 one routine (four inner products) */
 22 | 
 23 |       AddDot4x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );
 24 |     }
 25 |   }
 26 | }
 27 | 
 28 | 
 29 | void AddDot4x4( int k, double *a, int lda,  double *b, int ldb, double *c, int ldc )
 30 | {
 31 |   /* So, this routine computes a 4x4 block of matrix A
 32 | 
 33 |            C( 0, 0 ), C( 0, 1 ), C( 0, 2 ), C( 0, 3 ).  
 34 |            C( 1, 0 ), C( 1, 1 ), C( 1, 2 ), C( 1, 3 ).  
 35 |            C( 2, 0 ), C( 2, 1 ), C( 2, 2 ), C( 2, 3 ).  
 36 |            C( 3, 0 ), C( 3, 1 ), C( 3, 2 ), C( 3, 3 ).  
 37 | 
 38 |      Notice that this routine is called with c = C( i, j ) in the
 39 |      previous routine, so these are actually the elements 
 40 | 
 41 |            C( i  , j ), C( i  , j+1 ), C( i  , j+2 ), C( i  , j+3 ) 
 42 |            C( i+1, j ), C( i+1, j+1 ), C( i+1, j+2 ), C( i+1, j+3 ) 
 43 |            C( i+2, j ), C( i+2, j+1 ), C( i+2, j+2 ), C( i+2, j+3 ) 
 44 |            C( i+3, j ), C( i+3, j+1 ), C( i+3, j+2 ), C( i+3, j+3 ) 
 45 | 	  
 46 |      in the original matrix C 
 47 | 
 48 |      In this version, we use pointer to track where in four columns of B we are */
 49 | 
 50 |   int p;
 51 |   register double 
 52 |     /* hold contributions to
 53 |        C( 0, 0 ), C( 0, 1 ), C( 0, 2 ), C( 0, 3 ) 
 54 |        C( 1, 0 ), C( 1, 1 ), C( 1, 2 ), C( 1, 3 ) 
 55 |        C( 2, 0 ), C( 2, 1 ), C( 2, 2 ), C( 2, 3 ) 
 56 |        C( 3, 0 ), C( 3, 1 ), C( 3, 2 ), C( 3, 3 )   */
 57 |        c_00_reg,   c_01_reg,   c_02_reg,   c_03_reg,  
 58 |        c_10_reg,   c_11_reg,   c_12_reg,   c_13_reg,  
 59 |        c_20_reg,   c_21_reg,   c_22_reg,   c_23_reg,  
 60 |        c_30_reg,   c_31_reg,   c_32_reg,   c_33_reg,
 61 |     /* hold 
 62 |        A( 0, p ) 
 63 |        A( 1, p ) 
 64 |        A( 2, p ) 
 65 |        A( 3, p ) */
 66 |        a_0p_reg,
 67 |        a_1p_reg,
 68 |        a_2p_reg,
 69 |        a_3p_reg;
 70 |   double 
 71 |     /* Point to the current elements in the four columns of B */
 72 |     *b_p0_pntr, *b_p1_pntr, *b_p2_pntr, *b_p3_pntr; 
 73 | 
 74 |   c_00_reg = 0.0;   c_01_reg = 0.0;   c_02_reg = 0.0;   c_03_reg = 0.0;
 75 |   c_10_reg = 0.0;   c_11_reg = 0.0;   c_12_reg = 0.0;   c_13_reg = 0.0;
 76 |   c_20_reg = 0.0;   c_21_reg = 0.0;   c_22_reg = 0.0;   c_23_reg = 0.0;
 77 |   c_30_reg = 0.0;   c_31_reg = 0.0;   c_32_reg = 0.0;   c_33_reg = 0.0;
 78 | 
 79 |   for ( p=0; p<k; p++ ){
 80 |     a_0p_reg = A( 0, p );
 81 |     a_1p_reg = A( 1, p );
 82 |     a_2p_reg = A( 2, p );
 83 |     a_3p_reg = A( 3, p );
 84 | 	  
 85 |     b_p0_pntr = &B( p, 0 );
 86 |     b_p1_pntr = &B( p, 1 );
 87 |     b_p2_pntr = &B( p, 2 );
 88 |     b_p3_pntr = &B( p, 3 );	  
 89 | 
 90 |     /* First row */
 91 |     c_00_reg += a_0p_reg * *b_p0_pntr;     
 92 |     c_01_reg += a_0p_reg * *b_p1_pntr;     
 93 |     c_02_reg += a_0p_reg * *b_p2_pntr;     
 94 |     c_03_reg += a_0p_reg * *b_p3_pntr;     
 95 | 
 96 |     /* Second row */
 97 |     c_10_reg += a_1p_reg * *b_p0_pntr;     
 98 |     c_11_reg += a_1p_reg * *b_p1_pntr;     
 99 |     c_12_reg += a_1p_reg * *b_p2_pntr;     
100 |     c_13_reg += a_1p_reg * *b_p3_pntr;     
101 | 
102 |     /* Third row */
103 |     c_20_reg += a_2p_reg * *b_p0_pntr;     
104 |     c_21_reg += a_2p_reg * *b_p1_pntr;     
105 |     c_22_reg += a_2p_reg * *b_p2_pntr;     
106 |     c_23_reg += a_2p_reg * *b_p3_pntr;     
107 | 
108 |     /* Four row */
109 |     c_30_reg += a_3p_reg * *b_p0_pntr++;     
110 |     c_31_reg += a_3p_reg * *b_p1_pntr++;     
111 |     c_32_reg += a_3p_reg * *b_p2_pntr++;     
112 |     c_33_reg += a_3p_reg * *b_p3_pntr++;     
113 |   }
114 | 
115 |   C( 0, 0 ) += c_00_reg;   C( 0, 1 ) += c_01_reg;   C( 0, 2 ) += c_02_reg;   C( 0, 3 ) += c_03_reg;
116 |   C( 1, 0 ) += c_10_reg;   C( 1, 1 ) += c_11_reg;   C( 1, 2 ) += c_12_reg;   C( 1, 3 ) += c_13_reg;
117 |   C( 2, 0 ) += c_20_reg;   C( 2, 1 ) += c_21_reg;   C( 2, 2 ) += c_22_reg;   C( 2, 3 ) += c_23_reg;
118 |   C( 3, 0 ) += c_30_reg;   C( 3, 1 ) += c_31_reg;   C( 3, 2 ) += c_32_reg;   C( 3, 3 ) += c_33_reg;
119 | }
120 | 


--------------------------------------------------------------------------------
/src/MMult_4x4_8.c:
--------------------------------------------------------------------------------
  1 | 
  2 | /* Create macros so that the matrices are stored in column-major order */
  3 | 
  4 | #define A(i,j) a[ (j)*lda + (i) ]
  5 | #define B(i,j) b[ (j)*ldb + (i) ]
  6 | #define C(i,j) c[ (j)*ldc + (i) ]
  7 | 
  8 | /* Routine for computing C = A * B + C */
  9 | 
 10 | void AddDot4x4( int, double *, int, double *, int, double *, int );
 11 | 
 12 | void MY_MMult( int m, int n, int k, double *a, int lda, 
 13 |                                     double *b, int ldb,
 14 |                                     double *c, int ldc )
 15 | {
 16 |   int i, j;
 17 | 
 18 |   for ( j=0; j<n; j+=4 ){        /* Loop over the columns of C, unrolled by 4 */
 19 |     for ( i=0; i<m; i+=4 ){        /* Loop over the rows of C */
 20 |       /* Update C( i,j ), C( i,j+1 ), C( i,j+2 ), and C( i,j+3 ) in
 21 | 	 one routine (four inner products) */
 22 | 
 23 |       AddDot4x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );
 24 |     }
 25 |   }
 26 | }
 27 | 
 28 | 
 29 | void AddDot4x4( int k, double *a, int lda,  double *b, int ldb, double *c, int ldc )
 30 | {
 31 |   /* So, this routine computes a 4x4 block of matrix A
 32 | 
 33 |            C( 0, 0 ), C( 0, 1 ), C( 0, 2 ), C( 0, 3 ).  
 34 |            C( 1, 0 ), C( 1, 1 ), C( 1, 2 ), C( 1, 3 ).  
 35 |            C( 2, 0 ), C( 2, 1 ), C( 2, 2 ), C( 2, 3 ).  
 36 |            C( 3, 0 ), C( 3, 1 ), C( 3, 2 ), C( 3, 3 ).  
 37 | 
 38 |      Notice that this routine is called with c = C( i, j ) in the
 39 |      previous routine, so these are actually the elements 
 40 | 
 41 |            C( i  , j ), C( i  , j+1 ), C( i  , j+2 ), C( i  , j+3 ) 
 42 |            C( i+1, j ), C( i+1, j+1 ), C( i+1, j+2 ), C( i+1, j+3 ) 
 43 |            C( i+2, j ), C( i+2, j+1 ), C( i+2, j+2 ), C( i+2, j+3 ) 
 44 |            C( i+3, j ), C( i+3, j+1 ), C( i+3, j+2 ), C( i+3, j+3 ) 
 45 | 	  
 46 |      in the original matrix C 
 47 | 
 48 |      In this version, we use registers for elements in the current row
 49 |      of B as well */
 50 | 
 51 |   int p;
 52 |   register double 
 53 |     /* hold contributions to
 54 |        C( 0, 0 ), C( 0, 1 ), C( 0, 2 ), C( 0, 3 ) 
 55 |        C( 1, 0 ), C( 1, 1 ), C( 1, 2 ), C( 1, 3 ) 
 56 |        C( 2, 0 ), C( 2, 1 ), C( 2, 2 ), C( 2, 3 ) 
 57 |        C( 3, 0 ), C( 3, 1 ), C( 3, 2 ), C( 3, 3 )   */
 58 |        c_00_reg,   c_01_reg,   c_02_reg,   c_03_reg,  
 59 |        c_10_reg,   c_11_reg,   c_12_reg,   c_13_reg,  
 60 |        c_20_reg,   c_21_reg,   c_22_reg,   c_23_reg,  
 61 |        c_30_reg,   c_31_reg,   c_32_reg,   c_33_reg,
 62 |     /* hold 
 63 |        A( 0, p ) 
 64 |        A( 1, p ) 
 65 |        A( 2, p ) 
 66 |        A( 3, p ) */
 67 |        a_0p_reg,
 68 |        a_1p_reg,
 69 |        a_2p_reg,
 70 |        a_3p_reg,
 71 |        b_p0_reg,
 72 |        b_p1_reg,
 73 |        b_p2_reg,
 74 |        b_p3_reg;
 75 | 
 76 |   double 
 77 |     /* Point to the current elements in the four columns of B */
 78 |     *b_p0_pntr, *b_p1_pntr, *b_p2_pntr, *b_p3_pntr; 
 79 |     
 80 |   b_p0_pntr = &B( 0, 0 );
 81 |   b_p1_pntr = &B( 0, 1 );
 82 |   b_p2_pntr = &B( 0, 2 );
 83 |   b_p3_pntr = &B( 0, 3 );
 84 | 
 85 |   c_00_reg = 0.0;   c_01_reg = 0.0;   c_02_reg = 0.0;   c_03_reg = 0.0;
 86 |   c_10_reg = 0.0;   c_11_reg = 0.0;   c_12_reg = 0.0;   c_13_reg = 0.0;
 87 |   c_20_reg = 0.0;   c_21_reg = 0.0;   c_22_reg = 0.0;   c_23_reg = 0.0;
 88 |   c_30_reg = 0.0;   c_31_reg = 0.0;   c_32_reg = 0.0;   c_33_reg = 0.0;
 89 | 
 90 |   for ( p=0; p<k; p++ ){
 91 |     a_0p_reg = A( 0, p );
 92 |     a_1p_reg = A( 1, p );
 93 |     a_2p_reg = A( 2, p );
 94 |     a_3p_reg = A( 3, p );
 95 | 
 96 |     b_p0_reg = *b_p0_pntr++;
 97 |     b_p1_reg = *b_p1_pntr++;
 98 |     b_p2_reg = *b_p2_pntr++;
 99 |     b_p3_reg = *b_p3_pntr++;
100 | 
101 |     /* First row */
102 |     c_00_reg += a_0p_reg * b_p0_reg;
103 |     c_01_reg += a_0p_reg * b_p1_reg;
104 |     c_02_reg += a_0p_reg * b_p2_reg;
105 |     c_03_reg += a_0p_reg * b_p3_reg;
106 | 
107 |     /* Second row */
108 |     c_10_reg += a_1p_reg * b_p0_reg;
109 |     c_11_reg += a_1p_reg * b_p1_reg;
110 |     c_12_reg += a_1p_reg * b_p2_reg;
111 |     c_13_reg += a_1p_reg * b_p3_reg;
112 | 
113 |     /* Third row */
114 |     c_20_reg += a_2p_reg * b_p0_reg;
115 |     c_21_reg += a_2p_reg * b_p1_reg;
116 |     c_22_reg += a_2p_reg * b_p2_reg;
117 |     c_23_reg += a_2p_reg * b_p3_reg;
118 | 
119 |     /* Four row */
120 |     c_30_reg += a_3p_reg * b_p0_reg;
121 |     c_31_reg += a_3p_reg * b_p1_reg;
122 |     c_32_reg += a_3p_reg * b_p2_reg;
123 |     c_33_reg += a_3p_reg * b_p3_reg;
124 |   }
125 | 
126 |   C( 0, 0 ) += c_00_reg;   C( 0, 1 ) += c_01_reg;   C( 0, 2 ) += c_02_reg;   C( 0, 3 ) += c_03_reg;
127 |   C( 1, 0 ) += c_10_reg;   C( 1, 1 ) += c_11_reg;   C( 1, 2 ) += c_12_reg;   C( 1, 3 ) += c_13_reg;
128 |   C( 2, 0 ) += c_20_reg;   C( 2, 1 ) += c_21_reg;   C( 2, 2 ) += c_22_reg;   C( 2, 3 ) += c_23_reg;
129 |   C( 3, 0 ) += c_30_reg;   C( 3, 1 ) += c_31_reg;   C( 3, 2 ) += c_32_reg;   C( 3, 3 ) += c_33_reg;
130 | }
131 | 


--------------------------------------------------------------------------------
/src/MMult_4x4_9.c:
--------------------------------------------------------------------------------
  1 | 
  2 | /* Create macros so that the matrices are stored in column-major order */
  3 | 
  4 | #define A(i,j) a[ (j)*lda + (i) ]
  5 | #define B(i,j) b[ (j)*ldb + (i) ]
  6 | #define C(i,j) c[ (j)*ldc + (i) ]
  7 | 
  8 | /* Routine for computing C = A * B + C */
  9 | 
 10 | void AddDot4x4( int, double *, int, double *, int, double *, int );
 11 | 
 12 | void MY_MMult( int m, int n, int k, double *a, int lda, 
 13 |                                     double *b, int ldb,
 14 |                                     double *c, int ldc )
 15 | {
 16 |   int i, j;
 17 | 
 18 |   for ( j=0; j<n; j+=4 ){        /* Loop over the columns of C, unrolled by 4 */
 19 |     for ( i=0; i<m; i+=4 ){        /* Loop over the rows of C */
 20 |       /* Update C( i,j ), C( i,j+1 ), C( i,j+2 ), and C( i,j+3 ) in
 21 | 	 one routine (four inner products) */
 22 | 
 23 |       AddDot4x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );
 24 |     }
 25 |   }
 26 | }
 27 | 
 28 | 
 29 | void AddDot4x4( int k, double *a, int lda,  double *b, int ldb, double *c, int ldc )
 30 | {
 31 |   /* So, this routine computes a 4x4 block of matrix A
 32 | 
 33 |            C( 0, 0 ), C( 0, 1 ), C( 0, 2 ), C( 0, 3 ).  
 34 |            C( 1, 0 ), C( 1, 1 ), C( 1, 2 ), C( 1, 3 ).  
 35 |            C( 2, 0 ), C( 2, 1 ), C( 2, 2 ), C( 2, 3 ).  
 36 |            C( 3, 0 ), C( 3, 1 ), C( 3, 2 ), C( 3, 3 ).  
 37 | 
 38 |      Notice that this routine is called with c = C( i, j ) in the
 39 |      previous routine, so these are actually the elements 
 40 | 
 41 |            C( i  , j ), C( i  , j+1 ), C( i  , j+2 ), C( i  , j+3 ) 
 42 |            C( i+1, j ), C( i+1, j+1 ), C( i+1, j+2 ), C( i+1, j+3 ) 
 43 |            C( i+2, j ), C( i+2, j+1 ), C( i+2, j+2 ), C( i+2, j+3 ) 
 44 |            C( i+3, j ), C( i+3, j+1 ), C( i+3, j+2 ), C( i+3, j+3 ) 
 45 | 	  
 46 |      in the original matrix C 
 47 | 
 48 |      A simple rearrangement to prepare for the use of vector registers */
 49 | 
 50 |   int p;
 51 |   register double 
 52 |     /* hold contributions to
 53 |        C( 0, 0 ), C( 0, 1 ), C( 0, 2 ), C( 0, 3 ) 
 54 |        C( 1, 0 ), C( 1, 1 ), C( 1, 2 ), C( 1, 3 ) 
 55 |        C( 2, 0 ), C( 2, 1 ), C( 2, 2 ), C( 2, 3 ) 
 56 |        C( 3, 0 ), C( 3, 1 ), C( 3, 2 ), C( 3, 3 )   */
 57 |        c_00_reg,   c_01_reg,   c_02_reg,   c_03_reg,  
 58 |        c_10_reg,   c_11_reg,   c_12_reg,   c_13_reg,  
 59 |        c_20_reg,   c_21_reg,   c_22_reg,   c_23_reg,  
 60 |        c_30_reg,   c_31_reg,   c_32_reg,   c_33_reg,
 61 |     /* hold 
 62 |        A( 0, p ) 
 63 |        A( 1, p ) 
 64 |        A( 2, p ) 
 65 |        A( 3, p ) */
 66 |        a_0p_reg,
 67 |        a_1p_reg,
 68 |        a_2p_reg,
 69 |        a_3p_reg,
 70 |        b_p0_reg,
 71 |        b_p1_reg,
 72 |        b_p2_reg,
 73 |        b_p3_reg;
 74 | 
 75 |   double 
 76 |     /* Point to the current elements in the four columns of B */
 77 |     *b_p0_pntr, *b_p1_pntr, *b_p2_pntr, *b_p3_pntr; 
 78 |     
 79 |   b_p0_pntr = &B( 0, 0 );
 80 |   b_p1_pntr = &B( 0, 1 );
 81 |   b_p2_pntr = &B( 0, 2 );
 82 |   b_p3_pntr = &B( 0, 3 );
 83 | 
 84 |   c_00_reg = 0.0;   c_01_reg = 0.0;   c_02_reg = 0.0;   c_03_reg = 0.0;
 85 |   c_10_reg = 0.0;   c_11_reg = 0.0;   c_12_reg = 0.0;   c_13_reg = 0.0;
 86 |   c_20_reg = 0.0;   c_21_reg = 0.0;   c_22_reg = 0.0;   c_23_reg = 0.0;
 87 |   c_30_reg = 0.0;   c_31_reg = 0.0;   c_32_reg = 0.0;   c_33_reg = 0.0;
 88 | 
 89 |   for ( p=0; p<k; p++ ){
 90 |     a_0p_reg = A( 0, p );
 91 |     a_1p_reg = A( 1, p );
 92 |     a_2p_reg = A( 2, p );
 93 |     a_3p_reg = A( 3, p );
 94 | 
 95 |     b_p0_reg = *b_p0_pntr++;
 96 |     b_p1_reg = *b_p1_pntr++;
 97 |     b_p2_reg = *b_p2_pntr++;
 98 |     b_p3_reg = *b_p3_pntr++;
 99 | 
100 |     /* First row and second rows */
101 |     c_00_reg += a_0p_reg * b_p0_reg;
102 |     c_10_reg += a_1p_reg * b_p0_reg;
103 | 
104 |     c_01_reg += a_0p_reg * b_p1_reg;
105 |     c_11_reg += a_1p_reg * b_p1_reg;
106 | 
107 |     c_02_reg += a_0p_reg * b_p2_reg;
108 |     c_12_reg += a_1p_reg * b_p2_reg;
109 | 
110 |     c_03_reg += a_0p_reg * b_p3_reg;
111 |     c_13_reg += a_1p_reg * b_p3_reg;
112 | 
113 |     /* Third and fourth rows */
114 |     c_20_reg += a_2p_reg * b_p0_reg;
115 |     c_30_reg += a_3p_reg * b_p0_reg;
116 | 
117 |     c_21_reg += a_2p_reg * b_p1_reg;
118 |     c_31_reg += a_3p_reg * b_p1_reg;
119 | 
120 |     c_22_reg += a_2p_reg * b_p2_reg;
121 |     c_32_reg += a_3p_reg * b_p2_reg;
122 | 
123 |     c_23_reg += a_2p_reg * b_p3_reg;
124 |     c_33_reg += a_3p_reg * b_p3_reg;
125 |   }
126 | 
127 |   C( 0, 0 ) += c_00_reg;   C( 0, 1 ) += c_01_reg;   C( 0, 2 ) += c_02_reg;   C( 0, 3 ) += c_03_reg;
128 |   C( 1, 0 ) += c_10_reg;   C( 1, 1 ) += c_11_reg;   C( 1, 2 ) += c_12_reg;   C( 1, 3 ) += c_13_reg;
129 |   C( 2, 0 ) += c_20_reg;   C( 2, 1 ) += c_21_reg;   C( 2, 2 ) += c_22_reg;   C( 2, 3 ) += c_23_reg;
130 |   C( 3, 0 ) += c_30_reg;   C( 3, 1 ) += c_31_reg;   C( 3, 2 ) += c_32_reg;   C( 3, 3 ) += c_33_reg;
131 | }
132 | 


--------------------------------------------------------------------------------