├── NOTICE
├── platform_inc
    ├── generic_gcc_opt.inc
    ├── generic_gcc_debug.inc
    ├── xc30_gcc_debug.inc
    ├── xc30_gcc_opt.inc
    └── generic_mpicc_opt.inc
├── matrix_utils.h
├── Makefile
├── utils.h
├── level1.h
├── README.md
├── main.c
├── fileparse.c
├── matrix_utils.c
├── utils.c
├── level1.c
├── LICENSE
├── cg.c
└── blas_op.c


/NOTICE:
--------------------------------------------------------------------------------
1 | Adept Kernel Benchmarks, MPI port
2 | Copyright 2015 The University of Edinburgh
3 | 
4 | This product includes software developed at
5 | EPCC, The University of Edinburgh (http://www.epcc.ed.ac.uk/).
6 | 


--------------------------------------------------------------------------------
/platform_inc/generic_gcc_opt.inc:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2015 The University of Edinburgh.
 2 | # 
 3 | # This software was developed as part of the                       
 4 | # EC FP7 funded project Adept (Project ID: 610490)                 
 5 | #     www.adept-project.eu                                            
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | CC=mpicc
20 | CFLAGS+= -O3
21 | DMACROS +=


--------------------------------------------------------------------------------
/platform_inc/generic_gcc_debug.inc:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2015 The University of Edinburgh.
 2 | # 
 3 | # This software was developed as part of the                       
 4 | # EC FP7 funded project Adept (Project ID: 610490)                 
 5 | #     www.adept-project.eu                                            
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | CC=mpicc
20 | CFLAGS+= -g -O3
21 | DMACROS +=


--------------------------------------------------------------------------------
/platform_inc/xc30_gcc_debug.inc:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2015 The University of Edinburgh.
 2 | # 
 3 | # This software was developed as part of the                       
 4 | # EC FP7 funded project Adept (Project ID: 610490)                 
 5 | #     www.adept-project.eu                                            
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | CC=cc
20 | CFLAGS+= -g-O3
21 | DMACROS += -DNORAW


--------------------------------------------------------------------------------
/platform_inc/xc30_gcc_opt.inc:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2015 The University of Edinburgh.
 2 | # 
 3 | # This software was developed as part of the                       
 4 | # EC FP7 funded project Adept (Project ID: 610490)                 
 5 | #     www.adept-project.eu                                            
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | CC=cc
20 | CFLAGS+= -O3
21 | DMACROS += -DNORAW


--------------------------------------------------------------------------------
/platform_inc/generic_mpicc_opt.inc:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2015 The University of Edinburgh.
 2 | # 
 3 | # This software was developed as part of the                       
 4 | # EC FP7 funded project Adept (Project ID: 610490)                 
 5 | #     www.adept-project.eu                                            
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | CC = mpicc
20 | CFLAGS += -O3
21 | DMACROS += 
22 | LDFLAGS +=


--------------------------------------------------------------------------------
/matrix_utils.h:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2015 The University of Edinburgh. */
 2 | 
 3 | /* 
 4 | * This software was developed as part of the                       
 5 | * EC FP7 funded project Adept (Project ID: 610490)                 
 6 | * www.adept-project.eu                                            
 7 | */
 8 | 
 9 | /* Licensed under the Apache License, Version 2.0 (the "License"); */
10 | /* you may not use this file except in compliance with the License. */
11 | /* You may obtain a copy of the License at */
12 | 
13 | /*     http://www.apache.org/licenses/LICENSE-2.0 */
14 | 
15 | /* Unless required by applicable law or agreed to in writing, software */
16 | /* distributed under the License is distributed on an "AS IS" BASIS, */
17 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */
18 | /* See the License for the specific language governing permissions and */
19 | /* limitations under the License. */
20 | 
21 | void get_matrix_size(char*, int*, int*, int*);
22 | void mm_to_csr(char*, int, int, int, int*, int*, double*);
23 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2015 The University of Edinburgh.
 2 | # 
 3 | # This software was developed as part of the                       
 4 | # EC FP7 funded project Adept (Project ID: 610490)                 
 5 | #     www.adept-project.eu                                            
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | # Default to gcc but allong setting CC in the shell (or via cmd line) to another compiler
20 | ifndef $CC
21 |   CC = mpicc
22 | endif
23 | 
24 | #set OPT=debug to compile with -g -O0
25 | ifndef $OPT
26 |   OPT = opt
27 | endif
28 | 
29 | ifndef $ARCH
30 |   ARCH=generic
31 | endif
32 | 
33 | include platform_inc/${ARCH}_${CC}_${OPT}.inc
34 | 
35 | SOURCES = main.c level1.c blas_op.c utils.c stencil.c fileparse.c matrix_utils.c cg.c
36 | LDFLAGS+= -lm
37 | EXE = kernel
38 | 
39 | all: $(EXE)
40 | 
41 | $(EXE): $(SOURCES)
42 | 	$(CC) $(CFLAGS) -o $(EXE) $(SOURCES) $(DMACROS) $(LDFLAGS)
43 | 
44 | clean:
45 | 	rm -rf *~ *.o $(EXE)
46 | 


--------------------------------------------------------------------------------
/utils.h:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2015 The University of Edinburgh. */
 2 | 
 3 | /* 
 4 | * This software was developed as part of the                       
 5 | * EC FP7 funded project Adept (Project ID: 610490)                 
 6 | * www.adept-project.eu                                            
 7 | */
 8 | 
 9 | /* Licensed under the Apache License, Version 2.0 (the "License"); */
10 | /* you may not use this file except in compliance with the License. */
11 | /* You may obtain a copy of the License at */
12 | 
13 | /*     http://www.apache.org/licenses/LICENSE-2.0 */
14 | 
15 | /* Unless required by applicable law or agreed to in writing, software */
16 | /* distributed under the License is distributed on an "AS IS" BASIS, */
17 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */
18 | /* See the License for the specific language governing permissions and */
19 | /* limitations under the License. */
20 | 
21 | #include <signal.h>
22 | volatile sig_atomic_t stop;
23 | 
24 | 
25 | #ifdef __MACH__
26 | #include <mach/clock.h>
27 | #include <mach/mach.h>
28 | #define CLOCK 0
29 | 
30 | void clock_gettime (void*, struct timespec *);
31 | 
32 | #else
33 | 
34 | #ifdef CLOCK_MONOTONIC_RAW
35 | #define CLOCK CLOCK_MONOTONIC_RAW
36 | #else
37 | #define CLOCK CLOCK_MONOTONIC
38 | #endif
39 | 
40 | #endif
41 | 
42 | 
43 | double elapsed_time_hr(struct timespec, struct timespec, char *);
44 | void loop_timer(unsigned long);
45 | void loop_timer_nop(unsigned long);
46 | void warmup_loop(unsigned long);
47 | void interrupt_handler(int);
48 | void discrete_elapsed_hr(struct timespec*, struct timespec*, int*, char*);
49 | int sub_time_hr(struct timespec*, struct timespec*, struct timespec*);
50 | 


--------------------------------------------------------------------------------
/level1.h:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2015 The University of Edinburgh. */
 2 | 
 3 | /* 
 4 | * This software was developed as part of the                       
 5 | * EC FP7 funded project Adept (Project ID: 610490)                 
 6 | * www.adept-project.eu                                            
 7 | */
 8 | 
 9 | /* Licensed under the Apache License, Version 2.0 (the "License"); */
10 | /* you may not use this file except in compliance with the License. */
11 | /* You may obtain a copy of the License at */
12 | 
13 | /*     http://www.apache.org/licenses/LICENSE-2.0 */
14 | 
15 | /* Unless required by applicable law or agreed to in writing, software */
16 | /* distributed under the License is distributed on an "AS IS" BASIS, */
17 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */
18 | /* See the License for the specific language governing permissions and */
19 | /* limitations under the License. */
20 | 
21 | void bench_level1(char *, unsigned int, unsigned int, char *, char *, char *);
22 | 
23 | int int_dot_product(unsigned int);
24 | int float_dot_product(unsigned int);
25 | int double_dot_product(unsigned int);
26 | 
27 | int int_scalar_mult(unsigned int);
28 | int float_scalar_mult(unsigned int);
29 | int double_scalar_mult(unsigned int);
30 | 
31 | int double_norm(unsigned int);
32 | int float_norm(unsigned int);
33 | int int_norm(unsigned int);
34 | 
35 | int int_axpy(unsigned int);
36 | int float_axpy(unsigned int);
37 | int double_axpy(unsigned int);
38 | 
39 | int int_dmatvec_product(unsigned int);
40 | int float_dmatvec_product(unsigned int);
41 | int double_dmatvec_product(unsigned int);
42 | 
43 | int double_spmatvec_product(unsigned long);
44 | int float_spmatvec_product(unsigned long);
45 | 
46 | void double_stencil27(unsigned int);
47 | void float_stencil27(unsigned int);
48 | void int_stencil27(unsigned int);
49 | 
50 | void double_stencil19(unsigned int);
51 | void float_stencil19(unsigned int);
52 | void int_stencil19(unsigned int);
53 | 
54 | void double_stencil9(unsigned int);
55 | void float_stencil9(unsigned int);
56 | void int_stencil9(unsigned int);
57 | 
58 | void double_stencil5(unsigned int);
59 | void float_stencil5(unsigned int);
60 | void int_stencil5(unsigned int);
61 | 
62 | void double_stencil27_overlapped(unsigned int);
63 | void float_stencil27_overlapped(unsigned int);
64 | void int_stencil27_overlapped(unsigned int);
65 | 
66 | void double_stencil19_overlapped(unsigned int);
67 | void float_stencil19_overlapped(unsigned int);
68 | void int_stencil19_overlapped(unsigned int);
69 | 
70 | void double_stencil9_overlapped(unsigned int);
71 | void float_stencil9_overlapped(unsigned int);
72 | void int_stencil9_overlapped(unsigned int);
73 | 
74 | void double_stencil5_overlapped(unsigned int);
75 | void float_stencil5_overlapped(unsigned int);
76 | void int_stencil5_overlapped(unsigned int);
77 | 
78 | void fileparse(unsigned int);
79 | 
80 | int conjugate_gradient(unsigned int);
81 | int conjugate_gradient_mixed(unsigned int);
82 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015 The University of Edinburgh.
 2 |  
 3 | This software was developed as part of the                       
 4 | EC FP7 funded project Adept (Project ID: 610490)                 
 5 |     http://www.adept-project.eu                                            
 6 | 
 7 | Licensed under the Apache License, Version 2.0 (the "License");
 8 | you may not use this file except in compliance with the License.
 9 | You may obtain a copy of the License at
10 | 
11 |     http://www.apache.org/licenses/LICENSE-2.0
12 | 
13 | Unless required by applicable law or agreed to in writing, software
14 | distributed under the License is distributed on an "AS IS" BASIS,
15 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | See the License for the specific language governing permissions and
17 | limitations under the License.
18 | 
19 | 
20 | # Adept Kernel Benchmarks - MPI
21 | 
22 | This README describes the MPI parallel kernel benchmarks. They are implemented in C.
23 | 
24 | ## Citation & Further Information
25 | If you would like to cite this work, please cite:
26 | Nick Johnson et al., "Adept Deliverable D2.3 - Updated Report on Adept Benchmarks", September 2015.
27 | available at http://www.adept-project.eu/images/Deliverables/Adept%20D2.3.pdf
28 | 
29 | ## BLAS-type benchmarks
30 | 
31 | In our BLAS-type benchmarks we implement a few of the most common linear algebra computations.
32 | 
33 | #### AXPY
34 | This benchmark takes two vectors `x` and `y`, and the scalar `a`, and computes:
35 | ``` 
36 |   y = a * x + y
37 | ```
38 | The user can choose the length (number of elements) of the vectors, as well as their data type (int, float or double).
39 | 
40 | #### Dot product 
41 | The dot product benchmark multiplies two vectors x and y of length n and returns a scalar:
42 | ```
43 |   result = x_0 y0 + x_1 y_1 + ... x_n y_n
44 | ```
45 | The user can choose the length (number of elements) of the vectors, as well as their data type (int, float or double).
46 | 
47 | #### Scalar multiplication
48 | Thise benchmark scales the vector x by a fixed scalar a:
49 | ```
50 |   x = a * x
51 | ```
52 | The user can choose the length (number of elements) of the vectors, as well as their data type (int, float or double).
53 | 
54 | #### Euclidean norm
55 | This benchmarks computes for Euclidean norm of vector x:
56 | ```
57 |   || x || = sqrt ( |x_1|^2 + |x_2|^2 + ... |x_n|^2 )
58 | ```
59 | The user can choose the length (number of elements) of the vectors, as well as their data type (int, float or double).
60 |   
61 | #### Dense matrix-vector multiplication
62 | This benchmarks multiplies a square dense matrix A with a vector x to compute vector y:
63 | ```
64 | y = A * x
65 | ```
66 | Both A and x are randomly generated. The user can choose the size of the data structures (where size*size equals the number of elements in the matrix), as well as their data type (int, float or double).
67 | 
68 | #### Sparse matrix-vector multiplication
69 | This benchmarks multiplies a square sparse matrix A with a vector x to compute vector y:
70 | ```
71 | y = A * x
72 | ```
73 | A is represented in CSR format and read from an input file. The vector x is randomly generated. The size of the matrix is fixed by the input file (which the user can substitute for a different matrix). The user can choose the data type to be used (float or double).
74 | 
75 | #### Sparse matrix-matrix multiplication
76 | This benchmarks multiplies two square sparse matrices A and B to compute matrix C:
77 | ```
78 | C = A * B
79 | ```
80 | A and B are both represented in CSR format and read from an input file. The size of the matrices is fixed by the input file (which the user can substitute for a different matrix). The user can choose the data type to be used (float or double).
81 |   
82 | ## Stencil computation
83 | 
84 | The stencil benchmarks compute values for each element in a 2D or 3D grid based on the values of their nearest neighbours.
85 |  
86 | #### 2D grid: 5-point and 9-point Stencil
87 | On a 2D grid, the 5-point stencil computes the value of A[i][j] by taking the values from left, right, up and down from the current position, and scale them with a constant. The 9-point stencil is similar, but also includes the diagonals.
88 | The user can choose the data type to be used in the grid (int, float or double).
89 |  
90 | #### 3D grid: 19-point and 27-point Stencil 
91 | The 19-point and 27-point stencils are analogous to the 5 and 9 point stencil, but they operate in a 3D space. 
92 | The user can choose the data type to be used in the grid (int, float or double).
93 | 
94 | ## File parsing
95 | The file parsing benchmark creates a file filled with sequences of random characters, as well as a fixed search phrase (here: "AdeptProject"). The benchmark then searches through the file and counts the occurences of the search phrase. 
96 | The user can determine the size of the file by passing the number of lines to be created (using size).


--------------------------------------------------------------------------------
/main.c:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2015 The University of Edinburgh. */
  2 | 
  3 | /* 
  4 | * This software was developed as part of the                       
  5 | * EC FP7 funded project Adept (Project ID: 610490)                 
  6 | * www.adept-project.eu                                            
  7 | */
  8 | 
  9 | /* Licensed under the Apache License, Version 2.0 (the "License"); */
 10 | /* you may not use this file except in compliance with the License. */
 11 | /* You may obtain a copy of the License at */
 12 | 
 13 | /*     http://www.apache.org/licenses/LICENSE-2.0 */
 14 | 
 15 | /* Unless required by applicable law or agreed to in writing, software */
 16 | /* distributed under the License is distributed on an "AS IS" BASIS, */
 17 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */
 18 | /* See the License for the specific language governing permissions and */
 19 | /* limitations under the License. */
 20 | 
 21 | #include <stdio.h>
 22 | #include <stdlib.h>
 23 | #include <getopt.h>
 24 | #include <limits.h>
 25 | 
 26 | #include "level1.h"
 27 | 
 28 | #include <mpi.h>
 29 | 
 30 | void usage();
 31 | 
 32 | int main(int argc, char **argv) {
 33 | 
 34 |     int c;
 35 |     int world_size, world_rank;
 36 |     MPI_Init(&argc, &argv);
 37 |     MPI_Comm_size(MPI_COMM_WORLD, &world_size);
 38 |     MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
 39 |     if (world_rank == 0) {
 40 |         printf("Running with %d MPI processes.\n", world_size);
 41 |     }
 42 | 
 43 |     char *bench = "blas_op";
 44 |     unsigned int size = 200;
 45 |     unsigned long rep = ULONG_MAX;
 46 |     char *op = "dot_product";
 47 |     char *dt = "int";
 48 |     char *algo = "normal";
 49 | 
 50 |     static struct option option_list[] ={
 51 |         {"bench", required_argument, NULL, 'b'},
 52 |         {"size", required_argument, NULL, 's'},
 53 |         {"reps", required_argument, NULL, 'r'},
 54 |         {"op", required_argument, NULL, 'o'},
 55 |         {"dtype", required_argument, NULL, 'd'},
 56 | 	{"algo", required_argument, NULL, 'a'},
 57 |         {"help", no_argument, NULL, 'h'},
 58 |         {0, 0, 0, 0}
 59 |     };
 60 | 
 61 |     while ((c = getopt_long(argc, argv, "b:s:r:o:d:a:h", option_list, NULL)) != -1) {
 62 |         switch (c) {
 63 |             case 'b':
 64 |                 bench = optarg;
 65 |                 if (world_rank == 0) {
 66 |                     printf("Benchmark is %s.\n", bench);
 67 |                 }
 68 |                 break;
 69 |             case 's':
 70 |                 size = atoi(optarg);
 71 |                 if (world_rank == 0) {
 72 |                     printf("Size is %d.\n", size);
 73 |                 }
 74 |                 break;
 75 |             case 'r':
 76 |                 rep = atol(optarg);
 77 |                 printf("Number of repetitions %lu.\n", rep);
 78 |                 break;
 79 |             case 'o':
 80 |                 op = optarg;
 81 |                 if (world_rank == 0) {
 82 |                     printf("Operation %s\n", op);
 83 |                 }
 84 |                 break;
 85 |             case 'd':
 86 |                 dt = optarg;
 87 |                 if (world_rank == 0) {
 88 |                     printf("Data type is %s\n", dt);
 89 |                 }
 90 |                 break;
 91 | 	    case 'a':
 92 | 		algo = optarg;
 93 | 		if (world_rank == 0) {
 94 | 		    printf("Algorithm is %s\n", algo);
 95 | 		}
 96 | 		break;
 97 |             case 'h':
 98 |                 if (world_rank == 0) {
 99 |                     usage();
100 |                 }
101 |                 return 0;
102 |             default:
103 |                 if (world_rank == 0) {
104 |                     printf("Undefined.\n");
105 |                 }
106 |                 return 0;
107 |         }
108 |     }
109 | 
110 |     bench_level1(bench, size, rep, op, dt, algo);
111 |     MPI_Finalize();
112 |     return 0;
113 | 
114 | }
115 | 
116 | void usage() {
117 |     printf("Usage for KERNEL benchmarks:\n\n");
118 |     printf("\t -b, --bench NAME \t name of the benchmark - possible values are blas_op, stencil, fileparse and cg.\n");
119 |     printf("\t -s, --size N \t\t vector length. Default is 200. For fileparse benchmark this is the number of rows.\n");
120 |     printf("\t -r, --reps N \t\t number of repetitions. Default value is ULONG_MAX.\n");
121 |     printf("\t -o, --op TYPE \t\t TYPE of operation.\n");
122 |     printf("\t\t\t\t --> for blas_op benchmark: \"dot_product\", \"scalar_mult\", \"dmatvec_product\", \"norm\", \"spmv\" and \"axpy\". Default is \"dot_product\".\n");
123 |     printf("\t\t\t\t --> for stencil benchmark: \"27\", \"19\", \"9\" and \"5\". Default is \"27\".\n");
124 |     printf("\t -d, --dtype DATATYPE \t DATATYPE to be used - possible values are int, long, float, double. Default is int.\n");
125 |     printf("\t -a, --algo ALGORITHM \t ALGORITHM to be used. Default is normal.\n"
126 | 	   "\t\t\t\t --> for cg possible values are normal, mixed.\n"
127 | 	   "\t\t\t\t --> for stencil possible values are normal, overlapped.\n");
128 |     printf("\t -h, --help \t\t Displays this help.\n");
129 |     printf("\n\n");
130 | }
131 | 


--------------------------------------------------------------------------------
/fileparse.c:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2015 The University of Edinburgh. */
  2 | 
  3 | /* 
  4 | * This software was developed as part of the                       
  5 | * EC FP7 funded project Adept (Project ID: 610490)                 
  6 | * www.adept-project.eu                                            
  7 | */
  8 | 
  9 | /* Licensed under the Apache License, Version 2.0 (the "License"); */
 10 | /* you may not use this file except in compliance with the License. */
 11 | /* You may obtain a copy of the License at */
 12 | 
 13 | /*     http://www.apache.org/licenses/LICENSE-2.0 */
 14 | 
 15 | /* Unless required by applicable law or agreed to in writing, software */
 16 | /* distributed under the License is distributed on an "AS IS" BASIS, */
 17 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */
 18 | /* See the License for the specific language governing permissions and */
 19 | /* limitations under the License. */
 20 | 
 21 | #include <stdio.h>
 22 | #include <stdlib.h>
 23 | #include <string.h>
 24 | #include <time.h>
 25 | #include <unistd.h>
 26 | #include <mpi.h>
 27 | #include <ctype.h>
 28 | 
 29 | #include "utils.h"
 30 | #include "level1.h"
 31 | 
 32 | int create_line(char*, size_t, char*, unsigned int);
 33 | int seek_match(char*, size_t, char*, unsigned int);
 34 | 
 35 | void fileparse(unsigned int num_rows) {
 36 | 
 37 |     int world_size, world_rank;
 38 |     MPI_Comm_size(MPI_COMM_WORLD, &world_size);
 39 |     MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
 40 | 
 41 | 
 42 |     char search_phrase[] = "AdeptProject";
 43 |     size_t sp_len = strlen(search_phrase);
 44 | 
 45 |     unsigned int desired_line_len = 81;
 46 |     char line[desired_line_len];
 47 | 
 48 |     srand(time(NULL)); // Set seed
 49 | 
 50 |     int i = 0;
 51 |     int r = 0;
 52 |     int m = 0;
 53 |     int mismatch = 0;
 54 |     int r_count = 0;
 55 |     int m_count = 0;
 56 |     struct timespec start, end;
 57 | 
 58 |     /* p_num_rows is the number of rows across all processes */
 59 |     unsigned int p_num_rows;
 60 |     p_num_rows = (unsigned int) (num_rows * world_size);
 61 | 
 62 | 
 63 |     /* Generate (on the fly) the test file for the run */
 64 |     /* Make this single threaded for ease */
 65 |     if (world_rank == 0) {
 66 |         FILE* fp;
 67 |         fp = fopen("testfile", "w+");
 68 | 
 69 |         for (i = 0; i < p_num_rows; i++) {
 70 |             r = create_line(search_phrase, sp_len, line, desired_line_len);
 71 |             m = seek_match(search_phrase, sp_len, line, desired_line_len);
 72 |             if (r != m) {
 73 |                 mismatch++;
 74 |             }
 75 |             if (r == 0) {
 76 |                 r_count++;
 77 |             }
 78 |             if (m == 0) {
 79 |                 m_count++;
 80 |             }
 81 |             fprintf(fp, "%s\n", line);
 82 |         }
 83 |         fsync(fileno(fp));
 84 |         fclose(fp);
 85 | 
 86 |     }
 87 | 
 88 |     m = 0;
 89 | 
 90 | 
 91 |     MPI_Info info;
 92 |     MPI_Info_create(&info);
 93 |     MPI_File fh;
 94 |     MPI_Status status;
 95 | 
 96 |     /* For holding the data from the file before parsing */
 97 |     char *lb = (char*) malloc(sizeof (char)*num_rows * (desired_line_len + 1));
 98 |     char *lbp = NULL;
 99 | 
100 | 
101 |     MPI_Barrier(MPI_COMM_WORLD);
102 |     if (world_rank == 0) {
103 |         clock_gettime(CLOCK, &start);
104 |     }
105 |     m_count = 0;
106 | 
107 |     /* This part should use MPI-IO */
108 |     MPI_File_open(MPI_COMM_WORLD, "testfile", MPI_MODE_RDWR | MPI_MODE_CREATE, info, &fh);
109 |     MPI_File_read_at(fh, world_rank * num_rows * (desired_line_len + 1), lb, num_rows * (desired_line_len + 1), MPI_CHAR, &status);
110 |     for (i = 0; i < num_rows; i++) {
111 |         lbp = &lb[i * (desired_line_len + 1)];
112 |         m = seek_match(search_phrase, sp_len, lbp, desired_line_len);
113 |         if (m == 0) {
114 |             m_count++;
115 |         }
116 |     }
117 |     MPI_Barrier(MPI_COMM_WORLD);
118 |     MPI_File_close(&fh);
119 | 
120 |     if (world_rank == 0) {
121 |         clock_gettime(CLOCK, &end);
122 |         elapsed_time_hr(start, end, "Fileparse");
123 |     }
124 |     MPI_Barrier(MPI_COMM_WORLD);
125 |     unlink("testfile"); // Use this to ensure the generated file is removed from the system upon finish
126 | 
127 | }
128 | 
129 | /*
130 |  * Create a line of random characters
131 |  * Line will be ll long and appears in l
132 |  * Randomly, phrase contained in sp and of sp_len length will be added to l at a random position
133 |  */
134 | int create_line(char* sp, size_t sp_len, char* l, unsigned int ll) {
135 | 
136 | 
137 |     int i = 0;
138 |     int r = 0;
139 |     int flag = 0;
140 | 
141 |     for (i = 0; i < ll; i++) {
142 |         r = (rand() % 128);
143 |         while (!isalnum(r)) {
144 |             r = (rand() % 128);
145 |         }
146 |         l[i] = (char) r;
147 |     }
148 |     l[i + 1] = '\0';
149 | 
150 |     r = rand() % 2;
151 | 
152 |     if (r == 0) {
153 |         flag = 0;
154 |         r = rand() % (ll - sp_len);
155 |         for (i = 0; i < sp_len; i++) {
156 |             l[r + i] = sp[i];
157 |         }
158 |     } else {
159 |         flag = 1;
160 |     }
161 | 
162 |     return flag;
163 | }
164 | 
165 | /*
166 |  * Naive matching algorithm
167 |  */
168 | int seek_match(char* sp, size_t sp_len, char* l, unsigned int ll) {
169 | 
170 |     int i = 0;
171 |     int flag = 1;
172 |     for (i = 0; i < ll - sp_len; i++) {
173 |         if (l[i] == sp[0]) {
174 |             if (strncmp(&l[i], &sp[0], sp_len) == 0) {
175 |                 flag = 0;
176 |                 break;
177 |             }
178 |         }
179 |     }
180 | 
181 |     return flag;
182 | }
183 | 


--------------------------------------------------------------------------------
/matrix_utils.c:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2015 The University of Edinburgh. */
  2 | 
  3 | /* 
  4 | * This software was developed as part of the                       
  5 | * EC FP7 funded project Adept (Project ID: 610490)                 
  6 | * www.adept-project.eu                                            
  7 | */
  8 | 
  9 | /* Licensed under the Apache License, Version 2.0 (the "License"); */
 10 | /* you may not use this file except in compliance with the License. */
 11 | /* You may obtain a copy of the License at */
 12 | 
 13 | /*     http://www.apache.org/licenses/LICENSE-2.0 */
 14 | 
 15 | /* Unless required by applicable law or agreed to in writing, software */
 16 | /* distributed under the License is distributed on an "AS IS" BASIS, */
 17 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */
 18 | /* See the License for the specific language governing permissions and */
 19 | /* limitations under the License. */
 20 | 
 21 | /*
 22 |  * Utility functions for sparse matrices.
 23 |  *
 24 |  * Currently focusses on reading in a sparse matrix file in
 25 |  * Matrix Market Format (http://math.nist.gov/MatrixMarket)
 26 |  * converting this to CSR.
 27 |  *
 28 |  */
 29 | 
 30 | #include <stdlib.h>
 31 | #include <stdio.h>
 32 | #include <string.h>
 33 | 
 34 | /*
 35 |  *
 36 |  * reads matrix market file header and get number of rows,
 37 |  * number of columns, and number of non-zero elements.
 38 |  *
 39 |  */
 40 | 
 41 | void get_matrix_size(char *fn, int *rows, int *cols, int *nonzeros) {
 42 |     FILE *f;
 43 |     char header[64];
 44 |     char *rv = NULL;
 45 | 
 46 |     if ((f = fopen(fn, "r")) == NULL) {
 47 |         printf("can't open file <%s> \n", fn);
 48 |         exit(1);
 49 |     }
 50 | 
 51 |     rv = fgets(header, sizeof (header), f);
 52 |     if (*rv == EOF) {
 53 |         printf("Error reading file.\n");
 54 |         exit(1);
 55 |     }
 56 |     rv = fgets(header, sizeof (header), f);
 57 |     if (*rv == EOF) {
 58 |         printf("Error reading file.\n");
 59 |         exit(1);
 60 |     } else {
 61 |         sscanf(header, "%d %d %d", rows, cols, nonzeros);
 62 |     }
 63 | 
 64 |     printf("Rows: %d, Columns: %d, Non-zeros: %d\n", *rows, *cols, *nonzeros);
 65 |     fclose(f);
 66 | 
 67 | }
 68 | 
 69 | /*
 70 |  *
 71 |  * convert a matrix in Matrix Market Format (COO) to CSR
 72 |  *
 73 |  */
 74 | void mm_to_csr(char *fn, int m, int n, int nz, int *row_idx, int *col_idx, double *values) {
 75 | 
 76 |     FILE *fin, *fout;
 77 |     int i, j;
 78 |     int base;
 79 |     char body[64];
 80 |     int row_idx_current, inc;
 81 |     char *rv = NULL;
 82 | 
 83 |     int *new_row_idx, *new_col_idx;
 84 |     double *new_values;
 85 | 
 86 |     if ((fin = fopen(fn, "r")) == NULL) {
 87 |         printf("can't open input file <%s> \n", fn);
 88 |         exit(1);
 89 |     }
 90 | 
 91 |     printf("here\n");
 92 | 
 93 |     if ((fout = fopen("matrix_in.csr", "w")) == NULL) {
 94 |         printf("can't open output file <%s> \n", fn);
 95 |         exit(1);
 96 |     }
 97 | 
 98 |     /* discard first two lines */
 99 |     rv = fgets(body, sizeof (body), fin);
100 |     if (*rv == EOF) {
101 |         printf("Error reading file.\n");
102 |         exit(1);
103 |     }
104 |     rv = fgets(body, sizeof (body), fin);
105 |     if (*rv == EOF) {
106 |         printf("Error reading file.\n");
107 |         exit(1);
108 |     }
109 | 
110 | 
111 |     base = 1;
112 |     i = 0;
113 | 
114 |     /* walk through the file line by line */
115 |     while (fgets(body, sizeof (body), fin)) {
116 |         sscanf(body, "%d %d %lf", &row_idx[i], &col_idx[i], &values[i]);
117 |         row_idx[i] -= base; /* adjust from 1-based to 0-based */
118 |         col_idx[i] -= base;
119 |         i++;
120 |     }
121 | 
122 |     fclose(fin);
123 | 
124 |     /* allocate space for new arrays which will hold the    */
125 |     /* newly ordered values, and the column and row indices */
126 |     new_row_idx = malloc(nz * sizeof (int));
127 |     new_col_idx = malloc(nz * sizeof (int));
128 |     new_values = malloc(nz * sizeof (double));
129 | 
130 |     /* set first values for all three arrays   */
131 |     /* as there is nothing to be done for them */
132 |     row_idx_current = row_idx[0];
133 |     new_row_idx[0] = row_idx[0];
134 |     new_col_idx[0] = col_idx[0];
135 |     new_values[0] = values[0];
136 | 
137 |     inc = 1;
138 | 
139 |     /* this is where the arrays are being reordered */
140 |     for (j = 1; j < nz; j++) {
141 |         for (i = 1; i < nz; i++) {
142 |             if (row_idx[i] == row_idx_current) {
143 |                 new_values[inc] = values[i];
144 |                 new_col_idx[inc] = col_idx[i];
145 |                 inc++;
146 |             }
147 |         }
148 |         new_row_idx[j] = inc;
149 |         row_idx_current++;
150 |     }
151 | 
152 |     fprintf(fout, "%d %d %d\n", nz, nz, m + 1);
153 | 
154 |     /* fprintf(fout, "Values:\n"); */
155 |     /* copy the new colum indices and values into the old arrays */
156 |     for (i = 0; i < nz; i++) {
157 |         values[i] = new_values[i];
158 |         fprintf(fout, "%f\n", new_values[i]);
159 |     }
160 | 
161 |     /* fprintf(fout, "\nColumn indices:\n"); */
162 |     /* copy the new colum indices and values into the old arrays */
163 |     for (i = 0; i < nz; i++) {
164 |         col_idx[i] = new_col_idx[i];
165 |         fprintf(fout, "%d\n", new_col_idx[i]);
166 |     }
167 | 
168 |     /* fprintf(fout, "\nRow pointers:\n"); */
169 |     /* copy the new row indices into the old array */
170 |     for (i = 0; i <= m; i++) {
171 |         row_idx[i] = new_row_idx[i];
172 |         fprintf(fout, "%d\n", new_row_idx[i]);
173 |     }
174 | 
175 |     /* free memory for the temporary new arrays */
176 |     free(new_row_idx);
177 |     free(new_col_idx);
178 |     free(new_values);
179 | 
180 |     fclose(fout);
181 | 
182 | }
183 | 


--------------------------------------------------------------------------------
/utils.c:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2015 The University of Edinburgh. */
  2 | 
  3 | /* 
  4 | * This software was developed as part of the                       
  5 | * EC FP7 funded project Adept (Project ID: 610490)                 
  6 | * www.adept-project.eu                                            
  7 | */
  8 | 
  9 | /* Licensed under the Apache License, Version 2.0 (the "License"); */
 10 | /* you may not use this file except in compliance with the License. */
 11 | /* You may obtain a copy of the License at */
 12 | 
 13 | /*     http://www.apache.org/licenses/LICENSE-2.0 */
 14 | 
 15 | /* Unless required by applicable law or agreed to in writing, software */
 16 | /* distributed under the License is distributed on an "AS IS" BASIS, */
 17 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */
 18 | /* See the License for the specific language governing permissions and */
 19 | /* limitations under the License. */
 20 | 
 21 | #include <stdlib.h>
 22 | #include <stdio.h>
 23 | #include <time.h>
 24 | #include <sys/time.h>
 25 | #include <signal.h>
 26 | 
 27 | #include "utils.h"
 28 | 
 29 | #ifdef __MACH__
 30 | void clock_gettime (void* clk, struct timespec *ts){
 31 | 	clock_serv_t cclock;
 32 | 	mach_timespec_t mts;
 33 | 	host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock);
 34 | 	clock_get_time(cclock, &mts);
 35 | 	mach_port_deallocate(mach_task_self(), cclock);
 36 | 	ts->tv_sec = mts.tv_sec;
 37 | 	ts->tv_nsec = mts.tv_nsec;
 38 | }
 39 | #endif
 40 | 
 41 | 
 42 | double elapsed_time_hr(struct timespec t1, struct timespec t2, char * title){
 43 | 
 44 |   struct timespec elapsed;
 45 |   sub_time_hr(&elapsed, &t1, &t2);
 46 |   double elapsed_duration = 0;
 47 |   double elapsed_start = 0;
 48 |   double elapsed_end = 0;
 49 | 
 50 |   /* This could potentially lead to loss of precision dependant on the rounding in conversion to double */
 51 |   elapsed_duration = elapsed.tv_sec + ((double)elapsed.tv_nsec/1000000000);
 52 |   elapsed_start = t1.tv_sec + ((double)t1.tv_nsec/1000000000);
 53 |   elapsed_end = t2.tv_sec + ((double)t2.tv_nsec/1000000000);
 54 |   
 55 | 
 56 |   printf("\n--- %s\n", title);
 57 |   printf("--- Timings ------------------------------------------------------------------------\n");
 58 |   printf("|\n");
 59 |   printf("| Start: %.9lf   ", elapsed_start);
 60 |   printf("End: %.9lf   ", elapsed_end);
 61 |   printf("Duration: %.9lf s\n", elapsed_duration);
 62 |   printf("|\n");
 63 |   printf("------------------------------------------------------------------------------------\n");
 64 | 
 65 |   return 1.0; // Compatibility
 66 | }
 67 | 
 68 | void loop_timer(unsigned long limit){
 69 | 
 70 |   struct timespec t1, t2;
 71 |   int index;
 72 | 
 73 |   clock_gettime(CLOCK, &t1);
 74 |   for(index=0; index<limit; index++) {
 75 |     /* __asm__ ("nop"); */
 76 |   }
 77 |   clock_gettime(CLOCK, &t2);
 78 | 
 79 | 
 80 |   elapsed_time_hr(t1, t2, "Loop on its own");
 81 | }
 82 | 
 83 | void loop_timer_nop(unsigned long limit){
 84 | 
 85 |   struct timespec t1, t2;
 86 |   int index;
 87 | 
 88 |   clock_gettime(CLOCK, &t1);
 89 |   for(index=0; index<limit; index++) {
 90 |     __asm__ ("nop");
 91 |   }
 92 |   clock_gettime(CLOCK, &t2);
 93 | 
 94 | 
 95 |   elapsed_time_hr(t1, t2, "Loop on its own with nop");
 96 | }
 97 | 
 98 | void warmup_loop(unsigned long limit){
 99 | 
100 |   int index;
101 | 
102 |   for(index=0; index<limit; index++) {
103 |     __asm__ ("nop");
104 |   }
105 | 
106 | }
107 | 
108 | void interrupt_handler(int signum){
109 |   stop = 1;
110 | }
111 | 
112 | void discrete_elapsed_hr(struct timespec* oh_array, struct timespec* res_array, int* iter, char* title){
113 | 
114 |   /* This computes the overhead from the array you give it AND then removes this from the results array */
115 | 
116 | 
117 |   double tminus = 0;
118 |   double tcum = 0;
119 |   int count = 0;
120 |   int retval = 0;
121 |   int i = 0;
122 | 
123 |   struct timespec overhead;
124 |   struct timespec result;
125 |   double cum_overhead = 0;
126 |   int overhead_status = 0;
127 | 
128 | 
129 |   /* The overhead array is always 1000 values long */
130 |   for(i=0;i<1000;i++){
131 |     overhead_status = sub_time_hr(&overhead, &oh_array[2*i], &oh_array[2*i+1]);
132 |     if (overhead_status == 1){
133 |       printf("Error computing overhead for %s\n", title);
134 |     }
135 |     cum_overhead += (overhead.tv_sec+((double)overhead.tv_nsec/1000000000));
136 |   }
137 | 
138 |   double mean_oh;
139 |   if (cum_overhead == 0){
140 |     mean_oh = 0;
141 |   }
142 |   else{
143 |     mean_oh = cum_overhead/1000;
144 |   }
145 | 
146 |   /* Accumulate the difference between the results values and remove mean overhead */
147 |   for (i=0;i<*iter;i++){
148 |     retval = sub_time_hr(&result, &res_array[2*i], &res_array[2*i+1]);
149 |     if (retval!=1){
150 |       tminus = (result.tv_sec+((double)result.tv_nsec/1000000000)) - mean_oh;
151 |       tcum += tminus;
152 |       count++;
153 |     }
154 |   }
155 |   double cumulative = tcum/count;
156 | 
157 |   printf("\n--- %s\n", title);
158 |   printf("--- Timings ------------------------------------------------------------------------\n");
159 |   printf("|\n");
160 |   printf("| Iterations %d   ", *iter);
161 |   printf("Mean time(adjusted) %.9lf s   ", cumulative);
162 |   printf("Mean overhead %.9lf s\n", mean_oh);
163 |   printf("|\n");
164 |   printf("------------------------------------------------------------------------------------\n");
165 | 
166 | }
167 | 
168 | int sub_time_hr(struct timespec* result, struct timespec* start, struct timespec* end)
169 | {
170 | 
171 |   if ((end->tv_nsec-start->tv_nsec)<0) {
172 |     result->tv_sec = end->tv_sec-start->tv_sec-1;
173 |     result->tv_nsec = 1000000000+end->tv_nsec-start->tv_nsec;
174 |   } else {
175 |     result->tv_sec = end->tv_sec-start->tv_sec;
176 |     result->tv_nsec = end->tv_nsec-start->tv_nsec;
177 |   }
178 | 
179 |   return end->tv_sec < start->tv_sec;
180 | }
181 | 


--------------------------------------------------------------------------------
/level1.c:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2015 The University of Edinburgh. */
  2 | 
  3 | /* 
  4 | * This software was developed as part of the                       
  5 | * EC FP7 funded project Adept (Project ID: 610490)                 
  6 | * www.adept-project.eu                                            
  7 | */
  8 | 
  9 | /* Licensed under the Apache License, Version 2.0 (the "License"); */
 10 | /* you may not use this file except in compliance with the License. */
 11 | /* You may obtain a copy of the License at */
 12 | 
 13 | /*     http://www.apache.org/licenses/LICENSE-2.0 */
 14 | 
 15 | /* Unless required by applicable law or agreed to in writing, software */
 16 | /* distributed under the License is distributed on an "AS IS" BASIS, */
 17 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */
 18 | /* See the License for the specific language governing permissions and */
 19 | /* limitations under the License. */
 20 | 
 21 | #include <stdlib.h>
 22 | #include <stdio.h>
 23 | #include <string.h>
 24 | #include <mpi.h>
 25 | 
 26 | #include "level1.h"
 27 | 
 28 | 
 29 | /* Level 1 benchmark driver - calls appropriate function */
 30 | /* based on command line arguments.                      */
 31 | void bench_level1(char *b, unsigned int s, unsigned int r, char *o, char *dt, char *algo){
 32 | 
 33 |   int world_rank;
 34 |   MPI_Comm_rank(MPI_COMM_WORLD,&world_rank);
 35 | 
 36 | 
 37 |   /* BLAS operations */
 38 |   if(strcmp(b, "blas_op") == 0){
 39 | 
 40 |     if(strcmp(o, "dot_product") == 0){
 41 | 
 42 |       if(strcmp(dt, "int") == 0) int_dot_product(s);
 43 |       else if(strcmp(dt, "float") == 0) float_dot_product(s);
 44 |       else if(strcmp(dt, "double") == 0) double_dot_product(s);
 45 |       else if (world_rank==0){
 46 |         fprintf(stderr, "ERROR: check you are using a valid data type...\n");
 47 |       }
 48 | 
 49 |     }
 50 | 
 51 |     else if(strcmp(o, "scalar_product") == 0){
 52 | 
 53 |       if(strcmp(dt, "int") == 0) int_scalar_mult(s);
 54 |       else if(strcmp(dt, "float") == 0) float_scalar_mult(s);
 55 |       else if(strcmp(dt, "double") == 0) double_scalar_mult(s);
 56 |       else if (world_rank==0){
 57 |         fprintf(stderr, "ERROR: check you are using a valid data type...\n");
 58 |       }
 59 | 
 60 |     }
 61 | 
 62 |     else if(strcmp(o, "norm") == 0){
 63 | 
 64 |       if(strcmp(dt, "int") == 0) int_norm(s);
 65 |       else if(strcmp(dt, "float") == 0) float_norm(s);
 66 |       else if(strcmp(dt, "double") == 0) double_norm(s);
 67 |       else if (world_rank==0){
 68 |         fprintf(stderr, "ERROR: check you are using a valid data type...\n");
 69 |       }
 70 | 
 71 |     }
 72 | 
 73 |     else if(strcmp(o, "axpy") == 0){
 74 | 
 75 |       if(strcmp(dt, "int") == 0) int_axpy(s);
 76 |       else if(strcmp(dt, "float") == 0) float_axpy(s);
 77 |       else if(strcmp(dt, "double") == 0) double_axpy(s);
 78 |       else if (world_rank==0){
 79 |         fprintf(stderr, "ERROR: check you are using a valid data type...\n");
 80 |       }
 81 | 
 82 |     }
 83 | 
 84 |     else if(strcmp(o, "dmatvec_product") == 0){
 85 | 
 86 |       if(strcmp(dt, "int") == 0) int_dmatvec_product(s);
 87 |       else if(strcmp(dt, "float") == 0) float_dmatvec_product(s);
 88 |       else if(strcmp(dt, "double") == 0) double_dmatvec_product(s);
 89 |       else if (world_rank==0){
 90 |         fprintf(stderr, "ERROR: check you are using a valid data type...\n");
 91 |       }
 92 | 
 93 |     }
 94 | 
 95 |     else if(strcmp(o, "spmv") == 0){
 96 | 
 97 |       if(strcmp(dt, "int") == 0) int_dmatvec_product(r);
 98 |       else if(strcmp(dt, "float") == 0) float_spmatvec_product(r);
 99 |       else if(strcmp(dt, "double") == 0) double_spmatvec_product(r);
100 |       else if (world_rank==0){
101 |         fprintf(stderr, "ERROR: check you are using a valid data type...\n");
102 |       }
103 | 
104 |     }
105 | 
106 |   }
107 | 
108 |   /* Stencil codes */
109 |   else if (strcmp(b, "stencil") == 0){
110 | 
111 |       if (strcmp(algo, "normal") == 0){
112 | 	  /* o is set to "dot_product" by default. Use this to check for a default */
113 | 	  if( strcmp(o, "27") == 0 || strcmp(o, "dot_product") == 0){
114 | 	      if(strcmp(dt, "double") == 0) double_stencil27(s);
115 | 	      else if (strcmp(dt, "float") == 0) float_stencil27(s);
116 | 	      else if (strcmp(dt, "int") == 0) int_stencil27(s);
117 | 	      else if (world_rank==0){
118 | 		  fprintf(stderr, "ERROR: check you are using a valid data type...\n");
119 | 	      }
120 | 	  }
121 | 
122 | 	  else if(strcmp(o, "19") == 0){
123 | 	      if(strcmp(dt, "double") == 0) double_stencil19(s);
124 | 	      else if (strcmp(dt, "float") == 0) float_stencil19(s);
125 | 	      else if (strcmp(dt, "int") == 0) int_stencil19(s);
126 | 	      else if (world_rank==0){
127 | 		  fprintf(stderr, "ERROR: check you are using a valid data type...\n");
128 | 	      }
129 | 	  }
130 | 
131 | 
132 | 	  else if(strcmp(o, "9") == 0){
133 | 	      if(strcmp(dt, "double") == 0) double_stencil9(s);
134 | 	      else if (strcmp(dt, "float") == 0) float_stencil9(s);
135 | 	      else if (strcmp(dt, "int") == 0) int_stencil9(s);
136 | 	      else if (world_rank==0){
137 | 		  fprintf(stderr, "ERROR: check you are using a valid data type...\n");
138 | 	      }
139 | 	  }
140 | 
141 | 
142 | 	  else if(strcmp(o, "5") == 0){
143 | 	      if(strcmp(dt, "double") == 0) double_stencil5(s);
144 | 	      else if (strcmp(dt, "float") == 0) float_stencil5(s);
145 | 	      else if (strcmp(dt, "int") == 0) int_stencil5(s);
146 | 	      else if (world_rank==0){
147 | 		  fprintf(stderr, "ERROR: check you are using a valid data type...\n");
148 | 	      }
149 | 	  }
150 | 
151 | 
152 | 	  else if (world_rank==0){
153 | 	      fprintf(stderr, "ERROR: check you are using a valid operation type...\n");
154 | 	  }
155 |       }
156 |       else if (strcmp(algo, "overlapped") == 0) {
157 | 	  /* o is set to "dot_product" by default. Use this to check for a default */
158 | 	  if( strcmp(o, "27") == 0 || strcmp(o, "dot_product") == 0){
159 | 	      if(strcmp(dt, "double") == 0) double_stencil27_overlapped(s);
160 | 	      else if (strcmp(dt, "float") == 0) float_stencil27_overlapped(s);
161 | 	      else if (strcmp(dt, "int") == 0) int_stencil27_overlapped(s);
162 | 	      else if (world_rank==0){
163 | 		  fprintf(stderr, "ERROR: check you are using a valid data type...\n");
164 | 	      }
165 | 	  }
166 | 
167 | 	  else if(strcmp(o, "19") == 0){
168 | 	      if(strcmp(dt, "double") == 0) double_stencil19_overlapped(s);
169 | 	      else if (strcmp(dt, "float") == 0) float_stencil19_overlapped(s);
170 | 	      else if (strcmp(dt, "int") == 0) int_stencil19_overlapped(s);
171 | 	      else if (world_rank==0){
172 | 		  fprintf(stderr, "ERROR: check you are using a valid data type...\n");
173 | 	      }
174 | 	  }
175 | 
176 | 
177 | 	  else if(strcmp(o, "9") == 0){
178 | 	      if(strcmp(dt, "double") == 0) double_stencil9_overlapped(s);
179 | 	      else if (strcmp(dt, "float") == 0) float_stencil9_overlapped(s);
180 | 	      else if (strcmp(dt, "int") == 0) int_stencil9_overlapped(s);
181 | 	      else if (world_rank==0){
182 | 		  fprintf(stderr, "ERROR: check you are using a valid data type...\n");
183 | 	      }
184 | 	  }
185 | 
186 | 
187 | 	  else if(strcmp(o, "5") == 0){
188 | 	      if(strcmp(dt, "double") == 0) double_stencil5_overlapped(s);
189 | 	      else if (strcmp(dt, "float") == 0) float_stencil5_overlapped(s);
190 | 	      else if (strcmp(dt, "int") == 0) int_stencil5_overlapped(s);
191 | 	      else if (world_rank==0){
192 | 		  fprintf(stderr, "ERROR: check you are using a valid data type...\n");
193 | 	      }
194 | 	  }
195 | 
196 | 
197 | 	  else if (world_rank==0){
198 | 	      fprintf(stderr, "ERROR: check you are using a valid operation type...\n");
199 | 	  }
200 | 
201 |       }
202 |       else if (world_rank == 0) {
203 | 	  fprintf(stderr, "ERROR: check you are using a valid algorithm.\n");
204 |       }
205 |   }
206 | 
207 |   else if (strcmp(b, "fileparse") == 0){
208 | 
209 |       if(strcmp(o, "dot_product") == 0){
210 | 	  fileparse(s);
211 |       }
212 | 
213 |       else if (world_rank==0){
214 | 	  fprintf(stderr, "ERROR: check you are using a valid operation type...\n");
215 |       }
216 | 
217 |   }  
218 |   else if (strcmp(b, "cg") == 0) {
219 |       if (strcmp(algo, "mixed") == 0) {
220 | 	  conjugate_gradient_mixed(s);
221 |       }
222 |       else if (strcmp(algo, "normal") == 0) {
223 | 	  conjugate_gradient(s);
224 |       }
225 |       else fprintf(stderr, "ERROR: check you are using a valid algorithm...\n");
226 |   }
227 | 
228 |   else if (world_rank==0){
229 |     fprintf(stderr, "ERROR: check you are using a valid benchmark...\n");
230 |   }
231 | 
232 | 
233 | }
234 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/cg.c:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2015 The University of Edinburgh. */
  2 | 
  3 | /* 
  4 | * This software was developed as part of the                       
  5 | * EC FP7 funded project Adept (Project ID: 610490)                 
  6 | * www.adept-project.eu                                            
  7 | */
  8 | 
  9 | /* Licensed under the Apache License, Version 2.0 (the "License"); */
 10 | /* you may not use this file except in compliance with the License. */
 11 | /* You may obtain a copy of the License at */
 12 | 
 13 | /*     http://www.apache.org/licenses/LICENSE-2.0 */
 14 | 
 15 | /* Unless required by applicable law or agreed to in writing, software */
 16 | /* distributed under the License is distributed on an "AS IS" BASIS, */
 17 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */
 18 | /* See the License for the specific language governing permissions and */
 19 | /* limitations under the License. */
 20 | 
 21 | #include <stdio.h>
 22 | #include <stdlib.h>
 23 | #include <sys/time.h>
 24 | #include <time.h>
 25 | #include <math.h>
 26 | 
 27 | #include <mpi.h>
 28 | 
 29 | #include "utils.h"
 30 | 
 31 | #define PCG_TOLERANCE 1e-3
 32 | #define PCG_MAX_ITER 1000
 33 | #define PCG_FLOAT_TOLERANCE 1e-2
 34 | 
 35 | /* Conjugate gradient benchmark */
 36 | 
 37 | 
 38 | /* struct for CSR matrix type */
 39 | typedef struct
 40 | {
 41 |   int     nrow;
 42 |   int     ncol;
 43 |   int     nzmax;
 44 |   int    *colIndex;
 45 |   int    *rowStart;
 46 |   double *values;
 47 | } CSRmatrix;
 48 | 
 49 | typedef struct
 50 | {
 51 |   int     nrow;
 52 |   int     ncol;
 53 |   int     nzmax;
 54 |   int    *colIndex;
 55 |   int    *rowStart;
 56 |   float  *values;
 57 | } CSRmatrixF;
 58 | 
 59 | /*
 60 |  * Sparse matrix and vector utility functions
 61 |  */
 62 | static void CSR_matrix_vector_mult(CSRmatrix *A, double *x, double *b)
 63 | {
 64 |   int i, j;
 65 |   for (i = 0; i < A->nrow; i++) {
 66 |     double sum = 0.0;
 67 |     for (j = A->rowStart[i]; j < A->rowStart[i+1]; j++) {
 68 |       sum += A->values[j] * x[A->colIndex[j]];
 69 |     }
 70 |     b[i] = sum;
 71 |   }
 72 | }
 73 | 
 74 | static void CSR_matrix_vector_multF(CSRmatrixF *A, float *x, float *b)
 75 | {
 76 |   int i, j;
 77 |   for (i = 0; i < A->nrow; i++) {
 78 |     float sum = 0.0;
 79 |     for (j = A->rowStart[i]; j < A->rowStart[i+1]; j++) {
 80 |       sum += A->values[j] * x[A->colIndex[j]];
 81 |     }
 82 |     b[i] = sum;
 83 |   }
 84 | }
 85 | 
 86 | static double dotProduct(double *v1, double *v2, int size)
 87 | {
 88 |   int i;
 89 |   double result = 0.0;
 90 |   double full_result;
 91 |   for (i = 0; i < size; i++) {
 92 |     result += v1[i] * v2[i];
 93 |   }
 94 |   MPI_Allreduce(&result, &full_result, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
 95 |   return full_result;
 96 | }
 97 | 
 98 | static float dotProductF(float *v1, float *v2, int size)
 99 | {
100 |   int i;
101 |   float result = 0.0;
102 |   float full_result;
103 |   for (i = 0; i < size; i++) {
104 |     result += v1[i] * v2[i];
105 |   }
106 |   MPI_Allreduce(&result, &full_result, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
107 |   return full_result;
108 | }
109 | 
110 | static void vecAxpy(double *x, double *y, int size, double alpha)
111 | {
112 |   int i;
113 |   for (i = 0; i < size; i++) {
114 |     y[i] = y[i] + alpha * x[i];
115 |   }
116 | }
117 | 
118 | static void vecAxpyF(float *x, float *y, int size, float alpha)
119 | {
120 |   int i;
121 |   for (i = 0; i < size; i++) {
122 |     y[i] = y[i] + alpha * x[i];
123 |   }
124 | }
125 | 
126 | 
127 | static void vecAypx(double *x, double *y, int size, double alpha)
128 | {
129 |   int i;
130 |   for (i = 0; i < size; i++) {
131 |     y[i] = alpha * y[i] + x[i];
132 |   }
133 | }
134 | 
135 | static void vecAypxF(float *x, float *y, int size, float alpha)
136 | {
137 |   int i;
138 |   for (i = 0; i < size; i++) {
139 |     y[i] = alpha * y[i] + x[i];
140 |   }
141 | }
142 | 
143 | 
144 | int conjugate_gradient(unsigned int s)
145 | {
146 |   CSRmatrix *A;
147 |   int i;
148 |   double *x, *b, *r, *p, *omega;
149 |   int k;
150 |   double r0, r1, beta, dot, alpha;
151 |   double tol = PCG_TOLERANCE * PCG_TOLERANCE;
152 | 
153 |   struct timespec start, end;
154 | 
155 |   int size, rank;
156 |   int local_s, local_start;
157 |   double *full_p;
158 | 
159 |   MPI_Comm_size(MPI_COMM_WORLD, &size);
160 |   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
161 | 
162 |   /* determine local size and starting position */
163 |   local_s = s / size;
164 |   local_start = local_s * rank;
165 | 
166 |   /*======================================================================
167 |    *
168 |    * generate a random matrix of size s x s
169 |    *
170 |    *======================================================================*/
171 |   A = malloc(sizeof(CSRmatrix));
172 |   A->nrow = local_s;
173 |   A->ncol = s;
174 |   A->nzmax = local_s;
175 |   A->colIndex = malloc(A->nzmax * sizeof(int));
176 |   A->rowStart = malloc((A->nrow+1) * sizeof(int));
177 |   A->values = malloc(A->nzmax * sizeof(double));
178 | 
179 |   /* generate structure for matrix */
180 |   for (i = 0; i < A->nrow; i++) {
181 |     A->rowStart[i] = i;
182 |     A->colIndex[i] = i + local_start;
183 |   }
184 |   A->rowStart[i] = i;
185 | 
186 |   /* now generate values for matrix */
187 |   srand((unsigned int)time(NULL));
188 | 
189 |   for (i = 0; i < A->nzmax; i++) {
190 |     A->values[i] = rand() / 32768.0;
191 |   }
192 | 
193 |   /*======================================================================
194 |    *
195 |    * Initialise vectors
196 |    *
197 |    *======================================================================*/
198 |   /* allocate vectors (unknowns, RHS and temporaries) */
199 |   x = malloc(local_s * sizeof(double));
200 |   b = malloc(local_s * sizeof(double));
201 |   r = malloc(local_s * sizeof(double));
202 |   p = malloc(local_s * sizeof(double));
203 |   omega = malloc(local_s * sizeof(double));
204 | 
205 |   full_p = malloc(s * sizeof(double));
206 | 
207 |   /* generate a random vector of size s for the unknowns */
208 |   for (i = 0; i < local_s; i++) {
209 |     x[i] = rand() / 32768.0;
210 |   }
211 | 
212 |   /* multiply matrix by vector to get RHS */
213 |   CSR_matrix_vector_mult(A, x, b);
214 | 
215 |   /* clear initial guess and initialise temporaries */
216 |   for (i = 0; i < local_s; i++) {
217 |     x[i] = 0.0;
218 | 
219 |     /* r = b - Ax; since x is 0, r = b */
220 |     r[i] = b[i];
221 | 
222 |     /* p = r ( = b)*/
223 |     p[i] = b[i];
224 | 
225 |     omega[i] = 0.0;
226 |   }
227 | 
228 | 
229 |   clock_gettime(CLOCK, &start);
230 | 
231 |   /* compute initial residual */
232 |   r1 = dotProduct(r, r, local_s);
233 |   r0 = r1;
234 | 
235 |   /*======================================================================
236 |    *
237 |    * Actual solver loop
238 |    *
239 |    *======================================================================*/
240 |   k = 0;
241 |   while ((r1 > tol) && (k <= PCG_MAX_ITER)) {
242 |     MPI_Allgather(p, local_s, MPI_DOUBLE, full_p, local_s, MPI_DOUBLE, MPI_COMM_WORLD);
243 | 
244 |     /* omega = Ap */
245 |     CSR_matrix_vector_mult(A, full_p, omega);
246 | 
247 |     /* dot = p . omega */
248 |     dot = dotProduct(p, omega, local_s);
249 | 
250 |     alpha = r1 / dot;
251 | 
252 |     /* x = x + alpha.p */
253 |     vecAxpy(p, x, local_s, alpha);
254 | 
255 |     /* r = r - alpha.omega */
256 |     vecAxpy(omega, r, local_s, -alpha);
257 | 
258 |     r0 = r1;
259 | 
260 |     /* r1 = r . r */
261 |     r1 = dotProduct(r, r, local_s);
262 | 
263 |     beta = r1 / r0;
264 | 
265 |     /* p = r + beta.p */
266 |     vecAypx(r, p, local_s, beta);
267 |     k++;
268 |   }
269 | 
270 |   clock_gettime(CLOCK, &end);
271 |   if (rank == 0) {
272 |       elapsed_time_hr(start, end, "Conjugate gradient solve.");
273 |   }
274 | 
275 |   /*======================================================================
276 |    *
277 |    * Free memory
278 |    *
279 |    *======================================================================*/
280 |   /* free the vectors */
281 |   free(omega);
282 |   free(p);
283 |   free(r);
284 |   free(b);
285 |   free(x);
286 |   free(full_p);
287 | 
288 |   /* free the matrix */
289 |   free(A->colIndex);
290 |   free(A->rowStart);
291 |   free(A->values);
292 |   free(A);
293 |   return 0;
294 | }
295 | 
296 | 
297 | /* mixed precision version */
298 | int conjugate_gradient_mixed(unsigned int s)
299 | {
300 |   CSRmatrix *A;
301 |   CSRmatrixF *AF;
302 |   int i;
303 |   double *x, *b, *r, *p, *omega;
304 |   float *xf, *bf, *rf, *pf, *omegaf;
305 |   int k;
306 |   double r0, r1, beta, dot, alpha;
307 |   float r0f, r1f, betaf, dotf, alphaf;
308 |   double tol = PCG_FLOAT_TOLERANCE * PCG_FLOAT_TOLERANCE;
309 | 
310 |   struct timespec start, end;
311 | 
312 |   int size, rank;
313 |   int local_s, local_start;
314 |   double *full_p;
315 |   float *full_pF;
316 | 
317 |   MPI_Comm_size(MPI_COMM_WORLD, &size);
318 |   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
319 | 
320 |   /* determine local size and starting position */
321 |   local_s = s / size;
322 |   local_start = local_s * rank;
323 | 
324 |   /*======================================================================
325 |    *
326 |    * generate a random matrix of size s x s
327 |    *
328 |    *======================================================================*/
329 |   A = malloc(sizeof(CSRmatrix));
330 |   A->nrow = local_s;
331 |   A->ncol = s;
332 |   A->nzmax = local_s;
333 |   A->colIndex = malloc(A->nzmax * sizeof(int));
334 |   A->rowStart = malloc((A->nrow+1) * sizeof(int));
335 |   A->values = malloc(A->nzmax * sizeof(double));
336 | 
337 |   AF = malloc(sizeof(CSRmatrixF));
338 |   AF->nrow = local_s;
339 |   AF->ncol = s;
340 |   AF->nzmax = local_s;
341 |   AF->colIndex = malloc(AF->nzmax * sizeof(int));
342 |   AF->rowStart = malloc((AF->nrow+1) * sizeof(int));
343 |   AF->values = malloc(AF->nzmax * sizeof(float));
344 | 
345 |   /* generate structure for matrix */
346 |   for (i = 0; i < A->nrow; i++) {
347 |     A->rowStart[i] = i;
348 |     A->colIndex[i] = i + local_start;
349 | 
350 |     AF->rowStart[i] = i;
351 |     AF->colIndex[i] = i + local_start;
352 |   }
353 |   A->rowStart[i] = i;
354 |   AF->rowStart[i] = i;
355 | 
356 |   /* now generate values for matrix */
357 |   srand((unsigned int)time(NULL));
358 | 
359 |   for (i = 0; i < A->nzmax; i++) {
360 |     A->values[i] = rand() / 32768.0;
361 |     AF->values[i] = (float)A->values[i];
362 |   }
363 | 
364 |   /*======================================================================
365 |    *
366 |    * Initialise vectors
367 |    *
368 |    *======================================================================*/
369 |   /* allocate vectors (unknowns, RHS and temporaries) */
370 |   x = malloc(local_s * sizeof(double));
371 |   b = malloc(local_s * sizeof(double));
372 |   r = malloc(local_s * sizeof(double));
373 |   p = malloc(local_s * sizeof(double));
374 |   omega = malloc(local_s * sizeof(double));
375 | 
376 |   full_p = malloc(s * sizeof(double));
377 | 
378 |   xf = malloc(local_s * sizeof(float));
379 |   bf = malloc(local_s * sizeof(float));
380 |   rf = malloc(local_s * sizeof(float));
381 |   pf = malloc(local_s * sizeof(float));
382 |   omegaf = malloc(local_s * sizeof(float));
383 | 
384 |   full_pF = malloc(s * sizeof(float));
385 | 
386 |   /* generate a random vector of size s for the unknowns */
387 |   for (i = 0; i < local_s; i++) {
388 |     x[i] = rand() / 32768.0;
389 |     xf[i] = (float)x[i];
390 |   }
391 | 
392 |   /* multiply matrix by vector to get RHS */
393 |   CSR_matrix_vector_mult(A, x, b);
394 |   CSR_matrix_vector_multF(AF, xf, bf);
395 | 
396 |   /* clear initial guess and initialise temporaries */
397 |   for (i = 0; i < local_s; i++) {
398 |     x[i] = 0.0;
399 |     xf[i] = 0.0;
400 | 
401 |     /* r = b - Ax; since x is 0, r = b */
402 |     r[i] = b[i];
403 |     rf[i] = bf[i];
404 | 
405 |     /* p = r ( = b)*/
406 |     p[i] = b[i];
407 |     pf[i] = bf[i];
408 | 
409 |     omega[i] = 0.0;
410 |     omegaf[i] = 0.0;
411 |   }
412 | 
413 | 
414 |   clock_gettime(CLOCK, &start);
415 | 
416 |   /* compute initial residual */
417 |   r1f = dotProductF(rf, rf, local_s);
418 |   r0f = r1f;
419 | 
420 |   /*======================================================================
421 |    *
422 |    * Actual solver loop (single precision)
423 |    *
424 |    *======================================================================*/
425 |   k = 0;
426 |   while ((r1f > tol) && (k <= PCG_MAX_ITER)) {
427 |     MPI_Allgather(pf, local_s, MPI_FLOAT, full_pF, local_s, MPI_FLOAT, MPI_COMM_WORLD);
428 | 
429 |     /* omega = Ap */
430 |     CSR_matrix_vector_multF(AF, full_pF, omegaf);
431 | 
432 |     /* dot = p . omega */
433 |     dotf = dotProductF(pf, omegaf, local_s);
434 | 
435 |     alphaf = r1f / dotf;
436 | 
437 |     /* x = x + alpha.p */
438 |     vecAxpyF(pf, xf, local_s, alphaf);
439 | 
440 |     /* r = r - alpha.omega */
441 |     vecAxpyF(omegaf, rf, local_s, -alphaf);
442 | 
443 |     r0f = r1f;
444 | 
445 |     /* r1 = r . r */
446 |     r1f = dotProductF(rf, rf, local_s);
447 | 
448 |     betaf = r1f / r0f;
449 | 
450 |     /* p = r + beta.p */
451 |     vecAypxF(rf, pf, local_s, betaf);
452 |     k++;
453 |   }
454 | 
455 |   /* convert for double precision iterations */
456 |   r1 = (double)r1f;
457 |   r0 = (double)r0f;
458 |   for (i = 0; i < local_s; i++) {
459 |       r[i] = (double)rf[i];
460 |       p[i] = (double)pf[i];
461 |       x[i] = (double)xf[i];
462 |   }
463 | 
464 |   tol = PCG_TOLERANCE * PCG_TOLERANCE;
465 | 
466 |   /*======================================================================
467 |    *
468 |    * Actual solver loop
469 |    *
470 |    *======================================================================*/
471 |   while ((r1 > tol) && (k <= PCG_MAX_ITER)) {
472 |     MPI_Allgather(p, local_s, MPI_DOUBLE, full_p, local_s, MPI_DOUBLE, MPI_COMM_WORLD);
473 | 
474 |     /* omega = Ap */
475 |     CSR_matrix_vector_mult(A, full_p, omega);
476 | 
477 |     /* dot = p . omega */
478 |     dot = dotProduct(p, omega, local_s);
479 | 
480 |     alpha = r1 / dot;
481 | 
482 |     /* x = x + alpha.p */
483 |     vecAxpy(p, x, local_s, alpha);
484 | 
485 |     /* r = r - alpha.omega */
486 |     vecAxpy(omega, r, local_s, -alpha);
487 | 
488 |     r0 = r1;
489 | 
490 |     /* r1 = r . r */
491 |     r1 = dotProduct(r, r, local_s);
492 | 
493 |     beta = r1 / r0;
494 | 
495 |     /* p = r + beta.p */
496 |     vecAypx(r, p, local_s, beta);
497 |     k++;
498 |   }
499 | 
500 |   clock_gettime(CLOCK, &end);
501 |   if (rank == 0) {
502 |       elapsed_time_hr(start, end, "Conjugate gradient solve.");
503 |   }
504 | 
505 |   /*======================================================================
506 |    *
507 |    * Free memory
508 |    *
509 |    *======================================================================*/
510 |   /* free the vectors */
511 |   free(omega);
512 |   free(p);
513 |   free(r);
514 |   free(b);
515 |   free(x);
516 |   free(full_p);
517 | 
518 |   free(omegaf);
519 |   free(pf);
520 |   free(rf);
521 |   free(bf);
522 |   free(xf);
523 |   free(full_pF);
524 | 
525 |   /* free the matrix */
526 |   free(A->colIndex);
527 |   free(A->rowStart);
528 |   free(A->values);
529 |   free(A);
530 | 
531 |   free(AF->colIndex);
532 |   free(AF->rowStart);
533 |   free(AF->values);
534 |   free(AF);
535 | 
536 |   return 0;
537 | }
538 | 


--------------------------------------------------------------------------------
/blas_op.c:
--------------------------------------------------------------------------------
   1 | /* Copyright (c) 2015 The University of Edinburgh. */
   2 | 
   3 | /* 
   4 | * This software was developed as part of the                       
   5 | * EC FP7 funded project Adept (Project ID: 610490)                 
   6 | * www.adept-project.eu                                            
   7 | */
   8 | 
   9 | /* Licensed under the Apache License, Version 2.0 (the "License"); */
  10 | /* you may not use this file except in compliance with the License. */
  11 | /* You may obtain a copy of the License at */
  12 | 
  13 | /*     http://www.apache.org/licenses/LICENSE-2.0 */
  14 | 
  15 | /* Unless required by applicable law or agreed to in writing, software */
  16 | /* distributed under the License is distributed on an "AS IS" BASIS, */
  17 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */
  18 | /* See the License for the specific language governing permissions and */
  19 | /* limitations under the License. */
  20 | 
  21 | #include <stdlib.h>
  22 | #include <stdio.h>
  23 | #include <sys/time.h>
  24 | #include <time.h>
  25 | #include <math.h>
  26 | #include <string.h>
  27 | #include <limits.h>
  28 | #include <mpi.h>
  29 | 
  30 | #include "level1.h"
  31 | #include "utils.h"
  32 | #include "matrix_utils.h"
  33 | 
  34 | /*
  35 |  * Vector dot product, integers
  36 |  *
  37 |  * result = result + v1_i * v2_i
  38 |  *
  39 |  * Input: size of the vectors (in number of elements)
  40 |  * Output: dot product
  41 |  *
  42 |  */
  43 | int int_dot_product(unsigned int size) {
  44 | 
  45 |     int world_size, world_rank;
  46 |     MPI_Comm_size(MPI_COMM_WORLD, &world_size);
  47 |     MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
  48 | 
  49 | 
  50 |     int i;
  51 | 
  52 |     /*
  53 |      * Compute size of block each rank will work on
  54 |      * We do this in a slightly odd method to try an ensure we have the same number of data plane layers (n)
  55 |      * in the MPI case as in the serial case.
  56 |      */
  57 |     int local_size = 0;
  58 |     if (world_rank != 0) {
  59 |         local_size = size / world_size;
  60 |     } else if (world_rank == 0) {
  61 |         local_size = (size / world_size) + (size % world_size);
  62 |     } else {
  63 |         printf("Some error occured in size calculation\n");
  64 |     }
  65 | 
  66 | 
  67 |     /* create two vectors */
  68 |     int *v1 = (int *) malloc(local_size * sizeof (int));
  69 |     int *v2 = (int *) malloc(local_size * sizeof (int));
  70 | 
  71 |     /* result variable */
  72 |     unsigned int result = 0;
  73 | 
  74 |     if (v1 == NULL || v2 == NULL) {
  75 |         printf("Out Of Memory: could not allocate space for the two arrays.\n");
  76 |         return 0;
  77 |     }
  78 | 
  79 |     srand((int) time(NULL));
  80 | 
  81 |     struct timespec start, end;
  82 |     unsigned int global_result = 0;
  83 | 
  84 |     /* fill vectors with random integer values */
  85 |     for (i = 0; i < local_size; i++) {
  86 |         v1[i] = (int) rand() / (int) (RAND_MAX / 10);
  87 |         v2[i] = (int) rand() / (int) (RAND_MAX / 10);
  88 |     }
  89 |     MPI_Barrier(MPI_COMM_WORLD);
  90 |     clock_gettime(CLOCK, &start);
  91 | 
  92 | 
  93 | 
  94 |     /* perform dot product */
  95 | 
  96 |     for (i = 0; i < local_size; i++) {
  97 |         result = result + v1[i] * v2[i];
  98 |     }
  99 | 
 100 |     MPI_Reduce(&result, &global_result, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
 101 | 
 102 |     clock_gettime(CLOCK, &end);
 103 | 
 104 |     /* print result so compiler does not throw it away */
 105 |     if (world_rank == 0) {
 106 |         printf("Dot product result: %d\n", result);
 107 |         elapsed_time_hr(start, end, "Integer dot product.");
 108 |     }
 109 |     free(v1);
 110 |     free(v2);
 111 | 
 112 |     return 0;
 113 | }
 114 | 
 115 | /*
 116 |  * Vector dot product, floats
 117 |  *
 118 |  * result = result + v1_i * v2_i
 119 |  *
 120 |  * Input: size of the vectors (in number of elements)
 121 |  * Output: dot product
 122 |  *
 123 |  */
 124 | int float_dot_product(unsigned int size) {
 125 | 
 126 |     int world_size, world_rank;
 127 |     MPI_Comm_size(MPI_COMM_WORLD, &world_size);
 128 |     MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
 129 | 
 130 |     int i;
 131 | 
 132 |     /*
 133 |      * Compute size of block each rank will work on
 134 |      * We do this in a slightly odd method to try an ensure we have the same number of data plane layers (n)
 135 |      * in the MPI case as in the serial case.
 136 |      */
 137 |     int local_size = 0;
 138 |     if (world_rank != 0) {
 139 |         local_size = size / world_size;
 140 |     } else if (world_rank == 0) {
 141 |         local_size = (size / world_size) + (size % world_size);
 142 |     } else {
 143 |         printf("Some error occured in size calculation\n");
 144 |     }
 145 | 
 146 | 
 147 |     /* create two vectors */
 148 |     float *v1 = (float *) malloc(local_size * sizeof (float));
 149 |     float *v2 = (float *) malloc(local_size * sizeof (float));
 150 | 
 151 |     /* result variable */
 152 |     float result = 0;
 153 | 
 154 |     if (v1 == NULL || v2 == NULL) {
 155 |         printf("Out Of Memory: could not allocate space for the two arrays.\n");
 156 |         return 0;
 157 |     }
 158 | 
 159 |     srand((int) time(NULL));
 160 | 
 161 |     struct timespec start, end;
 162 |     float global_result = 0;
 163 | 
 164 |     /* fill vectors with random integer values */
 165 |     for (i = 0; i < local_size; i++) {
 166 |         v1[i] = (float) rand() / (float) (RAND_MAX / 10);
 167 |         v2[i] = (float) rand() / (float) (RAND_MAX / 10);
 168 |     }
 169 | 
 170 |     MPI_Barrier(MPI_COMM_WORLD);
 171 |     clock_gettime(CLOCK, &start);
 172 | 
 173 | 
 174 | 
 175 |     /* perform dot product */
 176 | 
 177 |     for (i = 0; i < local_size; i++) {
 178 |         result = result + v1[i] * v2[i];
 179 |     }
 180 | 
 181 |     MPI_Reduce(&result, &global_result, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);
 182 | 
 183 |     clock_gettime(CLOCK, &end);
 184 | 
 185 |     /* print result so compiler does not throw it away */
 186 |     if (world_rank == 0) {
 187 |         printf("Dot product result: %f\n", result);
 188 |         elapsed_time_hr(start, end, "Float dot product.");
 189 |     }
 190 |     free(v1);
 191 |     free(v2);
 192 | 
 193 |     return 0;
 194 | 
 195 | }
 196 | 
 197 | /*
 198 |  * Vector dot product, doubles
 199 |  *
 200 |  * result = result + v1_i * v2_i
 201 |  *
 202 |  * Input: size of the vectors (in number of elements)
 203 |  * Output: dot product
 204 |  *
 205 |  */
 206 | int double_dot_product(unsigned int size) {
 207 | 
 208 |     int world_size, world_rank;
 209 |     MPI_Comm_size(MPI_COMM_WORLD, &world_size);
 210 |     MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
 211 | 
 212 |     int i;
 213 | 
 214 |     /*
 215 |      * Compute size of block each rank will work on
 216 |      * We do this in a slightly odd method to try an ensure we have the same number of data plane layers (n)
 217 |      * in the MPI case as in the serial case.
 218 |      */
 219 |     int local_size = 0;
 220 |     if (world_rank != 0) {
 221 |         local_size = size / world_size;
 222 |     } else if (world_rank == 0) {
 223 |         local_size = (size / world_size) + (size % world_size);
 224 |     } else {
 225 |         printf("Some error occured in size calculation\n");
 226 |     }
 227 | 
 228 | 
 229 |     /* create two vectors */
 230 |     double *v1 = (double *) malloc(local_size * sizeof (double));
 231 |     double *v2 = (double *) malloc(local_size * sizeof (double));
 232 | 
 233 |     /* result variable */
 234 |     double result = 0;
 235 | 
 236 |     if (v1 == NULL || v2 == NULL) {
 237 |         printf("Out Of Memory: could not allocate space for the two arrays.\n");
 238 |         return 0;
 239 |     }
 240 | 
 241 |     srand((int) time(NULL));
 242 | 
 243 |     struct timespec start, end;
 244 |     double global_result = 0;
 245 | 
 246 |     /* fill vectors with random integer values */
 247 |     for (i = 0; i < local_size; i++) {
 248 |         v1[i] = (double) rand() / (double) (RAND_MAX / 10);
 249 |         v2[i] = (double) rand() / (double) (RAND_MAX / 10);
 250 |     }
 251 | 
 252 |     MPI_Barrier(MPI_COMM_WORLD);
 253 |     clock_gettime(CLOCK, &start);
 254 | 
 255 | 
 256 | 
 257 |     /* perform dot product */
 258 | 
 259 |     for (i = 0; i < local_size; i++) {
 260 |         result = result + v1[i] * v2[i];
 261 |     }
 262 | 
 263 |     MPI_Reduce(&result, &global_result, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
 264 | 
 265 |     clock_gettime(CLOCK, &end);
 266 | 
 267 |     /* print result so compiler does not throw it away */
 268 |     if (world_rank == 0) {
 269 |         printf("Dot product result: %f\n", result);
 270 |         elapsed_time_hr(start, end, "Double dot product.");
 271 |     }
 272 |     free(v1);
 273 |     free(v2);
 274 | 
 275 |     return 0;
 276 | 
 277 | }
 278 | 
 279 | 
 280 | /* Vector scalar multiplication, integers    */
 281 | 
 282 | /* v_i = a * v1_i                     */
 283 | int int_scalar_mult(unsigned int size) {
 284 | 
 285 |     int world_size, world_rank;
 286 |     MPI_Comm_size(MPI_COMM_WORLD, &world_size);
 287 |     MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
 288 | 
 289 |     int i;
 290 | 
 291 |     /*
 292 |      * Compute size of block each rank will work on
 293 |      * We do this in a slightly odd method to try an ensure we have the same number of data plane layers (n)
 294 |      * in the MPI case as in the serial case.
 295 |      */
 296 |     int local_size = 0;
 297 |     int local_size_other = 0;
 298 |     int rcounts[world_size];
 299 |     int displs[world_size];
 300 | 
 301 |     if (world_rank != 0) {
 302 |         local_size = size / world_size;
 303 |     } else if (world_rank == 0) {
 304 |         local_size = (size / world_size) + (size % world_size);
 305 |         local_size_other = size / world_size; // Needed for gatherv
 306 |         rcounts[0] = local_size;
 307 |         displs[0] = 0;
 308 |         for (i = 1; i < world_size; i++) {
 309 |             rcounts[i] = local_size_other;
 310 |             displs[i] = i*local_size_other;
 311 |         }
 312 |     } else {
 313 |         printf("Some error occured in size calculation\n");
 314 |     }
 315 | 
 316 |     /* create vector and scalar */
 317 |     int *v = (int *) malloc(local_size * sizeof (int));
 318 |     unsigned int a = 0;
 319 |     int* rbuf = NULL;
 320 | 
 321 |     /* We only need this space allocated on PE 0 */
 322 |     if (world_rank == 0) {
 323 |         rbuf = (int *) malloc(size * sizeof (int));
 324 |         if (rbuf == NULL) {
 325 |             printf("Out Of Memory: could not allocate space for the array.\n");
 326 |             return 0;
 327 |         }
 328 |     }
 329 | 
 330 |     if (v == NULL) {
 331 |         printf("Out Of Memory: could not allocate space for the array.\n");
 332 |         return 0;
 333 |     }
 334 | 
 335 |     srand((int) time(NULL));
 336 | 
 337 |     struct timespec start, end;
 338 | 
 339 |     /* fill vector with random ints */
 340 |     for (i = 0; i < local_size; i++) {
 341 |         v[i] = (int) rand() / (int) (RAND_MAX / 10);
 342 |     }
 343 | 
 344 |     /* assign random int value */
 345 |     a = (int) rand() / (int) (RAND_MAX / 10);
 346 | 
 347 |     if (world_size > 1) {
 348 |         MPI_Bcast(&a, 1, MPI_INT, 0, MPI_COMM_WORLD); // Make everyone use the SAME a */
 349 |     }
 350 | 
 351 |     MPI_Barrier(MPI_COMM_WORLD);
 352 | 
 353 |     if (world_rank == 0) {
 354 |         clock_gettime(CLOCK, &start);
 355 |     }
 356 | 
 357 | 
 358 |     /* perform scalar product */
 359 |     for (i = 0; i < local_size; i++) {
 360 |         v[i] = a * v[i];
 361 |     }
 362 | 
 363 |     /* Collect elements on PE 0 */
 364 |     if (world_size > 1) {
 365 |         MPI_Gatherv(v, local_size, MPI_INT, rbuf, rcounts, displs, MPI_INT, 0, MPI_COMM_WORLD);
 366 |     }
 367 | 
 368 |     if (world_rank == 0) {
 369 |         clock_gettime(CLOCK, &end);
 370 |     }
 371 | 
 372 |     /* print result so compiler does not throw it away */
 373 |     if (a == 999999) {
 374 |         printf("Scalar product result: %d\n", v[0]);
 375 |     }
 376 | 
 377 |     if (world_rank == 0) {
 378 |         elapsed_time_hr(start, end, "Int scalar multiplication.");
 379 |         free(rbuf);
 380 |     }
 381 |     free(v);
 382 | 
 383 | 
 384 |     return 0;
 385 | 
 386 | }
 387 | 
 388 | /* Vector scalar product, floats    */
 389 | 
 390 | /* v_i = a * v1_i                     */
 391 | int float_scalar_mult(unsigned int size) {
 392 | 
 393 |     int world_size, world_rank;
 394 |     MPI_Comm_size(MPI_COMM_WORLD, &world_size);
 395 |     MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
 396 | 
 397 |     int i;
 398 | 
 399 |     /*
 400 |      * Compute size of block each rank will work on
 401 |      * We do this in a slightly odd method to try an ensure we have the same number of data plane layers (n)
 402 |      * in the MPI case as in the serial case.
 403 |      */
 404 |     int local_size = 0;
 405 |     int local_size_other = 0;
 406 |     int rcounts[world_size];
 407 |     int displs[world_size];
 408 | 
 409 |     if (world_rank != 0) {
 410 |         local_size = size / world_size;
 411 |     } else if (world_rank == 0) {
 412 |         local_size = (size / world_size) + (size % world_size);
 413 |         local_size_other = size / world_size; // Needed for gatherv
 414 |         rcounts[0] = local_size;
 415 |         displs[0] = 0;
 416 |         for (i = 1; i < world_size; i++) {
 417 |             rcounts[i] = local_size_other;
 418 |             displs[i] = i*local_size_other;
 419 |         }
 420 |     } else {
 421 |         printf("Some error occured in size calculation\n");
 422 |     }
 423 | 
 424 |     /* create vector and scalar */
 425 |     float *v = (float *) malloc(local_size * sizeof (float));
 426 |     unsigned int a = 0;
 427 |     float* rbuf = NULL;
 428 | 
 429 |     /* We only need this space allocated on PE 0 */
 430 |     if (world_rank == 0) {
 431 |         rbuf = (float *) malloc(size * sizeof (float));
 432 |         if (rbuf == NULL) {
 433 |             printf("Out Of Memory: could not allocate space for the array.\n");
 434 |             return 0;
 435 |         }
 436 |     }
 437 | 
 438 |     if (v == NULL) {
 439 |         printf("Out Of Memory: could not allocate space for the array.\n");
 440 |         return 0;
 441 |     }
 442 | 
 443 |     srand((int) time(NULL));
 444 | 
 445 |     struct timespec start, end;
 446 | 
 447 |     /* fill vector with random floats */
 448 |     for (i = 0; i < local_size; i++) {
 449 |         v[i] = (float) rand() / (float) (RAND_MAX / 10);
 450 |     }
 451 | 
 452 |     /* assign random float value */
 453 |     a = (float) rand() / (float) (RAND_MAX / 10);
 454 | 
 455 |     if (world_size > 1) {
 456 |         MPI_Bcast(&a, 1, MPI_FLOAT, 0, MPI_COMM_WORLD); // Make everyone use the SAME a */
 457 |     }
 458 |     MPI_Barrier(MPI_COMM_WORLD);
 459 |     if (world_rank == 0) {
 460 |         clock_gettime(CLOCK, &start);
 461 |     }
 462 | 
 463 | 
 464 |     /* perform scalar product */
 465 |     for (i = 0; i < local_size; i++) {
 466 |         v[i] = a * v[i];
 467 |     }
 468 | 
 469 |     /* Collect elements on PE 0 */
 470 |     if (world_size > 1) {
 471 |         MPI_Gatherv(v, local_size, MPI_FLOAT, rbuf, rcounts, displs, MPI_FLOAT, 0, MPI_COMM_WORLD);
 472 |     }
 473 | 
 474 |     if (world_rank == 0) {
 475 |         clock_gettime(CLOCK, &end);
 476 |     }
 477 | 
 478 |     /* print result so compiler does not throw it away */
 479 |     if (a == 999999) {
 480 |         printf("Scalar product result: %f\n", v[0]);
 481 |     }
 482 | 
 483 |     if (world_rank == 0) {
 484 |         elapsed_time_hr(start, end, "Float scalar multiplication.");
 485 |         free(rbuf);
 486 |     }
 487 |     free(v);
 488 | 
 489 |     return 0;
 490 | 
 491 | }
 492 | 
 493 | /* Vector scalar product, doubles    */
 494 | 
 495 | /* v_i = a * v1_i                     */
 496 | int double_scalar_mult(unsigned int size) {
 497 | 
 498 | 
 499 |     int world_size, world_rank;
 500 |     MPI_Comm_size(MPI_COMM_WORLD, &world_size);
 501 |     MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
 502 | 
 503 |     int i;
 504 | 
 505 |     /*
 506 |      * Compute size of block each rank will work on
 507 |      * We do this in a slightly odd method to try an ensure we have the same number of data plane layers (n)
 508 |      * in the MPI case as in the serial case.
 509 |      */
 510 |     int local_size = 0;
 511 |     int local_size_other = 0;
 512 |     int rcounts[world_size];
 513 |     int displs[world_size];
 514 | 
 515 |     if (world_rank != 0) {
 516 |         local_size = size / world_size;
 517 |     } else if (world_rank == 0) {
 518 |         local_size = (size / world_size) + (size % world_size);
 519 |         local_size_other = size / world_size; // Needed for gatherv
 520 |         rcounts[0] = local_size;
 521 |         displs[0] = 0;
 522 |         for (i = 1; i < world_size; i++) {
 523 |             rcounts[i] = local_size_other;
 524 |             displs[i] = i*local_size_other;
 525 |         }
 526 |     } else {
 527 |         printf("Some error occured in size calculation\n");
 528 |     }
 529 | 
 530 |     /* create vector and scalar */
 531 |     double *v = (double *) malloc(local_size * sizeof (double));
 532 |     unsigned int a = 0;
 533 |     double* rbuf = NULL;
 534 | 
 535 |     /* We only need this space allocated on PE 0 */
 536 |     if (world_rank == 0) {
 537 |         rbuf = (double *) malloc(size * sizeof (double));
 538 |         if (rbuf == NULL) {
 539 |             printf("Out Of Memory: could not allocate space for the array.\n");
 540 |             return 0;
 541 |         }
 542 |     }
 543 | 
 544 |     if (v == NULL) {
 545 |         printf("Out Of Memory: could not allocate space for the array.\n");
 546 |         return 0;
 547 |     }
 548 | 
 549 |     srand((int) time(NULL));
 550 | 
 551 |     struct timespec start, end;
 552 | 
 553 |     /* fill vector with random doubles */
 554 |     for (i = 0; i < local_size; i++) {
 555 |         v[i] = (double) rand() / (double) (RAND_MAX / 10);
 556 |     }
 557 | 
 558 |     /* assign random double value */
 559 |     a = (double) rand() / (double) (RAND_MAX / 10);
 560 |     if (world_size > 1) {
 561 |         MPI_Bcast(&a, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); // Make everyone use the SAME a */
 562 |     }
 563 |     MPI_Barrier(MPI_COMM_WORLD);
 564 |     if (world_rank == 0) {
 565 |         clock_gettime(CLOCK, &start);
 566 |     }
 567 | 
 568 | 
 569 |     /* perform scalar product */
 570 |     for (i = 0; i < local_size; i++) {
 571 |         v[i] = a * v[i];
 572 |     }
 573 | 
 574 |     /* Collect elements on PE 0 */
 575 |     if (world_size > 1) {
 576 |         MPI_Gatherv(v, local_size, MPI_DOUBLE, rbuf, rcounts, displs, MPI_DOUBLE, 0, MPI_COMM_WORLD);
 577 |     }
 578 | 
 579 |     if (world_rank == 0) {
 580 |         clock_gettime(CLOCK, &end);
 581 |     }
 582 | 
 583 |     /* print result so compiler does not throw it away */
 584 |     if (a == 999999) {
 585 |         printf("Scalar product result: %lf\n", v[0]);
 586 |     }
 587 | 
 588 |     if (world_rank == 0) {
 589 |         elapsed_time_hr(start, end, "Double scalar multiplication.");
 590 |         free(rbuf);
 591 |     }
 592 |     free(v);
 593 | 
 594 | 
 595 |     return 0;
 596 | 
 597 | }
 598 | 
 599 | int double_norm(unsigned int size) {
 600 | 
 601 |     int world_size, world_rank;
 602 |     MPI_Comm_size(MPI_COMM_WORLD, &world_size);
 603 |     MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
 604 | 
 605 |     int i;
 606 | 
 607 |     /*
 608 |      * Compute size of block each rank will work on
 609 |      * We do this in a slightly odd method to try an ensure we have the same number of data plane layers (n)
 610 |      * in the MPI case as in the serial case.
 611 |      */
 612 |     int local_size = 0;
 613 |     if (world_rank != 0) {
 614 |         local_size = size / world_size;
 615 |     } else if (world_rank == 0) {
 616 |         local_size = (size / world_size) + (size % world_size);
 617 |     } else {
 618 |         printf("Some error occured in size calculation\n");
 619 |     }
 620 | 
 621 | 
 622 |     double *v = (double *) malloc(local_size * sizeof (double));
 623 |     double sum = 0.0, norm = 0.0;
 624 | 
 625 |     if (v == NULL) {
 626 |         printf("Out Of Memory: could not allocate space for the array.\n");
 627 |         return 0;
 628 |     }
 629 | 
 630 |     srand((int) time(NULL));
 631 | 
 632 |     struct timespec start, end;
 633 | 
 634 |     /* fill vector with random doubles */
 635 |     for (i = 0; i < local_size; i++) {
 636 |         v[i] = (double) rand() / (double) (RAND_MAX / 10.0);
 637 |     }
 638 |     MPI_Barrier(MPI_COMM_WORLD);
 639 |     if (world_rank == 0) {
 640 |         clock_gettime(CLOCK, &start);
 641 |     }
 642 | 
 643 |     for (i = 0; i < local_size; i++) {
 644 |         sum = sum + (v[i] * v[i]);
 645 |     }
 646 | 
 647 |     /* REDUCE */
 648 |     double global_sum = 0.0;
 649 |     MPI_Reduce(&sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
 650 |     if (world_rank == 0) {
 651 |         norm = sqrt(sum);
 652 |         clock_gettime(CLOCK, &end);
 653 |         elapsed_time_hr(start, end, "Double vector norm.");
 654 |     }
 655 |     /* print result so compiler does not throw it away */
 656 | 
 657 |     if (v[0] == 99999) {
 658 |         printf("Norm = %f\n", norm);
 659 |     }
 660 | 
 661 |     free(v);
 662 |     return 0;
 663 | }
 664 | 
 665 | 
 666 | 
 667 | /* compute the Euclidean norm of a float vector      */
 668 | /* !!!! naive implementation -- find algorithm that  */
 669 | 
 670 | /* !!!! will avoid over/underflow for large vectors  */
 671 | int float_norm(unsigned int size) {
 672 | 
 673 |     int world_size, world_rank;
 674 |     MPI_Comm_size(MPI_COMM_WORLD, &world_size);
 675 |     MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
 676 | 
 677 |     int i;
 678 | 
 679 |     /*
 680 |      * Compute size of block each rank will work on
 681 |      * We do this in a slightly odd method to try an ensure we have the same number of data plane layers (n)
 682 |      * in the MPI case as in the serial case.
 683 |      */
 684 |     int local_size = 0;
 685 |     if (world_rank != 0) {
 686 |         local_size = size / world_size;
 687 |     } else if (world_rank == 0) {
 688 |         local_size = (size / world_size) + (size % world_size);
 689 |     } else {
 690 |         printf("Some error occured in size calculation\n");
 691 |     }
 692 | 
 693 | 
 694 |     float *v = (float *) malloc(local_size * sizeof (float));
 695 |     float sum = 0.0, norm = 0.0;
 696 | 
 697 |     if (v == NULL) {
 698 |         printf("Out Of Memory: could not allocate space for the array.\n");
 699 |         return 0;
 700 |     }
 701 | 
 702 |     srand((int) time(NULL));
 703 | 
 704 |     struct timespec start, end;
 705 | 
 706 |     /* fill vector with random floats */
 707 |     for (i = 0; i < local_size; i++) {
 708 |         v[i] = (float) rand() / (float) (RAND_MAX / 10.0);
 709 |     }
 710 |     MPI_Barrier(MPI_COMM_WORLD);
 711 |     if (world_rank == 0) {
 712 |         clock_gettime(CLOCK, &start);
 713 |     }
 714 | 
 715 |     for (i = 0; i < local_size; i++) {
 716 |         sum = sum + (v[i] * v[i]);
 717 |     }
 718 | 
 719 |     /* REDUCE */
 720 |     float global_sum = 0.0;
 721 |     MPI_Reduce(&sum, &global_sum, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);
 722 |     if (world_rank == 0) {
 723 |         norm = sqrt(sum);
 724 |         clock_gettime(CLOCK, &end);
 725 |         elapsed_time_hr(start, end, "Float vector norm.");
 726 |     }
 727 |     /* print result so compiler does not throw it away */
 728 | 
 729 |     if (v[0] == 99999) {
 730 |         printf("Norm = %f\n", norm);
 731 |     }
 732 | 
 733 |     free(v);
 734 |     return 0;
 735 | }
 736 | 
 737 | int int_norm(unsigned int size) {
 738 | 
 739 |     int world_size, world_rank;
 740 |     MPI_Comm_size(MPI_COMM_WORLD, &world_size);
 741 |     MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
 742 | 
 743 |     int i;
 744 | 
 745 |     /*
 746 |      * Compute size of block each rank will work on
 747 |      * We do this in a slightly odd method to try an ensure we have the same number of data plane layers (n)
 748 |      * in the MPI case as in the serial case.
 749 |      */
 750 |     int local_size = 0;
 751 |     if (world_rank != 0) {
 752 |         local_size = size / world_size;
 753 |     } else if (world_rank == 0) {
 754 |         local_size = (size / world_size) + (size % world_size);
 755 |     } else {
 756 |         printf("Some error occured in size calculation\n");
 757 |     }
 758 | 
 759 | 
 760 |     int *v = (int *) malloc(local_size * sizeof (int));
 761 |     int sum = 0.0, norm = 0.0;
 762 | 
 763 |     if (v == NULL) {
 764 |         printf("Out Of Memory: could not allocate space for the array.\n");
 765 |         return 0;
 766 |     }
 767 | 
 768 |     srand((int) time(NULL));
 769 | 
 770 |     struct timespec start, end;
 771 | 
 772 |     /* fill vector with random ints */
 773 |     for (i = 0; i < local_size; i++) {
 774 |         v[i] = (int) rand() / (int) (RAND_MAX / 10.0);
 775 |     }
 776 |     MPI_Barrier(MPI_COMM_WORLD);
 777 |     if (world_rank == 0) {
 778 |         clock_gettime(CLOCK, &start);
 779 |     }
 780 | 
 781 |     for (i = 0; i < local_size; i++) {
 782 |         sum = sum + (v[i] * v[i]);
 783 |     }
 784 | 
 785 |     /* REDUCE */
 786 |     int global_sum = 0.0;
 787 |     MPI_Reduce(&sum, &global_sum, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
 788 |     if (world_rank == 0) {
 789 |         norm = sqrt(sum);
 790 |         clock_gettime(CLOCK, &end);
 791 |         elapsed_time_hr(start, end, "Int vector norm.");
 792 |     }
 793 |     /* print result so compiler does not throw it away */
 794 | 
 795 |     if (v[0] == 99999) {
 796 |         printf("Norm = %d\n", norm);
 797 |     }
 798 | 
 799 |     free(v);
 800 |     return 0;
 801 | }
 802 | 
 803 | /*
 804 |  *
 805 |  * Compute vector-scalar product
 806 |  * AXPY, integers
 807 |  *
 808 |  * y = a * x + y
 809 |  *
 810 |  * Naive implementation
 811 |  *
 812 |  */
 813 | int int_axpy(unsigned int size) {
 814 | 
 815 | 
 816 |     int world_size, world_rank;
 817 |     MPI_Comm_size(MPI_COMM_WORLD, &world_size);
 818 |     MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
 819 | 
 820 |     int i;
 821 | 
 822 |     /*
 823 |      * Compute size of block each rank will work on
 824 |      * We do this in a slightly odd method to try an ensure we have the same number of data plane layers (n)
 825 |      * in the MPI case as in the serial case.
 826 |      */
 827 |     int local_size = 0;
 828 |     int local_size_other = 0;
 829 |     int rcounts[world_size];
 830 |     int displs[world_size];
 831 | 
 832 |     if (world_rank != 0) {
 833 |         local_size = size / world_size;
 834 |     } else if (world_rank == 0) {
 835 |         local_size = (size / world_size) + (size % world_size);
 836 |         local_size_other = size / world_size; // Needed for gatherv
 837 |         rcounts[0] = local_size;
 838 |         displs[0] = 0;
 839 |         for (i = 1; i < world_size; i++) {
 840 |             rcounts[i] = local_size_other;
 841 |             displs[i] = i*local_size_other;
 842 |         }
 843 |     } else {
 844 |         printf("Some error occured in size calculation\n");
 845 |     }
 846 | 
 847 | 
 848 |     int a;
 849 |     int *x = (int *) malloc(local_size * sizeof (int));
 850 |     int *y = (int *) malloc(local_size * sizeof (int));
 851 | 
 852 |     int* rbuf = NULL;
 853 | 
 854 |     /* We only need this space allocated on PE 0 */
 855 |     if (world_rank == 0) {
 856 |         rbuf = (int *) malloc(size * sizeof (int));
 857 |         if (rbuf == NULL) {
 858 |             printf("Out Of Memory: could not allocate space for the array.\n");
 859 |             return 0;
 860 |         }
 861 |     }
 862 | 
 863 |     if (x == NULL || y == NULL) {
 864 |         printf("Out Of Memory: could not allocate space for the two arrays.\n");
 865 |         return 0;
 866 |     }
 867 | 
 868 |     srand((int) time(NULL));
 869 | 
 870 |     a = (int) rand() / (int) (RAND_MAX / 10);
 871 | 
 872 |     /* fill x and y vectors with random ints */
 873 | 
 874 |     for (i = 0; i < local_size; i++) {
 875 |         x[i] = (int) rand() / (int) (RAND_MAX / 10);
 876 |         y[i] = (int) rand() / (int) (RAND_MAX / 10);
 877 |     }
 878 | 
 879 |     struct timespec start, end;
 880 |     if (world_size > 1) {
 881 |         MPI_Bcast(&a, 1, MPI_INT, 0, MPI_COMM_WORLD); // Make everyone use the SAME a */
 882 |     }
 883 |     MPI_Barrier(MPI_COMM_WORLD);
 884 |     if (world_rank == 0) {
 885 |         clock_gettime(CLOCK, &start);
 886 |     }
 887 | 
 888 |     for (i = 0; i < local_size; i++) {
 889 |         y[i] = a * x[i] + y[i];
 890 |     }
 891 |     if (world_size > 1) {
 892 |         MPI_Gatherv(y, local_size, MPI_INT, rbuf, rcounts, displs, MPI_INT, 0, MPI_COMM_WORLD);
 893 |     }
 894 | 
 895 |     if (world_rank == 0) {
 896 |         clock_gettime(CLOCK, &end);
 897 |     }
 898 | 
 899 |     /* print result so compiler does not throw it away */
 900 |     if (a == 999999) {
 901 |         printf("Scalar product result: %d\n", rbuf[0]);
 902 |     }
 903 | 
 904 |     if (world_rank == 0) {
 905 |         elapsed_time_hr(start, end, "Int AXPY.");
 906 |         free(rbuf);
 907 |     }
 908 | 
 909 | 
 910 |     free(x);
 911 |     free(y);
 912 |     return 0;
 913 | }
 914 | 
 915 | /*
 916 |  *
 917 |  * Compute vector-scalar product
 918 |  * AXPY, floats
 919 |  *
 920 |  * y = a * x + y
 921 |  *
 922 |  * Naive implementation
 923 |  *
 924 |  */
 925 | int float_axpy(unsigned int size) {
 926 | 
 927 |     int world_size, world_rank;
 928 |     MPI_Comm_size(MPI_COMM_WORLD, &world_size);
 929 |     MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
 930 | 
 931 |     int i;
 932 | 
 933 |     /*
 934 |      * Compute size of block each rank will work on
 935 |      * We do this in a slightly odd method to try an ensure we have the same number of data plane layers (n)
 936 |      * in the MPI case as in the serial case.
 937 |      */
 938 |     int local_size = 0;
 939 |     int local_size_other = 0;
 940 |     int rcounts[world_size];
 941 |     int displs[world_size];
 942 | 
 943 |     if (world_rank != 0) {
 944 |         local_size = size / world_size;
 945 |     } else if (world_rank == 0) {
 946 |         local_size = (size / world_size) + (size % world_size);
 947 |         local_size_other = size / world_size; // Needed for gatherv
 948 |         rcounts[0] = local_size;
 949 |         displs[0] = 0;
 950 |         for (i = 1; i < world_size; i++) {
 951 |             rcounts[i] = local_size_other;
 952 |             displs[i] = i*local_size_other;
 953 |         }
 954 |     } else {
 955 |         printf("Some error occured in size calculation\n");
 956 |     }
 957 | 
 958 | 
 959 |     float a;
 960 |     float *x = (float *) malloc(local_size * sizeof (float));
 961 |     float *y = (float *) malloc(local_size * sizeof (float));
 962 | 
 963 |     float* rbuf = NULL;
 964 | 
 965 |     /* We only need this space allocated on PE 0 */
 966 |     if (world_rank == 0) {
 967 |         rbuf = (float *) malloc(size * sizeof (float));
 968 |         if (rbuf == NULL) {
 969 |             printf("Out Of Memory: could not allocate space for the array.\n");
 970 |             return 0;
 971 |         }
 972 |     }
 973 | 
 974 |     if (x == NULL || y == NULL) {
 975 |         printf("Out Of Memory: could not allocate space for the two arrays.\n");
 976 |         return 0;
 977 |     }
 978 | 
 979 |     srand((int) time(NULL));
 980 | 
 981 |     a = (float) rand() / (float) (RAND_MAX / 10);
 982 | 
 983 |     /* fill x and y vectors with random ints */
 984 | 
 985 |     for (i = 0; i < local_size; i++) {
 986 |         x[i] = (float) rand() / (float) (RAND_MAX / 10);
 987 |         y[i] = (float) rand() / (float) (RAND_MAX / 10);
 988 |     }
 989 | 
 990 |     struct timespec start, end;
 991 |     if (world_size > 1) {
 992 |         MPI_Bcast(&a, 1, MPI_FLOAT, 0, MPI_COMM_WORLD); // Make everyone use the SAME a */
 993 |     }
 994 |     MPI_Barrier(MPI_COMM_WORLD);
 995 |     if (world_rank == 0) {
 996 |         clock_gettime(CLOCK, &start);
 997 |     }
 998 | 
 999 |     for (i = 0; i < local_size; i++) {
1000 |         y[i] = a * x[i] + y[i];
1001 |     }
1002 |     if (world_size > 1) {
1003 |         MPI_Gatherv(y, local_size, MPI_FLOAT, rbuf, rcounts, displs, MPI_FLOAT, 0, MPI_COMM_WORLD);
1004 |     }
1005 |     if (world_rank == 0) {
1006 |         clock_gettime(CLOCK, &end);
1007 |     }
1008 | 
1009 |     /* print result so compiler does not throw it away */
1010 |     if (a == 999999) {
1011 |         printf("Scalar product result: %f\n", rbuf[0]);
1012 |     }
1013 | 
1014 |     if (world_rank == 0) {
1015 |         elapsed_time_hr(start, end, "Float AXPY.");
1016 |         free(rbuf);
1017 |     }
1018 | 
1019 | 
1020 |     free(x);
1021 |     free(y);
1022 |     return 0;
1023 | }
1024 | 
1025 | /*
1026 |  *
1027 |  * Compute vector-scalar product
1028 |  * AXPY, doubles
1029 |  *
1030 |  * y = a * x + y
1031 |  *
1032 |  * Naive implementation
1033 |  *
1034 |  */
1035 | int double_axpy(unsigned int size) {
1036 |     int world_size, world_rank;
1037 |     MPI_Comm_size(MPI_COMM_WORLD, &world_size);
1038 |     MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
1039 | 
1040 |     int i;
1041 | 
1042 |     /*
1043 |      * Compute size of block each rank will work on
1044 |      * We do this in a slightly odd method to try an ensure we have the same number of data plane layers (n)
1045 |      * in the MPI case as in the serial case.
1046 |      */
1047 |     int local_size = 0;
1048 |     int local_size_other = 0;
1049 |     int rcounts[world_size];
1050 |     int displs[world_size];
1051 | 
1052 |     if (world_rank != 0) {
1053 |         local_size = size / world_size;
1054 |     } else if (world_rank == 0) {
1055 |         local_size = (size / world_size) + (size % world_size);
1056 |         local_size_other = size / world_size; // Needed for gatherv
1057 |         rcounts[0] = local_size;
1058 |         displs[0] = 0;
1059 |         for (i = 1; i < world_size; i++) {
1060 |             rcounts[i] = local_size_other;
1061 |             displs[i] = i*local_size_other;
1062 |         }
1063 |     } else {
1064 |         printf("Some error occured in size calculation\n");
1065 |     }
1066 | 
1067 | 
1068 |     double a;
1069 |     double *x = (double *) malloc(local_size * sizeof (double));
1070 |     double *y = (double *) malloc(local_size * sizeof (double));
1071 | 
1072 |     double* rbuf = NULL;
1073 | 
1074 |     /* We only need this space allocated on PE 0 */
1075 |     if (world_rank == 0) {
1076 |         rbuf = (double *) malloc(size * sizeof (double));
1077 |         if (rbuf == NULL) {
1078 |             printf("Out Of Memory: could not allocate space for the array.\n");
1079 |             return 0;
1080 |         }
1081 |     }
1082 | 
1083 |     if (x == NULL || y == NULL) {
1084 |         printf("Out Of Memory: could not allocate space for the two arrays.\n");
1085 |         return 0;
1086 |     }
1087 | 
1088 |     srand((int) time(NULL));
1089 | 
1090 |     a = (double) rand() / (double) (RAND_MAX / 10);
1091 | 
1092 |     /* fill x and y vectors with random doubles */
1093 | 
1094 |     for (i = 0; i < local_size; i++) {
1095 |         x[i] = (double) rand() / (double) (RAND_MAX / 10);
1096 |         y[i] = (double) rand() / (double) (RAND_MAX / 10);
1097 |     }
1098 | 
1099 |     struct timespec start, end;
1100 |     if (world_size > 1) {
1101 |         MPI_Bcast(&a, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); // Make everyone use the SAME a */
1102 |     }
1103 |     if (world_rank == 0) {
1104 |         clock_gettime(CLOCK, &start);
1105 |     }
1106 | 
1107 |     for (i = 0; i < local_size; i++) {
1108 |         y[i] = a * x[i] + y[i];
1109 |     }
1110 |     if (world_size > 1) {
1111 |         MPI_Gatherv(y, local_size, MPI_DOUBLE, rbuf, rcounts, displs, MPI_DOUBLE, 0, MPI_COMM_WORLD);
1112 |     }
1113 |     MPI_Barrier(MPI_COMM_WORLD);
1114 |     if (world_rank == 0) {
1115 |         clock_gettime(CLOCK, &end);
1116 |     }
1117 | 
1118 |     /* print result so compiler does not throw it away */
1119 |     if (a == 999999) {
1120 |         printf("Scalar product result: %lf\n", rbuf[0]);
1121 |     }
1122 | 
1123 |     if (world_rank == 0) {
1124 |         elapsed_time_hr(start, end, "Double AXPY.");
1125 |         free(rbuf);
1126 |     }
1127 | 
1128 | 
1129 |     free(x);
1130 |     free(y);
1131 |     return 0;
1132 | }
1133 | 
1134 | /*
1135 |  * Dense Matrix-Vector product, integers
1136 |  *
1137 |  * y = A * x
1138 |  * where A is a square matrix
1139 |  *
1140 |  * Input:  number of elements in vectors and of rows/cols
1141 |  *         in matrix specified as number of ints
1142 |  *
1143 |  */
1144 | int int_dmatvec_product(unsigned int size) {
1145 | 
1146 |     int world_size, world_rank;
1147 |     MPI_Comm_size(MPI_COMM_WORLD, &world_size);
1148 |     MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
1149 | 
1150 |     /*
1151 |      * Compute size of block each rank will work on
1152 |      * We do this in a slightly odd method to try an ensure we have the same number of data plane layers (n)
1153 |      * in the MPI case as in the serial case.
1154 |      */
1155 |     int local_size = 0;
1156 |     int local_size_other = 0;
1157 |     int rcounts[world_size];
1158 |     int displs[world_size];
1159 |     int i, j;
1160 | 
1161 |     if (world_rank != 0) {
1162 |         local_size = size / world_size;
1163 |     } else if (world_rank == 0) {
1164 |         local_size = (size / world_size) + (size % world_size);
1165 |         local_size_other = size / world_size; // Needed for gatherv
1166 |         rcounts[0] = local_size;
1167 |         displs[0] = 0;
1168 |         for (i = 1; i < world_size; i++) {
1169 |             rcounts[i] = local_size_other;
1170 |             displs[i] = i*local_size_other;
1171 |         }
1172 |     } else {
1173 |         printf("Some error occured in size calculation\n");
1174 |     }
1175 | 
1176 | 
1177 | 
1178 |     int r1 = 0;
1179 |     int r2 = 0;
1180 | 
1181 |     /* create two vectors */
1182 |     int *x = (int *) malloc(size * sizeof (int));
1183 |     int *y = (int *) calloc(local_size, sizeof (int));
1184 | 
1185 |     /* create matrix */
1186 |     int **A;
1187 |     A = (int **) malloc(local_size * sizeof (int *));
1188 |     for (i = 0; i < local_size; i++) {
1189 |         A[i] = (int *) malloc(size * sizeof (int));
1190 |     }
1191 | 
1192 |     if (x == NULL || y == NULL || A == NULL) {
1193 |         printf("Out Of Memory: could not allocate space for the vectors and matrix.\n");
1194 |         return 0;
1195 |     }
1196 | 
1197 |     int* rbuf = NULL;
1198 | 
1199 |     /* We only need this space allocated on PE 0 */
1200 |     if (world_rank == 0) {
1201 |         rbuf = (int *) malloc(size * sizeof (int));
1202 |         if (rbuf == NULL) {
1203 |             printf("Out Of Memory: could not allocate space for the array.\n");
1204 |             return 0;
1205 |         }
1206 |     }
1207 | 
1208 |     srand((int) time(NULL));
1209 | 
1210 |     struct timespec start, end;
1211 | 
1212 |     if (world_rank == 0) {
1213 |         r1 = (int) rand() / (int) (RAND_MAX / 10);
1214 |         r2 = (int) rand() / (int) (RAND_MAX / 10);
1215 |     }
1216 | 
1217 |     /* Synchronise the "random" values from 0 to all PEs */
1218 |     if (world_size > 1) {
1219 |         MPI_Bcast(&r1, 1, MPI_INT, 0, MPI_COMM_WORLD);
1220 |         MPI_Bcast(&r2, 1, MPI_INT, 0, MPI_COMM_WORLD);
1221 |     }
1222 | 
1223 |     /* fill matrix A with "random" integer values */
1224 |     for (i = 0; i < local_size; i++) {
1225 |         for (j = 0; j < size; j++) {
1226 |             A[i][j] = r2;
1227 |         }
1228 |     }
1229 |     /* fill vector x with "random" integer values */
1230 |     for (i = 0; i < size; i++) {
1231 |         x[i] = r1;
1232 |     }
1233 | 
1234 |     clock_gettime(CLOCK, &start);
1235 | 
1236 |     /* perform matrix-vector product */
1237 |     for (i = 0; i < local_size; i++) {
1238 |         for (j = 0; j < size; j++) {
1239 |             y[i] = y[i] + A[i][j] * x[j];
1240 |         }
1241 |     }
1242 |     if (world_size > 1) {
1243 |         MPI_Gatherv(y, local_size, MPI_INT, rbuf, rcounts, displs, MPI_INT, 0, MPI_COMM_WORLD);
1244 |     }
1245 |     MPI_Barrier(MPI_COMM_WORLD);
1246 |     if (world_rank == 0) {
1247 |         clock_gettime(CLOCK, &end);
1248 |         elapsed_time_hr(start, end, "Int Dense Matrix-Vector product.");
1249 |         free(rbuf);
1250 |     }
1251 | 
1252 |     /* print result so compiler does not throw it away */
1253 |     if (r1 == 99999) {
1254 |         printf("Result vector y[0] = %d\n", y[0]);
1255 |     }
1256 | 
1257 |     free(x);
1258 |     free(y);
1259 |     for (i = 0; i < local_size; i++) free(A[i]);
1260 |     free(A);
1261 | 
1262 |     return 0;
1263 | 
1264 | }
1265 | 
1266 | /*
1267 |  * Dense Matrix-Vector product, floats
1268 |  *
1269 |  * y = A * x
1270 |  * where A is a square matrix
1271 |  *
1272 |  * Input:  number of elements in vectors and of rows/cols
1273 |  *         in matrix specified as number of floats
1274 |  *
1275 |  */
1276 | int float_dmatvec_product(unsigned int size) {
1277 | 
1278 |     int world_size, world_rank;
1279 |     MPI_Comm_size(MPI_COMM_WORLD, &world_size);
1280 |     MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
1281 | 
1282 |     /*
1283 |      * Compute size of block each rank will work on
1284 |      * We do this in a slightly odd method to try an ensure we have the same number of data plane layers (n)
1285 |      * in the MPI case as in the serial case.
1286 |      */
1287 |     int local_size = 0;
1288 |     int local_size_other = 0;
1289 |     int rcounts[world_size];
1290 |     int displs[world_size];
1291 |     int i, j;
1292 | 
1293 |     if (world_rank != 0) {
1294 |         local_size = size / world_size;
1295 |     } else if (world_rank == 0) {
1296 |         local_size = (size / world_size) + (size % world_size);
1297 |         local_size_other = size / world_size; // Needed for gatherv
1298 |         rcounts[0] = local_size;
1299 |         displs[0] = 0;
1300 |         for (i = 1; i < world_size; i++) {
1301 |             rcounts[i] = local_size_other;
1302 |             displs[i] = i*local_size_other;
1303 |         }
1304 |     } else {
1305 |         printf("Some error occured in size calculation\n");
1306 |     }
1307 | 
1308 | 
1309 | 
1310 |     float r1 = 0;
1311 |     float r2 = 0;
1312 | 
1313 |     /* create two vectors */
1314 |     float *x = (float *) malloc(size * sizeof (float));
1315 |     float *y = (float *) calloc(local_size, sizeof (float));
1316 | 
1317 |     /* create matrix */
1318 |     float **A;
1319 |     A = (float **) malloc(local_size * sizeof (float *));
1320 |     for (i = 0; i < local_size; i++) {
1321 |         A[i] = (float *) malloc(size * sizeof (float));
1322 |     }
1323 | 
1324 |     if (x == NULL || y == NULL || A == NULL) {
1325 |         printf("Out Of Memory: could not allocate space for the vectors and matrix.\n");
1326 |         return 0;
1327 |     }
1328 | 
1329 |     float* rbuf = NULL;
1330 | 
1331 |     /* We only need this space allocated on PE 0 */
1332 |     if (world_rank == 0) {
1333 |         rbuf = (float *) malloc(size * sizeof (float));
1334 |         if (rbuf == NULL) {
1335 |             printf("Out Of Memory: could not allocate space for the array.\n");
1336 |             return 0;
1337 |         }
1338 |     }
1339 | 
1340 |     srand((int) time(NULL));
1341 | 
1342 |     struct timespec start, end;
1343 | 
1344 |     if (world_rank == 0) {
1345 |         r1 = (float) rand() / (float) (RAND_MAX / 10);
1346 |         r2 = (float) rand() / (float) (RAND_MAX / 10);
1347 |     }
1348 | 
1349 |     /* Synchronise the "random" values from 0 to all PEs */
1350 |     if (world_size > 1) {
1351 |         MPI_Bcast(&r1, 1, MPI_FLOAT, 0, MPI_COMM_WORLD);
1352 |         MPI_Bcast(&r2, 1, MPI_FLOAT, 0, MPI_COMM_WORLD);
1353 |     }
1354 |     /* fill matrix A with "random" integer values */
1355 |     for (i = 0; i < local_size; i++) {
1356 |         for (j = 0; j < size; j++) {
1357 |             A[i][j] = r2;
1358 |         }
1359 |     }
1360 |     /* fill vector x with "random" integer values */
1361 |     for (i = 0; i < size; i++) {
1362 |         x[i] = r1;
1363 |     }
1364 |     MPI_Barrier(MPI_COMM_WORLD);
1365 |     clock_gettime(CLOCK, &start);
1366 | 
1367 |     /* perform matrix-vector product */
1368 |     for (i = 0; i < local_size; i++) {
1369 |         for (j = 0; j < size; j++) {
1370 |             y[i] = y[i] + A[i][j] * x[j];
1371 |         }
1372 |     }
1373 |     if (world_size > 1) {
1374 |         MPI_Gatherv(y, local_size, MPI_FLOAT, rbuf, rcounts, displs, MPI_FLOAT, 0, MPI_COMM_WORLD);
1375 |     }
1376 |     if (world_rank == 0) {
1377 |         clock_gettime(CLOCK, &end);
1378 |         elapsed_time_hr(start, end, "Float Dense Matrix-Vector product.");
1379 |         free(rbuf);
1380 |     }
1381 | 
1382 |     /* print result so compiler does not throw it away */
1383 |     if (r1 == 99999) {
1384 |         printf("Result vector y[0] = %f\n", y[0]);
1385 |     }
1386 | 
1387 |     free(x);
1388 |     free(y);
1389 |     for (i = 0; i < local_size; i++) free(A[i]);
1390 |     free(A);
1391 |     return 0;
1392 | 
1393 | }
1394 | 
1395 | /*
1396 |  * Dense Matrix-Vector product, doubles
1397 |  *
1398 |  * y = A * x
1399 |  * where A is a square matrix
1400 |  *
1401 |  * Input:  number of elements in vectors and of rows/cols
1402 |  *         in matrix specified as number of floats
1403 |  *
1404 |  */
1405 | int double_dmatvec_product(unsigned int size) {
1406 | 
1407 |     int world_size, world_rank;
1408 |     MPI_Comm_size(MPI_COMM_WORLD, &world_size);
1409 |     MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
1410 | 
1411 |     /*
1412 |      * Compute size of block each rank will work on
1413 |      * We do this in a slightly odd method to try an ensure we have the same number of data plane layers (n)
1414 |      * in the MPI case as in the serial case.
1415 |      */
1416 |     int local_size = 0;
1417 |     int local_size_other = 0;
1418 |     int rcounts[world_size];
1419 |     int displs[world_size];
1420 |     int i, j;
1421 | 
1422 |     if (world_rank != 0) {
1423 |         local_size = size / world_size;
1424 |     } else if (world_rank == 0) {
1425 |         local_size = (size / world_size) + (size % world_size);
1426 |         local_size_other = size / world_size; // Needed for gatherv
1427 |         rcounts[0] = local_size;
1428 |         displs[0] = 0;
1429 |         for (i = 1; i < world_size; i++) {
1430 |             rcounts[i] = local_size_other;
1431 |             displs[i] = i*local_size_other;
1432 |         }
1433 |     } else {
1434 |         printf("Some error occured in size calculation\n");
1435 |     }
1436 | 
1437 | 
1438 | 
1439 |     double r1 = 0;
1440 |     double r2 = 0;
1441 | 
1442 |     /* create two vectors */
1443 |     double *x = (double *) malloc(size * sizeof (double));
1444 |     double *y = (double *) calloc(local_size, sizeof (double));
1445 | 
1446 |     /* create matrix */
1447 |     double **A;
1448 |     A = (double **) malloc(local_size * sizeof (double *));
1449 |     for (i = 0; i < local_size; i++) {
1450 |         A[i] = (double *) malloc(size * sizeof (double));
1451 |         if (A[i] == NULL) {
1452 |             printf("Out Of Memory: could not allocate space for the vectors and matrix.\n");
1453 |             return 0;
1454 |         }
1455 |     }
1456 | 
1457 |     if (x == NULL || y == NULL || A == NULL) {
1458 |         printf("Out Of Memory: could not allocate space for the vectors and matrix.\n");
1459 |         return 0;
1460 |     }
1461 | 
1462 |     double* rbuf = NULL;
1463 | 
1464 |     /* We only need this space allocated on PE 0 */
1465 |     if (world_rank == 0) {
1466 |         rbuf = (double *) malloc(size * sizeof (double));
1467 |         if (rbuf == NULL) {
1468 |             printf("Out Of Memory: could not allocate space for the array.\n");
1469 |             return 0;
1470 |         }
1471 |     }
1472 | 
1473 |     srand((int) time(NULL));
1474 | 
1475 |     struct timespec start, end;
1476 | 
1477 |     if (world_rank == 0) {
1478 |         r1 = (double) rand() / (double) (RAND_MAX / 10);
1479 |         r2 = (double) rand() / (double) (RAND_MAX / 10);
1480 |     }
1481 | 
1482 |     /* Synchronise the "random" values from 0 to all PEs */
1483 |     if (world_size > 1) {
1484 |         MPI_Bcast(&r1, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
1485 |         MPI_Bcast(&r2, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
1486 |     }
1487 |     /* fill matrix A with "random" integer values */
1488 |     for (i = 0; i < local_size; i++) {
1489 |         for (j = 0; j < size; j++) {
1490 |             A[i][j] = r2;
1491 |         }
1492 |     }
1493 |     /* fill vector x with "random" integer values */
1494 |     for (i = 0; i < size; i++) {
1495 |         x[i] = r1;
1496 |     }
1497 |     MPI_Barrier(MPI_COMM_WORLD);
1498 |     clock_gettime(CLOCK, &start);
1499 |     /* perform matrix-vector product */
1500 |     for (i = 0; i < local_size; i++) {
1501 |         for (j = 0; j < size; j++) {
1502 |             y[i] = y[i] + A[i][j] * x[j];
1503 |         }
1504 |     }
1505 |     if (world_size > 1) {
1506 |         MPI_Gatherv(y, local_size, MPI_DOUBLE, rbuf, rcounts, displs, MPI_DOUBLE, 0, MPI_COMM_WORLD);
1507 |     }
1508 | 
1509 |     if (world_rank == 0) {
1510 |         clock_gettime(CLOCK, &end);
1511 |         elapsed_time_hr(start, end, "Double Dense Matrix-Vector product.");
1512 |         free(rbuf);
1513 |     }
1514 | 
1515 |     /* print result so compiler does not throw it away */
1516 |     if (r1 == 99999) {
1517 |         printf("Result vector y[0] = %lf\n", y[0]);
1518 |     }
1519 | 
1520 |     free(x);
1521 |     free(y);
1522 |     for (i = 0; i < local_size; i++) free(A[i]);
1523 |     free(A);
1524 | 
1525 |     return 0;
1526 | 
1527 | }
1528 | 
1529 | int double_spmatvec_product(unsigned long r) {
1530 | 
1531 |     int world_size, world_rank;
1532 |     MPI_Comm_size(MPI_COMM_WORLD, &world_size);
1533 |     MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
1534 |     MPI_Status status;
1535 | 
1536 | 
1537 |     struct timespec start, end;
1538 | 
1539 |     char *filename = "matrix_in.csr";
1540 | 
1541 |     int i, rep;
1542 |     char* retval = NULL;
1543 | 
1544 |     if (r == ULONG_MAX) r = 10000;
1545 | 
1546 |     if (world_rank == 0) {
1547 |         int m, n, nz;
1548 | 
1549 |         FILE *f;
1550 |         char line[64];
1551 | 
1552 |         if ((f = fopen(filename, "r")) == NULL) {
1553 |             printf("can't open file <%s> \n", filename);
1554 |             exit(1);
1555 |         }
1556 | 
1557 |         retval = fgets(line, sizeof (line), f);
1558 |         if (retval != NULL) {
1559 |             sscanf(line, "%d %d %d", &nz, &n, &m);
1560 |         } else {
1561 |             printf("Error in reading line from file. Exiting.\n");
1562 |             return 1;
1563 |         }
1564 | 
1565 |         printf("Number of elements of values and col_idx: %d; number of values in row_idx: %d\n", nz, m);
1566 | 
1567 |         double* values = malloc(nz * sizeof (double));
1568 |         /* int values_len = nz; */
1569 | 
1570 |         // fill values
1571 |         for (i = 0; i < nz; i++) {
1572 | 
1573 |             retval = fgets(line, sizeof (line), f);
1574 |             if (retval != NULL) {
1575 |                 sscanf(line, "%lf", &values[i]);
1576 |             } else {
1577 |                 printf("Error in reading line from file. Exiting.\n");
1578 |                 return 1;
1579 |             }
1580 | 
1581 | 
1582 |         }
1583 | 
1584 |         int* col_idx = malloc(nz * sizeof (int));
1585 |         /* int col_idx_len = values_len; */
1586 | 
1587 |         // fill col_idx
1588 |         for (i = 0; i < nz; i++) {
1589 |             retval = fgets(line, sizeof (line), f);
1590 |             if (retval != NULL) {
1591 |                 sscanf(line, "%d", &col_idx[i]);
1592 |             } else {
1593 |                 printf("Error in reading line from file. Exiting.\n");
1594 |                 return 1;
1595 |             }
1596 | 
1597 |         }
1598 | 
1599 |         int* row_idx = malloc((m + 1) * sizeof (int));
1600 |         int row_idx_len = m + 1;
1601 |         // fill row_idx
1602 |         for (i = 0; i < m; i++) {
1603 |             retval = fgets(line, sizeof (line), f);
1604 |             if (retval != NULL) {
1605 |                 sscanf(line, "%d", &row_idx[i]);
1606 |             } else {
1607 |                 printf("Error in reading line from file. Exiting.\n");
1608 |                 return 1;
1609 |             }
1610 |         }
1611 |         row_idx[m] = nz;
1612 | 
1613 | 
1614 |         int x_len = m - 1;
1615 |         double* x = (double*) malloc(sizeof (double)*x_len);
1616 |         for (i = 0; i < x_len; i++) {
1617 |             x[i] = i + 1;
1618 |         }
1619 | 
1620 |         fclose(f);
1621 | 
1622 | 
1623 | 
1624 |         double* b = (double*) malloc(sizeof (double)*x_len);
1625 |         memset(b, 0, sizeof (*b));
1626 | 
1627 | 
1628 |         /*
1629 |          * Compute how many members of row_idx to go each rank
1630 |          * The last rank (world_size-1) gets the overflow
1631 |          */
1632 |         int local_row_idx_len_array[world_size];
1633 |         for (i = 0; i < world_size; i++) {
1634 |             local_row_idx_len_array[i] = (row_idx_len - 1) / world_size;
1635 |         }
1636 |         local_row_idx_len_array[world_size - 1] += (row_idx_len - 1) % world_size;
1637 | 
1638 |         /*
1639 |          * Send the number of items from row_idx they will receive to each rank
1640 |          * For this rank (rank 0), perform a local copy rather than via MPI
1641 |          */
1642 |         int remote_row_offset = 0;
1643 |         for (i = 1; i < world_size; i++) {
1644 |             MPI_Send(&local_row_idx_len_array[i], 1, MPI_INT, i, 0, MPI_COMM_WORLD);
1645 |         }
1646 |         int local_row_idx_len = local_row_idx_len_array[0];
1647 | 
1648 |         /*
1649 |          * Send each rank the number of row_idx members it is due.
1650 |          * Send each rank the index in row_idx that it's first element came from,
1651 |          * this is important is placing the results in the correct place in the b
1652 |          * (result) vector.
1653 |          * Send each rank the first index that rank+1 should receive to use as a termination condition
1654 |          * For this rank (rank 0), perform a local copy rather than via MPI
1655 |          */
1656 |         for (i = 1; i < world_size; i++) {
1657 |             MPI_Send(&row_idx[i * local_row_idx_len], local_row_idx_len_array[i], MPI_INT, i, 0, MPI_COMM_WORLD);
1658 |             remote_row_offset = i*local_row_idx_len;
1659 |             MPI_Send(&remote_row_offset, 1, MPI_INT, i, 0, MPI_COMM_WORLD);
1660 |             MPI_Send(&row_idx[(i + 1) * local_row_idx_len], 1, MPI_INT, i, 0, MPI_COMM_WORLD);
1661 |         }
1662 | 
1663 | 
1664 | 
1665 |         int* local_row_idx = (int*) malloc(sizeof (int)*(local_row_idx_len + 1));
1666 |         memcpy(&local_row_idx[0], &row_idx[0], sizeof (int)*local_row_idx_len_array[0]);
1667 |         memcpy(&local_row_idx[local_row_idx_len], &row_idx[local_row_idx_len], sizeof (int));
1668 | 
1669 |         /*
1670 |          * Compute the number of values (members of values) for each rank
1671 |          */
1672 |         int vals_per_rank[world_size];
1673 |         for (i = 0; i < world_size; i++) {
1674 |             vals_per_rank[i] = row_idx[(i + 1) * local_row_idx_len] - row_idx[i * local_row_idx_len];
1675 |         }
1676 | 
1677 |         /*
1678 |          * Send to each rank the number of values they will receive
1679 |          * Send to each rank the values
1680 |          * Send to each rank the column indices (col_idx) which will be equal
1681 |          * to the number of values.
1682 |          * For this rank (rank 0), perform a local copy rather than via MPI
1683 |          */
1684 |         int counter = 0;
1685 |         counter = vals_per_rank[0];
1686 |         for (i = 1; i < world_size; i++) {
1687 |             MPI_Send(&vals_per_rank[i], 1, MPI_INT, i, 0, MPI_COMM_WORLD);
1688 |             MPI_Send(&values[counter], vals_per_rank[i], MPI_DOUBLE, i, 0, MPI_COMM_WORLD);
1689 |             MPI_Send(&col_idx[counter], vals_per_rank[i], MPI_INT, i, 0, MPI_COMM_WORLD);
1690 |             counter += vals_per_rank[i];
1691 |         }
1692 |         double local_vals[vals_per_rank[0]];
1693 |         memcpy(&local_vals[0], &values[0], sizeof (double)*vals_per_rank[0]);
1694 |         int local_col_idx[vals_per_rank[0]];
1695 |         memcpy(&local_col_idx[0], &col_idx[0], sizeof (int)*vals_per_rank[0]);
1696 | 
1697 | 
1698 |         /*
1699 |          * Broadcast the length of the vector x and vector x to each rank
1700 |          */
1701 |         MPI_Bcast(&x_len, 1, MPI_INT, 0, MPI_COMM_WORLD);
1702 |         MPI_Bcast(&x[0], x_len, MPI_DOUBLE, 0, MPI_COMM_WORLD);
1703 | 
1704 |         int ii, jj;
1705 |         int local_row_adjust = 0;
1706 | 
1707 |         /* Main algorithm */
1708 |         MPI_Barrier(MPI_COMM_WORLD);
1709 |         clock_gettime(CLOCK, &start);
1710 |         for (rep = 0; rep < r; rep++) {
1711 |             for (ii = 0; ii < local_row_idx_len - 1; ii++) {
1712 |                 for (jj = local_row_idx[ii]; jj < local_row_idx[ii + 1]; jj++) {
1713 |                     b[ii + local_row_adjust] = b[ii + local_row_adjust] + (x[local_col_idx[jj]] * local_vals[jj]);
1714 |                 }
1715 |             }
1716 |         }
1717 | 
1718 |         /*
1719 |          * Reduce the b vector on all ranks to bb vector on this rank (rank 0).
1720 |          */
1721 |         double* bb = (double*) malloc(sizeof (double)*x_len);
1722 |         MPI_Reduce(b, bb, x_len, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
1723 |         clock_gettime(CLOCK, &end);
1724 | 
1725 |         elapsed_time_hr(start, end, "Sparse DMVs.");
1726 | 
1727 | 
1728 |         free(b);
1729 |         free(x);
1730 |         free(bb);
1731 |         free(local_row_idx);
1732 |         free(row_idx);
1733 |         free(values);
1734 |         free(col_idx);
1735 | 
1736 |         /* Print result for checking */
1737 |         /* printf("bb "); */
1738 |         /* for(i=0;i<x_len;i++){ */
1739 |         /*   printf("%lf ", bb[i]); */
1740 |         /* } */
1741 |         /* printf("\n"); */
1742 | 
1743 | 
1744 |     } else {
1745 | 
1746 |         int local_row_idx_len = 0;
1747 |         MPI_Recv(&local_row_idx_len, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
1748 |         local_row_idx_len++;
1749 |         int local_row_idx[local_row_idx_len];
1750 |         memset(&local_row_idx[0], 0, local_row_idx_len * sizeof (int));
1751 |         MPI_Recv(&local_row_idx[0], local_row_idx_len, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
1752 |         int local_row_adjust = 0;
1753 | 
1754 |         MPI_Recv(&local_row_adjust, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
1755 |         MPI_Recv(&local_row_idx[local_row_idx_len - 1], 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
1756 |         int lr_adjust = local_row_idx[0];
1757 |         for (i = 0; i < local_row_idx_len; i++) {
1758 |             local_row_idx[i] = local_row_idx[i] - lr_adjust;
1759 |         }
1760 | 
1761 |         int local_vals_len = 0;
1762 |         MPI_Recv(&local_vals_len, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
1763 | 
1764 |         double* local_vals = (double*) malloc(sizeof (double)*local_vals_len);
1765 |         MPI_Recv(&local_vals[0], local_vals_len, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &status);
1766 | 
1767 |         int* local_col_idx = (int*) malloc(sizeof (int)*local_vals_len);
1768 |         MPI_Recv(&local_col_idx[0], local_vals_len, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
1769 | 
1770 |         int x_len; // Length of the vector in SpMV
1771 |         MPI_Bcast(&x_len, 1, MPI_INT, 0, MPI_COMM_WORLD);
1772 | 
1773 |         double* x = (double*) malloc(sizeof (double)*x_len);
1774 |         double* b = (double*) malloc(sizeof (double)*x_len);
1775 |         MPI_Bcast(&x[0], x_len, MPI_DOUBLE, 0, MPI_COMM_WORLD);
1776 |         memset(b, 0, sizeof (*b));
1777 |         int ii, jj;
1778 | 
1779 |         /* Main processing loop */
1780 |         MPI_Barrier(MPI_COMM_WORLD);
1781 |         for (rep = 0; rep < r; rep++) {
1782 |             for (ii = 0; ii < local_row_idx_len - 1; ii++) {
1783 |                 for (jj = local_row_idx[ii]; jj < local_row_idx[ii + 1]; jj++) {
1784 |                     b[ii + local_row_adjust] = b[ii + local_row_adjust] + (x[local_col_idx[jj]] * local_vals[jj]);
1785 |                 }
1786 |             }
1787 |         }
1788 |         MPI_Reduce(b, NULL, x_len, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
1789 | 
1790 |         free(x);
1791 |         free(b);
1792 |         free(local_vals);
1793 |         free(local_col_idx);
1794 | 
1795 | 
1796 |     }
1797 | 
1798 |     MPI_Barrier(MPI_COMM_WORLD);
1799 | 
1800 |     return 0;
1801 | }
1802 | 
1803 | int float_spmatvec_product(unsigned long r) {
1804 | 
1805 |     int world_size, world_rank;
1806 |     MPI_Comm_size(MPI_COMM_WORLD, &world_size);
1807 |     MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
1808 |     MPI_Status status;
1809 | 
1810 | 
1811 |     struct timespec start, end;
1812 | 
1813 |     char *filename = "matrix_in.csr";
1814 | 
1815 |     int i, rep;
1816 |     char* retval = NULL;
1817 | 
1818 |     if (r == ULONG_MAX) r = 10000;
1819 | 
1820 |     if (world_rank == 0) {
1821 |         int m, n, nz;
1822 | 
1823 |         FILE *f;
1824 |         char line[64];
1825 | 
1826 |         if ((f = fopen(filename, "r")) == NULL) {
1827 |             printf("can't open file <%s> \n", filename);
1828 |             exit(1);
1829 |         }
1830 | 
1831 |         retval = fgets(line, sizeof (line), f);
1832 |         if (retval != NULL) {
1833 |             sscanf(line, "%d %d %d", &nz, &n, &m);
1834 |         } else {
1835 |             printf("Error in reading line from file. Exiting.\n");
1836 |             return 1;
1837 |         }
1838 | 
1839 |         printf("Number of elements of values and col_idx: %d; number of values in row_idx: %d\n", nz, m);
1840 | 
1841 |         /*
1842 |          * Allocate memory for values on proc0
1843 |          * We read everything in on this proc, then distribute.
1844 |          */
1845 |         float* values = malloc(nz * sizeof (float));
1846 |         if (values = NULL){
1847 |             printf("Error allocating memory for values.\n");
1848 |         }
1849 | 
1850 |         /* int values_len = nz; */
1851 | 
1852 |         // fill values
1853 |         for (i = 0; i < nz; i++) {
1854 | 
1855 |             retval = fgets(line, sizeof (line), f);
1856 |             if (retval != NULL) {
1857 |                 sscanf(line, "%f", &values[i]); // I think this does automatic down conversion if it's a double in the input file.
1858 |             } else {
1859 |                 printf("Error in reading line from file. Exiting.\n");
1860 |                 return 1;
1861 |             }
1862 | 
1863 | 
1864 |         }
1865 | 
1866 |         int* col_idx = malloc(nz * sizeof (int));
1867 |         if (col_idx == NULL){
1868 |             printf("Error allocating memory for col_idx.\n");
1869 |         }
1870 |         /* int col_idx_len = values_len; */
1871 | 
1872 |         /* fill col_idx */
1873 |         for (i = 0; i < nz; i++) {
1874 |             retval = fgets(line, sizeof (line), f);
1875 |             if (retval != NULL) {
1876 |                 sscanf(line, "%d", &col_idx[i]);
1877 |             } else {
1878 |                 printf("Error in reading line from file. Exiting.\n");
1879 |                 return 1;
1880 |             }
1881 | 
1882 |         }
1883 | 
1884 |         int* row_idx = malloc((m + 1) * sizeof (int));
1885 |         int row_idx_len = m + 1;
1886 |         // fill row_idx
1887 |         for (i = 0; i < m; i++) {
1888 |             retval = fgets(line, sizeof (line), f);
1889 |             if (retval != NULL) {
1890 |                 sscanf(line, "%d", &row_idx[i]);
1891 |             } else {
1892 |                 printf("Error in reading line from file. Exiting.\n");
1893 |                 return 1;
1894 |             }
1895 |         }
1896 |         row_idx[m] = nz;
1897 | 
1898 | 
1899 |         int x_len = m - 1;
1900 |         float* x = (float*) malloc(sizeof (float)*x_len);
1901 |         if (x == NULL) {
1902 |             printf("Error alloc x\n");
1903 |         }
1904 | 
1905 |         for (i = 0; i < x_len; i++) {
1906 |             x[i] = i + 1;
1907 |         }
1908 | 
1909 |         fclose(f);
1910 | 
1911 | 
1912 | 
1913 |         float* b = (float*) malloc(sizeof (float)*x_len);
1914 |         if (b == NULL) {
1915 |             printf("Error alloc b\n");
1916 |         }
1917 |         memset(b, 0, sizeof (*b));
1918 | 
1919 | 
1920 |         /*
1921 |          * Compute how many members of row_idx to go each rank
1922 |          * The last rank (world_size-1) gets the overflow
1923 |          */
1924 |         int local_row_idx_len_array[world_size];
1925 |         for (i = 0; i < world_size; i++) {
1926 |             local_row_idx_len_array[i] = (row_idx_len - 1) / world_size;
1927 |         }
1928 |         local_row_idx_len_array[world_size - 1] += (row_idx_len - 1) % world_size;
1929 | 
1930 |         /*
1931 |          * Send the number of items from row_idx they will receive to each rank
1932 |          * For this rank (rank 0), perform a local copy rather than via MPI
1933 |          */
1934 |         int remote_row_offset = 0;
1935 |         for (i = 1; i < world_size; i++) {
1936 |             MPI_Send(&local_row_idx_len_array[i], 1, MPI_INT, i, 0, MPI_COMM_WORLD);
1937 |         }
1938 |         int local_row_idx_len = local_row_idx_len_array[0];
1939 | 
1940 |         /*
1941 |          * Send each rank the number of row_idx members it is due.
1942 |          * Send each rank the index in row_idx that it's first element came from,
1943 |          * this is important is placing the results in the correct place in the b
1944 |          * (result) vector.
1945 |          * Send each rank the first index that rank+1 should receive to use as a termination condition
1946 |          * For this rank (rank 0), perform a local copy rather than via MPI
1947 |          */
1948 |         for (i = 1; i < world_size; i++) {
1949 |             MPI_Send(&row_idx[i * local_row_idx_len], local_row_idx_len_array[i], MPI_INT, i, 0, MPI_COMM_WORLD);
1950 |             remote_row_offset = i*local_row_idx_len;
1951 |             MPI_Send(&remote_row_offset, 1, MPI_INT, i, 0, MPI_COMM_WORLD);
1952 |             MPI_Send(&row_idx[(i + 1) * local_row_idx_len], 1, MPI_INT, i, 0, MPI_COMM_WORLD);
1953 |         }
1954 | 
1955 | 
1956 | 
1957 |         int* local_row_idx = (int*) malloc(sizeof (int)*(local_row_idx_len + 1));
1958 |         if (local_row_idx == NULL) {
1959 |             printf("Error alloc local_row_idx\n");
1960 |         }
1961 | 
1962 |         memcpy(&local_row_idx[0], &row_idx[0], sizeof (int)*local_row_idx_len_array[0]);
1963 |         memcpy(&local_row_idx[local_row_idx_len], &row_idx[local_row_idx_len], sizeof (int));
1964 |         
1965 |         /*
1966 |          * Compute the number of values (members of values) for each rank
1967 |          */
1968 |         int vals_per_rank[world_size];
1969 |         for (i = 0; i < world_size; i++) {
1970 |             vals_per_rank[i] = row_idx[(i + 1) * local_row_idx_len] - row_idx[i * local_row_idx_len];
1971 |         }
1972 |         
1973 |         /*
1974 |          * Send to each rank the number of values they will receive
1975 |          * Send to each rank the values
1976 |          * Send to each rank the column indices (col_idx) which will be equal
1977 |          * to the number of values.
1978 |          * For this rank (rank 0), perform a local copy rather than via MPI
1979 |          */
1980 |         int counter = 0;
1981 |         counter = vals_per_rank[0];
1982 |         for (i = 1; i < world_size; i++) {
1983 |             MPI_Send(&vals_per_rank[i], 1, MPI_INT, i, 0, MPI_COMM_WORLD);
1984 |             MPI_Send(&values[counter], vals_per_rank[i], MPI_FLOAT, i, 0, MPI_COMM_WORLD);
1985 |             MPI_Send(&col_idx[counter], vals_per_rank[i], MPI_INT, i, 0, MPI_COMM_WORLD);
1986 |             counter += vals_per_rank[i];
1987 |         }
1988 |         
1989 |         /* float local_vals[vals_per_rank[0]]; */
1990 |         float *local_vals = (float*) malloc(sizeof (float) * vals_per_rank[0]);
1991 |         if (local_vals == NULL) {
1992 |             printf("Error alloc local_vals1\n");
1993 |         }
1994 |         
1995 |         memcpy(&local_vals[0], &values[0], sizeof (float)*vals_per_rank[0]);
1996 |         /* int local_col_idx[vals_per_rank[0]]; */
1997 |         int *local_col_idx = (int*) malloc(sizeof (int) * vals_per_rank[0]);
1998 |         if (local_vals == NULL) {
1999 |             printf("Error alloc local_col_idx1\n");
2000 |         }
2001 | 
2002 |         memcpy(&local_col_idx[0], &col_idx[0], sizeof (int)*vals_per_rank[0]);
2003 |         printf("A5.\n");
2004 | 
2005 |         /*
2006 |          * Broadcast the length of the vector x and vector x to each rank
2007 |          */
2008 |         MPI_Bcast(&x_len, 1, MPI_INT, 0, MPI_COMM_WORLD);
2009 |         MPI_Bcast(&x[0], x_len, MPI_FLOAT, 0, MPI_COMM_WORLD);
2010 | 
2011 |         int ii, jj;
2012 |         int local_row_adjust = 0;
2013 |         
2014 |         /* Main algorithm */
2015 |         MPI_Barrier(MPI_COMM_WORLD);
2016 |         clock_gettime(CLOCK, &start);
2017 |         for (rep = 0; rep < r; rep++) {
2018 |             for (ii = 0; ii < local_row_idx_len - 1; ii++) {
2019 |                 for (jj = local_row_idx[ii]; jj < local_row_idx[ii + 1]; jj++) {
2020 |                     b[ii + local_row_adjust] = b[ii + local_row_adjust] + (x[local_col_idx[jj]] * local_vals[jj]);
2021 |                 }
2022 |             }
2023 |         }
2024 | 
2025 |         /*
2026 |          * Reduce the b vector on all ranks to bb vector on this rank (rank 0).
2027 |          */
2028 |         float* bb = (float*) malloc(sizeof (float)*x_len);
2029 |         if (bb == NULL) {
2030 |             printf("Error alloc bb\n");
2031 |         }
2032 | 
2033 |         MPI_Reduce(b, bb, x_len, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);
2034 |         clock_gettime(CLOCK, &end);
2035 | 
2036 |         elapsed_time_hr(start, end, "Sparse DMVs.");
2037 | 
2038 | 
2039 |         free(b);
2040 |         free(x);
2041 |         free(bb);
2042 |         free(local_row_idx);
2043 |         free(row_idx);
2044 |         free(values);
2045 |         free(col_idx);
2046 | 
2047 |         /* Print result for checking */
2048 |         /* printf("bb "); */
2049 |         /* for(i=0;i<x_len;i++){ */
2050 |         /*   printf("%lf ", bb[i]); */
2051 |         /* } */
2052 |         /* printf("\n"); */
2053 | 
2054 | 
2055 |     } else {
2056 | 
2057 |         int local_row_idx_len = 0;
2058 |         MPI_Recv(&local_row_idx_len, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
2059 |         local_row_idx_len++;
2060 |         int local_row_idx[local_row_idx_len];
2061 |         memset(&local_row_idx[0], 0, local_row_idx_len * sizeof (int));
2062 |         MPI_Recv(&local_row_idx[0], local_row_idx_len, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
2063 |         int local_row_adjust = 0;
2064 | 
2065 |         MPI_Recv(&local_row_adjust, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
2066 |         MPI_Recv(&local_row_idx[local_row_idx_len - 1], 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
2067 |         int lr_adjust = local_row_idx[0];
2068 |         for (i = 0; i < local_row_idx_len; i++) {
2069 |             local_row_idx[i] = local_row_idx[i] - lr_adjust;
2070 |         }
2071 | 
2072 |         int local_vals_len = 0;
2073 |         MPI_Recv(&local_vals_len, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
2074 | 
2075 |         float* local_vals = (float*) malloc(sizeof (float)*local_vals_len);
2076 |         if (local_vals == NULL) {
2077 |             printf("error alloc local_vals2.\n");
2078 |         }
2079 |         MPI_Recv(&local_vals[0], local_vals_len, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &status);
2080 | 
2081 |         int* local_col_idx = (int*) malloc(sizeof (int)*local_vals_len);
2082 |         if (local_col_idx == NULL) {
2083 |             printf("error alloc local_col_idx2.\n");
2084 |         }
2085 |         MPI_Recv(&local_col_idx[0], local_vals_len, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
2086 | 
2087 |         int x_len; // Length of the vector in SpMV
2088 |         MPI_Bcast(&x_len, 1, MPI_INT, 0, MPI_COMM_WORLD);
2089 | 
2090 |         float* x = (float*) malloc(sizeof (float)*x_len);
2091 |         if (x == NULL) {
2092 |             printf("error alloc x2.\n");
2093 |         }
2094 | 
2095 |         float* b = (float*) malloc(sizeof (float)*x_len);
2096 |         if (b == NULL) {
2097 |             printf("error alloc b2.\n");
2098 |         }
2099 |         MPI_Bcast(&x[0], x_len, MPI_FLOAT, 0, MPI_COMM_WORLD);
2100 |         memset(b, 0, sizeof (*b));
2101 |         int ii, jj;
2102 |         
2103 |         /* Main processing loop */
2104 |         MPI_Barrier(MPI_COMM_WORLD);
2105 |         for (rep = 0; rep < r; rep++) {
2106 |             for (ii = 0; ii < local_row_idx_len - 1; ii++) {
2107 |                 for (jj = local_row_idx[ii]; jj < local_row_idx[ii + 1]; jj++) {
2108 |                     b[ii + local_row_adjust] = b[ii + local_row_adjust] + (x[local_col_idx[jj]] * local_vals[jj]);
2109 |                 }
2110 |             }
2111 |         }
2112 |         MPI_Reduce(b, NULL, x_len, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);
2113 | 
2114 |         free(x);
2115 |         free(b);
2116 |         free(local_vals);
2117 |         free(local_col_idx);
2118 | 
2119 | 
2120 |     }
2121 | 
2122 |     MPI_Barrier(MPI_COMM_WORLD);
2123 | 
2124 |     return 0;
2125 | }
2126 | 
2127 | 


--------------------------------------------------------------------------------