├── NOTICE ├── platform_inc ├── generic_gcc_opt.inc ├── generic_gcc_debug.inc ├── xc30_gcc_debug.inc ├── xc30_gcc_opt.inc └── generic_mpicc_opt.inc ├── matrix_utils.h ├── Makefile ├── utils.h ├── level1.h ├── README.md ├── main.c ├── fileparse.c ├── matrix_utils.c ├── utils.c ├── level1.c ├── LICENSE ├── cg.c └── blas_op.c /NOTICE: -------------------------------------------------------------------------------- 1 | Adept Kernel Benchmarks, MPI port 2 | Copyright 2015 The University of Edinburgh 3 | 4 | This product includes software developed at 5 | EPCC, The University of Edinburgh (http://www.epcc.ed.ac.uk/). 6 | -------------------------------------------------------------------------------- /platform_inc/generic_gcc_opt.inc: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015 The University of Edinburgh. 2 | # 3 | # This software was developed as part of the 4 | # EC FP7 funded project Adept (Project ID: 610490) 5 | # www.adept-project.eu 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | CC=mpicc 20 | CFLAGS+= -O3 21 | DMACROS += -------------------------------------------------------------------------------- /platform_inc/generic_gcc_debug.inc: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015 The University of Edinburgh. 2 | # 3 | # This software was developed as part of the 4 | # EC FP7 funded project Adept (Project ID: 610490) 5 | # www.adept-project.eu 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | CC=mpicc 20 | CFLAGS+= -g -O3 21 | DMACROS += -------------------------------------------------------------------------------- /platform_inc/xc30_gcc_debug.inc: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015 The University of Edinburgh. 2 | # 3 | # This software was developed as part of the 4 | # EC FP7 funded project Adept (Project ID: 610490) 5 | # www.adept-project.eu 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | CC=cc 20 | CFLAGS+= -g-O3 21 | DMACROS += -DNORAW -------------------------------------------------------------------------------- /platform_inc/xc30_gcc_opt.inc: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015 The University of Edinburgh. 2 | # 3 | # This software was developed as part of the 4 | # EC FP7 funded project Adept (Project ID: 610490) 5 | # www.adept-project.eu 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | CC=cc 20 | CFLAGS+= -O3 21 | DMACROS += -DNORAW -------------------------------------------------------------------------------- /platform_inc/generic_mpicc_opt.inc: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015 The University of Edinburgh. 2 | # 3 | # This software was developed as part of the 4 | # EC FP7 funded project Adept (Project ID: 610490) 5 | # www.adept-project.eu 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | CC = mpicc 20 | CFLAGS += -O3 21 | DMACROS += 22 | LDFLAGS += -------------------------------------------------------------------------------- /matrix_utils.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2015 The University of Edinburgh. */ 2 | 3 | /* 4 | * This software was developed as part of the 5 | * EC FP7 funded project Adept (Project ID: 610490) 6 | * www.adept-project.eu 7 | */ 8 | 9 | /* Licensed under the Apache License, Version 2.0 (the "License"); */ 10 | /* you may not use this file except in compliance with the License. */ 11 | /* You may obtain a copy of the License at */ 12 | 13 | /* http://www.apache.org/licenses/LICENSE-2.0 */ 14 | 15 | /* Unless required by applicable law or agreed to in writing, software */ 16 | /* distributed under the License is distributed on an "AS IS" BASIS, */ 17 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ 18 | /* See the License for the specific language governing permissions and */ 19 | /* limitations under the License. */ 20 | 21 | void get_matrix_size(char*, int*, int*, int*); 22 | void mm_to_csr(char*, int, int, int, int*, int*, double*); 23 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015 The University of Edinburgh. 2 | # 3 | # This software was developed as part of the 4 | # EC FP7 funded project Adept (Project ID: 610490) 5 | # www.adept-project.eu 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | # Default to gcc but allong setting CC in the shell (or via cmd line) to another compiler 20 | ifndef $CC 21 | CC = mpicc 22 | endif 23 | 24 | #set OPT=debug to compile with -g -O0 25 | ifndef $OPT 26 | OPT = opt 27 | endif 28 | 29 | ifndef $ARCH 30 | ARCH=generic 31 | endif 32 | 33 | include platform_inc/${ARCH}_${CC}_${OPT}.inc 34 | 35 | SOURCES = main.c level1.c blas_op.c utils.c stencil.c fileparse.c matrix_utils.c cg.c 36 | LDFLAGS+= -lm 37 | EXE = kernel 38 | 39 | all: $(EXE) 40 | 41 | $(EXE): $(SOURCES) 42 | $(CC) $(CFLAGS) -o $(EXE) $(SOURCES) $(DMACROS) $(LDFLAGS) 43 | 44 | clean: 45 | rm -rf *~ *.o $(EXE) 46 | -------------------------------------------------------------------------------- /utils.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2015 The University of Edinburgh. */ 2 | 3 | /* 4 | * This software was developed as part of the 5 | * EC FP7 funded project Adept (Project ID: 610490) 6 | * www.adept-project.eu 7 | */ 8 | 9 | /* Licensed under the Apache License, Version 2.0 (the "License"); */ 10 | /* you may not use this file except in compliance with the License. */ 11 | /* You may obtain a copy of the License at */ 12 | 13 | /* http://www.apache.org/licenses/LICENSE-2.0 */ 14 | 15 | /* Unless required by applicable law or agreed to in writing, software */ 16 | /* distributed under the License is distributed on an "AS IS" BASIS, */ 17 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ 18 | /* See the License for the specific language governing permissions and */ 19 | /* limitations under the License. */ 20 | 21 | #include 22 | volatile sig_atomic_t stop; 23 | 24 | 25 | #ifdef __MACH__ 26 | #include 27 | #include 28 | #define CLOCK 0 29 | 30 | void clock_gettime (void*, struct timespec *); 31 | 32 | #else 33 | 34 | #ifdef CLOCK_MONOTONIC_RAW 35 | #define CLOCK CLOCK_MONOTONIC_RAW 36 | #else 37 | #define CLOCK CLOCK_MONOTONIC 38 | #endif 39 | 40 | #endif 41 | 42 | 43 | double elapsed_time_hr(struct timespec, struct timespec, char *); 44 | void loop_timer(unsigned long); 45 | void loop_timer_nop(unsigned long); 46 | void warmup_loop(unsigned long); 47 | void interrupt_handler(int); 48 | void discrete_elapsed_hr(struct timespec*, struct timespec*, int*, char*); 49 | int sub_time_hr(struct timespec*, struct timespec*, struct timespec*); 50 | -------------------------------------------------------------------------------- /level1.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2015 The University of Edinburgh. */ 2 | 3 | /* 4 | * This software was developed as part of the 5 | * EC FP7 funded project Adept (Project ID: 610490) 6 | * www.adept-project.eu 7 | */ 8 | 9 | /* Licensed under the Apache License, Version 2.0 (the "License"); */ 10 | /* you may not use this file except in compliance with the License. */ 11 | /* You may obtain a copy of the License at */ 12 | 13 | /* http://www.apache.org/licenses/LICENSE-2.0 */ 14 | 15 | /* Unless required by applicable law or agreed to in writing, software */ 16 | /* distributed under the License is distributed on an "AS IS" BASIS, */ 17 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ 18 | /* See the License for the specific language governing permissions and */ 19 | /* limitations under the License. */ 20 | 21 | void bench_level1(char *, unsigned int, unsigned int, char *, char *, char *); 22 | 23 | int int_dot_product(unsigned int); 24 | int float_dot_product(unsigned int); 25 | int double_dot_product(unsigned int); 26 | 27 | int int_scalar_mult(unsigned int); 28 | int float_scalar_mult(unsigned int); 29 | int double_scalar_mult(unsigned int); 30 | 31 | int double_norm(unsigned int); 32 | int float_norm(unsigned int); 33 | int int_norm(unsigned int); 34 | 35 | int int_axpy(unsigned int); 36 | int float_axpy(unsigned int); 37 | int double_axpy(unsigned int); 38 | 39 | int int_dmatvec_product(unsigned int); 40 | int float_dmatvec_product(unsigned int); 41 | int double_dmatvec_product(unsigned int); 42 | 43 | int double_spmatvec_product(unsigned long); 44 | int float_spmatvec_product(unsigned long); 45 | 46 | void double_stencil27(unsigned int); 47 | void float_stencil27(unsigned int); 48 | void int_stencil27(unsigned int); 49 | 50 | void double_stencil19(unsigned int); 51 | void float_stencil19(unsigned int); 52 | void int_stencil19(unsigned int); 53 | 54 | void double_stencil9(unsigned int); 55 | void float_stencil9(unsigned int); 56 | void int_stencil9(unsigned int); 57 | 58 | void double_stencil5(unsigned int); 59 | void float_stencil5(unsigned int); 60 | void int_stencil5(unsigned int); 61 | 62 | void double_stencil27_overlapped(unsigned int); 63 | void float_stencil27_overlapped(unsigned int); 64 | void int_stencil27_overlapped(unsigned int); 65 | 66 | void double_stencil19_overlapped(unsigned int); 67 | void float_stencil19_overlapped(unsigned int); 68 | void int_stencil19_overlapped(unsigned int); 69 | 70 | void double_stencil9_overlapped(unsigned int); 71 | void float_stencil9_overlapped(unsigned int); 72 | void int_stencil9_overlapped(unsigned int); 73 | 74 | void double_stencil5_overlapped(unsigned int); 75 | void float_stencil5_overlapped(unsigned int); 76 | void int_stencil5_overlapped(unsigned int); 77 | 78 | void fileparse(unsigned int); 79 | 80 | int conjugate_gradient(unsigned int); 81 | int conjugate_gradient_mixed(unsigned int); 82 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015 The University of Edinburgh. 2 | 3 | This software was developed as part of the 4 | EC FP7 funded project Adept (Project ID: 610490) 5 | http://www.adept-project.eu 6 | 7 | Licensed under the Apache License, Version 2.0 (the "License"); 8 | you may not use this file except in compliance with the License. 9 | You may obtain a copy of the License at 10 | 11 | http://www.apache.org/licenses/LICENSE-2.0 12 | 13 | Unless required by applicable law or agreed to in writing, software 14 | distributed under the License is distributed on an "AS IS" BASIS, 15 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | See the License for the specific language governing permissions and 17 | limitations under the License. 18 | 19 | 20 | # Adept Kernel Benchmarks - MPI 21 | 22 | This README describes the MPI parallel kernel benchmarks. They are implemented in C. 23 | 24 | ## Citation & Further Information 25 | If you would like to cite this work, please cite: 26 | Nick Johnson et al., "Adept Deliverable D2.3 - Updated Report on Adept Benchmarks", September 2015. 27 | available at http://www.adept-project.eu/images/Deliverables/Adept%20D2.3.pdf 28 | 29 | ## BLAS-type benchmarks 30 | 31 | In our BLAS-type benchmarks we implement a few of the most common linear algebra computations. 32 | 33 | #### AXPY 34 | This benchmark takes two vectors `x` and `y`, and the scalar `a`, and computes: 35 | ``` 36 | y = a * x + y 37 | ``` 38 | The user can choose the length (number of elements) of the vectors, as well as their data type (int, float or double). 39 | 40 | #### Dot product 41 | The dot product benchmark multiplies two vectors x and y of length n and returns a scalar: 42 | ``` 43 | result = x_0 y0 + x_1 y_1 + ... x_n y_n 44 | ``` 45 | The user can choose the length (number of elements) of the vectors, as well as their data type (int, float or double). 46 | 47 | #### Scalar multiplication 48 | Thise benchmark scales the vector x by a fixed scalar a: 49 | ``` 50 | x = a * x 51 | ``` 52 | The user can choose the length (number of elements) of the vectors, as well as their data type (int, float or double). 53 | 54 | #### Euclidean norm 55 | This benchmarks computes for Euclidean norm of vector x: 56 | ``` 57 | || x || = sqrt ( |x_1|^2 + |x_2|^2 + ... |x_n|^2 ) 58 | ``` 59 | The user can choose the length (number of elements) of the vectors, as well as their data type (int, float or double). 60 | 61 | #### Dense matrix-vector multiplication 62 | This benchmarks multiplies a square dense matrix A with a vector x to compute vector y: 63 | ``` 64 | y = A * x 65 | ``` 66 | Both A and x are randomly generated. The user can choose the size of the data structures (where size*size equals the number of elements in the matrix), as well as their data type (int, float or double). 67 | 68 | #### Sparse matrix-vector multiplication 69 | This benchmarks multiplies a square sparse matrix A with a vector x to compute vector y: 70 | ``` 71 | y = A * x 72 | ``` 73 | A is represented in CSR format and read from an input file. The vector x is randomly generated. The size of the matrix is fixed by the input file (which the user can substitute for a different matrix). The user can choose the data type to be used (float or double). 74 | 75 | #### Sparse matrix-matrix multiplication 76 | This benchmarks multiplies two square sparse matrices A and B to compute matrix C: 77 | ``` 78 | C = A * B 79 | ``` 80 | A and B are both represented in CSR format and read from an input file. The size of the matrices is fixed by the input file (which the user can substitute for a different matrix). The user can choose the data type to be used (float or double). 81 | 82 | ## Stencil computation 83 | 84 | The stencil benchmarks compute values for each element in a 2D or 3D grid based on the values of their nearest neighbours. 85 | 86 | #### 2D grid: 5-point and 9-point Stencil 87 | On a 2D grid, the 5-point stencil computes the value of A[i][j] by taking the values from left, right, up and down from the current position, and scale them with a constant. The 9-point stencil is similar, but also includes the diagonals. 88 | The user can choose the data type to be used in the grid (int, float or double). 89 | 90 | #### 3D grid: 19-point and 27-point Stencil 91 | The 19-point and 27-point stencils are analogous to the 5 and 9 point stencil, but they operate in a 3D space. 92 | The user can choose the data type to be used in the grid (int, float or double). 93 | 94 | ## File parsing 95 | The file parsing benchmark creates a file filled with sequences of random characters, as well as a fixed search phrase (here: "AdeptProject"). The benchmark then searches through the file and counts the occurences of the search phrase. 96 | The user can determine the size of the file by passing the number of lines to be created (using size). -------------------------------------------------------------------------------- /main.c: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2015 The University of Edinburgh. */ 2 | 3 | /* 4 | * This software was developed as part of the 5 | * EC FP7 funded project Adept (Project ID: 610490) 6 | * www.adept-project.eu 7 | */ 8 | 9 | /* Licensed under the Apache License, Version 2.0 (the "License"); */ 10 | /* you may not use this file except in compliance with the License. */ 11 | /* You may obtain a copy of the License at */ 12 | 13 | /* http://www.apache.org/licenses/LICENSE-2.0 */ 14 | 15 | /* Unless required by applicable law or agreed to in writing, software */ 16 | /* distributed under the License is distributed on an "AS IS" BASIS, */ 17 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ 18 | /* See the License for the specific language governing permissions and */ 19 | /* limitations under the License. */ 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | #include "level1.h" 27 | 28 | #include 29 | 30 | void usage(); 31 | 32 | int main(int argc, char **argv) { 33 | 34 | int c; 35 | int world_size, world_rank; 36 | MPI_Init(&argc, &argv); 37 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 38 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 39 | if (world_rank == 0) { 40 | printf("Running with %d MPI processes.\n", world_size); 41 | } 42 | 43 | char *bench = "blas_op"; 44 | unsigned int size = 200; 45 | unsigned long rep = ULONG_MAX; 46 | char *op = "dot_product"; 47 | char *dt = "int"; 48 | char *algo = "normal"; 49 | 50 | static struct option option_list[] ={ 51 | {"bench", required_argument, NULL, 'b'}, 52 | {"size", required_argument, NULL, 's'}, 53 | {"reps", required_argument, NULL, 'r'}, 54 | {"op", required_argument, NULL, 'o'}, 55 | {"dtype", required_argument, NULL, 'd'}, 56 | {"algo", required_argument, NULL, 'a'}, 57 | {"help", no_argument, NULL, 'h'}, 58 | {0, 0, 0, 0} 59 | }; 60 | 61 | while ((c = getopt_long(argc, argv, "b:s:r:o:d:a:h", option_list, NULL)) != -1) { 62 | switch (c) { 63 | case 'b': 64 | bench = optarg; 65 | if (world_rank == 0) { 66 | printf("Benchmark is %s.\n", bench); 67 | } 68 | break; 69 | case 's': 70 | size = atoi(optarg); 71 | if (world_rank == 0) { 72 | printf("Size is %d.\n", size); 73 | } 74 | break; 75 | case 'r': 76 | rep = atol(optarg); 77 | printf("Number of repetitions %lu.\n", rep); 78 | break; 79 | case 'o': 80 | op = optarg; 81 | if (world_rank == 0) { 82 | printf("Operation %s\n", op); 83 | } 84 | break; 85 | case 'd': 86 | dt = optarg; 87 | if (world_rank == 0) { 88 | printf("Data type is %s\n", dt); 89 | } 90 | break; 91 | case 'a': 92 | algo = optarg; 93 | if (world_rank == 0) { 94 | printf("Algorithm is %s\n", algo); 95 | } 96 | break; 97 | case 'h': 98 | if (world_rank == 0) { 99 | usage(); 100 | } 101 | return 0; 102 | default: 103 | if (world_rank == 0) { 104 | printf("Undefined.\n"); 105 | } 106 | return 0; 107 | } 108 | } 109 | 110 | bench_level1(bench, size, rep, op, dt, algo); 111 | MPI_Finalize(); 112 | return 0; 113 | 114 | } 115 | 116 | void usage() { 117 | printf("Usage for KERNEL benchmarks:\n\n"); 118 | printf("\t -b, --bench NAME \t name of the benchmark - possible values are blas_op, stencil, fileparse and cg.\n"); 119 | printf("\t -s, --size N \t\t vector length. Default is 200. For fileparse benchmark this is the number of rows.\n"); 120 | printf("\t -r, --reps N \t\t number of repetitions. Default value is ULONG_MAX.\n"); 121 | printf("\t -o, --op TYPE \t\t TYPE of operation.\n"); 122 | printf("\t\t\t\t --> for blas_op benchmark: \"dot_product\", \"scalar_mult\", \"dmatvec_product\", \"norm\", \"spmv\" and \"axpy\". Default is \"dot_product\".\n"); 123 | printf("\t\t\t\t --> for stencil benchmark: \"27\", \"19\", \"9\" and \"5\". Default is \"27\".\n"); 124 | printf("\t -d, --dtype DATATYPE \t DATATYPE to be used - possible values are int, long, float, double. Default is int.\n"); 125 | printf("\t -a, --algo ALGORITHM \t ALGORITHM to be used. Default is normal.\n" 126 | "\t\t\t\t --> for cg possible values are normal, mixed.\n" 127 | "\t\t\t\t --> for stencil possible values are normal, overlapped.\n"); 128 | printf("\t -h, --help \t\t Displays this help.\n"); 129 | printf("\n\n"); 130 | } 131 | -------------------------------------------------------------------------------- /fileparse.c: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2015 The University of Edinburgh. */ 2 | 3 | /* 4 | * This software was developed as part of the 5 | * EC FP7 funded project Adept (Project ID: 610490) 6 | * www.adept-project.eu 7 | */ 8 | 9 | /* Licensed under the Apache License, Version 2.0 (the "License"); */ 10 | /* you may not use this file except in compliance with the License. */ 11 | /* You may obtain a copy of the License at */ 12 | 13 | /* http://www.apache.org/licenses/LICENSE-2.0 */ 14 | 15 | /* Unless required by applicable law or agreed to in writing, software */ 16 | /* distributed under the License is distributed on an "AS IS" BASIS, */ 17 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ 18 | /* See the License for the specific language governing permissions and */ 19 | /* limitations under the License. */ 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | #include "utils.h" 30 | #include "level1.h" 31 | 32 | int create_line(char*, size_t, char*, unsigned int); 33 | int seek_match(char*, size_t, char*, unsigned int); 34 | 35 | void fileparse(unsigned int num_rows) { 36 | 37 | int world_size, world_rank; 38 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 39 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 40 | 41 | 42 | char search_phrase[] = "AdeptProject"; 43 | size_t sp_len = strlen(search_phrase); 44 | 45 | unsigned int desired_line_len = 81; 46 | char line[desired_line_len]; 47 | 48 | srand(time(NULL)); // Set seed 49 | 50 | int i = 0; 51 | int r = 0; 52 | int m = 0; 53 | int mismatch = 0; 54 | int r_count = 0; 55 | int m_count = 0; 56 | struct timespec start, end; 57 | 58 | /* p_num_rows is the number of rows across all processes */ 59 | unsigned int p_num_rows; 60 | p_num_rows = (unsigned int) (num_rows * world_size); 61 | 62 | 63 | /* Generate (on the fly) the test file for the run */ 64 | /* Make this single threaded for ease */ 65 | if (world_rank == 0) { 66 | FILE* fp; 67 | fp = fopen("testfile", "w+"); 68 | 69 | for (i = 0; i < p_num_rows; i++) { 70 | r = create_line(search_phrase, sp_len, line, desired_line_len); 71 | m = seek_match(search_phrase, sp_len, line, desired_line_len); 72 | if (r != m) { 73 | mismatch++; 74 | } 75 | if (r == 0) { 76 | r_count++; 77 | } 78 | if (m == 0) { 79 | m_count++; 80 | } 81 | fprintf(fp, "%s\n", line); 82 | } 83 | fsync(fileno(fp)); 84 | fclose(fp); 85 | 86 | } 87 | 88 | m = 0; 89 | 90 | 91 | MPI_Info info; 92 | MPI_Info_create(&info); 93 | MPI_File fh; 94 | MPI_Status status; 95 | 96 | /* For holding the data from the file before parsing */ 97 | char *lb = (char*) malloc(sizeof (char)*num_rows * (desired_line_len + 1)); 98 | char *lbp = NULL; 99 | 100 | 101 | MPI_Barrier(MPI_COMM_WORLD); 102 | if (world_rank == 0) { 103 | clock_gettime(CLOCK, &start); 104 | } 105 | m_count = 0; 106 | 107 | /* This part should use MPI-IO */ 108 | MPI_File_open(MPI_COMM_WORLD, "testfile", MPI_MODE_RDWR | MPI_MODE_CREATE, info, &fh); 109 | MPI_File_read_at(fh, world_rank * num_rows * (desired_line_len + 1), lb, num_rows * (desired_line_len + 1), MPI_CHAR, &status); 110 | for (i = 0; i < num_rows; i++) { 111 | lbp = &lb[i * (desired_line_len + 1)]; 112 | m = seek_match(search_phrase, sp_len, lbp, desired_line_len); 113 | if (m == 0) { 114 | m_count++; 115 | } 116 | } 117 | MPI_Barrier(MPI_COMM_WORLD); 118 | MPI_File_close(&fh); 119 | 120 | if (world_rank == 0) { 121 | clock_gettime(CLOCK, &end); 122 | elapsed_time_hr(start, end, "Fileparse"); 123 | } 124 | MPI_Barrier(MPI_COMM_WORLD); 125 | unlink("testfile"); // Use this to ensure the generated file is removed from the system upon finish 126 | 127 | } 128 | 129 | /* 130 | * Create a line of random characters 131 | * Line will be ll long and appears in l 132 | * Randomly, phrase contained in sp and of sp_len length will be added to l at a random position 133 | */ 134 | int create_line(char* sp, size_t sp_len, char* l, unsigned int ll) { 135 | 136 | 137 | int i = 0; 138 | int r = 0; 139 | int flag = 0; 140 | 141 | for (i = 0; i < ll; i++) { 142 | r = (rand() % 128); 143 | while (!isalnum(r)) { 144 | r = (rand() % 128); 145 | } 146 | l[i] = (char) r; 147 | } 148 | l[i + 1] = '\0'; 149 | 150 | r = rand() % 2; 151 | 152 | if (r == 0) { 153 | flag = 0; 154 | r = rand() % (ll - sp_len); 155 | for (i = 0; i < sp_len; i++) { 156 | l[r + i] = sp[i]; 157 | } 158 | } else { 159 | flag = 1; 160 | } 161 | 162 | return flag; 163 | } 164 | 165 | /* 166 | * Naive matching algorithm 167 | */ 168 | int seek_match(char* sp, size_t sp_len, char* l, unsigned int ll) { 169 | 170 | int i = 0; 171 | int flag = 1; 172 | for (i = 0; i < ll - sp_len; i++) { 173 | if (l[i] == sp[0]) { 174 | if (strncmp(&l[i], &sp[0], sp_len) == 0) { 175 | flag = 0; 176 | break; 177 | } 178 | } 179 | } 180 | 181 | return flag; 182 | } 183 | -------------------------------------------------------------------------------- /matrix_utils.c: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2015 The University of Edinburgh. */ 2 | 3 | /* 4 | * This software was developed as part of the 5 | * EC FP7 funded project Adept (Project ID: 610490) 6 | * www.adept-project.eu 7 | */ 8 | 9 | /* Licensed under the Apache License, Version 2.0 (the "License"); */ 10 | /* you may not use this file except in compliance with the License. */ 11 | /* You may obtain a copy of the License at */ 12 | 13 | /* http://www.apache.org/licenses/LICENSE-2.0 */ 14 | 15 | /* Unless required by applicable law or agreed to in writing, software */ 16 | /* distributed under the License is distributed on an "AS IS" BASIS, */ 17 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ 18 | /* See the License for the specific language governing permissions and */ 19 | /* limitations under the License. */ 20 | 21 | /* 22 | * Utility functions for sparse matrices. 23 | * 24 | * Currently focusses on reading in a sparse matrix file in 25 | * Matrix Market Format (http://math.nist.gov/MatrixMarket) 26 | * converting this to CSR. 27 | * 28 | */ 29 | 30 | #include 31 | #include 32 | #include 33 | 34 | /* 35 | * 36 | * reads matrix market file header and get number of rows, 37 | * number of columns, and number of non-zero elements. 38 | * 39 | */ 40 | 41 | void get_matrix_size(char *fn, int *rows, int *cols, int *nonzeros) { 42 | FILE *f; 43 | char header[64]; 44 | char *rv = NULL; 45 | 46 | if ((f = fopen(fn, "r")) == NULL) { 47 | printf("can't open file <%s> \n", fn); 48 | exit(1); 49 | } 50 | 51 | rv = fgets(header, sizeof (header), f); 52 | if (*rv == EOF) { 53 | printf("Error reading file.\n"); 54 | exit(1); 55 | } 56 | rv = fgets(header, sizeof (header), f); 57 | if (*rv == EOF) { 58 | printf("Error reading file.\n"); 59 | exit(1); 60 | } else { 61 | sscanf(header, "%d %d %d", rows, cols, nonzeros); 62 | } 63 | 64 | printf("Rows: %d, Columns: %d, Non-zeros: %d\n", *rows, *cols, *nonzeros); 65 | fclose(f); 66 | 67 | } 68 | 69 | /* 70 | * 71 | * convert a matrix in Matrix Market Format (COO) to CSR 72 | * 73 | */ 74 | void mm_to_csr(char *fn, int m, int n, int nz, int *row_idx, int *col_idx, double *values) { 75 | 76 | FILE *fin, *fout; 77 | int i, j; 78 | int base; 79 | char body[64]; 80 | int row_idx_current, inc; 81 | char *rv = NULL; 82 | 83 | int *new_row_idx, *new_col_idx; 84 | double *new_values; 85 | 86 | if ((fin = fopen(fn, "r")) == NULL) { 87 | printf("can't open input file <%s> \n", fn); 88 | exit(1); 89 | } 90 | 91 | printf("here\n"); 92 | 93 | if ((fout = fopen("matrix_in.csr", "w")) == NULL) { 94 | printf("can't open output file <%s> \n", fn); 95 | exit(1); 96 | } 97 | 98 | /* discard first two lines */ 99 | rv = fgets(body, sizeof (body), fin); 100 | if (*rv == EOF) { 101 | printf("Error reading file.\n"); 102 | exit(1); 103 | } 104 | rv = fgets(body, sizeof (body), fin); 105 | if (*rv == EOF) { 106 | printf("Error reading file.\n"); 107 | exit(1); 108 | } 109 | 110 | 111 | base = 1; 112 | i = 0; 113 | 114 | /* walk through the file line by line */ 115 | while (fgets(body, sizeof (body), fin)) { 116 | sscanf(body, "%d %d %lf", &row_idx[i], &col_idx[i], &values[i]); 117 | row_idx[i] -= base; /* adjust from 1-based to 0-based */ 118 | col_idx[i] -= base; 119 | i++; 120 | } 121 | 122 | fclose(fin); 123 | 124 | /* allocate space for new arrays which will hold the */ 125 | /* newly ordered values, and the column and row indices */ 126 | new_row_idx = malloc(nz * sizeof (int)); 127 | new_col_idx = malloc(nz * sizeof (int)); 128 | new_values = malloc(nz * sizeof (double)); 129 | 130 | /* set first values for all three arrays */ 131 | /* as there is nothing to be done for them */ 132 | row_idx_current = row_idx[0]; 133 | new_row_idx[0] = row_idx[0]; 134 | new_col_idx[0] = col_idx[0]; 135 | new_values[0] = values[0]; 136 | 137 | inc = 1; 138 | 139 | /* this is where the arrays are being reordered */ 140 | for (j = 1; j < nz; j++) { 141 | for (i = 1; i < nz; i++) { 142 | if (row_idx[i] == row_idx_current) { 143 | new_values[inc] = values[i]; 144 | new_col_idx[inc] = col_idx[i]; 145 | inc++; 146 | } 147 | } 148 | new_row_idx[j] = inc; 149 | row_idx_current++; 150 | } 151 | 152 | fprintf(fout, "%d %d %d\n", nz, nz, m + 1); 153 | 154 | /* fprintf(fout, "Values:\n"); */ 155 | /* copy the new colum indices and values into the old arrays */ 156 | for (i = 0; i < nz; i++) { 157 | values[i] = new_values[i]; 158 | fprintf(fout, "%f\n", new_values[i]); 159 | } 160 | 161 | /* fprintf(fout, "\nColumn indices:\n"); */ 162 | /* copy the new colum indices and values into the old arrays */ 163 | for (i = 0; i < nz; i++) { 164 | col_idx[i] = new_col_idx[i]; 165 | fprintf(fout, "%d\n", new_col_idx[i]); 166 | } 167 | 168 | /* fprintf(fout, "\nRow pointers:\n"); */ 169 | /* copy the new row indices into the old array */ 170 | for (i = 0; i <= m; i++) { 171 | row_idx[i] = new_row_idx[i]; 172 | fprintf(fout, "%d\n", new_row_idx[i]); 173 | } 174 | 175 | /* free memory for the temporary new arrays */ 176 | free(new_row_idx); 177 | free(new_col_idx); 178 | free(new_values); 179 | 180 | fclose(fout); 181 | 182 | } 183 | -------------------------------------------------------------------------------- /utils.c: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2015 The University of Edinburgh. */ 2 | 3 | /* 4 | * This software was developed as part of the 5 | * EC FP7 funded project Adept (Project ID: 610490) 6 | * www.adept-project.eu 7 | */ 8 | 9 | /* Licensed under the Apache License, Version 2.0 (the "License"); */ 10 | /* you may not use this file except in compliance with the License. */ 11 | /* You may obtain a copy of the License at */ 12 | 13 | /* http://www.apache.org/licenses/LICENSE-2.0 */ 14 | 15 | /* Unless required by applicable law or agreed to in writing, software */ 16 | /* distributed under the License is distributed on an "AS IS" BASIS, */ 17 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ 18 | /* See the License for the specific language governing permissions and */ 19 | /* limitations under the License. */ 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | #include "utils.h" 28 | 29 | #ifdef __MACH__ 30 | void clock_gettime (void* clk, struct timespec *ts){ 31 | clock_serv_t cclock; 32 | mach_timespec_t mts; 33 | host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock); 34 | clock_get_time(cclock, &mts); 35 | mach_port_deallocate(mach_task_self(), cclock); 36 | ts->tv_sec = mts.tv_sec; 37 | ts->tv_nsec = mts.tv_nsec; 38 | } 39 | #endif 40 | 41 | 42 | double elapsed_time_hr(struct timespec t1, struct timespec t2, char * title){ 43 | 44 | struct timespec elapsed; 45 | sub_time_hr(&elapsed, &t1, &t2); 46 | double elapsed_duration = 0; 47 | double elapsed_start = 0; 48 | double elapsed_end = 0; 49 | 50 | /* This could potentially lead to loss of precision dependant on the rounding in conversion to double */ 51 | elapsed_duration = elapsed.tv_sec + ((double)elapsed.tv_nsec/1000000000); 52 | elapsed_start = t1.tv_sec + ((double)t1.tv_nsec/1000000000); 53 | elapsed_end = t2.tv_sec + ((double)t2.tv_nsec/1000000000); 54 | 55 | 56 | printf("\n--- %s\n", title); 57 | printf("--- Timings ------------------------------------------------------------------------\n"); 58 | printf("|\n"); 59 | printf("| Start: %.9lf ", elapsed_start); 60 | printf("End: %.9lf ", elapsed_end); 61 | printf("Duration: %.9lf s\n", elapsed_duration); 62 | printf("|\n"); 63 | printf("------------------------------------------------------------------------------------\n"); 64 | 65 | return 1.0; // Compatibility 66 | } 67 | 68 | void loop_timer(unsigned long limit){ 69 | 70 | struct timespec t1, t2; 71 | int index; 72 | 73 | clock_gettime(CLOCK, &t1); 74 | for(index=0; indextv_nsec-start->tv_nsec)<0) { 172 | result->tv_sec = end->tv_sec-start->tv_sec-1; 173 | result->tv_nsec = 1000000000+end->tv_nsec-start->tv_nsec; 174 | } else { 175 | result->tv_sec = end->tv_sec-start->tv_sec; 176 | result->tv_nsec = end->tv_nsec-start->tv_nsec; 177 | } 178 | 179 | return end->tv_sec < start->tv_sec; 180 | } 181 | -------------------------------------------------------------------------------- /level1.c: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2015 The University of Edinburgh. */ 2 | 3 | /* 4 | * This software was developed as part of the 5 | * EC FP7 funded project Adept (Project ID: 610490) 6 | * www.adept-project.eu 7 | */ 8 | 9 | /* Licensed under the Apache License, Version 2.0 (the "License"); */ 10 | /* you may not use this file except in compliance with the License. */ 11 | /* You may obtain a copy of the License at */ 12 | 13 | /* http://www.apache.org/licenses/LICENSE-2.0 */ 14 | 15 | /* Unless required by applicable law or agreed to in writing, software */ 16 | /* distributed under the License is distributed on an "AS IS" BASIS, */ 17 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ 18 | /* See the License for the specific language governing permissions and */ 19 | /* limitations under the License. */ 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | #include "level1.h" 27 | 28 | 29 | /* Level 1 benchmark driver - calls appropriate function */ 30 | /* based on command line arguments. */ 31 | void bench_level1(char *b, unsigned int s, unsigned int r, char *o, char *dt, char *algo){ 32 | 33 | int world_rank; 34 | MPI_Comm_rank(MPI_COMM_WORLD,&world_rank); 35 | 36 | 37 | /* BLAS operations */ 38 | if(strcmp(b, "blas_op") == 0){ 39 | 40 | if(strcmp(o, "dot_product") == 0){ 41 | 42 | if(strcmp(dt, "int") == 0) int_dot_product(s); 43 | else if(strcmp(dt, "float") == 0) float_dot_product(s); 44 | else if(strcmp(dt, "double") == 0) double_dot_product(s); 45 | else if (world_rank==0){ 46 | fprintf(stderr, "ERROR: check you are using a valid data type...\n"); 47 | } 48 | 49 | } 50 | 51 | else if(strcmp(o, "scalar_product") == 0){ 52 | 53 | if(strcmp(dt, "int") == 0) int_scalar_mult(s); 54 | else if(strcmp(dt, "float") == 0) float_scalar_mult(s); 55 | else if(strcmp(dt, "double") == 0) double_scalar_mult(s); 56 | else if (world_rank==0){ 57 | fprintf(stderr, "ERROR: check you are using a valid data type...\n"); 58 | } 59 | 60 | } 61 | 62 | else if(strcmp(o, "norm") == 0){ 63 | 64 | if(strcmp(dt, "int") == 0) int_norm(s); 65 | else if(strcmp(dt, "float") == 0) float_norm(s); 66 | else if(strcmp(dt, "double") == 0) double_norm(s); 67 | else if (world_rank==0){ 68 | fprintf(stderr, "ERROR: check you are using a valid data type...\n"); 69 | } 70 | 71 | } 72 | 73 | else if(strcmp(o, "axpy") == 0){ 74 | 75 | if(strcmp(dt, "int") == 0) int_axpy(s); 76 | else if(strcmp(dt, "float") == 0) float_axpy(s); 77 | else if(strcmp(dt, "double") == 0) double_axpy(s); 78 | else if (world_rank==0){ 79 | fprintf(stderr, "ERROR: check you are using a valid data type...\n"); 80 | } 81 | 82 | } 83 | 84 | else if(strcmp(o, "dmatvec_product") == 0){ 85 | 86 | if(strcmp(dt, "int") == 0) int_dmatvec_product(s); 87 | else if(strcmp(dt, "float") == 0) float_dmatvec_product(s); 88 | else if(strcmp(dt, "double") == 0) double_dmatvec_product(s); 89 | else if (world_rank==0){ 90 | fprintf(stderr, "ERROR: check you are using a valid data type...\n"); 91 | } 92 | 93 | } 94 | 95 | else if(strcmp(o, "spmv") == 0){ 96 | 97 | if(strcmp(dt, "int") == 0) int_dmatvec_product(r); 98 | else if(strcmp(dt, "float") == 0) float_spmatvec_product(r); 99 | else if(strcmp(dt, "double") == 0) double_spmatvec_product(r); 100 | else if (world_rank==0){ 101 | fprintf(stderr, "ERROR: check you are using a valid data type...\n"); 102 | } 103 | 104 | } 105 | 106 | } 107 | 108 | /* Stencil codes */ 109 | else if (strcmp(b, "stencil") == 0){ 110 | 111 | if (strcmp(algo, "normal") == 0){ 112 | /* o is set to "dot_product" by default. Use this to check for a default */ 113 | if( strcmp(o, "27") == 0 || strcmp(o, "dot_product") == 0){ 114 | if(strcmp(dt, "double") == 0) double_stencil27(s); 115 | else if (strcmp(dt, "float") == 0) float_stencil27(s); 116 | else if (strcmp(dt, "int") == 0) int_stencil27(s); 117 | else if (world_rank==0){ 118 | fprintf(stderr, "ERROR: check you are using a valid data type...\n"); 119 | } 120 | } 121 | 122 | else if(strcmp(o, "19") == 0){ 123 | if(strcmp(dt, "double") == 0) double_stencil19(s); 124 | else if (strcmp(dt, "float") == 0) float_stencil19(s); 125 | else if (strcmp(dt, "int") == 0) int_stencil19(s); 126 | else if (world_rank==0){ 127 | fprintf(stderr, "ERROR: check you are using a valid data type...\n"); 128 | } 129 | } 130 | 131 | 132 | else if(strcmp(o, "9") == 0){ 133 | if(strcmp(dt, "double") == 0) double_stencil9(s); 134 | else if (strcmp(dt, "float") == 0) float_stencil9(s); 135 | else if (strcmp(dt, "int") == 0) int_stencil9(s); 136 | else if (world_rank==0){ 137 | fprintf(stderr, "ERROR: check you are using a valid data type...\n"); 138 | } 139 | } 140 | 141 | 142 | else if(strcmp(o, "5") == 0){ 143 | if(strcmp(dt, "double") == 0) double_stencil5(s); 144 | else if (strcmp(dt, "float") == 0) float_stencil5(s); 145 | else if (strcmp(dt, "int") == 0) int_stencil5(s); 146 | else if (world_rank==0){ 147 | fprintf(stderr, "ERROR: check you are using a valid data type...\n"); 148 | } 149 | } 150 | 151 | 152 | else if (world_rank==0){ 153 | fprintf(stderr, "ERROR: check you are using a valid operation type...\n"); 154 | } 155 | } 156 | else if (strcmp(algo, "overlapped") == 0) { 157 | /* o is set to "dot_product" by default. Use this to check for a default */ 158 | if( strcmp(o, "27") == 0 || strcmp(o, "dot_product") == 0){ 159 | if(strcmp(dt, "double") == 0) double_stencil27_overlapped(s); 160 | else if (strcmp(dt, "float") == 0) float_stencil27_overlapped(s); 161 | else if (strcmp(dt, "int") == 0) int_stencil27_overlapped(s); 162 | else if (world_rank==0){ 163 | fprintf(stderr, "ERROR: check you are using a valid data type...\n"); 164 | } 165 | } 166 | 167 | else if(strcmp(o, "19") == 0){ 168 | if(strcmp(dt, "double") == 0) double_stencil19_overlapped(s); 169 | else if (strcmp(dt, "float") == 0) float_stencil19_overlapped(s); 170 | else if (strcmp(dt, "int") == 0) int_stencil19_overlapped(s); 171 | else if (world_rank==0){ 172 | fprintf(stderr, "ERROR: check you are using a valid data type...\n"); 173 | } 174 | } 175 | 176 | 177 | else if(strcmp(o, "9") == 0){ 178 | if(strcmp(dt, "double") == 0) double_stencil9_overlapped(s); 179 | else if (strcmp(dt, "float") == 0) float_stencil9_overlapped(s); 180 | else if (strcmp(dt, "int") == 0) int_stencil9_overlapped(s); 181 | else if (world_rank==0){ 182 | fprintf(stderr, "ERROR: check you are using a valid data type...\n"); 183 | } 184 | } 185 | 186 | 187 | else if(strcmp(o, "5") == 0){ 188 | if(strcmp(dt, "double") == 0) double_stencil5_overlapped(s); 189 | else if (strcmp(dt, "float") == 0) float_stencil5_overlapped(s); 190 | else if (strcmp(dt, "int") == 0) int_stencil5_overlapped(s); 191 | else if (world_rank==0){ 192 | fprintf(stderr, "ERROR: check you are using a valid data type...\n"); 193 | } 194 | } 195 | 196 | 197 | else if (world_rank==0){ 198 | fprintf(stderr, "ERROR: check you are using a valid operation type...\n"); 199 | } 200 | 201 | } 202 | else if (world_rank == 0) { 203 | fprintf(stderr, "ERROR: check you are using a valid algorithm.\n"); 204 | } 205 | } 206 | 207 | else if (strcmp(b, "fileparse") == 0){ 208 | 209 | if(strcmp(o, "dot_product") == 0){ 210 | fileparse(s); 211 | } 212 | 213 | else if (world_rank==0){ 214 | fprintf(stderr, "ERROR: check you are using a valid operation type...\n"); 215 | } 216 | 217 | } 218 | else if (strcmp(b, "cg") == 0) { 219 | if (strcmp(algo, "mixed") == 0) { 220 | conjugate_gradient_mixed(s); 221 | } 222 | else if (strcmp(algo, "normal") == 0) { 223 | conjugate_gradient(s); 224 | } 225 | else fprintf(stderr, "ERROR: check you are using a valid algorithm...\n"); 226 | } 227 | 228 | else if (world_rank==0){ 229 | fprintf(stderr, "ERROR: check you are using a valid benchmark...\n"); 230 | } 231 | 232 | 233 | } 234 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /cg.c: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2015 The University of Edinburgh. */ 2 | 3 | /* 4 | * This software was developed as part of the 5 | * EC FP7 funded project Adept (Project ID: 610490) 6 | * www.adept-project.eu 7 | */ 8 | 9 | /* Licensed under the Apache License, Version 2.0 (the "License"); */ 10 | /* you may not use this file except in compliance with the License. */ 11 | /* You may obtain a copy of the License at */ 12 | 13 | /* http://www.apache.org/licenses/LICENSE-2.0 */ 14 | 15 | /* Unless required by applicable law or agreed to in writing, software */ 16 | /* distributed under the License is distributed on an "AS IS" BASIS, */ 17 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ 18 | /* See the License for the specific language governing permissions and */ 19 | /* limitations under the License. */ 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | #include 28 | 29 | #include "utils.h" 30 | 31 | #define PCG_TOLERANCE 1e-3 32 | #define PCG_MAX_ITER 1000 33 | #define PCG_FLOAT_TOLERANCE 1e-2 34 | 35 | /* Conjugate gradient benchmark */ 36 | 37 | 38 | /* struct for CSR matrix type */ 39 | typedef struct 40 | { 41 | int nrow; 42 | int ncol; 43 | int nzmax; 44 | int *colIndex; 45 | int *rowStart; 46 | double *values; 47 | } CSRmatrix; 48 | 49 | typedef struct 50 | { 51 | int nrow; 52 | int ncol; 53 | int nzmax; 54 | int *colIndex; 55 | int *rowStart; 56 | float *values; 57 | } CSRmatrixF; 58 | 59 | /* 60 | * Sparse matrix and vector utility functions 61 | */ 62 | static void CSR_matrix_vector_mult(CSRmatrix *A, double *x, double *b) 63 | { 64 | int i, j; 65 | for (i = 0; i < A->nrow; i++) { 66 | double sum = 0.0; 67 | for (j = A->rowStart[i]; j < A->rowStart[i+1]; j++) { 68 | sum += A->values[j] * x[A->colIndex[j]]; 69 | } 70 | b[i] = sum; 71 | } 72 | } 73 | 74 | static void CSR_matrix_vector_multF(CSRmatrixF *A, float *x, float *b) 75 | { 76 | int i, j; 77 | for (i = 0; i < A->nrow; i++) { 78 | float sum = 0.0; 79 | for (j = A->rowStart[i]; j < A->rowStart[i+1]; j++) { 80 | sum += A->values[j] * x[A->colIndex[j]]; 81 | } 82 | b[i] = sum; 83 | } 84 | } 85 | 86 | static double dotProduct(double *v1, double *v2, int size) 87 | { 88 | int i; 89 | double result = 0.0; 90 | double full_result; 91 | for (i = 0; i < size; i++) { 92 | result += v1[i] * v2[i]; 93 | } 94 | MPI_Allreduce(&result, &full_result, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); 95 | return full_result; 96 | } 97 | 98 | static float dotProductF(float *v1, float *v2, int size) 99 | { 100 | int i; 101 | float result = 0.0; 102 | float full_result; 103 | for (i = 0; i < size; i++) { 104 | result += v1[i] * v2[i]; 105 | } 106 | MPI_Allreduce(&result, &full_result, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD); 107 | return full_result; 108 | } 109 | 110 | static void vecAxpy(double *x, double *y, int size, double alpha) 111 | { 112 | int i; 113 | for (i = 0; i < size; i++) { 114 | y[i] = y[i] + alpha * x[i]; 115 | } 116 | } 117 | 118 | static void vecAxpyF(float *x, float *y, int size, float alpha) 119 | { 120 | int i; 121 | for (i = 0; i < size; i++) { 122 | y[i] = y[i] + alpha * x[i]; 123 | } 124 | } 125 | 126 | 127 | static void vecAypx(double *x, double *y, int size, double alpha) 128 | { 129 | int i; 130 | for (i = 0; i < size; i++) { 131 | y[i] = alpha * y[i] + x[i]; 132 | } 133 | } 134 | 135 | static void vecAypxF(float *x, float *y, int size, float alpha) 136 | { 137 | int i; 138 | for (i = 0; i < size; i++) { 139 | y[i] = alpha * y[i] + x[i]; 140 | } 141 | } 142 | 143 | 144 | int conjugate_gradient(unsigned int s) 145 | { 146 | CSRmatrix *A; 147 | int i; 148 | double *x, *b, *r, *p, *omega; 149 | int k; 150 | double r0, r1, beta, dot, alpha; 151 | double tol = PCG_TOLERANCE * PCG_TOLERANCE; 152 | 153 | struct timespec start, end; 154 | 155 | int size, rank; 156 | int local_s, local_start; 157 | double *full_p; 158 | 159 | MPI_Comm_size(MPI_COMM_WORLD, &size); 160 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 161 | 162 | /* determine local size and starting position */ 163 | local_s = s / size; 164 | local_start = local_s * rank; 165 | 166 | /*====================================================================== 167 | * 168 | * generate a random matrix of size s x s 169 | * 170 | *======================================================================*/ 171 | A = malloc(sizeof(CSRmatrix)); 172 | A->nrow = local_s; 173 | A->ncol = s; 174 | A->nzmax = local_s; 175 | A->colIndex = malloc(A->nzmax * sizeof(int)); 176 | A->rowStart = malloc((A->nrow+1) * sizeof(int)); 177 | A->values = malloc(A->nzmax * sizeof(double)); 178 | 179 | /* generate structure for matrix */ 180 | for (i = 0; i < A->nrow; i++) { 181 | A->rowStart[i] = i; 182 | A->colIndex[i] = i + local_start; 183 | } 184 | A->rowStart[i] = i; 185 | 186 | /* now generate values for matrix */ 187 | srand((unsigned int)time(NULL)); 188 | 189 | for (i = 0; i < A->nzmax; i++) { 190 | A->values[i] = rand() / 32768.0; 191 | } 192 | 193 | /*====================================================================== 194 | * 195 | * Initialise vectors 196 | * 197 | *======================================================================*/ 198 | /* allocate vectors (unknowns, RHS and temporaries) */ 199 | x = malloc(local_s * sizeof(double)); 200 | b = malloc(local_s * sizeof(double)); 201 | r = malloc(local_s * sizeof(double)); 202 | p = malloc(local_s * sizeof(double)); 203 | omega = malloc(local_s * sizeof(double)); 204 | 205 | full_p = malloc(s * sizeof(double)); 206 | 207 | /* generate a random vector of size s for the unknowns */ 208 | for (i = 0; i < local_s; i++) { 209 | x[i] = rand() / 32768.0; 210 | } 211 | 212 | /* multiply matrix by vector to get RHS */ 213 | CSR_matrix_vector_mult(A, x, b); 214 | 215 | /* clear initial guess and initialise temporaries */ 216 | for (i = 0; i < local_s; i++) { 217 | x[i] = 0.0; 218 | 219 | /* r = b - Ax; since x is 0, r = b */ 220 | r[i] = b[i]; 221 | 222 | /* p = r ( = b)*/ 223 | p[i] = b[i]; 224 | 225 | omega[i] = 0.0; 226 | } 227 | 228 | 229 | clock_gettime(CLOCK, &start); 230 | 231 | /* compute initial residual */ 232 | r1 = dotProduct(r, r, local_s); 233 | r0 = r1; 234 | 235 | /*====================================================================== 236 | * 237 | * Actual solver loop 238 | * 239 | *======================================================================*/ 240 | k = 0; 241 | while ((r1 > tol) && (k <= PCG_MAX_ITER)) { 242 | MPI_Allgather(p, local_s, MPI_DOUBLE, full_p, local_s, MPI_DOUBLE, MPI_COMM_WORLD); 243 | 244 | /* omega = Ap */ 245 | CSR_matrix_vector_mult(A, full_p, omega); 246 | 247 | /* dot = p . omega */ 248 | dot = dotProduct(p, omega, local_s); 249 | 250 | alpha = r1 / dot; 251 | 252 | /* x = x + alpha.p */ 253 | vecAxpy(p, x, local_s, alpha); 254 | 255 | /* r = r - alpha.omega */ 256 | vecAxpy(omega, r, local_s, -alpha); 257 | 258 | r0 = r1; 259 | 260 | /* r1 = r . r */ 261 | r1 = dotProduct(r, r, local_s); 262 | 263 | beta = r1 / r0; 264 | 265 | /* p = r + beta.p */ 266 | vecAypx(r, p, local_s, beta); 267 | k++; 268 | } 269 | 270 | clock_gettime(CLOCK, &end); 271 | if (rank == 0) { 272 | elapsed_time_hr(start, end, "Conjugate gradient solve."); 273 | } 274 | 275 | /*====================================================================== 276 | * 277 | * Free memory 278 | * 279 | *======================================================================*/ 280 | /* free the vectors */ 281 | free(omega); 282 | free(p); 283 | free(r); 284 | free(b); 285 | free(x); 286 | free(full_p); 287 | 288 | /* free the matrix */ 289 | free(A->colIndex); 290 | free(A->rowStart); 291 | free(A->values); 292 | free(A); 293 | return 0; 294 | } 295 | 296 | 297 | /* mixed precision version */ 298 | int conjugate_gradient_mixed(unsigned int s) 299 | { 300 | CSRmatrix *A; 301 | CSRmatrixF *AF; 302 | int i; 303 | double *x, *b, *r, *p, *omega; 304 | float *xf, *bf, *rf, *pf, *omegaf; 305 | int k; 306 | double r0, r1, beta, dot, alpha; 307 | float r0f, r1f, betaf, dotf, alphaf; 308 | double tol = PCG_FLOAT_TOLERANCE * PCG_FLOAT_TOLERANCE; 309 | 310 | struct timespec start, end; 311 | 312 | int size, rank; 313 | int local_s, local_start; 314 | double *full_p; 315 | float *full_pF; 316 | 317 | MPI_Comm_size(MPI_COMM_WORLD, &size); 318 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 319 | 320 | /* determine local size and starting position */ 321 | local_s = s / size; 322 | local_start = local_s * rank; 323 | 324 | /*====================================================================== 325 | * 326 | * generate a random matrix of size s x s 327 | * 328 | *======================================================================*/ 329 | A = malloc(sizeof(CSRmatrix)); 330 | A->nrow = local_s; 331 | A->ncol = s; 332 | A->nzmax = local_s; 333 | A->colIndex = malloc(A->nzmax * sizeof(int)); 334 | A->rowStart = malloc((A->nrow+1) * sizeof(int)); 335 | A->values = malloc(A->nzmax * sizeof(double)); 336 | 337 | AF = malloc(sizeof(CSRmatrixF)); 338 | AF->nrow = local_s; 339 | AF->ncol = s; 340 | AF->nzmax = local_s; 341 | AF->colIndex = malloc(AF->nzmax * sizeof(int)); 342 | AF->rowStart = malloc((AF->nrow+1) * sizeof(int)); 343 | AF->values = malloc(AF->nzmax * sizeof(float)); 344 | 345 | /* generate structure for matrix */ 346 | for (i = 0; i < A->nrow; i++) { 347 | A->rowStart[i] = i; 348 | A->colIndex[i] = i + local_start; 349 | 350 | AF->rowStart[i] = i; 351 | AF->colIndex[i] = i + local_start; 352 | } 353 | A->rowStart[i] = i; 354 | AF->rowStart[i] = i; 355 | 356 | /* now generate values for matrix */ 357 | srand((unsigned int)time(NULL)); 358 | 359 | for (i = 0; i < A->nzmax; i++) { 360 | A->values[i] = rand() / 32768.0; 361 | AF->values[i] = (float)A->values[i]; 362 | } 363 | 364 | /*====================================================================== 365 | * 366 | * Initialise vectors 367 | * 368 | *======================================================================*/ 369 | /* allocate vectors (unknowns, RHS and temporaries) */ 370 | x = malloc(local_s * sizeof(double)); 371 | b = malloc(local_s * sizeof(double)); 372 | r = malloc(local_s * sizeof(double)); 373 | p = malloc(local_s * sizeof(double)); 374 | omega = malloc(local_s * sizeof(double)); 375 | 376 | full_p = malloc(s * sizeof(double)); 377 | 378 | xf = malloc(local_s * sizeof(float)); 379 | bf = malloc(local_s * sizeof(float)); 380 | rf = malloc(local_s * sizeof(float)); 381 | pf = malloc(local_s * sizeof(float)); 382 | omegaf = malloc(local_s * sizeof(float)); 383 | 384 | full_pF = malloc(s * sizeof(float)); 385 | 386 | /* generate a random vector of size s for the unknowns */ 387 | for (i = 0; i < local_s; i++) { 388 | x[i] = rand() / 32768.0; 389 | xf[i] = (float)x[i]; 390 | } 391 | 392 | /* multiply matrix by vector to get RHS */ 393 | CSR_matrix_vector_mult(A, x, b); 394 | CSR_matrix_vector_multF(AF, xf, bf); 395 | 396 | /* clear initial guess and initialise temporaries */ 397 | for (i = 0; i < local_s; i++) { 398 | x[i] = 0.0; 399 | xf[i] = 0.0; 400 | 401 | /* r = b - Ax; since x is 0, r = b */ 402 | r[i] = b[i]; 403 | rf[i] = bf[i]; 404 | 405 | /* p = r ( = b)*/ 406 | p[i] = b[i]; 407 | pf[i] = bf[i]; 408 | 409 | omega[i] = 0.0; 410 | omegaf[i] = 0.0; 411 | } 412 | 413 | 414 | clock_gettime(CLOCK, &start); 415 | 416 | /* compute initial residual */ 417 | r1f = dotProductF(rf, rf, local_s); 418 | r0f = r1f; 419 | 420 | /*====================================================================== 421 | * 422 | * Actual solver loop (single precision) 423 | * 424 | *======================================================================*/ 425 | k = 0; 426 | while ((r1f > tol) && (k <= PCG_MAX_ITER)) { 427 | MPI_Allgather(pf, local_s, MPI_FLOAT, full_pF, local_s, MPI_FLOAT, MPI_COMM_WORLD); 428 | 429 | /* omega = Ap */ 430 | CSR_matrix_vector_multF(AF, full_pF, omegaf); 431 | 432 | /* dot = p . omega */ 433 | dotf = dotProductF(pf, omegaf, local_s); 434 | 435 | alphaf = r1f / dotf; 436 | 437 | /* x = x + alpha.p */ 438 | vecAxpyF(pf, xf, local_s, alphaf); 439 | 440 | /* r = r - alpha.omega */ 441 | vecAxpyF(omegaf, rf, local_s, -alphaf); 442 | 443 | r0f = r1f; 444 | 445 | /* r1 = r . r */ 446 | r1f = dotProductF(rf, rf, local_s); 447 | 448 | betaf = r1f / r0f; 449 | 450 | /* p = r + beta.p */ 451 | vecAypxF(rf, pf, local_s, betaf); 452 | k++; 453 | } 454 | 455 | /* convert for double precision iterations */ 456 | r1 = (double)r1f; 457 | r0 = (double)r0f; 458 | for (i = 0; i < local_s; i++) { 459 | r[i] = (double)rf[i]; 460 | p[i] = (double)pf[i]; 461 | x[i] = (double)xf[i]; 462 | } 463 | 464 | tol = PCG_TOLERANCE * PCG_TOLERANCE; 465 | 466 | /*====================================================================== 467 | * 468 | * Actual solver loop 469 | * 470 | *======================================================================*/ 471 | while ((r1 > tol) && (k <= PCG_MAX_ITER)) { 472 | MPI_Allgather(p, local_s, MPI_DOUBLE, full_p, local_s, MPI_DOUBLE, MPI_COMM_WORLD); 473 | 474 | /* omega = Ap */ 475 | CSR_matrix_vector_mult(A, full_p, omega); 476 | 477 | /* dot = p . omega */ 478 | dot = dotProduct(p, omega, local_s); 479 | 480 | alpha = r1 / dot; 481 | 482 | /* x = x + alpha.p */ 483 | vecAxpy(p, x, local_s, alpha); 484 | 485 | /* r = r - alpha.omega */ 486 | vecAxpy(omega, r, local_s, -alpha); 487 | 488 | r0 = r1; 489 | 490 | /* r1 = r . r */ 491 | r1 = dotProduct(r, r, local_s); 492 | 493 | beta = r1 / r0; 494 | 495 | /* p = r + beta.p */ 496 | vecAypx(r, p, local_s, beta); 497 | k++; 498 | } 499 | 500 | clock_gettime(CLOCK, &end); 501 | if (rank == 0) { 502 | elapsed_time_hr(start, end, "Conjugate gradient solve."); 503 | } 504 | 505 | /*====================================================================== 506 | * 507 | * Free memory 508 | * 509 | *======================================================================*/ 510 | /* free the vectors */ 511 | free(omega); 512 | free(p); 513 | free(r); 514 | free(b); 515 | free(x); 516 | free(full_p); 517 | 518 | free(omegaf); 519 | free(pf); 520 | free(rf); 521 | free(bf); 522 | free(xf); 523 | free(full_pF); 524 | 525 | /* free the matrix */ 526 | free(A->colIndex); 527 | free(A->rowStart); 528 | free(A->values); 529 | free(A); 530 | 531 | free(AF->colIndex); 532 | free(AF->rowStart); 533 | free(AF->values); 534 | free(AF); 535 | 536 | return 0; 537 | } 538 | -------------------------------------------------------------------------------- /blas_op.c: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2015 The University of Edinburgh. */ 2 | 3 | /* 4 | * This software was developed as part of the 5 | * EC FP7 funded project Adept (Project ID: 610490) 6 | * www.adept-project.eu 7 | */ 8 | 9 | /* Licensed under the Apache License, Version 2.0 (the "License"); */ 10 | /* you may not use this file except in compliance with the License. */ 11 | /* You may obtain a copy of the License at */ 12 | 13 | /* http://www.apache.org/licenses/LICENSE-2.0 */ 14 | 15 | /* Unless required by applicable law or agreed to in writing, software */ 16 | /* distributed under the License is distributed on an "AS IS" BASIS, */ 17 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ 18 | /* See the License for the specific language governing permissions and */ 19 | /* limitations under the License. */ 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | 30 | #include "level1.h" 31 | #include "utils.h" 32 | #include "matrix_utils.h" 33 | 34 | /* 35 | * Vector dot product, integers 36 | * 37 | * result = result + v1_i * v2_i 38 | * 39 | * Input: size of the vectors (in number of elements) 40 | * Output: dot product 41 | * 42 | */ 43 | int int_dot_product(unsigned int size) { 44 | 45 | int world_size, world_rank; 46 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 47 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 48 | 49 | 50 | int i; 51 | 52 | /* 53 | * Compute size of block each rank will work on 54 | * We do this in a slightly odd method to try an ensure we have the same number of data plane layers (n) 55 | * in the MPI case as in the serial case. 56 | */ 57 | int local_size = 0; 58 | if (world_rank != 0) { 59 | local_size = size / world_size; 60 | } else if (world_rank == 0) { 61 | local_size = (size / world_size) + (size % world_size); 62 | } else { 63 | printf("Some error occured in size calculation\n"); 64 | } 65 | 66 | 67 | /* create two vectors */ 68 | int *v1 = (int *) malloc(local_size * sizeof (int)); 69 | int *v2 = (int *) malloc(local_size * sizeof (int)); 70 | 71 | /* result variable */ 72 | unsigned int result = 0; 73 | 74 | if (v1 == NULL || v2 == NULL) { 75 | printf("Out Of Memory: could not allocate space for the two arrays.\n"); 76 | return 0; 77 | } 78 | 79 | srand((int) time(NULL)); 80 | 81 | struct timespec start, end; 82 | unsigned int global_result = 0; 83 | 84 | /* fill vectors with random integer values */ 85 | for (i = 0; i < local_size; i++) { 86 | v1[i] = (int) rand() / (int) (RAND_MAX / 10); 87 | v2[i] = (int) rand() / (int) (RAND_MAX / 10); 88 | } 89 | MPI_Barrier(MPI_COMM_WORLD); 90 | clock_gettime(CLOCK, &start); 91 | 92 | 93 | 94 | /* perform dot product */ 95 | 96 | for (i = 0; i < local_size; i++) { 97 | result = result + v1[i] * v2[i]; 98 | } 99 | 100 | MPI_Reduce(&result, &global_result, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); 101 | 102 | clock_gettime(CLOCK, &end); 103 | 104 | /* print result so compiler does not throw it away */ 105 | if (world_rank == 0) { 106 | printf("Dot product result: %d\n", result); 107 | elapsed_time_hr(start, end, "Integer dot product."); 108 | } 109 | free(v1); 110 | free(v2); 111 | 112 | return 0; 113 | } 114 | 115 | /* 116 | * Vector dot product, floats 117 | * 118 | * result = result + v1_i * v2_i 119 | * 120 | * Input: size of the vectors (in number of elements) 121 | * Output: dot product 122 | * 123 | */ 124 | int float_dot_product(unsigned int size) { 125 | 126 | int world_size, world_rank; 127 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 128 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 129 | 130 | int i; 131 | 132 | /* 133 | * Compute size of block each rank will work on 134 | * We do this in a slightly odd method to try an ensure we have the same number of data plane layers (n) 135 | * in the MPI case as in the serial case. 136 | */ 137 | int local_size = 0; 138 | if (world_rank != 0) { 139 | local_size = size / world_size; 140 | } else if (world_rank == 0) { 141 | local_size = (size / world_size) + (size % world_size); 142 | } else { 143 | printf("Some error occured in size calculation\n"); 144 | } 145 | 146 | 147 | /* create two vectors */ 148 | float *v1 = (float *) malloc(local_size * sizeof (float)); 149 | float *v2 = (float *) malloc(local_size * sizeof (float)); 150 | 151 | /* result variable */ 152 | float result = 0; 153 | 154 | if (v1 == NULL || v2 == NULL) { 155 | printf("Out Of Memory: could not allocate space for the two arrays.\n"); 156 | return 0; 157 | } 158 | 159 | srand((int) time(NULL)); 160 | 161 | struct timespec start, end; 162 | float global_result = 0; 163 | 164 | /* fill vectors with random integer values */ 165 | for (i = 0; i < local_size; i++) { 166 | v1[i] = (float) rand() / (float) (RAND_MAX / 10); 167 | v2[i] = (float) rand() / (float) (RAND_MAX / 10); 168 | } 169 | 170 | MPI_Barrier(MPI_COMM_WORLD); 171 | clock_gettime(CLOCK, &start); 172 | 173 | 174 | 175 | /* perform dot product */ 176 | 177 | for (i = 0; i < local_size; i++) { 178 | result = result + v1[i] * v2[i]; 179 | } 180 | 181 | MPI_Reduce(&result, &global_result, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD); 182 | 183 | clock_gettime(CLOCK, &end); 184 | 185 | /* print result so compiler does not throw it away */ 186 | if (world_rank == 0) { 187 | printf("Dot product result: %f\n", result); 188 | elapsed_time_hr(start, end, "Float dot product."); 189 | } 190 | free(v1); 191 | free(v2); 192 | 193 | return 0; 194 | 195 | } 196 | 197 | /* 198 | * Vector dot product, doubles 199 | * 200 | * result = result + v1_i * v2_i 201 | * 202 | * Input: size of the vectors (in number of elements) 203 | * Output: dot product 204 | * 205 | */ 206 | int double_dot_product(unsigned int size) { 207 | 208 | int world_size, world_rank; 209 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 210 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 211 | 212 | int i; 213 | 214 | /* 215 | * Compute size of block each rank will work on 216 | * We do this in a slightly odd method to try an ensure we have the same number of data plane layers (n) 217 | * in the MPI case as in the serial case. 218 | */ 219 | int local_size = 0; 220 | if (world_rank != 0) { 221 | local_size = size / world_size; 222 | } else if (world_rank == 0) { 223 | local_size = (size / world_size) + (size % world_size); 224 | } else { 225 | printf("Some error occured in size calculation\n"); 226 | } 227 | 228 | 229 | /* create two vectors */ 230 | double *v1 = (double *) malloc(local_size * sizeof (double)); 231 | double *v2 = (double *) malloc(local_size * sizeof (double)); 232 | 233 | /* result variable */ 234 | double result = 0; 235 | 236 | if (v1 == NULL || v2 == NULL) { 237 | printf("Out Of Memory: could not allocate space for the two arrays.\n"); 238 | return 0; 239 | } 240 | 241 | srand((int) time(NULL)); 242 | 243 | struct timespec start, end; 244 | double global_result = 0; 245 | 246 | /* fill vectors with random integer values */ 247 | for (i = 0; i < local_size; i++) { 248 | v1[i] = (double) rand() / (double) (RAND_MAX / 10); 249 | v2[i] = (double) rand() / (double) (RAND_MAX / 10); 250 | } 251 | 252 | MPI_Barrier(MPI_COMM_WORLD); 253 | clock_gettime(CLOCK, &start); 254 | 255 | 256 | 257 | /* perform dot product */ 258 | 259 | for (i = 0; i < local_size; i++) { 260 | result = result + v1[i] * v2[i]; 261 | } 262 | 263 | MPI_Reduce(&result, &global_result, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); 264 | 265 | clock_gettime(CLOCK, &end); 266 | 267 | /* print result so compiler does not throw it away */ 268 | if (world_rank == 0) { 269 | printf("Dot product result: %f\n", result); 270 | elapsed_time_hr(start, end, "Double dot product."); 271 | } 272 | free(v1); 273 | free(v2); 274 | 275 | return 0; 276 | 277 | } 278 | 279 | 280 | /* Vector scalar multiplication, integers */ 281 | 282 | /* v_i = a * v1_i */ 283 | int int_scalar_mult(unsigned int size) { 284 | 285 | int world_size, world_rank; 286 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 287 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 288 | 289 | int i; 290 | 291 | /* 292 | * Compute size of block each rank will work on 293 | * We do this in a slightly odd method to try an ensure we have the same number of data plane layers (n) 294 | * in the MPI case as in the serial case. 295 | */ 296 | int local_size = 0; 297 | int local_size_other = 0; 298 | int rcounts[world_size]; 299 | int displs[world_size]; 300 | 301 | if (world_rank != 0) { 302 | local_size = size / world_size; 303 | } else if (world_rank == 0) { 304 | local_size = (size / world_size) + (size % world_size); 305 | local_size_other = size / world_size; // Needed for gatherv 306 | rcounts[0] = local_size; 307 | displs[0] = 0; 308 | for (i = 1; i < world_size; i++) { 309 | rcounts[i] = local_size_other; 310 | displs[i] = i*local_size_other; 311 | } 312 | } else { 313 | printf("Some error occured in size calculation\n"); 314 | } 315 | 316 | /* create vector and scalar */ 317 | int *v = (int *) malloc(local_size * sizeof (int)); 318 | unsigned int a = 0; 319 | int* rbuf = NULL; 320 | 321 | /* We only need this space allocated on PE 0 */ 322 | if (world_rank == 0) { 323 | rbuf = (int *) malloc(size * sizeof (int)); 324 | if (rbuf == NULL) { 325 | printf("Out Of Memory: could not allocate space for the array.\n"); 326 | return 0; 327 | } 328 | } 329 | 330 | if (v == NULL) { 331 | printf("Out Of Memory: could not allocate space for the array.\n"); 332 | return 0; 333 | } 334 | 335 | srand((int) time(NULL)); 336 | 337 | struct timespec start, end; 338 | 339 | /* fill vector with random ints */ 340 | for (i = 0; i < local_size; i++) { 341 | v[i] = (int) rand() / (int) (RAND_MAX / 10); 342 | } 343 | 344 | /* assign random int value */ 345 | a = (int) rand() / (int) (RAND_MAX / 10); 346 | 347 | if (world_size > 1) { 348 | MPI_Bcast(&a, 1, MPI_INT, 0, MPI_COMM_WORLD); // Make everyone use the SAME a */ 349 | } 350 | 351 | MPI_Barrier(MPI_COMM_WORLD); 352 | 353 | if (world_rank == 0) { 354 | clock_gettime(CLOCK, &start); 355 | } 356 | 357 | 358 | /* perform scalar product */ 359 | for (i = 0; i < local_size; i++) { 360 | v[i] = a * v[i]; 361 | } 362 | 363 | /* Collect elements on PE 0 */ 364 | if (world_size > 1) { 365 | MPI_Gatherv(v, local_size, MPI_INT, rbuf, rcounts, displs, MPI_INT, 0, MPI_COMM_WORLD); 366 | } 367 | 368 | if (world_rank == 0) { 369 | clock_gettime(CLOCK, &end); 370 | } 371 | 372 | /* print result so compiler does not throw it away */ 373 | if (a == 999999) { 374 | printf("Scalar product result: %d\n", v[0]); 375 | } 376 | 377 | if (world_rank == 0) { 378 | elapsed_time_hr(start, end, "Int scalar multiplication."); 379 | free(rbuf); 380 | } 381 | free(v); 382 | 383 | 384 | return 0; 385 | 386 | } 387 | 388 | /* Vector scalar product, floats */ 389 | 390 | /* v_i = a * v1_i */ 391 | int float_scalar_mult(unsigned int size) { 392 | 393 | int world_size, world_rank; 394 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 395 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 396 | 397 | int i; 398 | 399 | /* 400 | * Compute size of block each rank will work on 401 | * We do this in a slightly odd method to try an ensure we have the same number of data plane layers (n) 402 | * in the MPI case as in the serial case. 403 | */ 404 | int local_size = 0; 405 | int local_size_other = 0; 406 | int rcounts[world_size]; 407 | int displs[world_size]; 408 | 409 | if (world_rank != 0) { 410 | local_size = size / world_size; 411 | } else if (world_rank == 0) { 412 | local_size = (size / world_size) + (size % world_size); 413 | local_size_other = size / world_size; // Needed for gatherv 414 | rcounts[0] = local_size; 415 | displs[0] = 0; 416 | for (i = 1; i < world_size; i++) { 417 | rcounts[i] = local_size_other; 418 | displs[i] = i*local_size_other; 419 | } 420 | } else { 421 | printf("Some error occured in size calculation\n"); 422 | } 423 | 424 | /* create vector and scalar */ 425 | float *v = (float *) malloc(local_size * sizeof (float)); 426 | unsigned int a = 0; 427 | float* rbuf = NULL; 428 | 429 | /* We only need this space allocated on PE 0 */ 430 | if (world_rank == 0) { 431 | rbuf = (float *) malloc(size * sizeof (float)); 432 | if (rbuf == NULL) { 433 | printf("Out Of Memory: could not allocate space for the array.\n"); 434 | return 0; 435 | } 436 | } 437 | 438 | if (v == NULL) { 439 | printf("Out Of Memory: could not allocate space for the array.\n"); 440 | return 0; 441 | } 442 | 443 | srand((int) time(NULL)); 444 | 445 | struct timespec start, end; 446 | 447 | /* fill vector with random floats */ 448 | for (i = 0; i < local_size; i++) { 449 | v[i] = (float) rand() / (float) (RAND_MAX / 10); 450 | } 451 | 452 | /* assign random float value */ 453 | a = (float) rand() / (float) (RAND_MAX / 10); 454 | 455 | if (world_size > 1) { 456 | MPI_Bcast(&a, 1, MPI_FLOAT, 0, MPI_COMM_WORLD); // Make everyone use the SAME a */ 457 | } 458 | MPI_Barrier(MPI_COMM_WORLD); 459 | if (world_rank == 0) { 460 | clock_gettime(CLOCK, &start); 461 | } 462 | 463 | 464 | /* perform scalar product */ 465 | for (i = 0; i < local_size; i++) { 466 | v[i] = a * v[i]; 467 | } 468 | 469 | /* Collect elements on PE 0 */ 470 | if (world_size > 1) { 471 | MPI_Gatherv(v, local_size, MPI_FLOAT, rbuf, rcounts, displs, MPI_FLOAT, 0, MPI_COMM_WORLD); 472 | } 473 | 474 | if (world_rank == 0) { 475 | clock_gettime(CLOCK, &end); 476 | } 477 | 478 | /* print result so compiler does not throw it away */ 479 | if (a == 999999) { 480 | printf("Scalar product result: %f\n", v[0]); 481 | } 482 | 483 | if (world_rank == 0) { 484 | elapsed_time_hr(start, end, "Float scalar multiplication."); 485 | free(rbuf); 486 | } 487 | free(v); 488 | 489 | return 0; 490 | 491 | } 492 | 493 | /* Vector scalar product, doubles */ 494 | 495 | /* v_i = a * v1_i */ 496 | int double_scalar_mult(unsigned int size) { 497 | 498 | 499 | int world_size, world_rank; 500 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 501 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 502 | 503 | int i; 504 | 505 | /* 506 | * Compute size of block each rank will work on 507 | * We do this in a slightly odd method to try an ensure we have the same number of data plane layers (n) 508 | * in the MPI case as in the serial case. 509 | */ 510 | int local_size = 0; 511 | int local_size_other = 0; 512 | int rcounts[world_size]; 513 | int displs[world_size]; 514 | 515 | if (world_rank != 0) { 516 | local_size = size / world_size; 517 | } else if (world_rank == 0) { 518 | local_size = (size / world_size) + (size % world_size); 519 | local_size_other = size / world_size; // Needed for gatherv 520 | rcounts[0] = local_size; 521 | displs[0] = 0; 522 | for (i = 1; i < world_size; i++) { 523 | rcounts[i] = local_size_other; 524 | displs[i] = i*local_size_other; 525 | } 526 | } else { 527 | printf("Some error occured in size calculation\n"); 528 | } 529 | 530 | /* create vector and scalar */ 531 | double *v = (double *) malloc(local_size * sizeof (double)); 532 | unsigned int a = 0; 533 | double* rbuf = NULL; 534 | 535 | /* We only need this space allocated on PE 0 */ 536 | if (world_rank == 0) { 537 | rbuf = (double *) malloc(size * sizeof (double)); 538 | if (rbuf == NULL) { 539 | printf("Out Of Memory: could not allocate space for the array.\n"); 540 | return 0; 541 | } 542 | } 543 | 544 | if (v == NULL) { 545 | printf("Out Of Memory: could not allocate space for the array.\n"); 546 | return 0; 547 | } 548 | 549 | srand((int) time(NULL)); 550 | 551 | struct timespec start, end; 552 | 553 | /* fill vector with random doubles */ 554 | for (i = 0; i < local_size; i++) { 555 | v[i] = (double) rand() / (double) (RAND_MAX / 10); 556 | } 557 | 558 | /* assign random double value */ 559 | a = (double) rand() / (double) (RAND_MAX / 10); 560 | if (world_size > 1) { 561 | MPI_Bcast(&a, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); // Make everyone use the SAME a */ 562 | } 563 | MPI_Barrier(MPI_COMM_WORLD); 564 | if (world_rank == 0) { 565 | clock_gettime(CLOCK, &start); 566 | } 567 | 568 | 569 | /* perform scalar product */ 570 | for (i = 0; i < local_size; i++) { 571 | v[i] = a * v[i]; 572 | } 573 | 574 | /* Collect elements on PE 0 */ 575 | if (world_size > 1) { 576 | MPI_Gatherv(v, local_size, MPI_DOUBLE, rbuf, rcounts, displs, MPI_DOUBLE, 0, MPI_COMM_WORLD); 577 | } 578 | 579 | if (world_rank == 0) { 580 | clock_gettime(CLOCK, &end); 581 | } 582 | 583 | /* print result so compiler does not throw it away */ 584 | if (a == 999999) { 585 | printf("Scalar product result: %lf\n", v[0]); 586 | } 587 | 588 | if (world_rank == 0) { 589 | elapsed_time_hr(start, end, "Double scalar multiplication."); 590 | free(rbuf); 591 | } 592 | free(v); 593 | 594 | 595 | return 0; 596 | 597 | } 598 | 599 | int double_norm(unsigned int size) { 600 | 601 | int world_size, world_rank; 602 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 603 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 604 | 605 | int i; 606 | 607 | /* 608 | * Compute size of block each rank will work on 609 | * We do this in a slightly odd method to try an ensure we have the same number of data plane layers (n) 610 | * in the MPI case as in the serial case. 611 | */ 612 | int local_size = 0; 613 | if (world_rank != 0) { 614 | local_size = size / world_size; 615 | } else if (world_rank == 0) { 616 | local_size = (size / world_size) + (size % world_size); 617 | } else { 618 | printf("Some error occured in size calculation\n"); 619 | } 620 | 621 | 622 | double *v = (double *) malloc(local_size * sizeof (double)); 623 | double sum = 0.0, norm = 0.0; 624 | 625 | if (v == NULL) { 626 | printf("Out Of Memory: could not allocate space for the array.\n"); 627 | return 0; 628 | } 629 | 630 | srand((int) time(NULL)); 631 | 632 | struct timespec start, end; 633 | 634 | /* fill vector with random doubles */ 635 | for (i = 0; i < local_size; i++) { 636 | v[i] = (double) rand() / (double) (RAND_MAX / 10.0); 637 | } 638 | MPI_Barrier(MPI_COMM_WORLD); 639 | if (world_rank == 0) { 640 | clock_gettime(CLOCK, &start); 641 | } 642 | 643 | for (i = 0; i < local_size; i++) { 644 | sum = sum + (v[i] * v[i]); 645 | } 646 | 647 | /* REDUCE */ 648 | double global_sum = 0.0; 649 | MPI_Reduce(&sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); 650 | if (world_rank == 0) { 651 | norm = sqrt(sum); 652 | clock_gettime(CLOCK, &end); 653 | elapsed_time_hr(start, end, "Double vector norm."); 654 | } 655 | /* print result so compiler does not throw it away */ 656 | 657 | if (v[0] == 99999) { 658 | printf("Norm = %f\n", norm); 659 | } 660 | 661 | free(v); 662 | return 0; 663 | } 664 | 665 | 666 | 667 | /* compute the Euclidean norm of a float vector */ 668 | /* !!!! naive implementation -- find algorithm that */ 669 | 670 | /* !!!! will avoid over/underflow for large vectors */ 671 | int float_norm(unsigned int size) { 672 | 673 | int world_size, world_rank; 674 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 675 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 676 | 677 | int i; 678 | 679 | /* 680 | * Compute size of block each rank will work on 681 | * We do this in a slightly odd method to try an ensure we have the same number of data plane layers (n) 682 | * in the MPI case as in the serial case. 683 | */ 684 | int local_size = 0; 685 | if (world_rank != 0) { 686 | local_size = size / world_size; 687 | } else if (world_rank == 0) { 688 | local_size = (size / world_size) + (size % world_size); 689 | } else { 690 | printf("Some error occured in size calculation\n"); 691 | } 692 | 693 | 694 | float *v = (float *) malloc(local_size * sizeof (float)); 695 | float sum = 0.0, norm = 0.0; 696 | 697 | if (v == NULL) { 698 | printf("Out Of Memory: could not allocate space for the array.\n"); 699 | return 0; 700 | } 701 | 702 | srand((int) time(NULL)); 703 | 704 | struct timespec start, end; 705 | 706 | /* fill vector with random floats */ 707 | for (i = 0; i < local_size; i++) { 708 | v[i] = (float) rand() / (float) (RAND_MAX / 10.0); 709 | } 710 | MPI_Barrier(MPI_COMM_WORLD); 711 | if (world_rank == 0) { 712 | clock_gettime(CLOCK, &start); 713 | } 714 | 715 | for (i = 0; i < local_size; i++) { 716 | sum = sum + (v[i] * v[i]); 717 | } 718 | 719 | /* REDUCE */ 720 | float global_sum = 0.0; 721 | MPI_Reduce(&sum, &global_sum, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD); 722 | if (world_rank == 0) { 723 | norm = sqrt(sum); 724 | clock_gettime(CLOCK, &end); 725 | elapsed_time_hr(start, end, "Float vector norm."); 726 | } 727 | /* print result so compiler does not throw it away */ 728 | 729 | if (v[0] == 99999) { 730 | printf("Norm = %f\n", norm); 731 | } 732 | 733 | free(v); 734 | return 0; 735 | } 736 | 737 | int int_norm(unsigned int size) { 738 | 739 | int world_size, world_rank; 740 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 741 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 742 | 743 | int i; 744 | 745 | /* 746 | * Compute size of block each rank will work on 747 | * We do this in a slightly odd method to try an ensure we have the same number of data plane layers (n) 748 | * in the MPI case as in the serial case. 749 | */ 750 | int local_size = 0; 751 | if (world_rank != 0) { 752 | local_size = size / world_size; 753 | } else if (world_rank == 0) { 754 | local_size = (size / world_size) + (size % world_size); 755 | } else { 756 | printf("Some error occured in size calculation\n"); 757 | } 758 | 759 | 760 | int *v = (int *) malloc(local_size * sizeof (int)); 761 | int sum = 0.0, norm = 0.0; 762 | 763 | if (v == NULL) { 764 | printf("Out Of Memory: could not allocate space for the array.\n"); 765 | return 0; 766 | } 767 | 768 | srand((int) time(NULL)); 769 | 770 | struct timespec start, end; 771 | 772 | /* fill vector with random ints */ 773 | for (i = 0; i < local_size; i++) { 774 | v[i] = (int) rand() / (int) (RAND_MAX / 10.0); 775 | } 776 | MPI_Barrier(MPI_COMM_WORLD); 777 | if (world_rank == 0) { 778 | clock_gettime(CLOCK, &start); 779 | } 780 | 781 | for (i = 0; i < local_size; i++) { 782 | sum = sum + (v[i] * v[i]); 783 | } 784 | 785 | /* REDUCE */ 786 | int global_sum = 0.0; 787 | MPI_Reduce(&sum, &global_sum, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); 788 | if (world_rank == 0) { 789 | norm = sqrt(sum); 790 | clock_gettime(CLOCK, &end); 791 | elapsed_time_hr(start, end, "Int vector norm."); 792 | } 793 | /* print result so compiler does not throw it away */ 794 | 795 | if (v[0] == 99999) { 796 | printf("Norm = %d\n", norm); 797 | } 798 | 799 | free(v); 800 | return 0; 801 | } 802 | 803 | /* 804 | * 805 | * Compute vector-scalar product 806 | * AXPY, integers 807 | * 808 | * y = a * x + y 809 | * 810 | * Naive implementation 811 | * 812 | */ 813 | int int_axpy(unsigned int size) { 814 | 815 | 816 | int world_size, world_rank; 817 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 818 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 819 | 820 | int i; 821 | 822 | /* 823 | * Compute size of block each rank will work on 824 | * We do this in a slightly odd method to try an ensure we have the same number of data plane layers (n) 825 | * in the MPI case as in the serial case. 826 | */ 827 | int local_size = 0; 828 | int local_size_other = 0; 829 | int rcounts[world_size]; 830 | int displs[world_size]; 831 | 832 | if (world_rank != 0) { 833 | local_size = size / world_size; 834 | } else if (world_rank == 0) { 835 | local_size = (size / world_size) + (size % world_size); 836 | local_size_other = size / world_size; // Needed for gatherv 837 | rcounts[0] = local_size; 838 | displs[0] = 0; 839 | for (i = 1; i < world_size; i++) { 840 | rcounts[i] = local_size_other; 841 | displs[i] = i*local_size_other; 842 | } 843 | } else { 844 | printf("Some error occured in size calculation\n"); 845 | } 846 | 847 | 848 | int a; 849 | int *x = (int *) malloc(local_size * sizeof (int)); 850 | int *y = (int *) malloc(local_size * sizeof (int)); 851 | 852 | int* rbuf = NULL; 853 | 854 | /* We only need this space allocated on PE 0 */ 855 | if (world_rank == 0) { 856 | rbuf = (int *) malloc(size * sizeof (int)); 857 | if (rbuf == NULL) { 858 | printf("Out Of Memory: could not allocate space for the array.\n"); 859 | return 0; 860 | } 861 | } 862 | 863 | if (x == NULL || y == NULL) { 864 | printf("Out Of Memory: could not allocate space for the two arrays.\n"); 865 | return 0; 866 | } 867 | 868 | srand((int) time(NULL)); 869 | 870 | a = (int) rand() / (int) (RAND_MAX / 10); 871 | 872 | /* fill x and y vectors with random ints */ 873 | 874 | for (i = 0; i < local_size; i++) { 875 | x[i] = (int) rand() / (int) (RAND_MAX / 10); 876 | y[i] = (int) rand() / (int) (RAND_MAX / 10); 877 | } 878 | 879 | struct timespec start, end; 880 | if (world_size > 1) { 881 | MPI_Bcast(&a, 1, MPI_INT, 0, MPI_COMM_WORLD); // Make everyone use the SAME a */ 882 | } 883 | MPI_Barrier(MPI_COMM_WORLD); 884 | if (world_rank == 0) { 885 | clock_gettime(CLOCK, &start); 886 | } 887 | 888 | for (i = 0; i < local_size; i++) { 889 | y[i] = a * x[i] + y[i]; 890 | } 891 | if (world_size > 1) { 892 | MPI_Gatherv(y, local_size, MPI_INT, rbuf, rcounts, displs, MPI_INT, 0, MPI_COMM_WORLD); 893 | } 894 | 895 | if (world_rank == 0) { 896 | clock_gettime(CLOCK, &end); 897 | } 898 | 899 | /* print result so compiler does not throw it away */ 900 | if (a == 999999) { 901 | printf("Scalar product result: %d\n", rbuf[0]); 902 | } 903 | 904 | if (world_rank == 0) { 905 | elapsed_time_hr(start, end, "Int AXPY."); 906 | free(rbuf); 907 | } 908 | 909 | 910 | free(x); 911 | free(y); 912 | return 0; 913 | } 914 | 915 | /* 916 | * 917 | * Compute vector-scalar product 918 | * AXPY, floats 919 | * 920 | * y = a * x + y 921 | * 922 | * Naive implementation 923 | * 924 | */ 925 | int float_axpy(unsigned int size) { 926 | 927 | int world_size, world_rank; 928 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 929 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 930 | 931 | int i; 932 | 933 | /* 934 | * Compute size of block each rank will work on 935 | * We do this in a slightly odd method to try an ensure we have the same number of data plane layers (n) 936 | * in the MPI case as in the serial case. 937 | */ 938 | int local_size = 0; 939 | int local_size_other = 0; 940 | int rcounts[world_size]; 941 | int displs[world_size]; 942 | 943 | if (world_rank != 0) { 944 | local_size = size / world_size; 945 | } else if (world_rank == 0) { 946 | local_size = (size / world_size) + (size % world_size); 947 | local_size_other = size / world_size; // Needed for gatherv 948 | rcounts[0] = local_size; 949 | displs[0] = 0; 950 | for (i = 1; i < world_size; i++) { 951 | rcounts[i] = local_size_other; 952 | displs[i] = i*local_size_other; 953 | } 954 | } else { 955 | printf("Some error occured in size calculation\n"); 956 | } 957 | 958 | 959 | float a; 960 | float *x = (float *) malloc(local_size * sizeof (float)); 961 | float *y = (float *) malloc(local_size * sizeof (float)); 962 | 963 | float* rbuf = NULL; 964 | 965 | /* We only need this space allocated on PE 0 */ 966 | if (world_rank == 0) { 967 | rbuf = (float *) malloc(size * sizeof (float)); 968 | if (rbuf == NULL) { 969 | printf("Out Of Memory: could not allocate space for the array.\n"); 970 | return 0; 971 | } 972 | } 973 | 974 | if (x == NULL || y == NULL) { 975 | printf("Out Of Memory: could not allocate space for the two arrays.\n"); 976 | return 0; 977 | } 978 | 979 | srand((int) time(NULL)); 980 | 981 | a = (float) rand() / (float) (RAND_MAX / 10); 982 | 983 | /* fill x and y vectors with random ints */ 984 | 985 | for (i = 0; i < local_size; i++) { 986 | x[i] = (float) rand() / (float) (RAND_MAX / 10); 987 | y[i] = (float) rand() / (float) (RAND_MAX / 10); 988 | } 989 | 990 | struct timespec start, end; 991 | if (world_size > 1) { 992 | MPI_Bcast(&a, 1, MPI_FLOAT, 0, MPI_COMM_WORLD); // Make everyone use the SAME a */ 993 | } 994 | MPI_Barrier(MPI_COMM_WORLD); 995 | if (world_rank == 0) { 996 | clock_gettime(CLOCK, &start); 997 | } 998 | 999 | for (i = 0; i < local_size; i++) { 1000 | y[i] = a * x[i] + y[i]; 1001 | } 1002 | if (world_size > 1) { 1003 | MPI_Gatherv(y, local_size, MPI_FLOAT, rbuf, rcounts, displs, MPI_FLOAT, 0, MPI_COMM_WORLD); 1004 | } 1005 | if (world_rank == 0) { 1006 | clock_gettime(CLOCK, &end); 1007 | } 1008 | 1009 | /* print result so compiler does not throw it away */ 1010 | if (a == 999999) { 1011 | printf("Scalar product result: %f\n", rbuf[0]); 1012 | } 1013 | 1014 | if (world_rank == 0) { 1015 | elapsed_time_hr(start, end, "Float AXPY."); 1016 | free(rbuf); 1017 | } 1018 | 1019 | 1020 | free(x); 1021 | free(y); 1022 | return 0; 1023 | } 1024 | 1025 | /* 1026 | * 1027 | * Compute vector-scalar product 1028 | * AXPY, doubles 1029 | * 1030 | * y = a * x + y 1031 | * 1032 | * Naive implementation 1033 | * 1034 | */ 1035 | int double_axpy(unsigned int size) { 1036 | int world_size, world_rank; 1037 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 1038 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 1039 | 1040 | int i; 1041 | 1042 | /* 1043 | * Compute size of block each rank will work on 1044 | * We do this in a slightly odd method to try an ensure we have the same number of data plane layers (n) 1045 | * in the MPI case as in the serial case. 1046 | */ 1047 | int local_size = 0; 1048 | int local_size_other = 0; 1049 | int rcounts[world_size]; 1050 | int displs[world_size]; 1051 | 1052 | if (world_rank != 0) { 1053 | local_size = size / world_size; 1054 | } else if (world_rank == 0) { 1055 | local_size = (size / world_size) + (size % world_size); 1056 | local_size_other = size / world_size; // Needed for gatherv 1057 | rcounts[0] = local_size; 1058 | displs[0] = 0; 1059 | for (i = 1; i < world_size; i++) { 1060 | rcounts[i] = local_size_other; 1061 | displs[i] = i*local_size_other; 1062 | } 1063 | } else { 1064 | printf("Some error occured in size calculation\n"); 1065 | } 1066 | 1067 | 1068 | double a; 1069 | double *x = (double *) malloc(local_size * sizeof (double)); 1070 | double *y = (double *) malloc(local_size * sizeof (double)); 1071 | 1072 | double* rbuf = NULL; 1073 | 1074 | /* We only need this space allocated on PE 0 */ 1075 | if (world_rank == 0) { 1076 | rbuf = (double *) malloc(size * sizeof (double)); 1077 | if (rbuf == NULL) { 1078 | printf("Out Of Memory: could not allocate space for the array.\n"); 1079 | return 0; 1080 | } 1081 | } 1082 | 1083 | if (x == NULL || y == NULL) { 1084 | printf("Out Of Memory: could not allocate space for the two arrays.\n"); 1085 | return 0; 1086 | } 1087 | 1088 | srand((int) time(NULL)); 1089 | 1090 | a = (double) rand() / (double) (RAND_MAX / 10); 1091 | 1092 | /* fill x and y vectors with random doubles */ 1093 | 1094 | for (i = 0; i < local_size; i++) { 1095 | x[i] = (double) rand() / (double) (RAND_MAX / 10); 1096 | y[i] = (double) rand() / (double) (RAND_MAX / 10); 1097 | } 1098 | 1099 | struct timespec start, end; 1100 | if (world_size > 1) { 1101 | MPI_Bcast(&a, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); // Make everyone use the SAME a */ 1102 | } 1103 | if (world_rank == 0) { 1104 | clock_gettime(CLOCK, &start); 1105 | } 1106 | 1107 | for (i = 0; i < local_size; i++) { 1108 | y[i] = a * x[i] + y[i]; 1109 | } 1110 | if (world_size > 1) { 1111 | MPI_Gatherv(y, local_size, MPI_DOUBLE, rbuf, rcounts, displs, MPI_DOUBLE, 0, MPI_COMM_WORLD); 1112 | } 1113 | MPI_Barrier(MPI_COMM_WORLD); 1114 | if (world_rank == 0) { 1115 | clock_gettime(CLOCK, &end); 1116 | } 1117 | 1118 | /* print result so compiler does not throw it away */ 1119 | if (a == 999999) { 1120 | printf("Scalar product result: %lf\n", rbuf[0]); 1121 | } 1122 | 1123 | if (world_rank == 0) { 1124 | elapsed_time_hr(start, end, "Double AXPY."); 1125 | free(rbuf); 1126 | } 1127 | 1128 | 1129 | free(x); 1130 | free(y); 1131 | return 0; 1132 | } 1133 | 1134 | /* 1135 | * Dense Matrix-Vector product, integers 1136 | * 1137 | * y = A * x 1138 | * where A is a square matrix 1139 | * 1140 | * Input: number of elements in vectors and of rows/cols 1141 | * in matrix specified as number of ints 1142 | * 1143 | */ 1144 | int int_dmatvec_product(unsigned int size) { 1145 | 1146 | int world_size, world_rank; 1147 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 1148 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 1149 | 1150 | /* 1151 | * Compute size of block each rank will work on 1152 | * We do this in a slightly odd method to try an ensure we have the same number of data plane layers (n) 1153 | * in the MPI case as in the serial case. 1154 | */ 1155 | int local_size = 0; 1156 | int local_size_other = 0; 1157 | int rcounts[world_size]; 1158 | int displs[world_size]; 1159 | int i, j; 1160 | 1161 | if (world_rank != 0) { 1162 | local_size = size / world_size; 1163 | } else if (world_rank == 0) { 1164 | local_size = (size / world_size) + (size % world_size); 1165 | local_size_other = size / world_size; // Needed for gatherv 1166 | rcounts[0] = local_size; 1167 | displs[0] = 0; 1168 | for (i = 1; i < world_size; i++) { 1169 | rcounts[i] = local_size_other; 1170 | displs[i] = i*local_size_other; 1171 | } 1172 | } else { 1173 | printf("Some error occured in size calculation\n"); 1174 | } 1175 | 1176 | 1177 | 1178 | int r1 = 0; 1179 | int r2 = 0; 1180 | 1181 | /* create two vectors */ 1182 | int *x = (int *) malloc(size * sizeof (int)); 1183 | int *y = (int *) calloc(local_size, sizeof (int)); 1184 | 1185 | /* create matrix */ 1186 | int **A; 1187 | A = (int **) malloc(local_size * sizeof (int *)); 1188 | for (i = 0; i < local_size; i++) { 1189 | A[i] = (int *) malloc(size * sizeof (int)); 1190 | } 1191 | 1192 | if (x == NULL || y == NULL || A == NULL) { 1193 | printf("Out Of Memory: could not allocate space for the vectors and matrix.\n"); 1194 | return 0; 1195 | } 1196 | 1197 | int* rbuf = NULL; 1198 | 1199 | /* We only need this space allocated on PE 0 */ 1200 | if (world_rank == 0) { 1201 | rbuf = (int *) malloc(size * sizeof (int)); 1202 | if (rbuf == NULL) { 1203 | printf("Out Of Memory: could not allocate space for the array.\n"); 1204 | return 0; 1205 | } 1206 | } 1207 | 1208 | srand((int) time(NULL)); 1209 | 1210 | struct timespec start, end; 1211 | 1212 | if (world_rank == 0) { 1213 | r1 = (int) rand() / (int) (RAND_MAX / 10); 1214 | r2 = (int) rand() / (int) (RAND_MAX / 10); 1215 | } 1216 | 1217 | /* Synchronise the "random" values from 0 to all PEs */ 1218 | if (world_size > 1) { 1219 | MPI_Bcast(&r1, 1, MPI_INT, 0, MPI_COMM_WORLD); 1220 | MPI_Bcast(&r2, 1, MPI_INT, 0, MPI_COMM_WORLD); 1221 | } 1222 | 1223 | /* fill matrix A with "random" integer values */ 1224 | for (i = 0; i < local_size; i++) { 1225 | for (j = 0; j < size; j++) { 1226 | A[i][j] = r2; 1227 | } 1228 | } 1229 | /* fill vector x with "random" integer values */ 1230 | for (i = 0; i < size; i++) { 1231 | x[i] = r1; 1232 | } 1233 | 1234 | clock_gettime(CLOCK, &start); 1235 | 1236 | /* perform matrix-vector product */ 1237 | for (i = 0; i < local_size; i++) { 1238 | for (j = 0; j < size; j++) { 1239 | y[i] = y[i] + A[i][j] * x[j]; 1240 | } 1241 | } 1242 | if (world_size > 1) { 1243 | MPI_Gatherv(y, local_size, MPI_INT, rbuf, rcounts, displs, MPI_INT, 0, MPI_COMM_WORLD); 1244 | } 1245 | MPI_Barrier(MPI_COMM_WORLD); 1246 | if (world_rank == 0) { 1247 | clock_gettime(CLOCK, &end); 1248 | elapsed_time_hr(start, end, "Int Dense Matrix-Vector product."); 1249 | free(rbuf); 1250 | } 1251 | 1252 | /* print result so compiler does not throw it away */ 1253 | if (r1 == 99999) { 1254 | printf("Result vector y[0] = %d\n", y[0]); 1255 | } 1256 | 1257 | free(x); 1258 | free(y); 1259 | for (i = 0; i < local_size; i++) free(A[i]); 1260 | free(A); 1261 | 1262 | return 0; 1263 | 1264 | } 1265 | 1266 | /* 1267 | * Dense Matrix-Vector product, floats 1268 | * 1269 | * y = A * x 1270 | * where A is a square matrix 1271 | * 1272 | * Input: number of elements in vectors and of rows/cols 1273 | * in matrix specified as number of floats 1274 | * 1275 | */ 1276 | int float_dmatvec_product(unsigned int size) { 1277 | 1278 | int world_size, world_rank; 1279 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 1280 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 1281 | 1282 | /* 1283 | * Compute size of block each rank will work on 1284 | * We do this in a slightly odd method to try an ensure we have the same number of data plane layers (n) 1285 | * in the MPI case as in the serial case. 1286 | */ 1287 | int local_size = 0; 1288 | int local_size_other = 0; 1289 | int rcounts[world_size]; 1290 | int displs[world_size]; 1291 | int i, j; 1292 | 1293 | if (world_rank != 0) { 1294 | local_size = size / world_size; 1295 | } else if (world_rank == 0) { 1296 | local_size = (size / world_size) + (size % world_size); 1297 | local_size_other = size / world_size; // Needed for gatherv 1298 | rcounts[0] = local_size; 1299 | displs[0] = 0; 1300 | for (i = 1; i < world_size; i++) { 1301 | rcounts[i] = local_size_other; 1302 | displs[i] = i*local_size_other; 1303 | } 1304 | } else { 1305 | printf("Some error occured in size calculation\n"); 1306 | } 1307 | 1308 | 1309 | 1310 | float r1 = 0; 1311 | float r2 = 0; 1312 | 1313 | /* create two vectors */ 1314 | float *x = (float *) malloc(size * sizeof (float)); 1315 | float *y = (float *) calloc(local_size, sizeof (float)); 1316 | 1317 | /* create matrix */ 1318 | float **A; 1319 | A = (float **) malloc(local_size * sizeof (float *)); 1320 | for (i = 0; i < local_size; i++) { 1321 | A[i] = (float *) malloc(size * sizeof (float)); 1322 | } 1323 | 1324 | if (x == NULL || y == NULL || A == NULL) { 1325 | printf("Out Of Memory: could not allocate space for the vectors and matrix.\n"); 1326 | return 0; 1327 | } 1328 | 1329 | float* rbuf = NULL; 1330 | 1331 | /* We only need this space allocated on PE 0 */ 1332 | if (world_rank == 0) { 1333 | rbuf = (float *) malloc(size * sizeof (float)); 1334 | if (rbuf == NULL) { 1335 | printf("Out Of Memory: could not allocate space for the array.\n"); 1336 | return 0; 1337 | } 1338 | } 1339 | 1340 | srand((int) time(NULL)); 1341 | 1342 | struct timespec start, end; 1343 | 1344 | if (world_rank == 0) { 1345 | r1 = (float) rand() / (float) (RAND_MAX / 10); 1346 | r2 = (float) rand() / (float) (RAND_MAX / 10); 1347 | } 1348 | 1349 | /* Synchronise the "random" values from 0 to all PEs */ 1350 | if (world_size > 1) { 1351 | MPI_Bcast(&r1, 1, MPI_FLOAT, 0, MPI_COMM_WORLD); 1352 | MPI_Bcast(&r2, 1, MPI_FLOAT, 0, MPI_COMM_WORLD); 1353 | } 1354 | /* fill matrix A with "random" integer values */ 1355 | for (i = 0; i < local_size; i++) { 1356 | for (j = 0; j < size; j++) { 1357 | A[i][j] = r2; 1358 | } 1359 | } 1360 | /* fill vector x with "random" integer values */ 1361 | for (i = 0; i < size; i++) { 1362 | x[i] = r1; 1363 | } 1364 | MPI_Barrier(MPI_COMM_WORLD); 1365 | clock_gettime(CLOCK, &start); 1366 | 1367 | /* perform matrix-vector product */ 1368 | for (i = 0; i < local_size; i++) { 1369 | for (j = 0; j < size; j++) { 1370 | y[i] = y[i] + A[i][j] * x[j]; 1371 | } 1372 | } 1373 | if (world_size > 1) { 1374 | MPI_Gatherv(y, local_size, MPI_FLOAT, rbuf, rcounts, displs, MPI_FLOAT, 0, MPI_COMM_WORLD); 1375 | } 1376 | if (world_rank == 0) { 1377 | clock_gettime(CLOCK, &end); 1378 | elapsed_time_hr(start, end, "Float Dense Matrix-Vector product."); 1379 | free(rbuf); 1380 | } 1381 | 1382 | /* print result so compiler does not throw it away */ 1383 | if (r1 == 99999) { 1384 | printf("Result vector y[0] = %f\n", y[0]); 1385 | } 1386 | 1387 | free(x); 1388 | free(y); 1389 | for (i = 0; i < local_size; i++) free(A[i]); 1390 | free(A); 1391 | return 0; 1392 | 1393 | } 1394 | 1395 | /* 1396 | * Dense Matrix-Vector product, doubles 1397 | * 1398 | * y = A * x 1399 | * where A is a square matrix 1400 | * 1401 | * Input: number of elements in vectors and of rows/cols 1402 | * in matrix specified as number of floats 1403 | * 1404 | */ 1405 | int double_dmatvec_product(unsigned int size) { 1406 | 1407 | int world_size, world_rank; 1408 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 1409 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 1410 | 1411 | /* 1412 | * Compute size of block each rank will work on 1413 | * We do this in a slightly odd method to try an ensure we have the same number of data plane layers (n) 1414 | * in the MPI case as in the serial case. 1415 | */ 1416 | int local_size = 0; 1417 | int local_size_other = 0; 1418 | int rcounts[world_size]; 1419 | int displs[world_size]; 1420 | int i, j; 1421 | 1422 | if (world_rank != 0) { 1423 | local_size = size / world_size; 1424 | } else if (world_rank == 0) { 1425 | local_size = (size / world_size) + (size % world_size); 1426 | local_size_other = size / world_size; // Needed for gatherv 1427 | rcounts[0] = local_size; 1428 | displs[0] = 0; 1429 | for (i = 1; i < world_size; i++) { 1430 | rcounts[i] = local_size_other; 1431 | displs[i] = i*local_size_other; 1432 | } 1433 | } else { 1434 | printf("Some error occured in size calculation\n"); 1435 | } 1436 | 1437 | 1438 | 1439 | double r1 = 0; 1440 | double r2 = 0; 1441 | 1442 | /* create two vectors */ 1443 | double *x = (double *) malloc(size * sizeof (double)); 1444 | double *y = (double *) calloc(local_size, sizeof (double)); 1445 | 1446 | /* create matrix */ 1447 | double **A; 1448 | A = (double **) malloc(local_size * sizeof (double *)); 1449 | for (i = 0; i < local_size; i++) { 1450 | A[i] = (double *) malloc(size * sizeof (double)); 1451 | if (A[i] == NULL) { 1452 | printf("Out Of Memory: could not allocate space for the vectors and matrix.\n"); 1453 | return 0; 1454 | } 1455 | } 1456 | 1457 | if (x == NULL || y == NULL || A == NULL) { 1458 | printf("Out Of Memory: could not allocate space for the vectors and matrix.\n"); 1459 | return 0; 1460 | } 1461 | 1462 | double* rbuf = NULL; 1463 | 1464 | /* We only need this space allocated on PE 0 */ 1465 | if (world_rank == 0) { 1466 | rbuf = (double *) malloc(size * sizeof (double)); 1467 | if (rbuf == NULL) { 1468 | printf("Out Of Memory: could not allocate space for the array.\n"); 1469 | return 0; 1470 | } 1471 | } 1472 | 1473 | srand((int) time(NULL)); 1474 | 1475 | struct timespec start, end; 1476 | 1477 | if (world_rank == 0) { 1478 | r1 = (double) rand() / (double) (RAND_MAX / 10); 1479 | r2 = (double) rand() / (double) (RAND_MAX / 10); 1480 | } 1481 | 1482 | /* Synchronise the "random" values from 0 to all PEs */ 1483 | if (world_size > 1) { 1484 | MPI_Bcast(&r1, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); 1485 | MPI_Bcast(&r2, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); 1486 | } 1487 | /* fill matrix A with "random" integer values */ 1488 | for (i = 0; i < local_size; i++) { 1489 | for (j = 0; j < size; j++) { 1490 | A[i][j] = r2; 1491 | } 1492 | } 1493 | /* fill vector x with "random" integer values */ 1494 | for (i = 0; i < size; i++) { 1495 | x[i] = r1; 1496 | } 1497 | MPI_Barrier(MPI_COMM_WORLD); 1498 | clock_gettime(CLOCK, &start); 1499 | /* perform matrix-vector product */ 1500 | for (i = 0; i < local_size; i++) { 1501 | for (j = 0; j < size; j++) { 1502 | y[i] = y[i] + A[i][j] * x[j]; 1503 | } 1504 | } 1505 | if (world_size > 1) { 1506 | MPI_Gatherv(y, local_size, MPI_DOUBLE, rbuf, rcounts, displs, MPI_DOUBLE, 0, MPI_COMM_WORLD); 1507 | } 1508 | 1509 | if (world_rank == 0) { 1510 | clock_gettime(CLOCK, &end); 1511 | elapsed_time_hr(start, end, "Double Dense Matrix-Vector product."); 1512 | free(rbuf); 1513 | } 1514 | 1515 | /* print result so compiler does not throw it away */ 1516 | if (r1 == 99999) { 1517 | printf("Result vector y[0] = %lf\n", y[0]); 1518 | } 1519 | 1520 | free(x); 1521 | free(y); 1522 | for (i = 0; i < local_size; i++) free(A[i]); 1523 | free(A); 1524 | 1525 | return 0; 1526 | 1527 | } 1528 | 1529 | int double_spmatvec_product(unsigned long r) { 1530 | 1531 | int world_size, world_rank; 1532 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 1533 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 1534 | MPI_Status status; 1535 | 1536 | 1537 | struct timespec start, end; 1538 | 1539 | char *filename = "matrix_in.csr"; 1540 | 1541 | int i, rep; 1542 | char* retval = NULL; 1543 | 1544 | if (r == ULONG_MAX) r = 10000; 1545 | 1546 | if (world_rank == 0) { 1547 | int m, n, nz; 1548 | 1549 | FILE *f; 1550 | char line[64]; 1551 | 1552 | if ((f = fopen(filename, "r")) == NULL) { 1553 | printf("can't open file <%s> \n", filename); 1554 | exit(1); 1555 | } 1556 | 1557 | retval = fgets(line, sizeof (line), f); 1558 | if (retval != NULL) { 1559 | sscanf(line, "%d %d %d", &nz, &n, &m); 1560 | } else { 1561 | printf("Error in reading line from file. Exiting.\n"); 1562 | return 1; 1563 | } 1564 | 1565 | printf("Number of elements of values and col_idx: %d; number of values in row_idx: %d\n", nz, m); 1566 | 1567 | double* values = malloc(nz * sizeof (double)); 1568 | /* int values_len = nz; */ 1569 | 1570 | // fill values 1571 | for (i = 0; i < nz; i++) { 1572 | 1573 | retval = fgets(line, sizeof (line), f); 1574 | if (retval != NULL) { 1575 | sscanf(line, "%lf", &values[i]); 1576 | } else { 1577 | printf("Error in reading line from file. Exiting.\n"); 1578 | return 1; 1579 | } 1580 | 1581 | 1582 | } 1583 | 1584 | int* col_idx = malloc(nz * sizeof (int)); 1585 | /* int col_idx_len = values_len; */ 1586 | 1587 | // fill col_idx 1588 | for (i = 0; i < nz; i++) { 1589 | retval = fgets(line, sizeof (line), f); 1590 | if (retval != NULL) { 1591 | sscanf(line, "%d", &col_idx[i]); 1592 | } else { 1593 | printf("Error in reading line from file. Exiting.\n"); 1594 | return 1; 1595 | } 1596 | 1597 | } 1598 | 1599 | int* row_idx = malloc((m + 1) * sizeof (int)); 1600 | int row_idx_len = m + 1; 1601 | // fill row_idx 1602 | for (i = 0; i < m; i++) { 1603 | retval = fgets(line, sizeof (line), f); 1604 | if (retval != NULL) { 1605 | sscanf(line, "%d", &row_idx[i]); 1606 | } else { 1607 | printf("Error in reading line from file. Exiting.\n"); 1608 | return 1; 1609 | } 1610 | } 1611 | row_idx[m] = nz; 1612 | 1613 | 1614 | int x_len = m - 1; 1615 | double* x = (double*) malloc(sizeof (double)*x_len); 1616 | for (i = 0; i < x_len; i++) { 1617 | x[i] = i + 1; 1618 | } 1619 | 1620 | fclose(f); 1621 | 1622 | 1623 | 1624 | double* b = (double*) malloc(sizeof (double)*x_len); 1625 | memset(b, 0, sizeof (*b)); 1626 | 1627 | 1628 | /* 1629 | * Compute how many members of row_idx to go each rank 1630 | * The last rank (world_size-1) gets the overflow 1631 | */ 1632 | int local_row_idx_len_array[world_size]; 1633 | for (i = 0; i < world_size; i++) { 1634 | local_row_idx_len_array[i] = (row_idx_len - 1) / world_size; 1635 | } 1636 | local_row_idx_len_array[world_size - 1] += (row_idx_len - 1) % world_size; 1637 | 1638 | /* 1639 | * Send the number of items from row_idx they will receive to each rank 1640 | * For this rank (rank 0), perform a local copy rather than via MPI 1641 | */ 1642 | int remote_row_offset = 0; 1643 | for (i = 1; i < world_size; i++) { 1644 | MPI_Send(&local_row_idx_len_array[i], 1, MPI_INT, i, 0, MPI_COMM_WORLD); 1645 | } 1646 | int local_row_idx_len = local_row_idx_len_array[0]; 1647 | 1648 | /* 1649 | * Send each rank the number of row_idx members it is due. 1650 | * Send each rank the index in row_idx that it's first element came from, 1651 | * this is important is placing the results in the correct place in the b 1652 | * (result) vector. 1653 | * Send each rank the first index that rank+1 should receive to use as a termination condition 1654 | * For this rank (rank 0), perform a local copy rather than via MPI 1655 | */ 1656 | for (i = 1; i < world_size; i++) { 1657 | MPI_Send(&row_idx[i * local_row_idx_len], local_row_idx_len_array[i], MPI_INT, i, 0, MPI_COMM_WORLD); 1658 | remote_row_offset = i*local_row_idx_len; 1659 | MPI_Send(&remote_row_offset, 1, MPI_INT, i, 0, MPI_COMM_WORLD); 1660 | MPI_Send(&row_idx[(i + 1) * local_row_idx_len], 1, MPI_INT, i, 0, MPI_COMM_WORLD); 1661 | } 1662 | 1663 | 1664 | 1665 | int* local_row_idx = (int*) malloc(sizeof (int)*(local_row_idx_len + 1)); 1666 | memcpy(&local_row_idx[0], &row_idx[0], sizeof (int)*local_row_idx_len_array[0]); 1667 | memcpy(&local_row_idx[local_row_idx_len], &row_idx[local_row_idx_len], sizeof (int)); 1668 | 1669 | /* 1670 | * Compute the number of values (members of values) for each rank 1671 | */ 1672 | int vals_per_rank[world_size]; 1673 | for (i = 0; i < world_size; i++) { 1674 | vals_per_rank[i] = row_idx[(i + 1) * local_row_idx_len] - row_idx[i * local_row_idx_len]; 1675 | } 1676 | 1677 | /* 1678 | * Send to each rank the number of values they will receive 1679 | * Send to each rank the values 1680 | * Send to each rank the column indices (col_idx) which will be equal 1681 | * to the number of values. 1682 | * For this rank (rank 0), perform a local copy rather than via MPI 1683 | */ 1684 | int counter = 0; 1685 | counter = vals_per_rank[0]; 1686 | for (i = 1; i < world_size; i++) { 1687 | MPI_Send(&vals_per_rank[i], 1, MPI_INT, i, 0, MPI_COMM_WORLD); 1688 | MPI_Send(&values[counter], vals_per_rank[i], MPI_DOUBLE, i, 0, MPI_COMM_WORLD); 1689 | MPI_Send(&col_idx[counter], vals_per_rank[i], MPI_INT, i, 0, MPI_COMM_WORLD); 1690 | counter += vals_per_rank[i]; 1691 | } 1692 | double local_vals[vals_per_rank[0]]; 1693 | memcpy(&local_vals[0], &values[0], sizeof (double)*vals_per_rank[0]); 1694 | int local_col_idx[vals_per_rank[0]]; 1695 | memcpy(&local_col_idx[0], &col_idx[0], sizeof (int)*vals_per_rank[0]); 1696 | 1697 | 1698 | /* 1699 | * Broadcast the length of the vector x and vector x to each rank 1700 | */ 1701 | MPI_Bcast(&x_len, 1, MPI_INT, 0, MPI_COMM_WORLD); 1702 | MPI_Bcast(&x[0], x_len, MPI_DOUBLE, 0, MPI_COMM_WORLD); 1703 | 1704 | int ii, jj; 1705 | int local_row_adjust = 0; 1706 | 1707 | /* Main algorithm */ 1708 | MPI_Barrier(MPI_COMM_WORLD); 1709 | clock_gettime(CLOCK, &start); 1710 | for (rep = 0; rep < r; rep++) { 1711 | for (ii = 0; ii < local_row_idx_len - 1; ii++) { 1712 | for (jj = local_row_idx[ii]; jj < local_row_idx[ii + 1]; jj++) { 1713 | b[ii + local_row_adjust] = b[ii + local_row_adjust] + (x[local_col_idx[jj]] * local_vals[jj]); 1714 | } 1715 | } 1716 | } 1717 | 1718 | /* 1719 | * Reduce the b vector on all ranks to bb vector on this rank (rank 0). 1720 | */ 1721 | double* bb = (double*) malloc(sizeof (double)*x_len); 1722 | MPI_Reduce(b, bb, x_len, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); 1723 | clock_gettime(CLOCK, &end); 1724 | 1725 | elapsed_time_hr(start, end, "Sparse DMVs."); 1726 | 1727 | 1728 | free(b); 1729 | free(x); 1730 | free(bb); 1731 | free(local_row_idx); 1732 | free(row_idx); 1733 | free(values); 1734 | free(col_idx); 1735 | 1736 | /* Print result for checking */ 1737 | /* printf("bb "); */ 1738 | /* for(i=0;i \n", filename); 1828 | exit(1); 1829 | } 1830 | 1831 | retval = fgets(line, sizeof (line), f); 1832 | if (retval != NULL) { 1833 | sscanf(line, "%d %d %d", &nz, &n, &m); 1834 | } else { 1835 | printf("Error in reading line from file. Exiting.\n"); 1836 | return 1; 1837 | } 1838 | 1839 | printf("Number of elements of values and col_idx: %d; number of values in row_idx: %d\n", nz, m); 1840 | 1841 | /* 1842 | * Allocate memory for values on proc0 1843 | * We read everything in on this proc, then distribute. 1844 | */ 1845 | float* values = malloc(nz * sizeof (float)); 1846 | if (values = NULL){ 1847 | printf("Error allocating memory for values.\n"); 1848 | } 1849 | 1850 | /* int values_len = nz; */ 1851 | 1852 | // fill values 1853 | for (i = 0; i < nz; i++) { 1854 | 1855 | retval = fgets(line, sizeof (line), f); 1856 | if (retval != NULL) { 1857 | sscanf(line, "%f", &values[i]); // I think this does automatic down conversion if it's a double in the input file. 1858 | } else { 1859 | printf("Error in reading line from file. Exiting.\n"); 1860 | return 1; 1861 | } 1862 | 1863 | 1864 | } 1865 | 1866 | int* col_idx = malloc(nz * sizeof (int)); 1867 | if (col_idx == NULL){ 1868 | printf("Error allocating memory for col_idx.\n"); 1869 | } 1870 | /* int col_idx_len = values_len; */ 1871 | 1872 | /* fill col_idx */ 1873 | for (i = 0; i < nz; i++) { 1874 | retval = fgets(line, sizeof (line), f); 1875 | if (retval != NULL) { 1876 | sscanf(line, "%d", &col_idx[i]); 1877 | } else { 1878 | printf("Error in reading line from file. Exiting.\n"); 1879 | return 1; 1880 | } 1881 | 1882 | } 1883 | 1884 | int* row_idx = malloc((m + 1) * sizeof (int)); 1885 | int row_idx_len = m + 1; 1886 | // fill row_idx 1887 | for (i = 0; i < m; i++) { 1888 | retval = fgets(line, sizeof (line), f); 1889 | if (retval != NULL) { 1890 | sscanf(line, "%d", &row_idx[i]); 1891 | } else { 1892 | printf("Error in reading line from file. Exiting.\n"); 1893 | return 1; 1894 | } 1895 | } 1896 | row_idx[m] = nz; 1897 | 1898 | 1899 | int x_len = m - 1; 1900 | float* x = (float*) malloc(sizeof (float)*x_len); 1901 | if (x == NULL) { 1902 | printf("Error alloc x\n"); 1903 | } 1904 | 1905 | for (i = 0; i < x_len; i++) { 1906 | x[i] = i + 1; 1907 | } 1908 | 1909 | fclose(f); 1910 | 1911 | 1912 | 1913 | float* b = (float*) malloc(sizeof (float)*x_len); 1914 | if (b == NULL) { 1915 | printf("Error alloc b\n"); 1916 | } 1917 | memset(b, 0, sizeof (*b)); 1918 | 1919 | 1920 | /* 1921 | * Compute how many members of row_idx to go each rank 1922 | * The last rank (world_size-1) gets the overflow 1923 | */ 1924 | int local_row_idx_len_array[world_size]; 1925 | for (i = 0; i < world_size; i++) { 1926 | local_row_idx_len_array[i] = (row_idx_len - 1) / world_size; 1927 | } 1928 | local_row_idx_len_array[world_size - 1] += (row_idx_len - 1) % world_size; 1929 | 1930 | /* 1931 | * Send the number of items from row_idx they will receive to each rank 1932 | * For this rank (rank 0), perform a local copy rather than via MPI 1933 | */ 1934 | int remote_row_offset = 0; 1935 | for (i = 1; i < world_size; i++) { 1936 | MPI_Send(&local_row_idx_len_array[i], 1, MPI_INT, i, 0, MPI_COMM_WORLD); 1937 | } 1938 | int local_row_idx_len = local_row_idx_len_array[0]; 1939 | 1940 | /* 1941 | * Send each rank the number of row_idx members it is due. 1942 | * Send each rank the index in row_idx that it's first element came from, 1943 | * this is important is placing the results in the correct place in the b 1944 | * (result) vector. 1945 | * Send each rank the first index that rank+1 should receive to use as a termination condition 1946 | * For this rank (rank 0), perform a local copy rather than via MPI 1947 | */ 1948 | for (i = 1; i < world_size; i++) { 1949 | MPI_Send(&row_idx[i * local_row_idx_len], local_row_idx_len_array[i], MPI_INT, i, 0, MPI_COMM_WORLD); 1950 | remote_row_offset = i*local_row_idx_len; 1951 | MPI_Send(&remote_row_offset, 1, MPI_INT, i, 0, MPI_COMM_WORLD); 1952 | MPI_Send(&row_idx[(i + 1) * local_row_idx_len], 1, MPI_INT, i, 0, MPI_COMM_WORLD); 1953 | } 1954 | 1955 | 1956 | 1957 | int* local_row_idx = (int*) malloc(sizeof (int)*(local_row_idx_len + 1)); 1958 | if (local_row_idx == NULL) { 1959 | printf("Error alloc local_row_idx\n"); 1960 | } 1961 | 1962 | memcpy(&local_row_idx[0], &row_idx[0], sizeof (int)*local_row_idx_len_array[0]); 1963 | memcpy(&local_row_idx[local_row_idx_len], &row_idx[local_row_idx_len], sizeof (int)); 1964 | 1965 | /* 1966 | * Compute the number of values (members of values) for each rank 1967 | */ 1968 | int vals_per_rank[world_size]; 1969 | for (i = 0; i < world_size; i++) { 1970 | vals_per_rank[i] = row_idx[(i + 1) * local_row_idx_len] - row_idx[i * local_row_idx_len]; 1971 | } 1972 | 1973 | /* 1974 | * Send to each rank the number of values they will receive 1975 | * Send to each rank the values 1976 | * Send to each rank the column indices (col_idx) which will be equal 1977 | * to the number of values. 1978 | * For this rank (rank 0), perform a local copy rather than via MPI 1979 | */ 1980 | int counter = 0; 1981 | counter = vals_per_rank[0]; 1982 | for (i = 1; i < world_size; i++) { 1983 | MPI_Send(&vals_per_rank[i], 1, MPI_INT, i, 0, MPI_COMM_WORLD); 1984 | MPI_Send(&values[counter], vals_per_rank[i], MPI_FLOAT, i, 0, MPI_COMM_WORLD); 1985 | MPI_Send(&col_idx[counter], vals_per_rank[i], MPI_INT, i, 0, MPI_COMM_WORLD); 1986 | counter += vals_per_rank[i]; 1987 | } 1988 | 1989 | /* float local_vals[vals_per_rank[0]]; */ 1990 | float *local_vals = (float*) malloc(sizeof (float) * vals_per_rank[0]); 1991 | if (local_vals == NULL) { 1992 | printf("Error alloc local_vals1\n"); 1993 | } 1994 | 1995 | memcpy(&local_vals[0], &values[0], sizeof (float)*vals_per_rank[0]); 1996 | /* int local_col_idx[vals_per_rank[0]]; */ 1997 | int *local_col_idx = (int*) malloc(sizeof (int) * vals_per_rank[0]); 1998 | if (local_vals == NULL) { 1999 | printf("Error alloc local_col_idx1\n"); 2000 | } 2001 | 2002 | memcpy(&local_col_idx[0], &col_idx[0], sizeof (int)*vals_per_rank[0]); 2003 | printf("A5.\n"); 2004 | 2005 | /* 2006 | * Broadcast the length of the vector x and vector x to each rank 2007 | */ 2008 | MPI_Bcast(&x_len, 1, MPI_INT, 0, MPI_COMM_WORLD); 2009 | MPI_Bcast(&x[0], x_len, MPI_FLOAT, 0, MPI_COMM_WORLD); 2010 | 2011 | int ii, jj; 2012 | int local_row_adjust = 0; 2013 | 2014 | /* Main algorithm */ 2015 | MPI_Barrier(MPI_COMM_WORLD); 2016 | clock_gettime(CLOCK, &start); 2017 | for (rep = 0; rep < r; rep++) { 2018 | for (ii = 0; ii < local_row_idx_len - 1; ii++) { 2019 | for (jj = local_row_idx[ii]; jj < local_row_idx[ii + 1]; jj++) { 2020 | b[ii + local_row_adjust] = b[ii + local_row_adjust] + (x[local_col_idx[jj]] * local_vals[jj]); 2021 | } 2022 | } 2023 | } 2024 | 2025 | /* 2026 | * Reduce the b vector on all ranks to bb vector on this rank (rank 0). 2027 | */ 2028 | float* bb = (float*) malloc(sizeof (float)*x_len); 2029 | if (bb == NULL) { 2030 | printf("Error alloc bb\n"); 2031 | } 2032 | 2033 | MPI_Reduce(b, bb, x_len, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD); 2034 | clock_gettime(CLOCK, &end); 2035 | 2036 | elapsed_time_hr(start, end, "Sparse DMVs."); 2037 | 2038 | 2039 | free(b); 2040 | free(x); 2041 | free(bb); 2042 | free(local_row_idx); 2043 | free(row_idx); 2044 | free(values); 2045 | free(col_idx); 2046 | 2047 | /* Print result for checking */ 2048 | /* printf("bb "); */ 2049 | /* for(i=0;i