├── src
    ├── Schedgen
    │   ├── buffer_element.cpp
    │   ├── Makefile
    │   ├── buffer_element.hpp
    │   ├── goal_comm.h
    │   ├── trace_reader.hpp
    │   ├── schedgen_cmdline.ggo
    │   └── schedgen.hpp
    ├── liballprof2
    │   ├── README.md
    │   ├── tracer_main.c
    │   └── gensem.py
    ├── Schedgen2
    │   ├── config_example.json
    │   ├── TODO
    │   ├── README
    │   ├── additional_microbenchmarks.py
    │   ├── schedgen.py
    │   ├── mpi_colls.py
    │   ├── process_trace.py
    │   ├── goal.py
    │   └── patterns.py
    ├── liballprof
    │   ├── sync.h
    │   ├── mpi_helloworld.f90
    │   ├── allprof.h
    │   ├── mpi_helloworld.c
    │   ├── numbers.h
    │   ├── wrapper.sh
    │   ├── template.c
    │   └── sync.c
    ├── Drawviz
    │   ├── Makefile
    │   ├── drawviz.ggo
    │   ├── TimelineDrawing.hpp
    │   ├── Drawviz.cpp
    │   ├── cmdline.h
    │   └── TimelineDrawing.cpp
    ├── LogGOPSim
    │   ├── txt2bin_cmdline.ggo
    │   ├── Makefile
    │   ├── loggopsim_cmdline.ggo
    │   ├── LogGOPSim.hpp
    │   ├── Goal.hpp
    │   ├── TimelineVisualization.hpp
    │   ├── binary_tree_32.goal
    │   └── Noise.hpp
    └── CMakeLists.txt
├── .gitignore
├── tests
    ├── mpi_helloworld.f90
    └── mpi_helloworld.c
├── cmake
    ├── re2c.cmake
    ├── gengetopt.cmake
    ├── FindUnwind.cmake
    └── FindGraphviz.cmake
├── .github
    └── workflows
    │   └── build-and-test.yml
├── LICENCE
├── doc
    ├── README
    └── README-mpi-matching
└── README.md


/src/Schedgen/buffer_element.cpp:
--------------------------------------------------------------------------------
1 | #include "buffer_element.hpp" 
2 | 
3 | 
4 | 
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | *_cmdline.c
3 | *_cmdline.h
4 | src/LogGOPSim/txt2bin.cpp
5 | 


--------------------------------------------------------------------------------
/src/liballprof2/README.md:
--------------------------------------------------------------------------------
1 | This is an attempt to recreate liballprof and fix some of its shortcomings at the same time.
2 | It is not functional / complete yet - see it as work in progress.


--------------------------------------------------------------------------------
/src/Schedgen2/config_example.json:
--------------------------------------------------------------------------------
1 | {
2 |     "ptrn": "allreduce",
3 |     "algotithm": "ring",
4 |     "comm_size": 16,
5 |     "datasize": 1024,
6 |     "output": "allreduce_ring_16_1024.bin",
7 |     "txt2bin": "../../build/txt2bin"
8 | }


--------------------------------------------------------------------------------
/src/liballprof/sync.h:
--------------------------------------------------------------------------------
 1 | #include <mpi.h>
 2 | #include <stdlib.h>
 3 | 
 4 | 
 5 | #ifdef __cplusplus
 6 | extern "C" {
 7 | #endif
 8 | double sync_tree(MPI_Comm comm);
 9 | double sync_lin(MPI_Comm comm);
10 | #ifdef __cplusplus
11 | }
12 | #endif
13 | 


--------------------------------------------------------------------------------
/src/Schedgen2/TODO:
--------------------------------------------------------------------------------
1 | Trace Reader:
2 | - add calcs between mpi calls, make them dependent on each other
3 | 
4 | GOAL:
5 | - implement re-rooting, now we assume rank 0 is root
6 | - implement comm-flattening
7 | - handle tags 
8 | - get rid of GetLabel (label should be index in ops)
9 | 


--------------------------------------------------------------------------------
/src/Drawviz/Makefile:
--------------------------------------------------------------------------------
 1 | CXXFLAGS= -O0 -Wno-deprecated -Wall
 2 | CCFLAGS= -O0 
 3 | LDFLAGS= -lps -lboost_regex
 4 | 
 5 | all:
 6 | 	gengetopt < drawviz.ggo
 7 | 	gcc $(CCFLAGS) -c *.c
 8 | 	g++ $(CXXFLAGS) -c *.cpp
 9 | 	g++ $(CXXFLAGS) *.o -o drawviz $(LDFLAGS)
10 | 
11 | clean:
12 | 	rm -f cmdline.o
13 | 	rm -f *.o
14 | 	rm -f drawviz
15 | 


--------------------------------------------------------------------------------
/src/LogGOPSim/txt2bin_cmdline.ggo:
--------------------------------------------------------------------------------
1 | package "goalsim" 
2 | version "0.1"
3 | 
4 | option "input"      i "Input file, textfile containing GOAL schedules" string typestr="filename"
5 | option "output"     o "Output file, will contain the binary representation of the GOAL schedules" string typestr="filename"
6 | option "progress"   p "Print progress information while parsing the schedule" flag off
7 | 
8 | 


--------------------------------------------------------------------------------
/src/Schedgen2/README:
--------------------------------------------------------------------------------
1 | This is a prototype of a GAOL schedule generator in Python.
2 | At present this is not intended as a replacement for Schedgen. APIs might change drastically.
3 | The GOAL :) of this version is to make it easy to compose GOAL schedules, by keeping everything in memory.
4 | Schedgen on the other hand is optimized to minimize memory footprint and does not easily allow composability, i.e., supporting communicators.
5 | 


--------------------------------------------------------------------------------
/tests/mpi_helloworld.f90:
--------------------------------------------------------------------------------
 1 | program helloworld
 2 |  implicit none
 3 |  include 'mpif.h'
 4 |  
 5 |  integer :: ierr, me, nproc
 6 |  double precision :: val
 7 | 
 8 |  call MPI_INIT(ierr)
 9 |  call MPI_COMM_RANK(MPI_COMM_WORLD,me,ierr)
10 |  call MPI_COMM_SIZE(MPI_COMM_WORLD,nproc,ierr)
11 | 
12 |  call RANDOM_NUMBER(val)
13 |  
14 |  write(*,*) 'before', me, val
15 |  call MPI_ALLREDUCE(MPI_IN_PLACE, val, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
16 |  &                  MPI_COMM_WORLD, ierr)
17 |  write(*,*) 'after', me, val
18 |      
19 |  call MPI_FINALIZE(ierr);
20 | 
21 | end program
22 | 


--------------------------------------------------------------------------------
/src/liballprof/mpi_helloworld.f90:
--------------------------------------------------------------------------------
 1 | program helloworld
 2 |  implicit none
 3 |  include 'mpif.h'
 4 |  
 5 |  integer :: ierr, me, nproc
 6 |  double precision :: val
 7 | 
 8 |  call MPI_INIT(ierr)
 9 |  call MPI_COMM_RANK(MPI_COMM_WORLD,me,ierr)
10 |  call MPI_COMM_SIZE(MPI_COMM_WORLD,nproc,ierr)
11 | 
12 |  call RANDOM_NUMBER(val)
13 |  
14 |  write(*,*) 'before', me, val
15 |  call MPI_ALLREDUCE(MPI_IN_PLACE, val, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
16 |  &                  MPI_COMM_WORLD, ierr)
17 |  write(*,*) 'after', me, val
18 |      
19 |  call MPI_FINALIZE(ierr);
20 | 
21 | end program
22 | 


--------------------------------------------------------------------------------
/src/Schedgen/Makefile:
--------------------------------------------------------------------------------
 1 | CXXFLAGS=-g -O3 -Wno-deprecated -Wall -std=c++11
 2 | CCFLAGS=-g -O3 -g
 3 | LDFLAGS=-g -O3 -g -lboost_iostreams -L/opt/homebrew/lib/ 
 4 | 
 5 | force: all
 6 | 
 7 | schedgen_cmdline.c: schedgen_cmdline.ggo
 8 | 	gengetopt -i $< -F schedgen_cmdline
 9 | 
10 | %.o: %.cpp *hpp *h
11 | 	${CXX} $(CXXFLAGS) -c $<
12 | 
13 | %.o: %.c *h
14 | 	${CC} $(CCFLAGS) -c $<
15 | 
16 | all: buffer_element.o  schedgen_cmdline.o  process_trace.o  schedgen_cmdline.ggo  schedgen.o 
17 | 	${CXX} $(CXXFLAGS) *.o -o schedgen $(LDFLAGS)
18 | 
19 | clean:
20 | 	rm -f *.o
21 | 	rm -f schedgen_cmdline.c schedgen_cmdline.h
22 | 	rm -f schedgen
23 | 


--------------------------------------------------------------------------------
/src/Drawviz/drawviz.ggo:
--------------------------------------------------------------------------------
 1 | package "drawviz" 
 2 | version "0.1"
 3 | 
 4 | option "inputfile"   i "Name of the inputfile (event data)" string
 5 | option "outputfile"  o "Name of the output file (postscript)" default="timeline.ps" string optional
 6 | option "linethickness" l "Thickness of lines" default="1" int optional
 7 | option "starttime"     s "Starttime, if only a interval should be drawn" default="0" int optional
 8 | option "endtime"     e "Endtime, if only a interval should be drawn" default="0" int optional
 9 | option "arrowheads"  - "If this flag is given, arrowheads will be drawn" flag off
10 | option "descrtext"   - "If this flag is given, text will be written below o_send and o_recv" flag off
11 | 


--------------------------------------------------------------------------------
/src/liballprof/allprof.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * liballprof MPIP Wrapper 
 3 |  *
 4 |  * Copyright: Indiana University
 5 |  * Author: Torsten Hoefler <htor@cs.indiana.edu>
 6 |  * 
 7 |  *************************************************************************/
 8 | 
 9 | #define VERSION 1
10 | 
11 | /* undef to disable banner printing */
12 | #define PRINT_BANNER 
13 | 
14 | /* trace file prefix (relative to run directory) and suffix */
15 | #define FILE_PREFIX "/tmp/pmpi-trace-rank-"
16 | #define FILE_SUFFIX ".txt"
17 | 
18 | /* undef to disable writer thread */
19 | #define WRITER_THREAD 
20 | #define BUFSIZE 10485760
21 | #define THRESHOLD 8388608
22 | 
23 | /* IBM only implements a subset of MPI-2 */
24 | /* #define IBM_BROKEN_MPI */
25 | 


--------------------------------------------------------------------------------
/cmake/re2c.cmake:
--------------------------------------------------------------------------------
 1 | macro (find_re2c)
 2 |   if (NOT RE2C_EXECUTABLE)
 3 |     find_program (RE2C_EXECUTABLE re2c)
 4 |     if (NOT RE2C_EXECUTABLE)
 5 |       message (FATAL_ERROR "re2c not found. Aborting...")
 6 |     endif ()
 7 |   endif ()
 8 | endmacro ()
 9 | 
10 | macro (add_re2c_files _basename)
11 |   find_re2c()
12 | 
13 |   set (_re2c_in  ${CMAKE_CURRENT_SOURCE_DIR}/${_basename}.re)
14 |   set (_re2c_out ${CMAKE_CURRENT_SOURCE_DIR}/${_basename}.cpp)
15 | 
16 |   get_filename_component(_basepath ${_basename} DIRECTORY)
17 |   get_filename_component(_basefile ${_basename} NAME)
18 | 
19 |   add_custom_command (
20 |     OUTPUT ${_re2c_out}
21 |     COMMAND re2c  ${_re2c_in} -o ${_re2c_out}
22 |     DEPENDS ${_re2c_in}
23 | #    BYPRODUCTS
24 |     COMMENT "Generating re2c parser code ..."
25 |     VERBATIM
26 |     )
27 | 
28 | endmacro (add_re2c_files)
29 | 


--------------------------------------------------------------------------------
/tests/mpi_helloworld.c:
--------------------------------------------------------------------------------
 1 | #include <mpi.h>
 2 | #include <stdlib.h>
 3 | #include <stdio.h>
 4 | 
 5 | 
 6 | int main(int argc, char **argv) {
 7 | 	/*  ------ MPI specific ------- */
 8 | 	int rank; /*  MPI rank */
 9 |   int procs; /*  number of mpi procs */
10 |   double i;
11 |   int ret;
12 |   MPI_Request reqs[2];
13 | 
14 | 	MPI_Init(&argc, &argv);
15 | 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
16 | 	MPI_Comm_size(MPI_COMM_WORLD, &procs);
17 | 
18 |   i = (double)rand();
19 |   
20 |   MPI_Isend(&rank, 1, MPI_INT, (rank+1)%procs, 0, MPI_COMM_WORLD, &reqs[0]);
21 |   MPI_Irecv(&rank, 1, MPI_INT, (rank-1+procs)%procs, 0, MPI_COMM_WORLD, &reqs[1]);
22 |   MPI_Waitall(2,reqs,MPI_STATUSES_IGNORE);
23 | 
24 |   printf("before rank %u: i=%f\n", rank, i);
25 |   ret = MPI_Allreduce(MPI_IN_PLACE, &i, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
26 |   printf("after rank %u: i=%f\n", rank, i);
27 |       
28 | 
29 | 	printf("Hello from rank %u\n", rank);
30 | 	fflush(stdout);
31 | 
32 | 	MPI_Finalize();
33 | 
34 | 	return 0;
35 | }
36 | 


--------------------------------------------------------------------------------
/src/liballprof/mpi_helloworld.c:
--------------------------------------------------------------------------------
 1 | #include <mpi.h>
 2 | #include <stdlib.h>
 3 | #include <stdio.h>
 4 | 
 5 | 
 6 | int main(int argc, char **argv) {
 7 | 	/*  ------ MPI specific ------- */
 8 | 	int rank; /*  MPI rank */
 9 |   int procs; /*  number of mpi procs */
10 |   double i;
11 |   int ret;
12 |   MPI_Request reqs[2];
13 | 
14 | 	MPI_Init(&argc, &argv);
15 | 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
16 | 	MPI_Comm_size(MPI_COMM_WORLD, &procs);
17 | 
18 |   i = (double)rand();
19 |   
20 |   MPI_Isend(&rank, 1, MPI_INT, (rank+1)%procs, 0, MPI_COMM_WORLD, &reqs[0]);
21 |   MPI_Irecv(&rank, 1, MPI_INT, (rank-1+procs)%procs, 0, MPI_COMM_WORLD, &reqs[1]);
22 |   MPI_Waitall(2,reqs,MPI_STATUSES_IGNORE);
23 | 
24 |   printf("before rank %u: i=%f\n", rank, i);
25 |   ret = MPI_Allreduce(MPI_IN_PLACE, &i, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
26 |   printf("after rank %u: i=%f\n", rank, i);
27 |       
28 | 
29 | 	printf("Hello from rank %u\n", rank);
30 | 	fflush(stdout);
31 | 
32 | 	MPI_Finalize();
33 | 
34 | 	return 0;
35 | }
36 | 


--------------------------------------------------------------------------------
/src/liballprof/numbers.h:
--------------------------------------------------------------------------------
 1 | #define LOG_MPI_INT 1
 2 | #define LOG_MPI_INTEGER 2
 3 | #define LOG_MPI_LONG 3
 4 | #define LOG_MPI_SHORT 4
 5 | #define LOG_MPI_UNSIGNED 5
 6 | #define LOG_MPI_UNSIGNED_LONG 6
 7 | #define LOG_MPI_UNSIGNED_SHORT 7 
 8 | #define LOG_MPI_FLOAT 8
 9 | #define LOG_MPI_REAL 9
10 | #define LOG_MPI_DOUBLE 10
11 | #define LOG_MPI_DOUBLE_PRECISION 11
12 | #define LOG_MPI_LONG_DOUBLE 12
13 | #define LOG_MPI_BYTE 13
14 | #define LOG_MPI_FLOAT_INT 14
15 | #define LOG_MPI_DOUBLE_INT 15
16 | #define LOG_MPI_LONG_INT 16
17 | #define LOG_MPI_2INT 17
18 | #define LOG_MPI_SHORT_INT 18
19 | #define LOG_MPI_LONG_DOUBLE_INT 19
20 | #define LOG_MPI_LOGICAL 20
21 | #define LOG_MPI_COMPLEX 21
22 | #define LOG_MPI_CHARACTER 21
23 | #define LOG_MPI_DOUBLE_COMPLEX 22
24 | 
25 | 
26 | #define LOG_MPI_MIN 1
27 | #define LOG_MPI_MAX 2 
28 | #define LOG_MPI_SUM 3
29 | #define LOG_MPI_PROD 4 
30 | #define LOG_MPI_LAND 5
31 | #define LOG_MPI_BAND 6 
32 | #define LOG_MPI_LOR 7
33 | #define LOG_MPI_BOR 8
34 | #define LOG_MPI_LXOR 9
35 | #define LOG_MPI_BXOR 10
36 | #define LOG_MPI_MINLOC 11
37 | #define LOG_MPI_MAXLOC 12
38 | 


--------------------------------------------------------------------------------
/cmake/gengetopt.cmake:
--------------------------------------------------------------------------------
 1 | macro (find_gengetopt)
 2 |   if (NOT GENGETOPT_EXECUTABLE)
 3 |     find_program (GENGETOPT_EXECUTABLE gengetopt)
 4 |     if (NOT GENGETOPT_EXECUTABLE)
 5 |       message (FATAL_ERROR "gengetopt not found. Aborting...")
 6 |     endif ()
 7 |   endif ()
 8 | endmacro ()
 9 | 
10 | macro (add_gengetopt_files _basename)
11 |   find_gengetopt ()
12 | 
13 |   set (_ggo_extra_input ${ARGV})
14 | 
15 |   set (_ggo_c ${CMAKE_CURRENT_SOURCE_DIR}/${_basename}.c)
16 |   set (_ggo_h ${CMAKE_CURRENT_SOURCE_DIR}/${_basename}.h)
17 |   set (_ggo_g ${CMAKE_CURRENT_SOURCE_DIR}/${_basename}.ggo)
18 | 
19 |   get_filename_component(_basepath ${_basename} DIRECTORY)
20 |   get_filename_component(_basefile ${_basename} NAME)
21 | 
22 |   add_custom_command (
23 |     OUTPUT ${_ggo_c} ${_ggo_h}
24 |     COMMAND gengetopt -F ${_basefile} -i ${_ggo_g} --output-dir ${CMAKE_CURRENT_SOURCE_DIR}/${_basepath}
25 |     DEPENDS ${_ggo_g}
26 | #    BYPRODUCTS
27 |     COMMENT "Generating getopt parser code ..."
28 |     VERBATIM
29 |     )
30 | 
31 |   set (GGO_C ${_ggo_c})
32 |   set (GGO_H ${_ggo_h})
33 | 
34 | endmacro (add_gengetopt_files)
35 | 


--------------------------------------------------------------------------------
/src/Schedgen2/additional_microbenchmarks.py:
--------------------------------------------------------------------------------
 1 | from patterns import linear
 2 | 
 3 | 
 4 | def incast(
 5 |     comm_size: int,
 6 |     datasize: int,
 7 |     tag: int = 42,
 8 |     ptrn: str = "linear",
 9 |     randomized_data: bool = False,
10 |     **kwargs,
11 | ):
12 |     assert ptrn == "linear", "incast only supports the linear communication pattern"
13 |     return linear(
14 |         comm_size=comm_size,
15 |         datasize=datasize,
16 |         tag=tag,
17 |         algorithm="incast",
18 |         parallel=True,
19 |         randomized_data=randomized_data,
20 |         **kwargs,
21 |     )
22 | 
23 | 
24 | def outcast(
25 |     comm_size: int,
26 |     datasize: int,
27 |     tag: int = 42,
28 |     ptrn: str = "linear",
29 |     randomized_data: bool = False,
30 |     **kwargs,
31 | ):
32 |     assert ptrn == "linear", "outcast only supports the linear communication pattern"
33 |     return linear(
34 |         comm_size=comm_size,
35 |         datasize=datasize,
36 |         tag=tag,
37 |         algorithm="outcast",
38 |         parallel=True,
39 |         randomized_data=randomized_data,
40 |         **kwargs,
41 |     )
42 | 


--------------------------------------------------------------------------------
/src/liballprof/wrapper.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #COMPRESS="bzip2 -c"
 4 | #SUFFIX="bz2"
 5 | COMPRESS="gzip -c"
 6 | SUFFIX="gz"
 7 | 
 8 | HOST=$(hostname -s)
 9 | VERBOSE=false
10 | if [ -f $HOME/.wrapper_verbose ]; then
11 |   VERBOSE=true
12 | fi
13 | 
14 | #echo "[$HOST] clearing /tmp ..."
15 | rm -f /tmp/pmpi-trace-rank-*txt
16 | 
17 | if $VERBOSE; then
18 |   echo "[$HOST] htor profiling wrapper: executing $@ ..."
19 | fi
20 | 
21 | 
22 | # execute the command ...
23 | $@
24 | 
25 | if [ x"$HTOR_PMPI_FILE_PREFIX" == "x" ]; then
26 |   HTOR_PMPI_FILE_PREFIX="/tmp/pmpi-trace-rank-"
27 | fi;
28 | 
29 | for i in $(ls -1 $HTOR_PMPI_FILE_PREFIX*txt 2>/dev/null); do
30 |   if test -f $i; then
31 |     TMP=$(mktemp)
32 |     if $VERBOSE; then
33 |       echo "[$HOST] moving $i to $TMP to have exclusive access ..."
34 |     fi
35 |     # one process wins the move -- and mv should be atomic in any
36 |     # reasonable FS :)
37 |     mv $i $TMP 2> /dev/null
38 |     # if I won ... compress it ...
39 |     if test -s $TMP; then
40 |       if $VERBOSE; then
41 |         echo "[$HOST] compressing $i ($TMP) ..."
42 |       fi
43 |       cat $TMP | $COMPRESS > $(basename $i).$SUFFIX;
44 |     fi;
45 |     rm $TMP
46 |   fi;
47 | done;
48 | 
49 | 


--------------------------------------------------------------------------------
/.github/workflows/build-and-test.yml:
--------------------------------------------------------------------------------
 1 | name: CMake on a single platform
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ "master" ]
 6 |   pull_request:
 7 |     branches: [ "master" ]
 8 | 
 9 | env:
10 |   # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
11 |   BUILD_TYPE: Release
12 | 
13 | jobs:
14 |   build:
15 |     runs-on: ubuntu-latest
16 | 
17 |     steps:
18 |     - uses: actions/checkout@v3
19 | 
20 |     - name: Install build dependencies
21 |       run: sudo apt-get install gengetopt re2c libgraphviz-dev python3 libclang-15-dev llvm-15-dev python3-clang-15 openmpi-bin openmpi-common libopenmpi-dev libunwind-dev
22 | 
23 |     - name: Configure CMake
24 |       # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
25 |       # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
26 |       run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} src
27 | 
28 |     - name: Build
29 |       run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}
30 | 
31 |     - name: Test
32 |       working-directory: ${{github.workspace}}/build
33 |       run: ctest --output-on-failure -C ${{env.BUILD_TYPE}}
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/src/Schedgen/buffer_element.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2009 The Trustees of Indiana University and Indiana
 3 |  *                    University Research and Technology
 4 |  *                    Corporation.  All rights reserved.
 5 |  *
 6 |  * Author(s): Torsten Hoefler <htor@cs.indiana.edu>
 7 |  *            Timo Schneider <timoschn@cs.indiana.edu>
 8 |  *
 9 |  */
10 | 
11 | #include <stdint.h>
12 | 
13 | // This class stores the information of a single element of an address list
14 | // These things consist of three entries: The type, which can be IN=1 or OUT=2,
15 | // indicated by a '<' or '>' in the schedule. Then there is the actual address
16 | // in memory which is a simple integer in our language. And last, there is the
17 | // size of data referenced by this address which is an integer and denotes the
18 | // size in bytes.
19 | 
20 | typedef uint64_t btime_t;
21 | 
22 | class buffer_element {
23 | 	public:
24 | 		int type; // IN=1, OUT=2
25 | 		int addr; // address where to read/write
26 | 		btime_t size; // size of data to read/write in bytes
27 | 
28 | 		buffer_element() : type(0), addr(0), size(0) {};
29 | 		buffer_element(const buffer_element &elem) : type(elem.type), addr(elem.addr), size(elem.size) {};
30 | 		buffer_element(int t, int a, btime_t s) : type(t), addr(a), size(s) {};
31 | 		buffer_element& operator=(const buffer_element &elem) {
32 |       type = elem.type;
33 |       addr = elem.addr;
34 |       size = elem.size;
35 |       return *this;
36 |     };
37 | 		
38 | };
39 | 
40 | 


--------------------------------------------------------------------------------
/src/LogGOPSim/Makefile:
--------------------------------------------------------------------------------
 1 | CXX=g++
 2 | CXXFLAGS= -std=c++11 -O0 -g -pedantic -Wno-deprecated -Wall -Wno-long-long -I/opt/homebrew/include/
 3 | CCFLAGS= -O0 -g 
 4 | LDFLAGS= -L/opt/homebrew/lib/ -lcgraph -g
 5 | 
 6 | AUTOGEN_SRC= loggopsim_cmdline.c loggopsim_cmdline.h txt2bin_cmdline.h txt2bin_cmdline.c  
 7 | LOGGOPSIM_OBJECTS= LogGOPSim.o 
 8 | HLPR_OBJECTS= loggopsim_cmdline.o 
 9 | ALL_OBJECTS= $(LOGGOPSIM_OBJECTS) $(HLPR_OBJECTS)
10 | BINARY= LogGOPSim
11 | 
12 | all: $(ALL_OBJECTS) $(AUTOGEN_SRC) txt2bin
13 | 	$(CXX) $(CXXFLAGS) $(ALL_OBJECTS) -o $(BINARY) $(LDFLAGS)
14 | 
15 | txt2bin:
16 | 	re2c -o txt2bin.cpp txt2bin.re
17 | 	gengetopt -F txt2bin_cmdline -i txt2bin_cmdline.ggo
18 | 	$(CXX) -g -O3 txt2bin.cpp txt2bin_cmdline.c -o txt2bin
19 | 
20 | loggopsim_cmdline.c: loggopsim_cmdline.ggo 
21 | 	gengetopt -F loggopsim_cmdline -i loggopsim_cmdline.ggo
22 | 
23 | loggopsim_cmdline.h: loggopsim_cmdline.ggo
24 | 	gengetopt -F loggopsim_cmdline -i loggopsim_cmdline.ggo
25 | 
26 | txt2bin_cmdline.c: txt2bin_cmdline.ggo 
27 | 	gengetopt -F txt2bin_cmdline -i txt2bin_cmdline.ggo
28 | 
29 | txt2bin_cmdline.h: txt2bin_cmdline.ggo
30 | 	gengetopt -F txt2bin_cmdline -i txt2bin_cmdline.ggo
31 | 
32 | 
33 | 
34 | %.o: %.cpp $(AUTOGEN_SRC) *.hpp 
35 | 	$(CXX) $(CXXFLAGS) -c $<
36 | 
37 | %.o: %.c $(AUTOGEN_SRC) *.h
38 | 	$(CXX) $(CCFLAGS) -c $<
39 | 
40 | clean:
41 | 	rm -f $(AUTOGEN_SRC)
42 | 	rm -f $(ALL_OBJECTS)
43 | 	rm -f $(BINARY)
44 | 	rm -f txt2bin.cpp txt2bin bin2txt bin2dot simtest
45 | 	rm -f cmdline_txt2bin.*
46 | 


--------------------------------------------------------------------------------
/LICENCE:
--------------------------------------------------------------------------------
 1 | Redistribution and use in source and binary forms, with or without
 2 | modification, are permitted provided that the following conditions are met:
 3 | 
 4 | 1. Redistributions of source code must retain the above copyright notice, this
 5 | list of conditions and the following disclaimer.
 6 | 
 7 | 2. Redistributions in binary form must reproduce the above copyright notice,
 8 | this list of conditions and the following disclaimer in the documentation
 9 | and/or other materials provided with the distribution.
10 | 
11 | 3. Neither the name of the copyright holder nor the names of its contributors
12 | may be used to endorse or promote products derived from this software without
13 | specific prior written permission.
14 | 
15 | 4. Redistributions of any form whatsoever must retain the following
16 | acknowledgment: 'This product includes software developed by SPCL @ ETH Zurich
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 
29 | 


--------------------------------------------------------------------------------
/src/LogGOPSim/loggopsim_cmdline.ggo:
--------------------------------------------------------------------------------
 1 | package "goalsim" 
 2 | version "0.1"
 3 | 
 4 | 
 5 | option "filename"       f  "Prefix for the filenames which contain the schedules" string
 6 | option "save-mem"       -  "Map the schedules as MAP_SHARED to enable processing of large schedule (larger than main memory). This will invalidate the schedules during simulation." flag off
 7 | option "LogGOPS_L"        L  "The latency parameter L in the LogGP model" int default="2500" optional
 8 | option "LogGOPS_o"        o  "The overhead parameter o in the LogGP model" int default="1500" optional
 9 | option "LogGOPS_g"        g  "The gap per message parameter g in the LogGP model" int default="1000" optional
10 | option "LogGOPS_G"        G  "The gap per byte parameter G in the LogGP model" int default="6" optional
11 | option "LogGOPS_S"        S  "Datasize at which we change from eager to rendezvous protocol" int  default="65535" optional
12 | option "LogGOPS_O"	      O  "The overhead per byte in LogGOP" int default="0" optional
13 | #option "starttimes"    t  "Name of an output file from a previous run, the end times of the old simulation will be the starttime of this one" string optional
14 | option "vizfile"        V  "Name of the output file for visualization data" string optional
15 | option "verbose"        v  "Enable more verbose output" optional
16 | option "progress"       -  "print progress" optional
17 | option "batchmode"      b  "enable batchmode (never print detailed host info)" optional
18 | #option "collnoise"     -  "Enable noise in collective operations" optional
19 | option "noise-trace"    -  "Read Noise from trace <file>" string optional
20 | option "noise-cosched"  -  "Co-schedule noise (use same starttime on all processes)" flag off
21 | option "network-type"   n  "Network type (LogGP=no network congestion; simple=simple linear model)" values="LogGP","simple" default="LogGP" string optional
22 | option "network-file"   -  "Input file for network (annotated dot format)" string optional
23 | option "qstat"          -  "Enable PQ and UQ statistics.  Argument is output filename prefix" default="Unknown" string optional
24 | 
25 | 


--------------------------------------------------------------------------------
/src/Schedgen/goal_comm.h:
--------------------------------------------------------------------------------
 1 | #include <algorithm>
 2 | 
 3 | class KeyRankPair {
 4 |   int key;
 5 |   int rank;
 6 | 
 7 | public:
 8 |   KeyRankPair(int key, int rank) {
 9 |     this->key = key;
10 |     this->rank = rank;
11 |   }
12 |   bool operator<(const KeyRankPair &rhs) const {
13 |     return ((key == rhs.key) && (rank < rhs.rank)) || (key < rhs.key);
14 |   }
15 | };
16 | 
17 | class Comm {
18 | 
19 |   /* The GOAL base API assumes only a single communicator, aka MPI_COMM_WORLD.
20 |    * This class provides communicator support */
21 | 
22 | private:
23 |   Comm *base_comm; // pointer to root of the comm tree
24 |   int id; // unique ID of this communicator, id=0 means this is MPI_COMM_WORLD
25 |   int color; // if this comm was created by comm_split, this is his color
26 |   std::vector<KeyRankPair> key2rank; // key, world_rank, pos is new rank
27 |   std::set<Comm *> children;
28 |   int next_free_id; // only used at base comm for now
29 | 
30 |   Comm *find_comm_rec(int comm_id) {
31 |     if (this->id == comm_id)
32 |       return this;
33 |     for (auto c : this->children) {
34 |       Comm *r = c->find_comm_rec(comm_id);
35 |       if (r != NULL)
36 |         return r;
37 |     }
38 |     return NULL;
39 |   }
40 | 
41 | public:
42 |   Comm() {
43 |     this->base_comm = this;
44 |     this->id = 0;
45 |     this->next_free_id = 1;
46 |   }
47 | 
48 |   Comm *find_comm(int comm_id) {
49 |     auto r = this->base_comm->find_comm_rec(comm_id);
50 |     if (r == NULL)
51 |       fprintf(stderr, "Did not find comm %i\n", comm_id);
52 |     return r;
53 |   }
54 | 
55 |   int getId(void) { return this->id; }
56 | 
57 |   int nextId() { return this->base_comm->next_free_id++; }
58 | 
59 |   Comm *find_or_create_child_comm(int color) {
60 |     for (auto c : this->children) {
61 |       if (c->color == color)
62 |         return c;
63 |     }
64 |     Comm *c = new Comm;
65 |     c->base_comm = this->base_comm;
66 |     c->id = this->base_comm->nextId();
67 |     c->color = color;
68 |     return c;
69 |   }
70 | 
71 |   void add_rank_key(int world_rank, int key) {
72 |     auto p = KeyRankPair(key, world_rank);
73 |     this->key2rank.push_back(p);
74 |     std::sort(
75 |         this->key2rank.begin(),
76 |         this->key2rank
77 |             .end()); // we could add a "close_comm" method and sort only once
78 |   }
79 | };


--------------------------------------------------------------------------------
/src/LogGOPSim/LogGOPSim.hpp:
--------------------------------------------------------------------------------
 1 | #include <inttypes.h>
 2 | 
 3 | #ifndef GRAPH_NODE_PROPERTIES
 4 | #define GRAPH_NODE_PROPERTIES 1
 5 | 
 6 | typedef uint64_t btime_t;
 7 |   
 8 | /* this class is CRITICAL -- keep it as SMALL as possible! 
 9 |  *
10 |  * current size: 39 bytes
11 |  *
12 |  */
13 | class graph_node_properties {
14 | 	public:
15 |     btime_t time;
16 |     btime_t starttime;         // only used for MSGs to identify start times
17 | #ifdef HOSTSYNC
18 |     btime_t syncstart;
19 | #endif
20 | #ifdef STRICT_ORDER
21 |     btime_t ts; /* this is a timestamp that determines the (original) insertion order of 
22 |                   elements in the queue, it is increased for every new element, not for 
23 |                   re-insertions! Needed for correctness. */
24 | #endif
25 | 		uint64_t size;						// number of bytes to send, recv, or time to spend in loclop
26 | 		uint32_t target;					// partner for send/recv
27 |     uint32_t host;            // owning host 
28 | 		uint32_t offset;          // for Parser (to identify schedule element)
29 | 		uint32_t tag;							// tag for send/recv
30 |     uint32_t handle;          // handle for network layer :-/
31 | 		uint8_t proc;							// processing element for this operation
32 | 		uint8_t nic;							// network interface for this operation
33 | 		char type;							  // see below
34 | };
35 | 
36 | /* this is a comparison functor that can be used to compare and sort
37 |  * operation types of graph_node_properties */
38 | class gnp_op_comp_func {
39 |   public:
40 |   bool operator()(graph_node_properties x, graph_node_properties y) {
41 |     if(x.type < y.type) return true;
42 |     return false;
43 |   }
44 | };
45 | 
46 | /* this is a comparison functor that can be used to compare and sort
47 |  * graph_node_properties by time */
48 | class aqcompare_func {
49 |   public:
50 |   bool operator()(graph_node_properties x, graph_node_properties y) {
51 |     if(x.time > y.time) return true;
52 | #ifdef STRICT_ORDER
53 |     if(x.time == y.time && x.ts > y.ts) return true; 
54 | #endif
55 |     return false;
56 |   }
57 | };
58 | 
59 | 
60 | // mnemonic defines for op type
61 | static const int OP_SEND = 1;
62 | static const int OP_RECV = 2;
63 | static const int OP_LOCOP = 3;
64 | static const int OP_MSG = 4;
65 | 		
66 | static const uint32_t ANY_SOURCE = ~0;
67 | static const uint32_t ANY_TAG = ~0;
68 | 
69 | 
70 | #endif
71 | 


--------------------------------------------------------------------------------
/cmake/FindUnwind.cmake:
--------------------------------------------------------------------------------
 1 | # - Try to find libunwind
 2 | # Once done this will define
 3 | #
 4 | #  Unwind_FOUND - system has libunwind
 5 | #  unwind::unwind - cmake target for libunwind
 6 | 
 7 | include (FindPackageHandleStandardArgs)
 8 | 
 9 | find_path (Unwind_INCLUDE_DIR NAMES unwind.h libunwind.h DOC "unwind include directory")
10 | find_library (Unwind_LIBRARY NAMES unwind DOC "unwind library")
11 | 
12 | mark_as_advanced (Unwind_INCLUDE_DIR Unwind_LIBRARY)
13 | 
14 | # Extract version information
15 | if (Unwind_LIBRARY)
16 |   set (_Unwind_VERSION_HEADER ${Unwind_INCLUDE_DIR}/libunwind-common.h)
17 | 
18 |   if (EXISTS ${_Unwind_VERSION_HEADER})
19 |     file (READ ${_Unwind_VERSION_HEADER} _Unwind_VERSION_CONTENTS)
20 | 
21 |     string (REGEX REPLACE ".*#define UNW_VERSION_MAJOR[ \t]+([0-9]+).*" "\\1"
22 |       Unwind_VERSION_MAJOR "${_Unwind_VERSION_CONTENTS}")
23 |     string (REGEX REPLACE ".*#define UNW_VERSION_MINOR[ \t]+([0-9]+).*" "\\1"
24 |       Unwind_VERSION_MINOR "${_Unwind_VERSION_CONTENTS}")
25 |     string (REGEX REPLACE ".*#define UNW_VERSION_EXTRA[ \t]+([0-9]+).*" "\\1"
26 |       Unwind_VERSION_PATCH "${_Unwind_VERSION_CONTENTS}")
27 | 
28 |     set (Unwind_VERSION ${Unwind_VERSION_MAJOR}.${Unwind_VERSION_MINOR})
29 | 
30 |     if (CMAKE_MATCH_0)
31 |       # Third version component may be empty
32 |       set (Unwind_VERSION ${Unwind_VERSION}.${Unwind_VERSION_PATCH})
33 |       set (Unwind_VERSION_COMPONENTS 3)
34 |     else (CMAKE_MATCH_0)
35 |       set (Unwind_VERSION_COMPONENTS 2)
36 |     endif (CMAKE_MATCH_0)
37 |   endif (EXISTS ${_Unwind_VERSION_HEADER})
38 | endif (Unwind_LIBRARY)
39 | 
40 | # handle the QUIETLY and REQUIRED arguments and set Unwind_FOUND to TRUE
41 | # if all listed variables are TRUE
42 | find_package_handle_standard_args (Unwind
43 |   REQUIRED_VARS Unwind_INCLUDE_DIR Unwind_LIBRARY
44 |   VERSION_VAR Unwind_VERSION
45 | )
46 | 
47 | if (Unwind_FOUND)
48 |   if (NOT TARGET unwind::unwind)
49 |     add_library (unwind::unwind INTERFACE IMPORTED)
50 | 
51 |     set_property (TARGET unwind::unwind PROPERTY
52 |       INTERFACE_INCLUDE_DIRECTORIES ${Unwind_INCLUDE_DIR}
53 |     )
54 |     set_property (TARGET unwind::unwind PROPERTY
55 |       INTERFACE_LINK_LIBRARIES ${Unwind_LIBRARY}
56 |     )
57 |     set_property (TARGET unwind::unwind PROPERTY
58 |       IMPORTED_CONFIGURATIONS RELEASE
59 |     )
60 |   endif (NOT TARGET unwind::unwind)
61 | endif (Unwind_FOUND)
62 | 


--------------------------------------------------------------------------------
/src/Schedgen/trace_reader.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2009 The Trustees of Indiana University and Indiana
 3 |  *                    University Research and Technology
 4 |  *                    Corporation.  All rights reserved.
 5 |  *
 6 |  * Author(s): Torsten Hoefler <htor@cs.indiana.edu>
 7 |  *            Timo Schneider <timoschn@cs.indiana.edu>
 8 |  *
 9 |  */
10 | 
11 | #include "schedgen.hpp"
12 | #include <iostream>
13 | #include <fstream>
14 | #include <sstream>
15 | #include <iomanip>
16 | #include <string.h>
17 | #include <unistd.h>
18 | 
19 | //#define HAVE_BOOST_IO
20 | 
21 | #ifdef HAVE_BOOST_IO
22 | #include <boost/iostreams/device/file_descriptor.hpp>
23 | #include <boost/iostreams/filtering_stream.hpp>
24 | #include <boost/iostreams/filter/zlib.hpp>
25 | #include <boost/iostreams/filter/bzip2.hpp>
26 | #include <boost/iostreams/copy.hpp>
27 | #include <boost/iostreams/char_traits.hpp>
28 | #endif
29 | 
30 | class TraceReader {
31 |   private:
32 |   std::ifstream trace;
33 |   enum {BZ2, NORM} type;
34 | 
35 | #ifdef HAVE_BOOST_IO
36 |   boost::iostreams::filtering_streambuf<boost::iostreams::input> inbz2;
37 | #endif
38 |   
39 |   public:
40 |   TraceReader(std::string fname) {
41 |     trace.open(fname.c_str(),std::ios::in);
42 | 
43 |     //boost::cmatch m;
44 |     //static const boost::regex e(".*\\.bz2$");
45 |     //if(regex_match(fname.c_str(), m, e)) type = BZ2;
46 |     if(NULL!=strstr(fname.c_str(), ".bz2")) {
47 | #ifdef HAVE_BOOST_IO
48 |       type = BZ2;
49 | #else
50 |       std::cerr << "bz2 not supported (anymore)\n";
51 |       _exit(10);
52 | #endif
53 |     } else type=NORM;
54 | 
55 | #ifdef HAVE_BOOST_IO
56 |     if(type == BZ2) {
57 |       inbz2.push(boost::iostreams::bzip2_decompressor());
58 |       inbz2.push(trace);
59 |     }
60 | #endif
61 |   }
62 | 
63 |   bool is_open() {
64 |     return trace.is_open();
65 |   }
66 | 
67 |   std::streampos tellg() {
68 |     return trace.tellg();
69 |   }
70 | 
71 |   void seekg(std::streampos pos) {
72 |     trace.seekg(pos);
73 |   }
74 | 
75 |   bool getline(char* s, int n) {
76 |     bool eof=0;
77 |     if(type == BZ2) {
78 | #ifdef HAVE_BOOST_IO
79 |       int pos = 0;
80 |       while(1) {
81 |         std::string line;
82 |         char z = boost::iostreams::get(inbz2);
83 |         if(z == '\n') break;
84 |         if(z == EOF) { eof=1; break; }
85 |         s[pos++] = z;
86 |       }
87 |       s[pos]='\0';
88 | #endif
89 |     } else {
90 |       trace.getline(s,n);
91 |       eof = trace.eof();
92 |     }
93 |     //std::cout << "getline " << s << "\n";
94 |     return eof;
95 |   }
96 | };
97 | 


--------------------------------------------------------------------------------
/src/Schedgen/schedgen_cmdline.ggo:
--------------------------------------------------------------------------------
 1 | package "schedgen" 
 2 | version "0.1"
 3 | 
 4 | option "ptrn"        p "Name of the communication pattern that should be used to generate a schedule"
 5 | 							values="binomialtreereduce","binarytreebcast","binomialtreebcast","nwaydissemination","pipelinedring","pipelinedringdep","doublering","gather","scatter","linbarrier","trace","dissemination","random_bisect","random_bisect_fd_sym","linear_alltoall","linear_alltoallv","allreduce_recdoub","allreduce_ring","resnet","chained_dissem"
 6 | 							default="binomialtreebcast" string optional
 7 | option "commsize"    s "Number of nodes that should be used in the communication pattern" default="8" int optional
 8 | option "timemult"    - "Time multiplier, relative to microsecond (e.g., nanoseconds -> 1000)" default="1000" int optional
 9 | option "datasize"    d "Number of bytes that are transmitted in the communication patttern in a basic step" default="1" int optional
10 | option "filename"    o "Filename for the name of the generated schedule" default="schedule.goal" string optional
11 | option "nway"        - "Fanout for the n-way dissemination pattern" default="1" int optional
12 | option "root"        - "Root node for certain patterns" default="0" int optional
13 | option "segmentsize" - "Segment size for pipelined pattern" default="1" int optional
14 | option "nb" - "nonblocking execution of collectives (provide length of local operation)" default="0" int optional
15 | option "nb-poll" - "polling interval for nonblocking execution" default="0" int optional
16 | option "cpu" - "select CPU to execute computation" default="0" int optional
17 | option "rpl-dep-cmp" - "replaces dependencies with fixed computation (parameter represents computation time. If -1, does not replace dependencies.)" default="-1" int optional
18 | option "a2av-skew-ratio" - "In the alltoallv pattern, each chunk of data sent by each rank is of a random size. The size of the chunk with id 'root' is of size 'datasize'. All the other chunks are of random size, and a2av-skew-ratio times smaller." default="1" int optional
19 | option "outcast" - "Generates outcast in the alltoallv pattern." flag off
20 | option "traces"      - "Tracefile for rank 0 (others are autodetected)" string optional
21 | option "traces-start" - "file with start lines in trace files (is updated after run if trace-nops is given)" string optional
22 | option "traces-nops" - "number of operations to write to file (0=all)" default="0" int optional
23 | option "traces-extr" - "extrapolation factor for traces" default="1" int optional
24 | option "traces-print" - "print each operation" default="0" int optional
25 | option "traces-nop2p" - "do not consider point-to-point communication" flag off
26 | option "traces-nocolls" - "do not consider collective communication" flag off
27 | option "traces-nbcify" - "turn blocking collectives into non-blocking colls, time to pre-post NBCs" default="0" int optional
28 | 
29 | 


--------------------------------------------------------------------------------
/src/Drawviz/TimelineDrawing.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2009 The Trustees of Indiana University and Indiana
 3 |  *                    University Research and Technology
 4 |  *                    Corporation.  All rights reserved.
 5 |  *
 6 |  * Author(s): Torsten Hoefler <htor@cs.indiana.edu>
 7 |  *            Timo Schneider <timoschn@cs.indiana.edu>
 8 |  *
 9 |  */
10 | 
11 | #include <string>
12 | #include <math.h>
13 | #include <stdio.h>
14 | #include <assert.h>
15 | #include <vector>
16 | #include <libps/pslib.h>
17 | #include <inttypes.h>
18 | #include "cmdline.h"
19 | 
20 | class overh {
21 | 	public:
22 | 		int type; // 1 = osend, 2 = orecv
23 | 		int rank;
24 | 		int cpu;
25 | 		uint64_t start;
26 | 		uint64_t end;
27 | 		float r;
28 | 		float g;
29 | 		float b;
30 | };
31 | 
32 | class trans {
33 | 	public:
34 | 		int source;
35 | 		int dest;
36 | 		uint64_t starttime;
37 | 		uint64_t endtime;
38 | 		int size;
39 | 		int G;		
40 | 		int r;		
41 | 		int g;		
42 | 		int b;		
43 | };
44 | 
45 | class TimelineDrawing {
46 | 
47 | 	private:
48 | 	gengetopt_args_info args_info;
49 | 
50 | 	PSDoc *psdoc;
51 | 	int psfont;
52 | 	int fontsize;
53 | 
54 | 	int numranks;
55 | 	double ranksep;
56 | 	int numcpus;
57 | 	double cpusep;
58 | 	double timesep;
59 | 
60 | 	int width;
61 | 	int height;
62 | 	int leftmargin;
63 | 
64 | 	std::string content;
65 | 
66 | 	std::vector<overh> overheads;
67 | 	std::vector<trans> transmissions;
68 | 
69 | 	void calc_arrowhead_coords(int sx, int sy, int dx, int dy, int *x1, int *y1, int *x2, int *y2);
70 | 	void add_ranknum(int);	
71 | 	public:
72 | 
73 |   TimelineDrawing(gengetopt_args_info _args_info) : args_info(_args_info) {};
74 | 
75 | 	void init_graph(int numranks, int numcpus, int width, int height, std::string filename);
76 | 	void close_graph();
77 | 	void draw_everything(int maxtime);
78 | 	
79 | 	void draw_ranklines();
80 | 	void draw_osend(int rank, int cpu, uint64_t start, uint64_t end, float r, float g, float b);
81 | 	void draw_orecv(int rank, int cpu, uint64_t start, uint64_t end, float r, float g, float b);
82 | 	void draw_transmission(int source, int dest, uint64_t starttime, uint64_t endtime, int size, int G, float r, float g, float b);
83 | 	void draw_loclop(int rank, int cpu, uint64_t start, uint64_t end, float r, float g, float b);
84 | 	void draw_noise(int rank, int cpu, uint64_t start, uint64_t end, float r, float g, float b);
85 | 	void draw_seperator(int rank, int cpu, int pos);
86 | 	
87 | 	void add_osend(int rank, uint64_t start, uint64_t end, int cpu, float r, float g, float b);
88 | 	void add_orecv(int rank, uint64_t start, uint64_t end, int cpu, float r, float g, float b);
89 | 	void add_transmission(int source, int dest, uint64_t starttime, uint64_t endtime, int size, int G, float r, float g, float b);
90 | 	void add_loclop(int rank, uint64_t start, uint64_t end, int cpu, float r, float g, float b);
91 | 	void add_noise(int rank, uint64_t start, uint64_t end, int cpu, float r, float g, float b);
92 | };
93 | 
94 | 
95 | 


--------------------------------------------------------------------------------
/cmake/FindGraphviz.cmake:
--------------------------------------------------------------------------------
 1 | # - Try to find Graphviz
 2 | # Once done this will define
 3 | #
 4 | #  GRAPHVIZ_FOUND - system has Graphviz
 5 | #  GRAPHVIZ_INCLUDE_DIRS - Graphviz include directories
 6 | #  GRAPHVIZ_CGRAPH_LIBRARY - Graphviz CGRAPH library
 7 | #  GRAPHVIZ_VERSION - Graphviz version
 8 | #
 9 | # This module reads hints about search locations from the following cmake variables:
10 | #  GRAPHVIZ_ROOT          - Graphviz installation prefix
11 | #                           (containing bin/, include/, etc.)
12 | 
13 | # Copyright (c) 2009, Adrien Bustany, <madcat@mymadcat.com>
14 | # Copyright (c) 2013-2014 Kevin Funk <kevin.funk@kdab.com>
15 | 
16 | # Version computation and some cleanups by Allen Winter <allen.winter@kdab.com>
17 | # Copyright (c) 2012-2014 Klarälvdalens Datakonsult AB, a KDAB Group company <info@kdab.com>
18 | 
19 | # Simplified script by Dogan Can <dogancan@usc.edu>
20 | # Copyright (c) 2014 University of Southern California
21 | 
22 | # Redistribution and use is allowed according to the terms of the GPLv3+ license.
23 | 
24 | 
25 | if(GRAPHVIZ_ROOT)
26 |   set(_GRAPHVIZ_INCLUDE_DIR ${GRAPHVIZ_ROOT}/include)
27 |   set(_GRAPHVIZ_LIBRARY_DIR ${GRAPHVIZ_ROOT}/lib)
28 | endif()
29 | 
30 | find_path(GRAPHVIZ_INCLUDE_DIR         NAMES graphviz/cgraph.h
31 |           HINTS ${_GRAPHVIZ_INCLUDE_DIR})
32 | find_library(GRAPHVIZ_CGRAPH_LIBRARY   NAMES cgraph
33 |              HINTS ${_GRAPHVIZ_LIBRARY_DIR})
34 | 
35 | if(GRAPHVIZ_INCLUDE_DIR AND GRAPHVIZ_CGRAPH_LIBRARY)
36 |   set(GRAPHVIZ_FOUND TRUE)
37 | else()
38 |   set(GRAPHVIZ_FOUND FALSE)
39 | endif()
40 | 
41 | # Ok, now compute the version
42 | if(GRAPHVIZ_FOUND)
43 |     set(FIND_GRAPHVIZ_VERSION_SOURCE
44 |       "#include <graphviz/graphviz_version.h>\n#include <stdio.h>\n int main()\n {\n printf(\"%s\",PACKAGE_VERSION);return 1;\n }\n")
45 |     set(FIND_GRAPHVIZ_VERSION_SOURCE_FILE ${CMAKE_BINARY_DIR}/CMakeTmp/FindGRAPHVIZ.cxx)
46 |     file(WRITE "${FIND_GRAPHVIZ_VERSION_SOURCE_FILE}" "${FIND_GRAPHVIZ_VERSION_SOURCE}")
47 |     
48 |     set(FIND_GRAPHVIZ_VERSION_ADD_INCLUDES
49 |       "-DINCLUDE_DIRECTORIES:STRING=${GRAPHVIZ_INCLUDE_DIR}")
50 |     
51 |     try_run(RUN_RESULT COMPILE_RESULT
52 |       ${CMAKE_BINARY_DIR}
53 |       ${FIND_GRAPHVIZ_VERSION_SOURCE_FILE}
54 |       CMAKE_FLAGS "${FIND_GRAPHVIZ_VERSION_ADD_INCLUDES}"
55 |       RUN_OUTPUT_VARIABLE GRAPHVIZ_VERSION)
56 |     
57 |     if(COMPILE_RESULT AND RUN_RESULT EQUAL 1)
58 |       message(STATUS "Graphviz version: ${GRAPHVIZ_VERSION}")
59 |     else()
60 |       message(FATAL_ERROR "Unable to compile or run the graphviz version detection program.")
61 |     endif()
62 |     
63 |     set(GRAPHVIZ_INCLUDE_DIRS ${GRAPHVIZ_INCLUDE_DIR} ${GRAPHVIZ_INCLUDE_DIR}/graphviz)
64 |     
65 |     if(NOT Graphviz_FIND_QUIETLY)
66 |       message(STATUS "Graphviz include: ${GRAPHVIZ_INCLUDE_DIRS}")
67 |       message(STATUS "Graphviz libraries: ${GRAPHVIZ_CGRAPH_LIBRARY}")
68 |     endif()
69 | endif()
70 | 
71 | if(Graphviz_FIND_REQUIRED AND NOT GRAPHVIZ_FOUND)
72 |   message(FATAL_ERROR "Could not find GraphViz.")
73 | endif()
74 | 


--------------------------------------------------------------------------------
/src/LogGOPSim/Goal.hpp:
--------------------------------------------------------------------------------
  1 | #include <sys/types.h>
  2 | #include <sys/stat.h>
  3 | #include <fcntl.h>
  4 | 
  5 | #include "Parser.hpp"
  6 | 
  7 | typedef Node* goalop_t;
  8 | 
  9 | class Goal {
 10 | 
 11 | 	private:
 12 | 		Graph graph;
 13 | 		uint32_t rank;
 14 | 		uint32_t num_ranks;
 15 | 
 16 | 		uint8_t MaxCPU(uint8_t cpu = 0) {
 17 | 			static uint8_t max_cpu = 0;
 18 | 			if (cpu > max_cpu) max_cpu = cpu;
 19 | 			return max_cpu;
 20 | 		}
 21 | 	
 22 | 		uint8_t MaxNIC(uint8_t nic = 0) {
 23 | 			static uint8_t max_nic = 0;
 24 | 			if (nic > max_nic) max_nic = nic;
 25 | 			return max_nic;
 26 | 		}
 27 | 			
 28 | 	public:
 29 | 	
 30 | 		goalop_t Send(uint32_t src, uint32_t dest, uint64_t size, uint32_t tag, uint8_t cpu, uint8_t nic) {
 31 | 			
 32 | 			Node* n = graph.addNode();
 33 | 			
 34 | 			n->Type = OPTYPE_SEND; 
 35 | 			n->Peer = dest;
 36 | 			n->Tag = tag;
 37 | 			n->Proc = cpu;
 38 | 			n->Nic = nic;
 39 | 			n->Size = size;
 40 | 
 41 | 			MaxCPU(cpu);
 42 | 			MaxNIC(nic);
 43 | 			
 44 | 			return n;
 45 | 		}
 46 | 
 47 | 		goalop_t Recv(uint32_t src, uint32_t dest, uint64_t size, uint32_t tag, uint8_t cpu, uint8_t nic) {
 48 | 		
 49 | 			Node* n = graph.addNode();
 50 | 			
 51 | 			n->Type = OPTYPE_RECV; 
 52 | 			n->Peer = src;
 53 | 			n->Tag = tag;
 54 | 			n->Proc = cpu;
 55 | 			n->Nic = nic;
 56 | 			n->Size = size;
 57 | 		
 58 | 			MaxCPU(cpu);
 59 | 			MaxNIC(nic);	
 60 | 			
 61 | 			return n;
 62 | 		}
 63 | 
 64 | 		goalop_t Calc(uint32_t src, uint64_t size, uint8_t cpu, uint8_t nic) {
 65 | 		
 66 | 			Node* n = graph.addNode();
 67 | 			
 68 | 			n->Type = OPTYPE_CALC; 
 69 | 			n->Peer = 0; // this optype has not real peer, i just set it so it is clearly defined
 70 | 			n->Tag = 0; // this optype has not real tag, i just set it so it is clearly defined
 71 | 			n->Proc = cpu;
 72 | 			n->Nic = nic;
 73 | 			n->Size = size;
 74 | 			
 75 | 			MaxCPU(cpu);
 76 | 			MaxNIC(nic);
 77 | 
 78 | 			return n;
 79 | 		}
 80 | 
 81 | 		void StartDependency(goalop_t src, goalop_t dest) {
 82 | 			// a can not be executed before b is started	
 83 | 			graph.addStartDependency(src, dest);
 84 | 		}
 85 | 
 86 | 		void Dependency(goalop_t src, goalop_t dest) {
 87 | 			//a can not be executed before b is finished
 88 | 			graph.addDependency(src, dest);
 89 | 		}
 90 | 
 91 | 		void SerializeSchedule(char* filename) {
 92 | 			
 93 | 			static int fd;
 94 | 			
 95 | 			// create/open binary schedule if it is the first rank (rank 0)
 96 | 			if (rank==0) {
 97 | 				fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, S_IWUSR | S_IRUSR);
 98 | 				if (fd == -1) {
 99 | 					fprintf(stderr, "Couldn't open %s for schedule serialization!\n", filename);		
100 | 					perror("system error message:");
101 | 					exit(EXIT_FAILURE);
102 | 				}
103 | 			}
104 | 
105 | 			graph.serialize_mmap(fd, rank, num_ranks, MaxCPU(), MaxNIC());
106 | 			
107 | 			// close the binary schedule if it is the last rank
108 | 			if (rank == num_ranks-1) {
109 | 				close(fd);
110 | 				sync();
111 | 			}
112 | 			
113 | 		}
114 | 
115 | 		void SetRank(uint32_t r) {
116 | 			rank=r;
117 | 		}
118 | 
119 | 		void SetNumRanks(uint32_t nr) {
120 | 			num_ranks = nr;
121 | 		}
122 | 
123 | };
124 | 


--------------------------------------------------------------------------------
/doc/README:
--------------------------------------------------------------------------------
 1 | # README file for the LogGOPSim simulator
 2 | 
 3 | Installing
 4 | ----------
 5 | 
 6 |   * prerequisites to build: 
 7 |     - C++ compiler (e.g., g++)
 8 |     - re2c - http://re2c.org/
 9 |     - gengetopt - http://www.gnu.org/software/gengetopt/gengetopt.html
10 |     - libagraph - http://www.graphviz.org/
11 | 
12 |   * build: 
13 |     - optional: edit Makefile (change CXX and/or CXXFLAGS)
14 |     - make
15 | 
16 | Running
17 | -------
18 | 
19 |   * write or generate a GOAL schedule or use one of the example
20 |     schedules (e.g., dissemination_16.goal or binary_tree_16.goal)
21 | 
22 |   * convert schedule to binary format using txt2bin:
23 |     - e.g., txt2bin -i dissemination_16.goal -o dissemination_16.bin  
24 | 
25 |   * execute simulation with default parameters (see LogGOPSim --help for
26 |     more options):
27 |     - e.g., LogGOPSim -f dissemination_16.bin
28 | 
29 |   * interpret output:
30 |     - for small simulations, each host end time is printed (22000ns for
31 |       our example with default parameters)
32 |     - for larger runs, only the maximum time is printed
33 | 
34 | Visualization
35 | -------------
36 | 
37 |   * run LogGOPSim with -V <outfile> option:
38 |     - e.g., LogGOPSim -f dissemination_16.bin -V viz.out
39 | 
40 |   * compile DrawViz (simple "make")
41 |   
42 |   * run DrawViz (only for smaller simulations):
43 |     - e.g., drawviz -i viz.out -o viz.eps
44 | 
45 |   * view postscript output:
46 |     - e.g., gv viz.eps
47 | 
48 | MPI Matching Data
49 | ------------------
50 | 
51 |   * run LogGOPSim with -qstat <outfile-prefix> option:
52 |     - e.g., LogGOPSim -f dissemination_16.bin -stat mpi-matching will produce several
53 |       files containing MPI match queue data with names that have the form mpi-matching.*.data
54 | 
55 |   * additional information on the MPI matching data is available in README-mpi-matching
56 | 
57 | Schedgen - automatic GOAL schedule generator
58 | --------------------------------------------
59 | 
60 |   * compile SchedGen (simple "make")
61 | 
62 |   * run schedgen to generate schedules for collective operations:
63 |     - e.g., schedgen -p binomialtreebcast -s 32 -o binary_tree_32.goal
64 |       (generates a binomial tree brodacast pattern with 32 processes,
65 |       the GOAL schedule can be converted to the binary simulator input
66 |       with txt2bin)
67 | 
68 |   * run schedgen to generate schedules for application traces collected
69 |     with liballprof-0.9:
70 |     - traces need to be collected by linking liballprof as PMPI layer
71 |       with an MPI application. Sample traces are included in the
72 |       distribution in liballprof-samples
73 |     - e.g., schedgen -p trace --traces liballprof-samples/sweep3d-2x2/pmpi-trace-rank-0.txt -o sweep-4.goal
74 |     - convert and simulate:
75 |       - e.g., txt2bin -i sweep-4.goal -o sweep-4.bin
76 |               LogGOPSim -f sweep-4.bin
77 | 
78 | Citation
79 | --------
80 | 
81 | Any published work which uses this software should include the following
82 | citation:
83 | ----------------------------------------------------------------------
84 | T. Hoefler, T. Schneider, A. Lumsdaine: LogGOPSim ­ Simulating
85 | Large-Scale Applications in the LogGOPS Model
86 | ----------------------------------------------------------------------
87 | 


--------------------------------------------------------------------------------
/src/LogGOPSim/TimelineVisualization.hpp:
--------------------------------------------------------------------------------
  1 | #include <string>
  2 | #include <math.h>
  3 | #include <sstream>
  4 | #include <fstream>
  5 | #include <stdio.h>
  6 | #include <assert.h>
  7 | #include <vector>
  8 | #include <inttypes.h>
  9 | 
 10 | class TimelineVisualization {
 11 | 
 12 |     private:
 13 | 	std::string content;
 14 |   	bool enable;
 15 | 	std::string filename;
 16 | 
 17 |   void add_ranknum(int numranks) {
 18 | 
 19 |     std::stringstream os;
 20 |     os << "numranks " << numranks << ";\n";
 21 |     this->content.append(os.str());
 22 | 
 23 |   }
 24 | 
 25 |   void write_events(bool append) {
 26 | 
 27 |     std::ofstream myfile;
 28 |     if (append) myfile.open(filename.c_str(), std::ios::out | std::ios::app);
 29 |     else myfile.open(filename.c_str(), std::ios::out);
 30 |     if (myfile.is_open()) {
 31 |       myfile << this->content;
 32 |       myfile.close();
 33 |     }
 34 |     else {
 35 |       fprintf(stderr, "Unable to open %s\n", filename.c_str());
 36 |     }
 37 |     
 38 |   }
 39 | 
 40 | 
 41 | 
 42 |   public:
 43 | 
 44 |   TimelineVisualization(gengetopt_args_info *args_info, int p) {
 45 |     this->enable = args_info->vizfile_given;
 46 |     if(!enable) return;
 47 | 
 48 |     filename = args_info->vizfile_arg;
 49 |     add_ranknum(p);
 50 |   }
 51 | 
 52 |   ~TimelineVisualization() {
 53 |     if(!enable) return;
 54 | 
 55 |     write_events(false);
 56 |   }
 57 | 	
 58 |   void add_osend(int rank, uint64_t start, uint64_t end, int cpu, float r=0.0, float g=0.0, float b=1.0) {
 59 |     if(!enable) return;
 60 |     
 61 |     std::stringstream outstream;
 62 |     outstream << "osend " << rank << " " << cpu << " " << start << " " << end << " " << r << " " << g << " " << b << ";\n";
 63 |     this->content.append(outstream.str());
 64 | 
 65 |   }
 66 | 
 67 |   void add_orecv(int rank, uint64_t start, uint64_t end, int cpu, float r=0.0, float g=0.0, float b=1.0) {
 68 |     if(!enable) return;
 69 |     
 70 |     std::stringstream os;
 71 |     os << "orecv " << rank << " " << cpu << " " << start << " " << end << " " << r << " " << g << " " << b << ";\n";
 72 |     this->content.append(os.str());
 73 | 
 74 |   }
 75 | 
 76 |   void add_loclop(int rank, uint64_t start, uint64_t end, int cpu, float r=1.0, float g=0.0, float b=0.0) {
 77 |     if(!enable) return;
 78 |     
 79 |     std::stringstream os;
 80 |     os << "loclop " << rank << " " << cpu << " " << start << " " << end << " " << r << " " << g << " " << b << ";\n";
 81 |     this->content.append(os.str());
 82 | 
 83 |   }
 84 |   
 85 |   void add_noise(int rank, uint64_t start, uint64_t end, int cpu, float r=0.0, float g=1.0, float b=0.0) {
 86 |     if(!enable) return;
 87 |     
 88 |     std::stringstream os;
 89 |     os << "noise " << rank << " " << cpu << " " << start << " " << end << " " << r << " " << g << " " << b << ";\n";
 90 |     this->content.append(os.str());
 91 | 
 92 |   }
 93 | 
 94 |   void add_transmission(int source, int dest, uint64_t starttime, uint64_t endtime, int size, int G, float r=0.0, float g=0.0, float b=1.0) {
 95 |     if(!enable) return;
 96 |     
 97 |     std::stringstream os;
 98 |     os << "transmission " << source << " " << dest << " " << starttime << " ";
 99 |     os << endtime << " " << size << " " << G <<  " " << r << " " << g << " " << b << ";\n";
100 |     this->content.append(os.str());
101 |   }
102 | };
103 | 


--------------------------------------------------------------------------------
/src/LogGOPSim/binary_tree_32.goal:
--------------------------------------------------------------------------------
  1 | num_ranks 32
  2 | 
  3 | rank 0 {
  4 | l1: send 1b to 1 tag 0
  5 | l2: send 1b to 2 tag 0
  6 | l3: send 1b to 4 tag 0
  7 | l4: send 1b to 8 tag 0
  8 | l5: send 1b to 16 tag 0
  9 | }
 10 | 
 11 | rank 1 {
 12 | l1: recv 1b from 0 tag 0
 13 | l2: send 1b to 3 tag 0
 14 | l2 requires l1
 15 | l3: send 1b to 5 tag 0
 16 | l3 requires l1
 17 | l4: send 1b to 9 tag 0
 18 | l4 requires l1
 19 | l5: send 1b to 17 tag 0
 20 | l5 requires l1
 21 | }
 22 | 
 23 | rank 2 {
 24 | l1: recv 1b from 0 tag 0
 25 | l2: send 1b to 6 tag 0
 26 | l2 requires l1
 27 | l3: send 1b to 10 tag 0
 28 | l3 requires l1
 29 | l4: send 1b to 18 tag 0
 30 | l4 requires l1
 31 | }
 32 | 
 33 | rank 3 {
 34 | l1: recv 1b from 1 tag 0
 35 | l2: send 1b to 7 tag 0
 36 | l2 requires l1
 37 | l3: send 1b to 11 tag 0
 38 | l3 requires l1
 39 | l4: send 1b to 19 tag 0
 40 | l4 requires l1
 41 | }
 42 | 
 43 | rank 4 {
 44 | l1: recv 1b from 0 tag 0
 45 | l2: send 1b to 12 tag 0
 46 | l2 requires l1
 47 | l3: send 1b to 20 tag 0
 48 | l3 requires l1
 49 | }
 50 | 
 51 | rank 5 {
 52 | l1: recv 1b from 1 tag 0
 53 | l2: send 1b to 13 tag 0
 54 | l2 requires l1
 55 | l3: send 1b to 21 tag 0
 56 | l3 requires l1
 57 | }
 58 | 
 59 | rank 6 {
 60 | l1: recv 1b from 2 tag 0
 61 | l2: send 1b to 14 tag 0
 62 | l2 requires l1
 63 | l3: send 1b to 22 tag 0
 64 | l3 requires l1
 65 | }
 66 | 
 67 | rank 7 {
 68 | l1: recv 1b from 3 tag 0
 69 | l2: send 1b to 15 tag 0
 70 | l2 requires l1
 71 | l3: send 1b to 23 tag 0
 72 | l3 requires l1
 73 | }
 74 | 
 75 | rank 8 {
 76 | l1: recv 1b from 0 tag 0
 77 | l2: send 1b to 24 tag 0
 78 | l2 requires l1
 79 | }
 80 | 
 81 | rank 9 {
 82 | l1: recv 1b from 1 tag 0
 83 | l2: send 1b to 25 tag 0
 84 | l2 requires l1
 85 | }
 86 | 
 87 | rank 10 {
 88 | l1: recv 1b from 2 tag 0
 89 | l2: send 1b to 26 tag 0
 90 | l2 requires l1
 91 | }
 92 | 
 93 | rank 11 {
 94 | l1: recv 1b from 3 tag 0
 95 | l2: send 1b to 27 tag 0
 96 | l2 requires l1
 97 | }
 98 | 
 99 | rank 12 {
100 | l1: recv 1b from 4 tag 0
101 | l2: send 1b to 28 tag 0
102 | l2 requires l1
103 | }
104 | 
105 | rank 13 {
106 | l1: recv 1b from 5 tag 0
107 | l2: send 1b to 29 tag 0
108 | l2 requires l1
109 | }
110 | 
111 | rank 14 {
112 | l1: recv 1b from 6 tag 0
113 | l2: send 1b to 30 tag 0
114 | l2 requires l1
115 | }
116 | 
117 | rank 15 {
118 | l1: recv 1b from 7 tag 0
119 | l2: send 1b to 31 tag 0
120 | l2 requires l1
121 | }
122 | 
123 | rank 16 {
124 | l1: recv 1b from 0 tag 0
125 | }
126 | 
127 | rank 17 {
128 | l1: recv 1b from 1 tag 0
129 | }
130 | 
131 | rank 18 {
132 | l1: recv 1b from 2 tag 0
133 | }
134 | 
135 | rank 19 {
136 | l1: recv 1b from 3 tag 0
137 | }
138 | 
139 | rank 20 {
140 | l1: recv 1b from 4 tag 0
141 | }
142 | 
143 | rank 21 {
144 | l1: recv 1b from 5 tag 0
145 | }
146 | 
147 | rank 22 {
148 | l1: recv 1b from 6 tag 0
149 | }
150 | 
151 | rank 23 {
152 | l1: recv 1b from 7 tag 0
153 | }
154 | 
155 | rank 24 {
156 | l1: recv 1b from 8 tag 0
157 | }
158 | 
159 | rank 25 {
160 | l1: recv 1b from 9 tag 0
161 | }
162 | 
163 | rank 26 {
164 | l1: recv 1b from 10 tag 0
165 | }
166 | 
167 | rank 27 {
168 | l1: recv 1b from 11 tag 0
169 | }
170 | 
171 | rank 28 {
172 | l1: recv 1b from 12 tag 0
173 | }
174 | 
175 | rank 29 {
176 | l1: recv 1b from 13 tag 0
177 | }
178 | 
179 | rank 30 {
180 | l1: recv 1b from 14 tag 0
181 | }
182 | 
183 | rank 31 {
184 | l1: recv 1b from 15 tag 0
185 | }
186 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | The LogGOPSim Toolchain
 2 | =======================
 3 | 
 4 | The tools in this repository are centered around LogGOPSim, a network simulator
 5 | based on the LogGP model.
 6 | 
 7 | For a full explanation of this model, please see the referenced publications. But in
 8 | short this model (as implemented in LogGOPSim) provides the following:
 9 |   
10 |   * Matching semantics similar to MPI, i.e., a send matches a specific receive, thus both sender and receiver can influence matching, and dependencies between recv and send operations can be expressed, thus real-world applications can be simulated (unlike other simulators which rely on predefined traffic patterns).
11 |   * Nessages take a uniform amount of time between any pair of hosts, regardless of other traffic (there are extensions of LogGOPSim which change that), thus large-scale simulations can be performed relatively fast, compared to packet-based simulators.
12 |   
13 | Parts of the toolchain
14 | ======================
15 | 
16 |  * LogGOPSim: The simulator itself. It consumes a GOAL binary file, which specifies the actions (send and receive) of each host in the simulated network and produces a timing report, i.e., the time at which each host finishes its execution (among other data).
17 |  * Schedgen:  While it is possible to write a GOAL file for LogGOPSim by hand, this is not advised. Instead, the Schedgen tool can be used to create such files. Schedgen can produce GOAL files for single MPI collective operations, but also allows to produce GOAL files which mimic the communication patterns observed in ML training workloads. It can also convert traces of MPI applications into the GOAL format. In case Schedgen does not offer the communication pattern you want to simulate, it can be extended using a C++ or Python API.
18 |  * Schedgen2: An experimental re-implementation of Schedgen in Python - while this offers features that Schedgen lacks it misses many things stil.
19 |  * Txt2bin: The output of Schedgen is produced in a human-readable text format, which makes it easy to debug schedules, however, for large scale simulations the limiting resource is memory/cache, thus we convert the GOAL file into a space-efficient binary format before feeding it into LogGOPSim. The txt2bin tool performs this conversion. When invoking LogGOPSim, the user has the option of allowing "destructive reading" of the binary schedule, i.e., the input file is memory mapped and modified during the execution to limit further reduce the amount of memory required during large simulations.
20 |  * liballprof: A wrapper library around MPI which records all MPI calls, including their non-data arguments, the MPI traces produced can be converted into the GOAL format by Schedgen.
21 | 
22 | 
23 | Building the toolchain
24 | ======================
25 | 
26 | On a recent Debian-based distro such as Ubuntu you can install the build dependencies with something like
27 | ```
28 | sudo apt-get install cmake gengetopt re2c libgraphviz-dev python3 libclang-15-dev llvm-15-dev python3-clang-15 openmpi-bin openmpi-common libopenmpi-dev libunwind-dev
29 | ```
30 | YMMV, but this is what we use in our CI pipeline.
31 | 
32 | 
33 | This project uses cmake as its build tool:
34 | ```
35 |  git clone [This repo]
36 |  cd LogGOPSim
37 |  mkdir build
38 |  cd build
39 |  cmake ../src/CMakeLists.txt
40 |  make
41 | ```
42 | 
43 | Simple usage example
44 | ====================
45 | 
46 | ```
47 |   # we assume we are in the build folder, i.e., completed the steps above
48 |  ./schedgen --commsize 20 --datasize 1024 --ptrn binomialtreereduce -o example.goal  # generate a GOAL text file for a simple pattern (a reduction using a binomial tree, for 20 hosts, each host contributing 1024 bytes)
49 |  ./txt2bin -i example.goal -o example.bin   # convert the GOAL text file into the binary format required by LogGOPSim
50 |  ./LogGOPSim -f example.bin                 # run LogGOPSim with default parameters (see output below, try running with --help to see how to change them)                                      
51 |   LogGP network backend; size: 8 (1 CPUs, 1 NICs); L=2500, o=1500 g=1000, G=6, O=0, P=8, S=65535
52 |   PERFORMANCE: Processes: 8 	 Events: 21 	 Time: 0 s 	 Speed: inf ev/s
53 |   Times: 
54 |   Host 0: 34914
55 |   Host 1: 24776
56 |   Host 2: 13138
57 |   Host 3: 13138
58 |   Host 4: 1500
59 |   Host 5: 1500
60 |   Host 6: 1500
61 |   Host 7: 1500
62 | ```
63 | 


--------------------------------------------------------------------------------
/src/Schedgen/schedgen.hpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2009 The Trustees of Indiana University and Indiana
  3 |  *                    University Research and Technology
  4 |  *                    Corporation.  All rights reserved.
  5 |  *
  6 |  * Author(s): Torsten Hoefler <htor@cs.indiana.edu>
  7 |  *            Timo Schneider <timoschn@cs.indiana.edu>
  8 |  *
  9 |  */
 10 | 
 11 | #ifndef SCHEDGEN_HPP
 12 | #define SCHEDGEN_HPP
 13 | 
 14 | #include <fstream>
 15 | #include <iostream>
 16 | #include <set>
 17 | #include <string>
 18 | #include <vector>
 19 | //#include <boost/regex.hpp>
 20 | #include "buffer_element.hpp"
 21 | #include "schedgen_cmdline.h"
 22 | 
 23 | class Goal;
 24 | 
 25 | class LocOp {
 26 | private:
 27 |   double time_mult; // multiplier - relative to microseconds
 28 | public:
 29 |   int cpu; // cpu to execute on
 30 |   enum t_type {
 31 |     IREQU,
 32 |     REQU
 33 |   }; // type of preceding op (I{send,recv} or {send,recv})
 34 |   std::vector<std::pair<int /*Goal::t_id*/, t_type>> prev,
 35 |       next;     // preceding and next operations - pairs of id and type
 36 |   Goal *goal;   // goal object
 37 |   double start; // start time for this local operation
 38 | 
 39 |   LocOp(Goal *_goal, double _time_mult, int cpu)
 40 |       : time_mult(_time_mult), cpu(cpu), goal(_goal), start(0) {}
 41 |   void NextOp(double time, double tend);
 42 | };
 43 | 
 44 | #include "goal_comm.h"
 45 | 
 46 | class Goal {
 47 | 
 48 | public:
 49 |   Comm *comm;
 50 |   typedef int t_id;             // identifier type
 51 |   static const t_id NO_ID; // invalid identifier
 52 | 
 53 |   typedef std::vector<std::pair<t_id, LocOp::t_type>>
 54 |       locop; /* used to identify local operations for dependencies, it's a
 55 |                 vector of pairs of < id , irequ | requ > */
 56 | 
 57 |   Goal(gengetopt_args_info *args_info, int nranks);
 58 |   ~Goal();
 59 | 
 60 |   void StartOp() { // this starts an operatio
 61 |     start.clear();
 62 |     end.clear();
 63 |   }
 64 | 
 65 |   int BuildComm_split(int base_comm, int rank_in_world_comm, int color,
 66 |                       int key) {
 67 |     Comm *c = this->comm->find_comm(base_comm);
 68 |     Comm *nc = c->find_or_create_child_comm(color);
 69 |     nc->add_rank_key(rank_in_world_comm, key);
 70 |     return nc->getId();
 71 |   }
 72 | 
 73 |   std::pair<locop, locop> EndOp() {
 74 |     locop rstart, rend;
 75 |     std::set<t_id>::iterator it;
 76 |     for (it = start.begin(); it != start.end(); it++) {
 77 |       rstart.push_back(std::make_pair(*it, LocOp::REQU));
 78 |     }
 79 |     for (it = end.begin(); it != end.end(); it++) {
 80 |       rend.push_back(std::make_pair(*it, LocOp::REQU));
 81 |     }
 82 |     return std::make_pair(rstart, rend);
 83 |   }
 84 | 
 85 |   void SetTag(uint64_t tag) { curtag = tag; }
 86 |   void StartRank(int rank);
 87 |   void Comment(std::string c);
 88 |   int Send(std::vector<buffer_element> buf, int dest);
 89 |   int Send(int size, int dest);
 90 |   int Recv(std::vector<buffer_element>, int src);
 91 |   int Recv(int size, int src);
 92 |   int Exec(std::string opname, btime_t size, int proc);
 93 |   int Exec(std::string opname, std::vector<buffer_element> buf);
 94 |   int Exec(std::string opname, btime_t size);
 95 |   void Requires(int tail, int head);
 96 |   void Irequires(int tail, int head);
 97 |   void EndRank();
 98 |   void Write();
 99 |   void AppendString(std::string);
100 | 
101 | private:
102 |   std::set<t_id> start,
103 |       end; /* the operations which are independent at start and end */
104 |   std::string schedule;
105 |   std::string filename;
106 |   std::fstream myfile;
107 | 
108 |   /* nonblocking stuff */
109 |   bool nb;
110 |   int poll_int;
111 |   int nbfunc;
112 |   int cpu;
113 |   std::vector<bool> ranks_init;
114 | 
115 |   t_id id_counter;
116 |   int dummynode;
117 |   int sends, recvs, execs, ranks, reqs;
118 |   uint64_t curtag;
119 | 
120 |   void read_schedule_from_file();
121 | };
122 | 
123 | template <typename T> std::vector<T> make_vector(T x) {
124 |   std::vector<T> y;
125 |   y.push_back(x);
126 |   return y;
127 | };
128 | 
129 | // prototype
130 | void process_trace(gengetopt_args_info *args_info);
131 | void create_binomial_tree_bcast_rank(Goal *goal, int root, int comm_rank,
132 |                                      int comm_size, int datasize);
133 | void create_binomial_tree_reduce_rank(Goal *goal, int root, int comm_rank,
134 |                                       int comm_size, int datasize);
135 | void create_dissemination_rank(Goal *goal, int comm_rank, int comm_size,
136 |                                int datasize);
137 | void create_linear_alltoall_rank(Goal *goal, int src_rank, int comm_size,
138 |                                  int datasize);
139 | 
140 | #endif
141 | 


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.12)
 2 | project(LogGOPSim LANGUAGES C CXX Fortran)
 3 | 
 4 | set(CMAKE_BUILD_TYPE RelWithDebInfo)
 5 | 
 6 | # If several versions of a package provide CMake packages, try to use the latest one.
 7 | set(CMAKE_FIND_PACKAGE_SORT_ORDER NATURAL)
 8 | set(CMAKE_FIND_PACKAGE_SORT_DIRECTION DEC)
 9 | 
10 | set(CMAKE_CXX_STANDARD 11)
11 | set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/../cmake" ${CMAKE_MODULE_PATH})
12 | 
13 | include(${CMAKE_MODULE_PATH}/gengetopt.cmake)
14 | include(${CMAKE_MODULE_PATH}/re2c.cmake)
15 | find_gengetopt()
16 | find_re2c()
17 | 
18 | # build loggopsim
19 | find_package(Graphviz REQUIRED)
20 | include_directories(${GRAPHVIZ_INCLUDE_DIRS})
21 | add_gengetopt_files(LogGOPSim/loggopsim_cmdline)
22 | add_executable(LogGOPSim LogGOPSim/loggopsim_cmdline.c LogGOPSim/LogGOPSim.cpp)
23 | target_link_libraries(LogGOPSim ${GRAPHVIZ_CGRAPH_LIBRARY})
24 | 
25 | # build txt2bin
26 | add_gengetopt_files(LogGOPSim/txt2bin_cmdline)
27 | add_re2c_files(LogGOPSim/txt2bin)
28 | add_executable(txt2bin LogGOPSim/txt2bin.cpp LogGOPSim/txt2bin_cmdline.c)
29 | 
30 | # build Schedgen1 (Schedgen2 is pure python)
31 | add_gengetopt_files(Schedgen/schedgen_cmdline)
32 | add_executable(schedgen Schedgen/buffer_element.cpp Schedgen/schedgen.cpp Schedgen/process_trace.cpp Schedgen/schedgen_cmdline.c)
33 | 
34 | # build liballprof1
35 | find_package(MPI)
36 | include(FortranCInterface)
37 | FortranCInterface_VERIFY()
38 | FortranCInterface_HEADER(fc_mangle.h)
39 | include_directories(SYSTEM ${MPI_INCLUDE_PATH})
40 | add_custom_command(OUTPUT lap1_mpi_c_wrapper.c COMMAND ${PROJECT_SOURCE_DIR}/liballprof/gencode.py c   > lap1_mpi_c_wrapper.c  DEPENDS ${PROJECT_SOURCE_DIR}/liballprof/mpi_header.h)
41 | add_custom_command(OUTPUT lap1_mpi_f_wrapper.c COMMAND ${PROJECT_SOURCE_DIR}/liballprof/gencode.py f77 > lap1_mpi_f_wrapper.c  DEPENDS ${PROJECT_SOURCE_DIR}/liballprof/mpi_header.h)
42 | #include_directories(liballprof)
43 | #add_library(mpipclog1 lap1_mpi_c_wrapper.c liballprof/sync.c)
44 | #add_library(mpipflog1 lap1_mpi_f_wrapper.c liballprof/sync.c)
45 | 
46 | 
47 | # build liballprof2
48 | find_package(Clang)
49 | find_package(Unwind)
50 | add_custom_target(mpi_header_avail         DEPENDS ${PROJECT_SOURCE_DIR}/liballprof2/mpi.h)
51 | add_custom_target(mpi_semantics_avail      DEPENDS mpi_sem.yml)
52 | add_custom_target(mpi_wrapper_generated    DEPENDS mpi_c_wrapper.c mpi_f_wrapper.c )
53 | add_custom_command(OUTPUT mpi_c_wrapper.c  mpi_f_wrapper.c COMMAND ${PROJECT_SOURCE_DIR}/liballprof2/gencode.py -s mpi_sem.yml                                                           DEPENDS mpi_semantics_avail)
54 | add_custom_command(OUTPUT mpi_sem.yml                      COMMAND ${PROJECT_SOURCE_DIR}/liballprof2/gensem.py -l ${CLANG_INSTALL_PREFIX}/lib -m ${PROJECT_SOURCE_DIR}/liballprof2/mpi.h DEPENDS mpi_header_avail)
55 | add_library(mpipclog SHARED mpi_c_wrapper.c)
56 | add_library(mpipflog SHARED mpi_f_wrapper.c)
57 | add_dependencies(mpipclog mpi_wrapper_generated)
58 | add_dependencies(mpipflog mpi_wrapper_generated)
59 | target_link_libraries(mpipclog ${MPI_C_LIBRARIES})
60 | target_link_libraries(mpipflog ${MPI_Fortran_LIBRARIES})
61 | if (Unwind_FOUND) 
62 |   target_link_libraries(mpipclog unwind::unwind)
63 |   target_link_libraries(mpipflog unwind::unwind)
64 | endif()
65 | 
66 | include(CTest)
67 | # test the c++ toolchain for some pattern
68 | add_test(NAME schedgen_binomialtreebcast COMMAND schedgen -o schedule.goal --commsize 8 --ptrn binomialtreebcast)
69 | add_test(NAME txt2bin_binomialtreebcast COMMAND txt2bin -i schedule.goal -o schedule.bin DEPENDS schedgen_binomialtreebcast)
70 | add_test(NAME loggopsim_binomialtreebcast COMMAND LogGOPSim -f schedule.bin DEPENDS txt2bin_binomialtreebcast)
71 | 
72 | # test the python toolchain for some pattern
73 | add_test(NAME schedgen2_bcast COMMAND ${PYTHON_EXECUTABLE} schedgen.py bcast --output schedule.goal WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/Schedgen2)
74 | add_test(NAME txt2bin_schedgen2_bcast COMMAND txt2bin -i ${PROJECT_SOURCE_DIR}/Schedgen2/schedule.goal -o schedule.bin)
75 | add_test(NAME loggopsim_schedgen2_bcast COMMAND LogGOPSim -f schedule.bin)
76 | 
77 | # test liballprof
78 | 
79 | # test liballprof2
80 | #c wrapper test
81 | add_executable(test_lap2_c ${PROJECT_SOURCE_DIR}/../tests/mpi_helloworld.c)
82 | target_link_libraries(test_lap2_c mpipclog)
83 | add_test(NAME trace_lap2_c COMMAND ${MPIEXEC_EXECUTABLE} --host localhost:4 ${MPIEXEC_NUMPROC_FLAG} 4 $<TARGET_FILE:test_lap2_c>)
84 | add_test(NAME lap2_c_trace_exists  COMMAND ${CMAKE_COMMAND} -E cat lap2-trace-rank-1-of-4.txt)
85 | 
86 | # fortran wrapper test
87 | add_executable(test_lap2_f ${PROJECT_SOURCE_DIR}/../tests/mpi_helloworld.f90)
88 | target_link_libraries(test_lap2_f mpipflog)
89 | add_test(NAME trace_lap2_f COMMAND ${MPIEXEC_EXECUTABLE} --host localhost:4 ${MPIEXEC_NUMPROC_FLAG} 4 $<TARGET_FILE:test_lap2_f>)
90 | add_test(NAME lap2_f_trace_exists  COMMAND ${CMAKE_COMMAND} -E cat lap2-trace-rank-1-of-4.txt )
91 | 
92 | 


--------------------------------------------------------------------------------
/src/liballprof2/tracer_main.c:
--------------------------------------------------------------------------------
  1 | #include <mpi.h>
  2 | #include <stdlib.h>
  3 | #include <assert.h>
  4 | #include <stdio.h>
  5 | #include <string.h>
  6 | #define UNW_LOCAL_ONLY //we do not need to unwind frames in another process
  7 | #include <libunwind.h>
  8 | 
  9 | #define LAP2_TRANSFER_BUFFER_SIZE  1024
 10 | #define LAP2_BACKTRACE_BUF_SIZE    4096
 11 | #define WRITE_TRACE(fmt, args...) fprintf(lap_fptr, fmt, args)
 12 | 
 13 | FILE* lap_fptr = NULL;
 14 | char* lap_backtrace_buf = NULL;
 15 | int lap_initialized = 0;
 16 | int lap_mpi_initialized = 0;
 17 | 
 18 | int lap_tracing_enabled = 1;
 19 | int lap_backtrace_enabled = 1;
 20 | int lap_elem_tracing_enabled = 1;
 21 | 
 22 | 
 23 | static void init_back_trace(void) {
 24 | 
 25 | }
 26 | 
 27 | static void lap_get_full_backtrace(char* buf, size_t len) {
 28 |   size_t written = 0;
 29 |   unw_cursor_t cursor;
 30 |   unw_context_t context;
 31 | 
 32 |   // Initialize cursor to current frame for local unwinding.
 33 |   unw_getcontext(&context);
 34 |   unw_init_local(&cursor, &context);
 35 | 
 36 |   // Unwind frames one by one, going up the frame stack.
 37 |   while (unw_step(&cursor) > 0) {
 38 |     unw_word_t offset, pc;
 39 |     unw_get_reg(&cursor, UNW_REG_IP, &pc);
 40 |     if (pc == 0) {
 41 |       break;
 42 |     }
 43 |     written += snprintf(&buf[written], len-written, "0x%lx:", pc);
 44 | 
 45 |     char sym[256];
 46 |     if (unw_get_proc_name(&cursor, sym, sizeof(sym), &offset) == 0) {
 47 |       written += snprintf(&buf[written], len-written, " (%s+0x%lx) <- ", sym, offset);
 48 |     } else {
 49 |       written += snprintf(&buf[written], len-written, "NO_SYMBOL    ");
 50 |     }
 51 |   }
 52 |   if (written>0) written -= 4;
 53 |   buf[written] = '\0';
 54 | }
 55 | 
 56 | static void lap_check(void) {
 57 |   if (lap_mpi_initialized == 0) PMPI_Initialized(&lap_mpi_initialized);
 58 |   if (lap_initialized) return;
 59 |   lap_fptr = tmpfile(); //write to a tmpfile, we don't know our rank yet, until MPI is initialized
 60 |   lap_backtrace_buf = malloc(LAP2_BACKTRACE_BUF_SIZE);
 61 |   assert(lap_backtrace_buf);
 62 |   assert(lap_fptr);
 63 |   init_back_trace();
 64 |   lap_initialized = 1;
 65 | }
 66 | 
 67 | 
 68 | static void lap_collect_traces(void) {
 69 |     int comm_rank, comm_size;
 70 |     PMPI_Comm_rank(MPI_COMM_WORLD, &comm_rank);
 71 |     PMPI_Comm_size(MPI_COMM_WORLD, &comm_size);
 72 |     int trace_size = ftell(lap_fptr);
 73 |     fseek(lap_fptr, 0, SEEK_SET);
 74 |     int* trace_sizes = malloc(comm_size * sizeof(int));
 75 |     if (trace_sizes == NULL) {
 76 |       fprintf(stderr, "lap2 ran out of memory when collecting traces :(\n");
 77 |       return;
 78 |     }
 79 |     void* chunkbuf = malloc(LAP2_TRANSFER_BUFFER_SIZE);
 80 |     if (chunkbuf == NULL) {
 81 |       fprintf(stderr, "lap2 ran out of memory when collecting traces, decrease LAP2_TRANSFER_BUFFER_SIZE=%i :(\n", LAP2_TRANSFER_BUFFER_SIZE);
 82 |       return;
 83 |     }
 84 |     PMPI_Gather(&trace_size, 1, MPI_INT, trace_sizes, 1, MPI_INT, 0, MPI_COMM_WORLD);
 85 |     if (comm_rank == 0) {
 86 |         for (int r=0; r<comm_size; r++) {
 87 |             printf("*** lap2: rank %i of %i trace is %i bytes long ***\n", r, comm_size, trace_sizes[r]);
 88 |             char trace_fname[FILENAME_MAX];
 89 |             snprintf(trace_fname, FILENAME_MAX, "lap2-trace-rank-%i-of-%i.txt", r, comm_size);
 90 |             FILE* trace_fh = fopen(trace_fname, "w");
 91 |             int num_chunks = trace_sizes[r] / LAP2_TRANSFER_BUFFER_SIZE;
 92 |             if (num_chunks * LAP2_TRANSFER_BUFFER_SIZE < trace_sizes[r]) num_chunks += 1;
 93 |             for (int chunk=0; chunk<num_chunks; chunk++) {
 94 |                 int bytes_received = 0;
 95 |                 if (r != comm_rank) {
 96 |                     MPI_Status recv_status;
 97 |                     PMPI_Recv(chunkbuf, LAP2_TRANSFER_BUFFER_SIZE, MPI_BYTE, r, chunk, MPI_COMM_WORLD, &recv_status);
 98 |                     PMPI_Get_count(&recv_status, MPI_BYTE, &bytes_received);
 99 |                 }
100 |                 else {
101 |                     bytes_received = fread(chunkbuf, 1, LAP2_TRANSFER_BUFFER_SIZE, lap_fptr);
102 |                 }
103 |                 fwrite(chunkbuf, 1, bytes_received, trace_fh);
104 |             }
105 |             fclose(trace_fh);
106 |         }
107 |     }
108 |     else {
109 |         int num_chunks = trace_size / LAP2_TRANSFER_BUFFER_SIZE;
110 |         if (num_chunks * LAP2_TRANSFER_BUFFER_SIZE < trace_size) num_chunks += 1;
111 |         for (int chunk=0; chunk<num_chunks; chunk++) {
112 |             size_t bytes_read = fread(chunkbuf, 1, LAP2_TRANSFER_BUFFER_SIZE, lap_fptr);
113 |             PMPI_Send(chunkbuf, bytes_read, MPI_BYTE, 0, chunk, MPI_COMM_WORLD);
114 |         }
115 |     }
116 |     free(trace_sizes);
117 |     free(chunkbuf);
118 | }
119 | 
120 | int main(int argc, char** argv) { 
121 |   // this exists just for debugging, we paste this file into the generated code
122 |   // main must be the last function in this file and will be cut!
123 |   int rank, size;
124 |   MPI_Init(&argc, &argv);
125 |   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
126 |   MPI_Comm_size(MPI_COMM_WORLD, &size);
127 |   lap_check();
128 |   for (int i=0; i<42*size+rank*rank; i++) {
129 |     WRITE_TRACE("Rank %i of %i line %i\n", rank, size, i);
130 |   }
131 |   lap_collect_traces();
132 |   MPI_Finalize();
133 | }
134 | 


--------------------------------------------------------------------------------
/doc/README-mpi-matching:
--------------------------------------------------------------------------------
  1 | Collecting MPI Matching Data
  2 | ============================
  3 | 
  4 | Overview
  5 | --------
  6 | MPI matching is the process of matching incoming messages to receive requests (e.g., due to MPI
  7 | operations like MPI_Recv).  This process occurs for both point-to-point messages and collecive
  8 | operations.  Common MPI implementations may treat collective operations differently than
  9 | point-to-point operations but, because LogGOPSim decomposes collective operations into sequences of
 10 | point-to-point messages, LogGOPSim uses the same matching process for collective operations and
 11 | point-to-point messages.  
 12 | 
 13 | An incoming message and a receive request are a match when the source and tag of the incoming
 14 | message and the receive request match.  This process is complicated by MPI's `no-overtaking` rule
 15 | and the existence of wildcard values (e.g., MPI_ANY_SOURCE and MPI_ANY_TAG).
 16 | 
 17 | In order to simulate MPI application, LogGOPSim must accurately model how messages are matched.
 18 | LogGOPSim matches messsages and receive requests using two queues: a posted receive queue (RQ) and a
 19 | unexpected messsage queue (UQ).  When the application makes a receive request (directly via a
 20 | point-to-point operation or indirectly via a collectively operation), the UQ is searched to
 21 | determine whether a message that satisfies the request has already been received.  It not, the
 22 | request is appended to the end of the RQ.  Similarly, when a message arrives, the RQ is searched to
 23 | determine whether it matches an existing receive request.  If not, the message is appended to the
 24 | end of the UQ.
 25 | 
 26 | Matching Statistics
 27 | -------------------
 28 | LogGOPSim allows the user to collect statistics about the process of MPI message matching for the
 29 | applications that it simulates.  A key benefit of using LogGOPSim to collect this information is
 30 | that the collection of this data can be done without competing with the application under test for
 31 | either memory/storage resources or CPU cycles (i.e., data collection occurs without the passage of
 32 | simulated time).  When the collection of matching statistics is enabled (i.e., using "-qstat
 33 | <prefix>"), LogGOPSim will produce the following set of files:
 34 | 
 35 | * <prefix>-rq-max.data  : this file contains one line per rank, each line contains the maximum 
 36 |                           number of elements observed in the RQ
 37 | 
 38 | * <prefix>-rq-hit.data  : this file contains one line per rank, each line contains list of
 39 |                           space-separated pairs.  Each pair has the form: 
 40 | 
 41 |                             <hit depth>,<simulated time>  
 42 | 
 43 |                           Each successful search of the RQ results in the creation of a new pair
 44 |                           (i.e., the number of pairs for a given rank corresponds to the number of
 45 |                           successful searches, i.e., hits, of the RQ).  The value of <hit depth>
 46 |                           represents how many elements were searched before a match was found.  The
 47 |                           value of <simulated time> represents the point in simulated time (in
 48 |                           nanoseconds since the start of the simulation) at which the search
 49 |                           occurred.
 50 | 
 51 | * <prefix>-rq-miss.data : this file contains one line per rank, each line contains list of
 52 |                           space-separated pairs.  Each pair has the form: 
 53 | 
 54 |                             <queue depth>,<simulated time>
 55 |       
 56 |                           Each unsuccessful search of the RQ results in the creation of a new pair
 57 |                           (i.e., the number of pairs for a given rank corresponds to the number of
 58 |                           unsuccessful searches, i.e., misses, of the RQ).  The value of 
 59 |                           <queue depth> represents the size of the RQ when the search failed, i.e.,
 60 |                           how many elements were searched trying to find a match.  The value of 
 61 |                           <simulated time> represents the point in simulated time (in nanoseconds
 62 |                           since the start of the simulation) at which the search occurred.
 63 | 
 64 | * <prefix>-uq-max.data  : this file contains one line per rank, each line contains the maximum 
 65 |                           number of elements observed in the UQ
 66 | 
 67 | * <prefix>-uq-hit.data  : this file contains one line per rank, each line contains list of
 68 |                           space-separated pairs.  Each pair has the form: 
 69 | 
 70 |                             <hit depth>,<simulated time>
 71 |       
 72 |                           Each successful search of the UQ results in the creation of a new pair
 73 |                           (i.e., the number of pairs for a given rank corresponds to the number of
 74 |                           successful searches, i.e., hits, of the UQ).  The value of <hit depth>
 75 |                           represents how many elements were searched before a match was found.  The
 76 |                           value of <simulated time> represents the point in simulated time (in
 77 |                           nanoseconds since the start of the simulation) at which the search occurred.
 78 | 
 79 | * <prefix>-uq-miss.data : this file contains one line per rank, each line contains list of
 80 |                           space-separated pairs.  Each pair has the form: 
 81 | 
 82 |                             <queue depth>,<simulated time>
 83 | 
 84 |                           Each unsuccessful search of the UQ results in the creation of a new pair
 85 |                           (i.e., the number of pairs for a given rank corresponds to the number of
 86 |                           unsuccessful searches, i.e., misses, of the UQ).  The value of 
 87 |                           <queue depth> represents the size of the RQ when the search failed, i.e.,
 88 |                           how many elements were searched trying to find a match.  The value of
 89 |                           <simulated time> represents the point in simulated time (in nanoseconds
 90 |                           since the start of the simulation) at which the search occurred.
 91 | 
 92 | Example
 93 | -------
 94 | For an example of the data that can be collected using this option, see:
 95 | 
 96 | Ferreira, Levy, Pedretti and Grant.  "Characterizing MPI matching via trace-based simulation",
 97 | Parallel Computing, volume 77, pages 57-83 (2018).
 98 | 
 99 | Questions
100 | ---------
101 | Questions regarding this feature may be directed to: 
102 | 
103 | Scott Levy (sllevy@sandia.gov)
104 | Kurt Ferreira (kbferre@sandia.gov)
105 | 


--------------------------------------------------------------------------------
/src/LogGOPSim/Noise.hpp:
--------------------------------------------------------------------------------
  1 | #include "loggopsim_cmdline.h"
  2 | #include <iostream>
  3 | #include <assert.h>
  4 | #include <fstream>
  5 | #include <vector>
  6 | #include <limits>
  7 | #include <random>
  8 | 
  9 | class Noise {
 10 |   private:
 11 |     int p;
 12 |     std::vector<std::pair<uint64_t,uint64_t> > trcnoise; // read NG noise
 13 |     uint64_t trctime; // how long is the trace
 14 | 
 15 |     std::vector<uint64_t> ranktime; // time in trcnoise for each rank
 16 |     std::vector<uint64_t> injected_noise; // counts total injected noise per node 
 17 |     static const int max_report=64; // maximum number of nodes to report for
 18 | 
 19 | 	public:
 20 | 
 21 |   Noise(gengetopt_args_info *args_info, int p) : p(p) {
 22 | 
 23 |     if(args_info->noise_trace_given) {
 24 |       const int size=1024;
 25 |       char buffer[size];
 26 |       std::ifstream trace;
 27 |       trace.open(args_info->noise_trace_arg);
 28 |       if(!trace.is_open()) { 
 29 |         std::cerr << "couldn't read noise trace file: " << args_info->noise_trace_arg << " - exiting\n";
 30 |         throw(10);
 31 |       }
 32 |       
 33 |       bool eof=false;
 34 |       int line=0;
 35 |       while(!eof) {
 36 |         line++;
 37 |         trace.getline(buffer,size);
 38 | 
 39 |         if(buffer[0] == '#') continue;
 40 | 
 41 |         double offset, duration;
 42 |         // format: line ::= <start-time>\t<duration> - all times in nanoseconds
 43 |         sscanf(buffer, "%lf\t%lf", &offset, &duration);
 44 |         
 45 |         //std::cout << offset << " " << duration << "\n";
 46 |         trcnoise.push_back(std::make_pair((uint64_t)round(offset), (uint64_t)round(duration)));
 47 | 
 48 |         eof = trace.eof();
 49 |       }
 50 |       
 51 |       if(((trcnoise.end()-1)->first-trcnoise.begin()->first) > (double)std::numeric_limits<uint64_t>::max()) {
 52 |         std::cerr << " the length of the noise-trace ("<<(trcnoise.end()-1)->first-trcnoise.begin()->first<<" ns) is can not be saved in 'uint64_t' (max: "<<(double)std::numeric_limits<uint64_t>::max()<<") - exiting\n";
 53 |         throw(11);
 54 |       }
 55 | 
 56 |       //trctime = ((trcnoise.end()-1)->first-trcnoise.begin()->first);
 57 |       trctime = (trcnoise.end()-1)->first;
 58 |       std::cout << "Noisegen: read " << trcnoise.size() << " noise events spanning " << trctime/1e9 << "s ";
 59 |       if(args_info->noise_cosched_given)
 60 |         std::cout << "(coscheduling)\n";
 61 |       else
 62 |         std::cout << "(independent)\n";
 63 | 
 64 |       std::mt19937 mtrand(time(0));
 65 |       double cosched_starttime = ((double)mtrand()/mtrand.max())*((double)trctime);
 66 |       for(int i=0; i<p; i++) {
 67 |         if(args_info->noise_cosched_given) {
 68 |           ranktime.push_back((uint64_t)cosched_starttime);
 69 |         } else {
 70 |           double starttime = ((double)mtrand()/mtrand.max())*((double)trctime);
 71 |           ranktime.push_back((uint64_t)starttime);
 72 |           //printf("%i %llu %llu\n", i, (uint64_t)starttime, trctime);
 73 |         }
 74 |         if (p<=max_report) injected_noise.push_back(0);
 75 |       }
 76 |     }
 77 |   }
 78 | 
 79 |   ~Noise() {
 80 |     // if we have trace data
 81 |     if(trcnoise.size()) {
 82 |       // only print noise for small runs
 83 |       if (p<=max_report) {
 84 |         std::cout << "noise per rank: ";
 85 |         for(int i=0; i<p; i++) {
 86 |           std::cout << injected_noise[i] << ",";
 87 |         }
 88 |       }
 89 |       std::cout << "\n";
 90 |     }
 91 |   }
 92 | 	
 93 | 	inline btime_t get_noise(int r, btime_t starttime /* in ns */, btime_t endtime /* in ns */) {
 94 |     btime_t noise=0;
 95 | 
 96 |     // if we have trace data
 97 |     if(trcnoise.size()) {
 98 |       btime_t oplength = endtime-starttime;
 99 |       // start time in trace -- endtime must not be larger than the last
100 |       // entry in trace because the binary search below doesn't work otherwise
101 |       btime_t trcstart = (starttime+ranktime[r])%(trcnoise.back().first-oplength);
102 | 
103 |       //std::cout << "start: " << starttime << "-" << endtime << " " << noise << "\n";
104 | 
105 |       unsigned int pos=0;
106 |       btime_t endlastevent=0;
107 |       // trcstart smaller than first elemens has to be a special case!
108 |       if(trcstart > trcnoise[0].first) {
109 |         // do binary search for pos where trcnoise[pos].first is the
110 |         // biggest element that is smaller than trcstart
111 |         unsigned int min=0, max=trcnoise.size()-1;
112 |         do {
113 |           pos=(min+max) / 2;
114 |           if(trcstart > trcnoise[pos].first) {
115 |             min = pos+1;
116 |           } else {
117 |             max = pos-1;
118 |           }
119 |         } while((trcstart != trcnoise[pos].first) && (min < max));
120 | 
121 |         // the binary search doesn't necessarily find the right interval,
122 |         // however, it brings us close
123 |         while( !( // we loop until we have:
124 |               (trcnoise[pos].first <= trcstart) && // pos is smaller or equal than trcstart
125 |               (trcnoise[pos+1].first > trcstart)   // pos+1 is larger than trcstart
126 |               ) ) {
127 |           if(trcnoise[pos].first > trcstart) pos--;
128 |           else pos++;
129 |         };
130 |         // compute the endtime of the last event
131 |         endlastevent = trcnoise[pos].first+trcnoise[pos].second;
132 |       }
133 | 
134 |       // if last event reached into starttime - then it influenced me :)
135 |       if(endlastevent>trcstart) {
136 |         noise += endlastevent-trcstart;
137 |       }
138 | 
139 |       //if(noise > 100000) std::cout << trcnoise[pos].first << " " << trcstart << " " << trcnoise[pos+1].first << "\n";
140 |       
141 |       btime_t end = trcstart+oplength;
142 | 
143 |       // if we're at the end of samples - wrap around
144 |       if(pos == trcnoise.size()-1) {
145 |         end -= trctime; // adjust end time
146 |         pos = 0; // set position to first 
147 |       }
148 | 
149 |       // if we reach into next sample - then add the whole time of sample
150 |       while(end > trcnoise[pos+1].first) {
151 |         pos++;
152 |         noise += trcnoise[pos].second;
153 |         /*if(noise > 100000) {
154 |           std::cout << "inner " << trcnoise[pos].first << " pos: " << pos << " start-end: " << trcstart << "-" << end << " end: " << end << " noise: " << noise << " trctime: " << trctime << "\n";
155 |         return 0;}*/
156 |         
157 |         // if we're at the end of samples - wrap around
158 |         if(pos == trcnoise.size()-1) {
159 |           end -= trctime; // adjust end time
160 |           pos = 0; // set position to first 
161 |         }
162 |       }
163 | 
164 |       // do *NOT* update ranktime because starttime is absolut
165 | 
166 |       //if (noise > 100000) std::cout << "injected " << noise << " ns noise in " << endtime-starttime << "ns\n";
167 |       if (p<=max_report) injected_noise[r] += noise;
168 |     }
169 |    
170 |     assert(noise >= 0); 
171 |     return noise;
172 |   }
173 | };
174 | 


--------------------------------------------------------------------------------
/src/Drawviz/Drawviz.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2009 The Trustees of Indiana University and Indiana
  3 |  *                    University Research and Technology
  4 |  *                    Corporation.  All rights reserved.
  5 |  *
  6 |  * Author(s): Torsten Hoefler <htor@cs.indiana.edu>
  7 |  *            Timo Schneider <timoschn@cs.indiana.edu>
  8 |  *
  9 |  */
 10 | 
 11 | #include <stdlib.h>
 12 | #include <vector>
 13 | #include <iostream>
 14 | #include <stdio.h>
 15 | #include <fstream>
 16 | #include <boost/regex.hpp>
 17 | 
 18 | #include "TimelineDrawing.hpp"
 19 | #include "cmdline.h"
 20 | 
 21 | 
 22 | int main(int argc, char **argv) {
 23 | 
 24 | 	gengetopt_args_info args_info;
 25 |         
 26 |     if (cmdline_parser(argc, argv, &args_info) != 0) {
 27 |     	fprintf(stderr, "Couldn't parse command line arguments!\n");
 28 |         exit(EXIT_FAILURE);
 29 | 	}
 30 | 
 31 | 	std::string line;
 32 | 	std::ifstream myfile(args_info.inputfile_arg);
 33 | 
 34 | 	int rank_num = 0;
 35 | 	int maxtime = 0;
 36 | 	int maxcpu = 0;
 37 | 	bool interval = false;
 38 | 
 39 | 	if (args_info.endtime_arg > 0) {
 40 | 		interval = true;
 41 | 	}
 42 | 
 43 | 	if (myfile.is_open()) {
 44 | 		
 45 | 		TimelineDrawing TLViz(args_info);
 46 | 
 47 | 		while (!myfile.eof()) {
 48 | 
 49 | 			boost::cmatch matches;
 50 | 			
 51 | 			getline (myfile,line);
 52 | 			
 53 | 			boost::regex ranknum("numranks (\\d+);");
 54 | 			boost::regex osend("osend (\\d+) (\\d+) (\\d+) (\\d+) (\\d+(?:.\\d+)?) (\\d+(?:.\\d+)?) (\\d+(?:.\\d+)?);");
 55 | 			boost::regex orecv("orecv (\\d+) (\\d+) (\\d+) (\\d+) (\\d+(?:.\\d+)?) (\\d+(?:.\\d+)?) (\\d+(?:.\\d+)?);");
 56 | 			boost::regex loclop("loclop (\\d+) (\\d+) (\\d+) (\\d+) (\\d+(?:.\\d+)?) (\\d+(?:.\\d+)?) (\\d+(?:.\\d+)?);");
 57 | 			boost::regex noise("noise (\\d+) (\\d+) (\\d+) (\\d+) (\\d+(?:.\\d+)?) (\\d+(?:.\\d+)?) (\\d+(?:.\\d+)?);");
 58 | 			boost::regex transmission("transmission (\\d+) (\\d+) (\\d+) (\\d+) (\\d+) (\\d+) (\\d+(?:.\\d+)?) (\\d+(?:.\\d+)?) (\\d+(?:.\\d+)?);");
 59 | 			boost::regex whitespace("\\w*");
 60 | 
 61 | 			if (boost::regex_match(line.c_str(), matches, osend)) {
 62 | 				
 63 | 				std::string ranks = matches[1];
 64 | 				std::string cpus = matches[2];
 65 | 				std::string starts = matches[3];
 66 | 				std::string ends = matches[4];
 67 | 				std::string reds = matches[5];
 68 | 				std::string greens = matches[6];
 69 | 				std::string blues = matches[7];
 70 | 				
 71 | 				if ((interval==false) or ((atoi(starts.c_str()) >= args_info.starttime_arg) && (atoi(ends.c_str()) < args_info.endtime_arg))) {
 72 | 					TLViz.add_osend(atoi(ranks.c_str()), atoi(starts.c_str())-args_info.starttime_arg, atoi(ends.c_str())-args_info.starttime_arg, atoi(cpus.c_str()),
 73 | 					atof(reds.c_str()), atof(greens.c_str()), atof(blues.c_str()) );
 74 | 					if (maxtime < atoi(ends.c_str())) maxtime = atoi(ends.c_str());
 75 | 					if (maxcpu < atoi(cpus.c_str())) maxcpu = atoi(cpus.c_str());
 76 | 				}	
 77 | 			}
 78 | 
 79 | 			else if (boost::regex_match(line.c_str(), matches, orecv)) {
 80 | 				
 81 | 				std::string ranks = matches[1];
 82 | 				std::string cpus = matches[2];
 83 | 				std::string starts = matches[3];
 84 | 				std::string ends = matches[4];
 85 | 				std::string reds = matches[5];
 86 | 				std::string greens = matches[6];
 87 | 				std::string blues = matches[7];
 88 | 				
 89 | 				if ((interval==false) or ((atoi(starts.c_str()) >= args_info.starttime_arg) && (atoi(ends.c_str()) < args_info.endtime_arg))) {
 90 | 					TLViz.add_orecv(atoi(ranks.c_str()), atoi(starts.c_str())-args_info.starttime_arg, atoi(ends.c_str())-args_info.starttime_arg, atoi(cpus.c_str()),
 91 | 					atof(reds.c_str()), atof(greens.c_str()), atof(blues.c_str()) );
 92 | 					if (maxtime < atoi(ends.c_str())) maxtime = atoi(ends.c_str());
 93 | 					if (maxcpu < atoi(cpus.c_str())) maxcpu = atoi(cpus.c_str());
 94 | 				}
 95 | 			}
 96 | 
 97 | 			else if (boost::regex_match(line.c_str(), matches, loclop)) {
 98 | 
 99 | 				std::string ranks = matches[1];
100 | 				std::string cpus = matches[2];
101 | 				std::string starts = matches[3];
102 | 				std::string ends = matches[4];
103 | 				std::string reds = matches[5];
104 | 				std::string greens = matches[6];
105 | 				std::string blues = matches[7];
106 | 
107 | 				if ((interval==false) or ((atoi(starts.c_str()) >= args_info.starttime_arg) && (atoi(ends.c_str()) < args_info.endtime_arg))) {
108 | 					TLViz.add_loclop(atoi(ranks.c_str()), atoi(starts.c_str())-args_info.starttime_arg , atoi(ends.c_str())-args_info.starttime_arg, atoi(cpus.c_str()),
109 | 					atof(reds.c_str()), atof(greens.c_str()), atof(blues.c_str()) );
110 | 					if (maxtime < atoi(ends.c_str())) maxtime = atoi(ends.c_str());
111 | 					if (maxcpu < atoi(cpus.c_str())) maxcpu = atoi(cpus.c_str());
112 | 				}
113 | 			}
114 | 
115 | 			else if (boost::regex_match(line.c_str(), matches, noise)) {
116 | 
117 | 				std::string ranks = matches[1];
118 | 				std::string cpus = matches[2];
119 | 				std::string starts = matches[3];
120 | 				std::string ends = matches[4];
121 | 				std::string reds = matches[5];
122 | 				std::string greens = matches[6];
123 | 				std::string blues = matches[7];
124 | 
125 | 				if ((interval==false) or ((atoi(starts.c_str()) >= args_info.starttime_arg) && (atoi(ends.c_str()) < args_info.endtime_arg))) {
126 | 					TLViz.add_noise(atoi(ranks.c_str()), atoi(starts.c_str())-args_info.starttime_arg , atoi(ends.c_str())-args_info.starttime_arg, atoi(cpus.c_str()),
127 | 					atof(reds.c_str()), atof(greens.c_str()), atof(blues.c_str()) );
128 | 					if (maxtime < atoi(ends.c_str())) maxtime = atoi(ends.c_str());
129 | 					if (maxcpu < atoi(cpus.c_str())) maxcpu = atoi(cpus.c_str());
130 | 				}
131 | 			}
132 | 			
133 | 			else if (boost::regex_match(line.c_str(), matches, transmission)) {
134 | 				
135 | 				std::string src = matches[1];
136 | 				std::string dest = matches[2];
137 | 				std::string start = matches[3];
138 | 				std::string end = matches[4];
139 | 				std::string size = matches[5];
140 | 				std::string G = matches[6];
141 | 				std::string reds = matches[7];
142 | 				std::string greens = matches[8];
143 | 				std::string blues = matches[9];
144 | 
145 | 				if ((interval==false) or ((atoi(start.c_str()) >= args_info.starttime_arg) && (atoi(end.c_str()) < args_info.endtime_arg))) {
146 | 					TLViz.add_transmission(atoi(src.c_str()), atoi(dest.c_str()), atoi(start.c_str()) - args_info.starttime_arg,
147 | 					                       atoi(end.c_str()) - args_info.starttime_arg, atoi(size.c_str()), atoi(G.c_str()),
148 | 										   atof(reds.c_str()), atof(greens.c_str()), atof(blues.c_str()) );
149 | 
150 | 					int endtime = atoi(end.c_str())+atoi(G.c_str())*atoi(size.c_str());
151 | 					if (maxtime < endtime ) maxtime = endtime;
152 | 				}
153 | 			}
154 | 			else if (boost::regex_match(line.c_str(), matches, ranknum)) {
155 | 				std::string ranknum = matches[1];
156 | 				if (atoi(ranknum.c_str()) > rank_num) rank_num = atoi(ranknum.c_str());
157 | 			}
158 | 			else if (boost::regex_match(line.c_str(), matches, whitespace)) {
159 | 			}
160 | 			else {
161 | 				std::cout << "Unamtched line: [" << line << "]" << std::endl;	
162 | 			} 
163 | 
164 | 		}
165 | 		myfile.close();
166 | 
167 | 		TLViz.init_graph(rank_num, maxcpu+1, 800, 800, args_info.outputfile_arg);
168 | 		TLViz.draw_ranklines();
169 | 		maxtime -= args_info.starttime_arg;
170 | 		TLViz.draw_everything(maxtime);
171 | 		TLViz.close_graph();
172 | 	}
173 | 	else {
174 | 		fprintf(stderr, "Unable to open file with starttimes (%s)\n", args_info.inputfile_arg);
175 | 		exit(EXIT_FAILURE);
176 | 	}
177 | 	 
178 | 	exit(EXIT_SUCCESS);
179 | }
180 | 
181 | 
182 | 


--------------------------------------------------------------------------------
/src/Schedgen2/schedgen.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python3
  2 | 
  3 | import sys
  4 | import json
  5 | import tempfile
  6 | import subprocess
  7 | import argparse
  8 | from mpi_colls import *
  9 | from additional_microbenchmarks import *
 10 | 
 11 | parser = argparse.ArgumentParser(description="Generate GOAL Schedules.")
 12 | 
 13 | subparsers = parser.add_subparsers(
 14 |     help="Communication to generate", dest="comm", required=True
 15 | )
 16 | mpi = []
 17 | additional_microbenchmarks = []
 18 | 
 19 | incast_parser = subparsers.add_parser("incast")
 20 | additional_microbenchmarks.append(incast_parser)
 21 | 
 22 | outcast_parser = subparsers.add_parser("outcast")
 23 | additional_microbenchmarks.append(outcast_parser)
 24 | 
 25 | dissemination_parser = subparsers.add_parser("dissemination")
 26 | mpi.append(dissemination_parser)
 27 | 
 28 | reduce_parser = subparsers.add_parser("reduce")
 29 | mpi.append(reduce_parser)
 30 | 
 31 | bcast_parser = subparsers.add_parser("bcast")
 32 | mpi.append(bcast_parser)
 33 | 
 34 | scatter_parser = subparsers.add_parser("scatter")
 35 | mpi.append(scatter_parser)
 36 | 
 37 | allreduce_parser = subparsers.add_parser("allreduce")
 38 | mpi.append(allreduce_parser)
 39 | 
 40 | alltoall_parser = subparsers.add_parser("alltoall")
 41 | mpi.append(alltoall_parser)
 42 | 
 43 | alltoallv_parser = subparsers.add_parser("alltoallv")
 44 | mpi.append(alltoallv_parser)
 45 | 
 46 | for p in additional_microbenchmarks:
 47 |     p.add_argument(
 48 |         "--randomized_data",
 49 |         dest="randomized_data",
 50 |         action="store_true",
 51 |         help="Use unbalanced data sizes",
 52 |     )
 53 | 
 54 | for p in [allreduce_parser, alltoall_parser, alltoallv_parser]:
 55 |     p.add_argument(
 56 |         "--num_comm_groups",
 57 |         dest="num_comm_groups",
 58 |         type=int,
 59 |         default=1,
 60 |         help="Number of communication groups, >1 for multi-allreduce and multi-alltoall(v)",
 61 |     )
 62 | 
 63 | for p in mpi + additional_microbenchmarks:
 64 |     p.add_argument(
 65 |         "--ptrn",
 66 |         dest="ptrn",
 67 |         choices=["datasize_based", "binomialtree", "recdoub", "ring", "linear"],
 68 |         default="datasize_based",
 69 |         help="Pattern to use for communication, note that not all patterns are available for all communication types",
 70 |     )
 71 |     p.add_argument(
 72 |         "--ptrn-config",
 73 |         dest="ptrn_config",
 74 |         help="Configuration file for the pattern to use with data size based selection to override the default configuration",
 75 |     )
 76 |     p.add_argument(
 77 |         "--comm_size",
 78 |         dest="comm_size",
 79 |         type=int,
 80 |         default=8,
 81 |         help="Size of the communicator",
 82 |     )
 83 |     p.add_argument(
 84 |         "--datasize",
 85 |         dest="datasize",
 86 |         type=int,
 87 |         default=8,
 88 |         help="Size of the data, i.e., for reduce operations",
 89 |     )
 90 |     p.add_argument(
 91 |         "--window_size",
 92 |         dest="window_size",
 93 |         type=int,
 94 |         default=0,
 95 |         help="Window size for windowed linear communication patterns",
 96 |     )
 97 |     p.add_argument(
 98 |         "--compute_time_dependency",
 99 |         dest="compute_time_dependency",
100 |         type=int,
101 |         default=0,
102 |         help="Compute time that is to be inserted in between send operations",
103 |     )
104 |     p.add_argument(
105 |         "--output",
106 |         dest="output",
107 |         default="stdout",
108 |         help="Output file",
109 |     )
110 |     p.add_argument(
111 |         "--ignore_verification",
112 |         dest="ignore_verification",
113 |         action="store_true",
114 |         help="Ignore verification of parameters",
115 |     )
116 |     p.add_argument(
117 |         "--config",
118 |         dest="config",
119 |         help="Configuration file, takes precedence over other parameters",
120 |     )
121 |     p.add_argument(
122 |         "--txt2bin",
123 |         dest="txt2bin",
124 |         help="Path to txt2bin executable",
125 |     )
126 | 
127 | 
128 | def verify_params(args):
129 |     if args.ignore_verification:
130 |         return
131 |     assert args.comm_size > 0, "Communicator size must be greater than 0."
132 |     assert args.datasize > 0, "Data size must be greater than 0."
133 |     assert (
134 |         args.txt2bin is None or args.output != "stdout"
135 |     ), "Cannot use txt2bin with stdout"
136 |     assert (
137 |         args.ptrn != "recdoub" or args.comm_size & (args.comm_size - 1) == 0
138 |     ), "Currently recdoub pattern requires a power of 2 communicator size."
139 | 
140 | 
141 | def comm_to_func(comm: str) -> callable:
142 |     """
143 |     Convert a communication type to a function that generates the communication.
144 | 
145 |     :param comm: The communication type.
146 |     :return: A function that generates the communication.
147 |     """
148 | 
149 |     if comm == "incast":
150 |         return incast
151 |     elif comm == "outcast":
152 |         return outcast
153 |     elif comm == "reduce":
154 |         return reduce
155 |     elif comm == "bcast":
156 |         return bcast
157 |     elif comm == "scatter":
158 |         return scatter
159 |     elif comm == "dissemination":
160 |         return dissemination
161 |     elif comm == "allreduce":
162 |         return allreduce
163 |     elif comm == "alltoall":
164 |         return alltoall
165 |     elif comm == "alltoallv":
166 |         return alltoallv
167 |     else:
168 |         raise ValueError(f"Communication type {comm} not implemented")
169 | 
170 | 
171 | def multi(collective: callable, num_comm_groups: int, comm_size: int, **kwargs):
172 |     comm = GoalComm(comm_size * num_comm_groups)
173 |     comms = comm.CommSplit(
174 |         color=[i // comm_size for i in range(comm_size * num_comm_groups)],
175 |         key=[i % comm_size for i in range(comm_size * num_comm_groups)],
176 |     )
177 |     for comm_split in comms:
178 |         comm_collective = collective(comm_size=comm_size, **kwargs)
179 |         comm_split.Append(comm_collective)
180 |     return comm
181 | 
182 | 
183 | args = parser.parse_args()
184 | if args.config is not None:
185 |     with open(args.config, "r") as f:
186 |         config = json.load(f)
187 |     for k, v in config.items():
188 |         setattr(args, k, v)
189 | 
190 | if args.ptrn == "datasize_based":
191 |     if args.comm in [p.prog.split()[-1] for p in mpi]:
192 |         args.ptrn = mpi_communication_pattern_selection(
193 |             args.comm, args.comm_size, args.datasize
194 |         )
195 |     elif args.comm in [p.prog.split()[-1] for p in additional_microbenchmarks]:
196 |         args.ptrn = "linear"
197 |     else:
198 |         raise ValueError(
199 |             f"Communication type {args.comm} does not currently support data size based pattern selection"
200 |         )
201 | 
202 | verify_params(args)
203 | args.tag = 42
204 | 
205 | if (
206 |     "num_comm_groups" not in vars(args)
207 |     or args.num_comm_groups is None
208 |     or args.num_comm_groups <= 1
209 | ):
210 |     g = comm_to_func(args.comm)(**vars(args))
211 | else:
212 |     g = multi(
213 |         comm_to_func(args.comm), **vars(args)
214 |     )
215 | 
216 | if args.txt2bin is not None:
217 |     assert args.output != "stdout", "Cannot use txt2bin with stdout"
218 |     with tempfile.NamedTemporaryFile(mode="w", delete=False) as f:
219 |         g.write_goal(fh=f)
220 |         tmp_goal_file = f.name
221 |     subprocess.run(
222 |         [args.txt2bin, "-i", tmp_goal_file, "-o", args.output, "-p"],
223 |         check=True,
224 |     )
225 |     subprocess.run(["rm", tmp_goal_file], check=True)
226 | else:
227 |     if args.output == "stdout":
228 |         args.output = sys.stdout
229 |     else:
230 |         args.output = open(args.output, "w")
231 | 
232 |     g.write_goal(fh=args.output)
233 |     if args.output != sys.stdout:
234 |         args.output.close()
235 | 


--------------------------------------------------------------------------------
/src/Drawviz/cmdline.h:
--------------------------------------------------------------------------------
  1 | /** @file cmdline.h
  2 |  *  @brief The header file for the command line option parser
  3 |  *  generated by GNU Gengetopt version 2.23
  4 |  *  http://www.gnu.org/software/gengetopt.
  5 |  *  DO NOT modify this file, since it can be overwritten
  6 |  *  @author GNU Gengetopt */
  7 | 
  8 | #ifndef CMDLINE_H
  9 | #define CMDLINE_H
 10 | 
 11 | /* If we use autoconf.  */
 12 | #ifdef HAVE_CONFIG_H
 13 | #include "config.h"
 14 | #endif
 15 | 
 16 | #include <stdio.h> /* for FILE */
 17 | 
 18 | #ifdef __cplusplus
 19 | extern "C" {
 20 | #endif /* __cplusplus */
 21 | 
 22 | #ifndef CMDLINE_PARSER_PACKAGE
 23 | /** @brief the program name (used for printing errors) */
 24 | #define CMDLINE_PARSER_PACKAGE "drawviz"
 25 | #endif
 26 | 
 27 | #ifndef CMDLINE_PARSER_PACKAGE_NAME
 28 | /** @brief the complete program name (used for help and version) */
 29 | #define CMDLINE_PARSER_PACKAGE_NAME "drawviz"
 30 | #endif
 31 | 
 32 | #ifndef CMDLINE_PARSER_VERSION
 33 | /** @brief the program version */
 34 | #define CMDLINE_PARSER_VERSION "0.1"
 35 | #endif
 36 | 
 37 | /** @brief Where the command line options are stored */
 38 | struct gengetopt_args_info
 39 | {
 40 |   const char *help_help; /**< @brief Print help and exit help description.  */
 41 |   const char *version_help; /**< @brief Print version and exit help description.  */
 42 |   char * inputfile_arg;	/**< @brief Name of the inputfile (event data).  */
 43 |   char * inputfile_orig;	/**< @brief Name of the inputfile (event data) original value given at command line.  */
 44 |   const char *inputfile_help; /**< @brief Name of the inputfile (event data) help description.  */
 45 |   char * outputfile_arg;	/**< @brief Name of the output file (postscript) (default='timeline.ps').  */
 46 |   char * outputfile_orig;	/**< @brief Name of the output file (postscript) original value given at command line.  */
 47 |   const char *outputfile_help; /**< @brief Name of the output file (postscript) help description.  */
 48 |   int linethickness_arg;	/**< @brief Thickness of lines (default='1').  */
 49 |   char * linethickness_orig;	/**< @brief Thickness of lines original value given at command line.  */
 50 |   const char *linethickness_help; /**< @brief Thickness of lines help description.  */
 51 |   int starttime_arg;	/**< @brief Starttime, if only a interval should be drawn (default='0').  */
 52 |   char * starttime_orig;	/**< @brief Starttime, if only a interval should be drawn original value given at command line.  */
 53 |   const char *starttime_help; /**< @brief Starttime, if only a interval should be drawn help description.  */
 54 |   int endtime_arg;	/**< @brief Endtime, if only a interval should be drawn (default='0').  */
 55 |   char * endtime_orig;	/**< @brief Endtime, if only a interval should be drawn original value given at command line.  */
 56 |   const char *endtime_help; /**< @brief Endtime, if only a interval should be drawn help description.  */
 57 |   int arrowheads_flag;	/**< @brief If this flag is given, arrowheads will be drawn (default=off).  */
 58 |   const char *arrowheads_help; /**< @brief If this flag is given, arrowheads will be drawn help description.  */
 59 |   int descrtext_flag;	/**< @brief If this flag is given, text will be written below o_send and o_recv (default=off).  */
 60 |   const char *descrtext_help; /**< @brief If this flag is given, text will be written below o_send and o_recv help description.  */
 61 |   
 62 |   unsigned int help_given ;	/**< @brief Whether help was given.  */
 63 |   unsigned int version_given ;	/**< @brief Whether version was given.  */
 64 |   unsigned int inputfile_given ;	/**< @brief Whether inputfile was given.  */
 65 |   unsigned int outputfile_given ;	/**< @brief Whether outputfile was given.  */
 66 |   unsigned int linethickness_given ;	/**< @brief Whether linethickness was given.  */
 67 |   unsigned int starttime_given ;	/**< @brief Whether starttime was given.  */
 68 |   unsigned int endtime_given ;	/**< @brief Whether endtime was given.  */
 69 |   unsigned int arrowheads_given ;	/**< @brief Whether arrowheads was given.  */
 70 |   unsigned int descrtext_given ;	/**< @brief Whether descrtext was given.  */
 71 | 
 72 | } ;
 73 | 
 74 | /** @brief The additional parameters to pass to parser functions */
 75 | struct cmdline_parser_params
 76 | {
 77 |   int override; /**< @brief whether to override possibly already present options (default 0) */
 78 |   int initialize; /**< @brief whether to initialize the option structure gengetopt_args_info (default 1) */
 79 |   int check_required; /**< @brief whether to check that all required options were provided (default 1) */
 80 |   int check_ambiguity; /**< @brief whether to check for options already specified in the option structure gengetopt_args_info (default 0) */
 81 |   int print_errors; /**< @brief whether getopt_long should print an error message for a bad option (default 1) */
 82 | } ;
 83 | 
 84 | /** @brief the purpose string of the program */
 85 | extern const char *gengetopt_args_info_purpose;
 86 | /** @brief the usage string of the program */
 87 | extern const char *gengetopt_args_info_usage;
 88 | /** @brief the description string of the program */
 89 | extern const char *gengetopt_args_info_description;
 90 | /** @brief all the lines making the help output */
 91 | extern const char *gengetopt_args_info_help[];
 92 | 
 93 | /**
 94 |  * The command line parser
 95 |  * @param argc the number of command line options
 96 |  * @param argv the command line options
 97 |  * @param args_info the structure where option information will be stored
 98 |  * @return 0 if everything went fine, NON 0 if an error took place
 99 |  */
100 | int cmdline_parser (int argc, char **argv,
101 |   struct gengetopt_args_info *args_info);
102 | 
103 | /**
104 |  * The command line parser (version with additional parameters - deprecated)
105 |  * @param argc the number of command line options
106 |  * @param argv the command line options
107 |  * @param args_info the structure where option information will be stored
108 |  * @param override whether to override possibly already present options
109 |  * @param initialize whether to initialize the option structure my_args_info
110 |  * @param check_required whether to check that all required options were provided
111 |  * @return 0 if everything went fine, NON 0 if an error took place
112 |  * @deprecated use cmdline_parser_ext() instead
113 |  */
114 | int cmdline_parser2 (int argc, char **argv,
115 |   struct gengetopt_args_info *args_info,
116 |   int override, int initialize, int check_required);
117 | 
118 | /**
119 |  * The command line parser (version with additional parameters)
120 |  * @param argc the number of command line options
121 |  * @param argv the command line options
122 |  * @param args_info the structure where option information will be stored
123 |  * @param params additional parameters for the parser
124 |  * @return 0 if everything went fine, NON 0 if an error took place
125 |  */
126 | int cmdline_parser_ext (int argc, char **argv,
127 |   struct gengetopt_args_info *args_info,
128 |   struct cmdline_parser_params *params);
129 | 
130 | /**
131 |  * Save the contents of the option struct into an already open FILE stream.
132 |  * @param outfile the stream where to dump options
133 |  * @param args_info the option struct to dump
134 |  * @return 0 if everything went fine, NON 0 if an error took place
135 |  */
136 | int cmdline_parser_dump(FILE *outfile,
137 |   struct gengetopt_args_info *args_info);
138 | 
139 | /**
140 |  * Save the contents of the option struct into a (text) file.
141 |  * This file can be read by the config file parser (if generated by gengetopt)
142 |  * @param filename the file where to save
143 |  * @param args_info the option struct to save
144 |  * @return 0 if everything went fine, NON 0 if an error took place
145 |  */
146 | int cmdline_parser_file_save(const char *filename,
147 |   struct gengetopt_args_info *args_info);
148 | 
149 | /**
150 |  * Print the help
151 |  */
152 | void cmdline_parser_print_help(void);
153 | /**
154 |  * Print the version
155 |  */
156 | void cmdline_parser_print_version(void);
157 | 
158 | /**
159 |  * Initializes all the fields a cmdline_parser_params structure 
160 |  * to their default values
161 |  * @param params the structure to initialize
162 |  */
163 | void cmdline_parser_params_init(struct cmdline_parser_params *params);
164 | 
165 | /**
166 |  * Allocates dynamically a cmdline_parser_params structure and initializes
167 |  * all its fields to their default values
168 |  * @return the created and initialized cmdline_parser_params structure
169 |  */
170 | struct cmdline_parser_params *cmdline_parser_params_create(void);
171 | 
172 | /**
173 |  * Initializes the passed gengetopt_args_info structure's fields
174 |  * (also set default values for options that have a default)
175 |  * @param args_info the structure to initialize
176 |  */
177 | void cmdline_parser_init (struct gengetopt_args_info *args_info);
178 | /**
179 |  * Deallocates the string fields of the gengetopt_args_info structure
180 |  * (but does not deallocate the structure itself)
181 |  * @param args_info the structure to deallocate
182 |  */
183 | void cmdline_parser_free (struct gengetopt_args_info *args_info);
184 | 
185 | /**
186 |  * Checks that all the required options were specified
187 |  * @param args_info the structure to check
188 |  * @param prog_name the name of the program that will be used to print
189 |  *   possible errors
190 |  * @return
191 |  */
192 | int cmdline_parser_required (struct gengetopt_args_info *args_info,
193 |   const char *prog_name);
194 | 
195 | 
196 | #ifdef __cplusplus
197 | }
198 | #endif /* __cplusplus */
199 | #endif /* CMDLINE_H */
200 | 


--------------------------------------------------------------------------------
/src/liballprof/template.c:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 |  * liballprof MPIP Wrapper 
  3 |  *
  4 |  * Copyright: Indiana University
  5 |  * Author: Torsten Hoefler <htor@cs.indiana.edu>
  6 |  * 
  7 |  *************************************************************************/
  8 | #include <stdio.h>
  9 | #include <string.h>
 10 | #include <unistd.h>
 11 | #include <time.h>
 12 | #include <sys/types.h>
 13 | #include <sys/stat.h>
 14 | #include <fcntl.h>
 15 | #include <stdlib.h>
 16 | #include <sys/utsname.h>
 17 | #include <mpi.h>
 18 | 
 19 | #include "fc_mangle.h"
 20 | #define F77_FUNC FortranCInterface_GLOBAL
 21 | 
 22 | #include "allprof.h"
 23 | #include "numbers.h"
 24 | #include "sync.h"
 25 | 
 26 | #define true 1
 27 | #define false 0
 28 | 
 29 | #ifdef HAVE_NBC
 30 | #include <nbc.h>
 31 | #endif
 32 | 
 33 | #ifdef WRITER_THREAD
 34 | #include <pthread.h>
 35 | #include <semaphore.h>
 36 | #endif
 37 | 
 38 | #ifdef __cplusplus
 39 | extern "C" {
 40 | #endif
 41 | 
 42 | 
 43 | 
 44 | 
 45 | 
 46 | #ifdef WRITER_THREAD
 47 | #define VOLATILE volatile
 48 | /* have a second buffer to swap */
 49 | static volatile char *buf1, *buf2, 
 50 |   *curbuf, /* current buffer base address */
 51 |   *bufptr; /* current position on buffer */
 52 | static volatile char exitflag=0;
 53 | static sem_t threadsem, usersem;
 54 | #else
 55 | #define VOLATILE 
 56 | static char *buf1,
 57 |   *curbuf, /* current buffer base address */
 58 |   *bufptr; /* position in current buf */
 59 | #endif
 60 | static char buf_initialized = false;
 61 | 
 62 | static int world_rank, world_size;
 63 |     
 64 | static FILE *fp;
 65 | static char mpi_initialized = false;
 66 | 
 67 | static void resetbuffer(void *buffer) {
 68 |   memset(buffer, '\0', BUFSIZE);
 69 |   buf_initialized = true;
 70 | }
 71 |  
 72 | #ifdef WRITER_THREAD
 73 | static void *writer_thread(void* arg) {
 74 |   /* loops infinitely - until exit notification received */
 75 |   while(1) {
 76 |     char *tmpbuf;
 77 | 
 78 |     /* wait on semaphore to be notified */
 79 |     sem_wait(&threadsem);
 80 | 
 81 |     /* check if exit flag is set :) */
 82 |     if(exitflag) {
 83 |       /* write */
 84 |       fputs((char*)curbuf, fp);
 85 |       sem_post(&usersem);
 86 |       break;
 87 |     }
 88 |     /* swap buffers */
 89 |     if(curbuf == buf1) {
 90 |       curbuf=buf2;
 91 |       bufptr=buf2;
 92 |       tmpbuf=(char*)buf1;
 93 |     } else {
 94 |       curbuf=buf1;
 95 |       bufptr=buf1;
 96 |       tmpbuf=(char*)buf2;
 97 |     }
 98 |     
 99 |     /* notify user thread */
100 |     sem_post(&usersem);
101 | 
102 |     /* write buffer to disk */
103 |     fputs(tmpbuf, fp);
104 |     resetbuffer(tmpbuf);
105 |   }
106 | }
107 | #endif
108 | 
109 | void print_banner(int rank, char *bindings, char *name, int size) {
110 | #ifdef PRINT_BANNER
111 |   if(!rank) {
112 |     char *env = getenv("HTOR_PMPI_FILE_PREFIX");
113 |     if(env == NULL) printf("*** htor's mpiplog in %s - %s bindings, logging %i processes to %s*%s!\n", name, bindings, size, FILE_PREFIX, FILE_SUFFIX);
114 |     else            printf("*** htor's mpiplog in %s - %s bindings, logging %i processes to %s*%s!\n", name, bindings, size, env, FILE_SUFFIX);
115 |   }
116 | #endif
117 | }
118 | 
119 | /* get \ceil log_base(i) \ceil with integer arithmetic */
120 | int logi(int base, int x) {
121 |   int log=0;
122 |   int y=1;
123 |   while(y <= x) { log++; y*=base; }
124 |   return log;
125 | }
126 | 
127 | /* pretty print numbers in buffer (add 0's to fill up to max) */
128 | int pprint(char *buf, int len, int x, int max) {
129 |   int log10x=logi(10,x);
130 |   if(x==0) log10x=1; /* log_x(0) is undefined but has a single digit ;) */
131 |   int log10max=logi(10,max);
132 |   int i; for(i=0; i<log10max-log10x; ++i) {
133 |     *buf = '0';
134 |     buf++;
135 |     len--;
136 |   }
137 |   snprintf(buf, len, "%i", x);
138 |   return log10max-log10x;
139 | }
140 | 
141 | 
142 | static void mpi_initialize(void) {
143 | #define BUFSZ 1024
144 |   char buf[ BUFSZ ];
145 |   time_t tim;
146 |   char *string;
147 |   double diff;
148 |  
149 |   PMPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
150 |   PMPI_Comm_size(MPI_COMM_WORLD, &world_size);
151 | 
152 |   char numbuf[1024];
153 |   pprint(numbuf, 1024, world_rank, world_size);   
154 | 
155 |   /* open file */
156 |   char *env = getenv("HTOR_PMPI_FILE_PREFIX");
157 |   if(env == NULL) snprintf(buf, BUFSZ-1, "%s%s%s", FILE_PREFIX, numbuf, FILE_SUFFIX);
158 |   else            snprintf(buf, BUFSZ-1, "%s%s%s", env, numbuf, FILE_SUFFIX);
159 |   fp = fopen(buf, "w");
160 |  
161 |   /* print greeting */
162 |   snprintf(buf, BUFSZ-1, "# htor's PMPI Tracer %s Output File\n", VERSION);
163 |   fputs(buf, fp);
164 |   
165 |   /* print local time */
166 |   time(&tim);
167 |   string = ctime(&tim);
168 |   snprintf(buf, BUFSZ-1, "# time: %s", string);
169 |   fputs(buf, fp);
170 |   
171 |   /* print hostname */
172 |   snprintf(buf, BUFSZ-1, "# hostname: ");
173 |   fputs(buf, fp);
174 |   gethostname(buf, BUFSZ-1);
175 |   fputs(buf, fp);
176 |   snprintf(buf, BUFSZ-1, ".");
177 |   fputs(buf, fp);
178 |   getdomainname(buf, BUFSZ-1);
179 |   fputs(buf, fp);
180 |   snprintf(buf, BUFSZ-1, "\n");
181 |   fputs(buf, fp);
182 | 
183 |   /* print uname information */
184 |   struct utsname uname_info;
185 |   uname(&uname_info);
186 |   snprintf(buf, BUFSZ-1, "# uname: %s %s %s %s %s\n", uname_info.sysname, uname_info.nodename, uname_info.release, uname_info.version, uname_info.machine);
187 |   fputs(buf, fp);
188 | 
189 | 
190 |   diff = sync_tree(MPI_COMM_WORLD);
191 |   snprintf(buf, BUFSZ-1, "# clockdiff: %lf s (relative to rank 0)\n", diff);
192 |   fputs(buf, fp);
193 |  
194 |   /* end of greeting -- empty line*/
195 |   snprintf(buf, BUFSZ-1, "\n");
196 |   fputs(buf, fp);
197 | 
198 | #ifdef WRITER_THREAD
199 |   sem_init(&threadsem, 0, 0);
200 |   sem_init(&usersem, 0, 0);
201 |   
202 |   {
203 |     pthread_attr_t attr;
204 |     pthread_t thread;
205 |     int rc;
206 | 
207 |     pthread_attr_init(&attr);
208 |     rc = pthread_create(&thread, &attr, writer_thread, (void *)0);
209 |     if(rc) { MPI_Abort(MPI_COMM_WORLD, 55); }
210 |   }
211 | 
212 | #endif
213 | 
214 |   mpi_initialized = true;
215 | }
216 | 
217 | 
218 | static void writebuf(void* buffer) {
219 |   /* either notify thread or write yourself */
220 | #ifdef WRITER_THREAD
221 |   sem_post(&threadsem);
222 |   /* wait until writer swapped buffers */
223 |   sem_wait(&usersem);
224 | #else
225 |   fputs(buffer, fp);
226 |   resetbuffer(buffer);
227 |   bufptr=buffer;
228 | #endif
229 | }
230 | 
231 | static void mpi_finalize(void) {
232 |   double diff;
233 | 
234 |   diff = sync_tree(MPI_COMM_WORLD);
235 | 
236 |   bufptr += snprintf((char*)bufptr, BUFSIZE-((unsigned long)bufptr-(unsigned long)curbuf), "# Finalize clockdiff: %lf\n", diff);
237 | #ifdef WRITER_THREAD
238 |   /* tell thread to write finalize and finish */
239 |   exitflag = 1;
240 |   sem_post(&threadsem);
241 |   sem_wait(&usersem);
242 | #else
243 |   writebuf(curbuf);
244 | #endif
245 |   resetbuffer((char*)curbuf);
246 |   bufptr=curbuf;
247 | 
248 |   mpi_initialized = false;
249 | }
250 | 
251 | static __inline__ void check() {
252 | #ifdef WRITER_THREAD
253 |   /* initialize both buffers */
254 |   if(!buf_initialized) { 
255 |     /* allocate buffers */
256 |     buf1 = (volatile char*)malloc(BUFSIZE); if(buf1 == NULL) printf("malloc error\n");
257 |     buf2 = (volatile char*)malloc(BUFSIZE); if(buf2 == NULL) printf("malloc error\n");
258 |     curbuf=buf1;
259 |     bufptr=buf1;
260 |     resetbuffer((char*)buf1); 
261 |     resetbuffer((char*)buf2); 
262 |   }
263 | #else
264 |   if(!buf_initialized) {
265 |     buf1 = malloc(BUFSIZE); if(buf1 == NULL) printf("malloc error\n");
266 |     curbuf=buf1;
267 |     bufptr=buf1;
268 |     resetbuffer(buf1);
269 |   }
270 | #endif
271 |   if((unsigned long)bufptr > ((unsigned long)curbuf)+BUFSIZE-THRESHOLD) writebuf((char*)curbuf);
272 | }
273 | 
274 | #define IFDTYPE(DTYPE, dtypenum) \
275 |   if(type == DTYPE) { \
276 |     return snprintf(buffer, length, ":%i", dtypenum); \
277 |   } \
278 | 
279 | static int printdatatype(MPI_Datatype type, char *buffer, int length) {
280 | 
281 |   IFDTYPE(MPI_INT, LOG_MPI_INT) else
282 |   IFDTYPE(MPI_INTEGER, LOG_MPI_INTEGER) else 
283 |   IFDTYPE(MPI_CHARACTER, LOG_MPI_CHARACTER) else 
284 |   IFDTYPE(MPI_LONG, LOG_MPI_LONG) else 
285 |   IFDTYPE(MPI_SHORT, LOG_MPI_SHORT) else 
286 |   IFDTYPE(MPI_UNSIGNED, LOG_MPI_UNSIGNED) else 
287 |   IFDTYPE(MPI_UNSIGNED_LONG, LOG_MPI_UNSIGNED_LONG) else 
288 |   IFDTYPE(MPI_UNSIGNED_SHORT, LOG_MPI_UNSIGNED_SHORT) else 
289 |   IFDTYPE(MPI_FLOAT, LOG_MPI_FLOAT) else 
290 |   IFDTYPE(MPI_REAL, LOG_MPI_REAL) else 
291 |   IFDTYPE(MPI_DOUBLE, LOG_MPI_DOUBLE) else 
292 |   IFDTYPE(MPI_DOUBLE_PRECISION, LOG_MPI_DOUBLE_PRECISION) else 
293 |   IFDTYPE(MPI_LONG_DOUBLE, LOG_MPI_LONG_DOUBLE) else 
294 |   IFDTYPE(MPI_BYTE, LOG_MPI_BYTE) else 
295 |   IFDTYPE(MPI_FLOAT_INT, LOG_MPI_FLOAT_INT) else 
296 |   IFDTYPE(MPI_DOUBLE_INT, LOG_MPI_DOUBLE_INT) else 
297 |   IFDTYPE(MPI_LONG_INT, LOG_MPI_LONG_INT) else 
298 |   IFDTYPE(MPI_2INT, LOG_MPI_2INT) else 
299 |   IFDTYPE(MPI_SHORT_INT, LOG_MPI_SHORT_INT) else 
300 |   IFDTYPE(MPI_LONG_DOUBLE_INT, LOG_MPI_LONG_DOUBLE_INT) else 
301 |   IFDTYPE(MPI_LOGICAL, LOG_MPI_LOGICAL) else 
302 |   IFDTYPE(MPI_COMPLEX, LOG_MPI_COMPLEX) else 
303 |   IFDTYPE(MPI_DOUBLE_COMPLEX, LOG_MPI_DOUBLE_COMPLEX) else 
304 |   return snprintf(buffer, length, ":%lu", (unsigned long)type); 
305 | }
306 | 
307 | #define IFOP(OP, opnum) \
308 |   if(op == OP) { \
309 |     return snprintf(buffer, length, ":%i", opnum); \
310 |   } \
311 | 
312 | static int printop(MPI_Op op, char *buffer, int length) {
313 | 
314 |   IFOP(MPI_MIN, LOG_MPI_MIN) else
315 |   IFOP(MPI_MAX, LOG_MPI_MAX) else
316 |   IFOP(MPI_SUM, LOG_MPI_SUM) else
317 |   IFOP(MPI_PROD, LOG_MPI_PROD) else
318 |   IFOP(MPI_LAND, LOG_MPI_LAND) else
319 |   IFOP(MPI_BAND, LOG_MPI_BAND) else
320 |   IFOP(MPI_LOR, LOG_MPI_LOR) else
321 |   IFOP(MPI_BOR, LOG_MPI_BOR) else
322 |   IFOP(MPI_LXOR, LOG_MPI_LXOR) else
323 |   IFOP(MPI_BXOR, LOG_MPI_BXOR) else
324 |   IFOP(MPI_MINLOC, LOG_MPI_MINLOC) else
325 |   IFOP(MPI_MAXLOC, LOG_MPI_MAXLOC) else
326 |   return snprintf(buffer, length, ":%lu", (unsigned long)op); 
327 | }
328 | 
329 | 
330 | 
331 | 
332 | 
333 | 


--------------------------------------------------------------------------------
/src/Schedgen2/mpi_colls.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from goal import GoalComm
  3 | from patterns import binomialtree, recdoub, ring, linear
  4 | 
  5 | 
  6 | def mpi_communication_pattern_selection(
  7 |     algorithm: str, comm_size: int, datasize: int, ptrn_config: str = None
  8 | ):
  9 |     if ptrn_config is not None and ptrn_config != "":
 10 |         # The config file should be a json file with the following format (lower bounds are inclusive, upper bounds are exclusive):
 11 |         # [
 12 |         #     {
 13 |         #         "algorithm": "algorithm_name", # can be left empty or omitted, otherwise only matching algorithms are considered
 14 |         #         "ptrn": "pattern_name",
 15 |         #         "lower_bounds": {
 16 |         #             "comm_size": -1 for no lower bound on the x-axis,
 17 |         #             "datasize": -1 for no lower bound on the y-axis,
 18 |         #             "combined": [(grad, intercept), (grad, intercept), ...] for the combined lower bounds
 19 |         #         },
 20 |         #         "upper_bounds": {
 21 |         #             "comm_size": -1 for no upper bound on the x-axis,
 22 |         #             "datasize": -1 for no upper bound on the y-axis,
 23 |         #             "combined": [(grad, intercept), (grad, intercept), ...] for the combined upper bounds
 24 |         #         }
 25 |         #     },
 26 |         #     ...
 27 |         # ]
 28 |         with open(ptrn_config, "r") as f:
 29 |             config = json.load(f)
 30 |             for c in config:
 31 |                 if (
 32 |                     "algorithm" in c
 33 |                     and c["algorithm"] != ""
 34 |                     and c["algorithm"] != algorithm
 35 |                 ):
 36 |                     continue
 37 |                 if (
 38 |                     c["lower_bounds"]["comm_size"] != -1
 39 |                     and comm_size < c["lower_bounds"]["comm_size"]
 40 |                 ):
 41 |                     continue
 42 |                 if (
 43 |                     c["upper_bounds"]["comm_size"] != -1
 44 |                     and comm_size >= c["upper_bounds"]["comm_size"]
 45 |                 ):
 46 |                     continue
 47 |                 if (
 48 |                     c["lower_bounds"]["datasize"] != -1
 49 |                     and datasize < c["lower_bounds"]["datasize"]
 50 |                 ):
 51 |                     continue
 52 |                 if (
 53 |                     c["upper_bounds"]["datasize"] != -1
 54 |                     and datasize >= c["upper_bounds"]["datasize"]
 55 |                 ):
 56 |                     continue
 57 |                 if c["lower_bounds"]["combined"] is not None:
 58 |                     for grad, intercept in c["lower_bounds"]["combined"]:
 59 |                         if datasize < grad * comm_size + intercept:
 60 |                             continue
 61 |                 if c["upper_bounds"]["combined"] is not None:
 62 |                     for grad, intercept in c["upper_bounds"]["combined"]:
 63 |                         if datasize >= grad * comm_size + intercept:
 64 |                             continue
 65 |                 return c["ptrn"]
 66 |             raise ValueError(
 67 |                 f"Cannot find a pattern for comm_size={comm_size} and datasize={datasize} according to the config file"
 68 |             )
 69 |     else:
 70 |         if algorithm == "reduce":
 71 |             # use binomial tree for large data size and when the communicator size is a power of 2
 72 |             if datasize > 4096 and comm_size & (comm_size - 1) == 0:
 73 |                 return "binomialtree"
 74 |             else:
 75 |                 return "linear"
 76 |         elif algorithm == "bcast":
 77 |             # use binomial tree for small data size and when the communicator size is a power of 2
 78 |             if datasize <= 4096 and comm_size & (comm_size - 1) == 0:
 79 |                 return "binomialtree"
 80 |             else:
 81 |                 return "linear"
 82 |         elif algorithm == "dissemination":
 83 |             # TODO currently not implemented to support different patterns
 84 |             pass
 85 |         elif algorithm == "allreduce":
 86 |             # Use recdoub for power of 2 communicator size and small data sizes
 87 |             if datasize <= 4096 and comm_size & (comm_size - 1) == 0:
 88 |                 return "recdoub"
 89 |             else:
 90 |                 return "ring"
 91 |         elif algorithm == "alltoall" or algorithm == "alltoallv":
 92 |             return "linear"
 93 |         else:
 94 |             raise ValueError(f"Communication type {algorithm} not implemented")
 95 | 
 96 | 
 97 | def dissemination(comm_size, datasize, tag):
 98 |     # TODO: select or implement right pattern
 99 |     comm = GoalComm(comm_size)
100 |     for rank in range(0, comm_size):
101 |         dist = 1
102 |         recv = None
103 |         while dist < comm_size:
104 |             send = comm.Send(
105 |                 src=rank,
106 |                 dst=(rank + dist + comm_size) % comm_size,
107 |                 size=datasize,
108 |                 tag=tag,
109 |             )
110 |             if recv is not None:
111 |                 send.requires(recv)
112 |             recv = comm.Recv(
113 |                 src=(rank - dist + comm_size) % comm_size,
114 |                 dst=rank,
115 |                 size=datasize,
116 |                 tag=tag,
117 |             )
118 |             dist *= 2
119 |     return comm
120 | 
121 | 
122 | def scatter(
123 |     comm_size: int,
124 |     datasize: int,
125 |     tag: int = 42,
126 |     ptrn: str = "linear",
127 |     **kwargs,
128 | ):
129 |     if ptrn == "binomialtree":
130 |         return binomialtree(
131 |             comm_size=comm_size,
132 |             datasize=datasize,
133 |             tag=tag,
134 |             algorithm="scatter",
135 |             **kwargs,
136 |         )
137 |     elif ptrn == "linear":
138 |         return linear(
139 |             comm_size=comm_size,
140 |             datasize=datasize,
141 |             tag=tag,
142 |             algorithm="scatter",
143 |             parallel=True,
144 |             **kwargs,
145 |         )
146 |     else:
147 |         raise ValueError(f"scatter with pattern {ptrn} not implemented")
148 | 
149 | 
150 | def reduce(
151 |     comm_size: int,
152 |     datasize: int,
153 |     tag: int = 42,
154 |     ptrn: str = "binomialtree",
155 |     **kwargs,
156 | ):
157 |     if ptrn == "binomialtree":
158 |         return binomialtree(
159 |             comm_size=comm_size,
160 |             datasize=datasize,
161 |             tag=tag,
162 |             algorithm="reduce",
163 |             **kwargs,
164 |         )
165 |     elif ptrn == "linear":
166 |         return linear(
167 |             comm_size=comm_size,
168 |             datasize=datasize,
169 |             tag=tag,
170 |             algorithm="reduce",
171 |             parallel=True,
172 |             **kwargs,
173 |         )
174 |     else:
175 |         raise ValueError(f"reduce with pattern {ptrn} not implemented")
176 | 
177 | 
178 | def bcast(
179 |     comm_size: int,
180 |     datasize: int,
181 |     tag: int = 42,
182 |     ptrn: str = "binomialtree",
183 |     **kwargs,
184 | ):
185 |     if ptrn == "binomialtree":
186 |         return binomialtree(
187 |             comm_size=comm_size, datasize=datasize, tag=tag, algorithm="bcast", **kwargs
188 |         )
189 |     elif ptrn == "linear":
190 |         return linear(
191 |             comm_size=comm_size,
192 |             datasize=datasize,
193 |             tag=tag,
194 |             algorithm="bcast",
195 |             parallel=True,
196 |             **kwargs,
197 |         )
198 |     else:
199 |         raise ValueError(f"bcast with pattern {ptrn} not implemented")
200 | 
201 | 
202 | def allreduce(
203 |     comm_size: int,
204 |     datasize: int,
205 |     tag: int = 42,
206 |     ptrn: str = "recdoub",
207 |     **kwargs,
208 | ):
209 |     comms = []  # reduce-scatter and allgather
210 |     if ptrn == "recdoub":
211 |         comms.append(
212 |             recdoub(
213 |                 comm_size=comm_size,
214 |                 datasize=datasize,
215 |                 tag=tag,
216 |                 algorithm="reduce-scatter",
217 |                 **kwargs,
218 |             )
219 |         )
220 |         comms.append(
221 |             recdoub(
222 |                 comm_size=comm_size,
223 |                 datasize=datasize,
224 |                 tag=tag + comm_size,
225 |                 algorithm="allgather",
226 |                 **kwargs,
227 |             )
228 |         )
229 |     elif ptrn == "ring":
230 |         comms.append(
231 |             ring(
232 |                 comm_size=comm_size,
233 |                 datasize=datasize,
234 |                 tag=tag,
235 |                 algorithm="reduce-scatter",
236 |                 rounds=comm_size - 1,
237 |                 **kwargs,
238 |             )
239 |         )
240 |         comms.append(
241 |             ring(
242 |                 comm_size=comm_size,
243 |                 datasize=datasize,
244 |                 tag=tag + comm_size,
245 |                 algorithm="allgather",
246 |                 rounds=comm_size - 1,
247 |                 **kwargs,
248 |             )
249 |         )
250 |     else:
251 |         raise ValueError(f"allreduce with pattern {ptrn} not implemented")
252 |     comms[0].Append(comms[1])
253 |     return comms[0]
254 | 
255 | 
256 | def alltoall(
257 |     comm_size: int,
258 |     datasize: int,
259 |     tag: int = 42,
260 |     ptrn: str = "linear",
261 |     window_size: int = 0,
262 |     **kwargs,
263 | ):
264 |     if ptrn == "linear":
265 |         return linear(
266 |             comm_size=comm_size,
267 |             datasize=datasize,
268 |             tag=tag,
269 |             algorithm="alltoall",
270 |             parallel=(window_size == 0),
271 |             window_size=window_size,
272 |             **kwargs,
273 |         )
274 |     else:
275 |         raise ValueError(f"alltoall with pattern {ptrn} not implemented")
276 | 
277 | 
278 | def alltoallv(
279 |     comm_size: int,
280 |     datasize: int,
281 |     tag: int = 42,
282 |     ptrn: str = "linear",
283 |     window_size: int = 0,
284 |     **kwargs,
285 | ):
286 |     # TODO: currently data is only randomized, add support for custom data sizes
287 |     if ptrn == "linear":
288 |         return linear(
289 |             comm_size=comm_size,
290 |             datasize=datasize,
291 |             tag=tag,
292 |             algorithm="alltoallv",
293 |             parallel=(window_size == 0),
294 |             randomized_data=True,
295 |             window_size=window_size,
296 |             **kwargs,
297 |         )
298 |     else:
299 |         raise ValueError(f"alltoallv with pattern {ptrn} not implemented")
300 | 


--------------------------------------------------------------------------------
/src/Schedgen2/process_trace.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import ast
  3 | import re
  4 | import glob
  5 | import argparse
  6 | import os
  7 | from mpi_colls import allreduce, alltoall
  8 | from goal import GoalComm
  9 | 
 10 | 
 11 | class AllprofParser:
 12 | 
 13 |     def __init__(self, requestsize=8, verbose=False):
 14 |         self.comm = None
 15 |         self.verbose = verbose
 16 |         self.requests = []    # for each comm_world rank, this holds one dict, mapping reqptrs to ops
 17 |         self.REQUEST_SIZE = requestsize # this is the size in bytes of a MPI_Request, i.e., in waitall we step through the array using this stepsize
 18 |         self.last_op = {}     # one pair (op, endtime) per comm_world rank
 19 |     
 20 |     def getLastOp(self, rank: int):
 21 |         if rank not in self.last_op:
 22 |             return None
 23 |         return self.last_op[rank]
 24 |     
 25 |     def setLastOp(self, rank: int, op, endtime: int):
 26 |         self.last_op[rank] = (op, endtime)
 27 | 
 28 |     def parseLine(self, rank: int, allprof_line: str):
 29 |         # if the line is a comment, ignore it
 30 |         if re.match("#.*\n", allprof_line):
 31 |             return
 32 |         # if the line is whitespace ignore it
 33 |         if re.match("\s*\n", allprof_line):
 34 |             return
 35 |         # check if it matches any of the defined MPI functions
 36 |         m = re.match("(MPI_.+?):(.+:(\d+|-))\n", allprof_line)
 37 |         if m:
 38 |             name = m.group(1)
 39 |             args = m.group(2)
 40 |             if hasattr(self, name):
 41 |                 args = args.strip().split(":")
 42 |                 # turn args into ints where possible (ddts, comms, ... are not ints!)
 43 |                 newargs = []
 44 |                 for arg in args:
 45 |                     newarg = 0
 46 |                     try:
 47 |                         newarg = int(arg)
 48 |                     except:
 49 |                         newarg = arg
 50 |                     newargs.append(newarg)
 51 |                 args = newargs
 52 |                 args.append(rank)
 53 |                 if self.verbose:
 54 |                     print("Parsing "+name+" with args "+str(args))
 55 |                 # for each line we get its start and end time (first and last elem in args)
 56 |                 # we add a calc of the size of the difference between the endtime of the last
 57 |                 # operation on rank and the starttime to account for any computation that might
 58 |                 # have happened between calls - we init last_op in MPI_Init, so it might be None
 59 |                 if self.getLastOp(rank) is not None:
 60 |                     tstart = int(args[0])
 61 |                     tend = int(args[-1])
 62 |                     last_op, last_endtime = self.getLastOp(rank)
 63 |                     newCalc = self.comm[rank].Calc(tstart - last_endtime)
 64 |                     newCalc.requires(last_op)
 65 |                     self.setLastOp(rank, newCalc, tend)
 66 |                 newcomm = getattr(self, name)(*args)
 67 |                 if newcomm is not None:
 68 |                     # append rank of newcomm to self.comm, however all independent ops in newcomm depend on last_op
 69 |                     # and the new last_op becomes the last op in newcomm (if there is only one, otherwise we make a calc of size 0)
 70 |                     self.comm[rank].Append(newcomm[rank], dependOn=self.getLastOp(rank)[0])
 71 |                     lastop = None
 72 |                     l = newcomm[rank].LastOps()
 73 |                     if len(l) == 1:
 74 |                         lastop = l[0]
 75 |                     else:
 76 |                         lastop = self.comm[rank].Calc(0)
 77 |                         lastop.requires(self.getLastOp(rank)[0]) # just to be save in case newcomm is empty
 78 |                         for o in l:
 79 |                             lastop.requires(o)
 80 |                     self.setLastOp(rank, lastop, args[-1])
 81 |             else:
 82 |                 raise NotImplementedError("Parsing of "+allprof_line.strip()+" is not implemented yet.")
 83 |         else:
 84 |             raise ValueError("The line "+allprof_line+" doesn't look like anything allprof should output!")
 85 | 
 86 |     def MPI_Initialized(self, tstart, flagptr, tend, rank):
 87 |         return None # this doesn't modify the goal schedule
 88 |     
 89 |     def MPI_Init(self, tstart, argcptr, argvptr, tend, rank: int):
 90 |         self.setLastOp(rank, self.comm[rank].Calc(tend), tend)
 91 |         return None
 92 |     
 93 |     def MPI_Comm_size(self, tstart, comm, sizeptr, tend, rank):
 94 |         return None
 95 |     
 96 |     def MPI_Comm_rank(self, tstart, comm, rankptr, tend, rank):
 97 |         return None
 98 |     
 99 |     def MPI_Irecv(self, tstart, buf, count, datatype, src, tag, comm, req, tend, rank):
100 |         g = GoalComm(self.comm.CommSize())
101 |         ddtsize = self.getDDTSize(datatype)
102 |         op = g[rank].Recv(int(src), int(tag), int(count)*ddtsize)
103 |         self.addRequest(rank, req, op)
104 |         return g #TODO handle splitted comms
105 | 
106 |     def MPI_Isend(self, tstart, buf, count, datatype, dst, tag, comm, req, tend, rank):
107 |         g = GoalComm(self.comm.CommSize())
108 |         ddtsize = self.getDDTSize(datatype)
109 |         op = g[rank].Send(int(dst), int(tag), int(count)*ddtsize)
110 |         self.addRequest(rank, req, op)
111 |         return g #TODO handle splitted comms
112 |     
113 |     def MPI_Waitall(self, tstart, count, requestptr, statusptr, tend, rank):
114 |         calc = None
115 |         for ridx in range(0, int(count)):
116 |             request = int(requestptr)+ridx*self.REQUEST_SIZE
117 |             op = self.findRequest(rank, request)
118 |             if op is None:
119 |                 print("Waitall on a request we didn't see before - might be ok if the user initialized it to MPI_REQUEST_NULL, but also might mean request size is set to the wrong constant! -- check the code of the trace app!")
120 |                 continue
121 |             if calc is None:
122 |                 calc = self.comm[rank].Calc(0)
123 |             calc.requires(op)
124 |         # Waitall directly modifies self.comm, thus returns None and we need to handle deps from/on last op in here
125 |         calc.requires(self.getLastOp(rank)[0])
126 |         self.setLastOp(rank, calc, tend)
127 |         return None
128 |     
129 |     def MPI_Wait(self, tstart, requestptr, statusptr, tend, rank):
130 |         calc = None
131 |         op = self.findRequest(rank, requestptr)
132 |         if op is None:
133 |             print("Wait on a request we didn't see before - might be ok if the user initialized it to MPI_REQUEST_NULL, but also might mean request size is set to the wrong constant! -- check the code of the trace app!")
134 |             return
135 |         calc = self.comm[rank].Calc(0)
136 |         calc.requires(op)
137 |          # Wait directly modifies self.comm, thus returns None and we need to handle deps from/on last op in here
138 |         calc.requires(self.getLastOp(rank)[0])
139 |         self.setLastOp(rank, calc, tend)
140 |         return None
141 | 
142 |     def MPI_Barrier(self, tstart, comm, tend, rank):
143 |         return alltoall(datasize=0, comm_size=self.comm.CommSize())
144 | 
145 |     def MPI_Wtime(self, tstart, tend, rank):
146 |         return None #this does not modify the goal schedule
147 |     
148 |     def MPI_Allreduce(self, tstart, sendbuf, recvbuf, count, datatype, op, comm, tend, rank):
149 |         datasize = self.getDDTSize(datatype) * count
150 |         return allreduce(datasize, self.comm.CommSize())
151 |     
152 |     def MPI_Finalize(self, tstart, tend, rank):
153 |         return None #this does not modify the goal schedule
154 | 
155 |     def addRequest(self, rank, req, op):
156 |         self.requests[rank][int(req)] = op
157 | 
158 |     def findRequest(self, rank, req):
159 |         if int(req) in self.requests[rank]:
160 |             op = self.requests[rank][int(req)]
161 |             return op
162 |         return None
163 |     
164 |     def deleteRequest(self, rank, req):
165 |         if int(req) in self.requests[rank]:
166 |             self.requests[rank].pop(int(req))
167 | 
168 |     def getDDTSize(self, ddtstr):
169 |         return int(ddtstr.split(",")[1])
170 | 
171 |     def parseDir(self, tracepath, nameptrn="pmpi-trace-rank-*.txt", abortonerror=False):
172 |       self.tracepath = tracepath
173 |       searchpath = os.path.join(tracepath, nameptrn)
174 |       files = glob.glob(searchpath)
175 |       if len(files) < 1:
176 |           raise ValueError("No tracefiles found at path "+str(searchpath))
177 |       self.comm = GoalComm(len(files))
178 |       for rank in range(0, self.comm.CommSize()):
179 |           self.requests.append({})
180 |       for rank in range(0, self.comm.CommSize()):
181 |           file_name = str(rank).join(nameptrn.split("*"))
182 |           fh = open(os.path.join(tracepath, file_name), "r")
183 |           while True:
184 |             line = fh.readline()
185 |             if not line:
186 |                 if self.verbose:
187 |                     print("Finished parsing ranks "+str(rank)+" trace.")
188 |                 break
189 |             else:
190 |                 try:
191 |                     self.parseLine(rank, line)
192 |                 except Exception as e:
193 |                     if abortonerror:
194 |                         raise e
195 |                         sys.exit(1)
196 |                     else:
197 |                         if self.verbose:
198 |                             print("There was a problem but we attempt to carry on: "+str(e))
199 |           fh.close()
200 |       return self.comm
201 | 
202 | 
203 | 
204 | if __name__ == "__main__":
205 |     parser = argparse.ArgumentParser(
206 |                     prog='Schedgen2 Trace Parser',
207 |                     description='Reads an MPI trace in liballprof format and outputs a GOAL schedule (or a graphical representation of it).')
208 |     parser.add_argument('-v', '--verbose', action='store_true', help="Be more verbose, i.e., print progress info.")
209 |     parser.add_argument('-i', '--tracedir', required=True, help="Path to the directory containing the individual traces, each tracefile name follows nameptrn.")
210 |     parser.add_argument('-n', '--nameptrn', default="pmpi-trace-rank-*.txt", help="Filename of traces, use * to indicate rank id (in MPI_COMM_WORLD), defaults to pmpi-trace-rank*.txt")
211 |     parser.add_argument('-f', '--output-format', default="goal", choices=["goal", "graphviz"], help="Output format, either goal or graphviz, defaults to goal")
212 |     parser.add_argument('-o', '--outfile', default="-", help="Output file name, use - for stdout (if verbose mode is on progress will be printed to stdout), defaults to -.")
213 |     parser.add_argument('-r', '--requestsize', default=8, help="Size of an MPI_REQUEST in bytes, defaults to 8.")
214 |     parser.add_argument('-a', '--abortonerror', action='store_true', help="By default we ignore errors such as not implemented MPI functions. Use this flag to abort on such errors.")
215 |     args = parser.parse_args()
216 |     p = AllprofParser(requestsize=args.requestsize, verbose=args.verbose)
217 |     comm = p.parseDir(args.tracedir, nameptrn=args.nameptrn, abortonerror=args.abortonerror)
218 |     outfile = sys.stdout
219 |     if args.outfile != "-":
220 |         outfile = open(args.outfile, "w")
221 |     comm.write_goal(fh=outfile, format=args.output_format)
222 |     outfile.close()
223 | 
224 | 


--------------------------------------------------------------------------------
/src/liballprof/sync.c:
--------------------------------------------------------------------------------
  1 | #include "sync.h"
  2 | #include <stdio.h>
  3 | 
  4 | #define MAX_DOUBLE 1e100
  5 | #define NUMBER_SMALLER 100
  6 | static double *diffs=NULL; /* global array of all diffs to all ranks - only
  7 |                   completely valid on rank 0 */
  8 | static double gdiff;
  9 | 
 10 | double sync_peer(int client, int peer, MPI_Comm comm) {
 11 |     const double ABORT_VAL = 9999999.0;
 12 |     int notsmaller = 0; /* count number of RTTs that are *not* smaller than
 13 |                     the current smallest one */
 14 |     int server=0;
 15 |     double tstart, /* local start time */
 16 |            tend, /* local end time */
 17 |            trem, /* remote time */
 18 |            tmpdiff, /* temporary difference to remote clock */
 19 |            diff; /* difference to remote clock */
 20 |     int res, r;
 21 |   res = PMPI_Comm_rank(comm, &r);
 22 | 
 23 |     if(!client) server = 1;
 24 | 
 25 |     double smallest = MAX_DOUBLE; /* the current smallest time */
 26 |     do {
 27 |       /* the client sends a ping to the server and waits for a pong (and
 28 |        * takes the RTT time). It repeats this procedure until the last
 29 |        * NUMBER_SMALLER RTTs have not been smaller than the smallest
 30 |        * (tries to find the smallest RTT). When the smallest RTT is
 31 |        * found, it sends a special flag (0d) to the server that it knows
 32 |        * that the benchmark is finished. The client computes the diff
 33 |        * with this smallest RTT with the scheme described in the paper.
 34 |        * */
 35 |       if(client) {
 36 |         tstart = PMPI_Wtime();
 37 |         res = PMPI_Send(&tstart, 1, MPI_DOUBLE, peer, 0, comm);
 38 |         res = PMPI_Recv(&trem, 1, MPI_DOUBLE, peer, 0, comm, MPI_STATUS_IGNORE);
 39 |         tend = PMPI_Wtime();
 40 |         tmpdiff = tstart + (tend-tstart)/2 - trem;
 41 |         
 42 |         if(tend-tstart < smallest) {
 43 |           smallest = tend-tstart;
 44 |           notsmaller = 0;
 45 |           diff = tmpdiff; /* save new smallest diff-time */
 46 |         } else {
 47 |           if(++notsmaller == NUMBER_SMALLER) {
 48 |             /* send abort flag to client */
 49 |             trem = ABORT_VAL;
 50 |             res = PMPI_Send(&trem, 1, MPI_DOUBLE, peer, 0, comm);
 51 |             /*printf("[%i] diff to %i: %lf\n", r, peer, diff*1e6);*/
 52 |             break;
 53 |           }
 54 |         }
 55 |         /*printf("[%i] notsmaller: %i\n", r, notsmaller);*/
 56 |       }
 57 | 
 58 |       /* The server just replies with the local time to the client
 59 |        * requests and aborts the benchmark if the abort flag (0d) is
 60 |        * received in any of the requests. */
 61 |       if(server) {
 62 |         /* printf("[%i] server: waiting for ping from %i\n", r, peer); */
 63 |         res = PMPI_Recv(&tstart, 1, MPI_DOUBLE, peer, 0, comm, MPI_STATUS_IGNORE);
 64 |         if(tstart == ABORT_VAL) {break;} /* this is the signal from the client to stop */
 65 |         trem = PMPI_Wtime(); /* fill in local time on server */
 66 |         /* printf("[%i] server: got ping from %i (%lf) \n", r, peer, tstart); */
 67 |         res = PMPI_Send(&trem, 1, MPI_DOUBLE, peer, 0, comm);
 68 |       }
 69 |       /* this loop is only left with a break */
 70 |     } while(1);
 71 |     return diff;
 72 | }
 73 | 
 74 | 
 75 | /* tree-based synchronization mechanism 
 76 |  * - */
 77 | double sync_tree(MPI_Comm comm) {
 78 |   int p, r, res, dist, round;
 79 |   int power; /* biggest power of two value that is smaller or equal to p */
 80 |   int peer; /* synchronization peer */
 81 |   double diff;
 82 | 
 83 |   res = PMPI_Comm_rank(comm, &r);
 84 |   res = PMPI_Comm_size(comm, &p);
 85 |   
 86 |   /* reallocate tha diffs array with the right size */
 87 |   if(diffs != NULL) free(diffs);
 88 |   diffs = (double*)calloc(1, p*sizeof(double));
 89 |   
 90 |   /* check if p is power of 2 
 91 |   { int i=1;
 92 |     while((i = i << 1) < p) {};
 93 |     if(i != p) { 
 94 |       printf("communicator size (%i) must be power of 2 (%i)!\n", p, i);
 95 |       MPI_Abort(MPI_COMM_WORLD, 1);
 96 |     }
 97 |   }*/
 98 | 
 99 |   { /* get the maximum power of 2 that is smaller than p */
100 |     int num=1;
101 |     do {
102 |       num *= 2;
103 |     } while(num*2 <= p);
104 |     power = num;
105 |   }
106 | 
107 |   /* if I am in the powers-of two group? */ 
108 |   if(r < power) { 
109 |     dist = 1; /* this gets left-shifted (<<) every round and is after 
110 |                      $\lceil log_2(p) \rceil$ rounds >= p */
111 |     round = 1; /* fun and printf round counter - not really needed */
112 |     do {
113 |       int client, server;
114 | 
115 |       client = 0; server = 0;
116 |       client = ((r % (dist << 1)) == 0);
117 |       server = ((r % (dist << 1)) == dist);
118 |       
119 |       if(server) {
120 |         peer = r - dist;
121 |         if(peer < 0) server = 0; /* disable yourself if there is no peer*/
122 |         /*if(server) printf("(%i) %i <- %i\n", round, r, peer);*/
123 |       }
124 |       if(client) {
125 |         peer = r + dist;
126 |         if(peer >= p) client = 0; /* disable yourself if there is no peer*/
127 |         /*if(client) printf("(%i) %i -> %i\n", round, peer, r);*/
128 |       }
129 |       if(!client && !server) break; /* TODO: leave loop if no peer left -
130 |                                        works only for power of two process
131 |                                        groups */
132 |       
133 | 	diff = sync_peer(client, peer, comm);
134 |       
135 |       /* diff is the time difference between client and server. This is
136 |        * only valid on the client, and is derived with the following
137 |        * formula: diff = tstart + (tend-tstart)/2 - trem;
138 |        * example:
139 |        *     Client                       Server
140 |        *     tstart = 100                 200 (those are local times, but at the same moment)
141 |        *     send message (L=10)
142 |        *     110                          trem = 210
143 |        *                                  send message back (L=10)
144 |        *     tend = 120                   220
145 |        *
146 |        *     diff = 100 + (120-100)/2 - 210
147 |        *          = 100 + 10 - 210 = 100
148 |        *
149 |        *  now, to get the local time on a server on a client:
150 |        *        t_s = r_c - diff
151 |        */
152 |       
153 |       /* the client measured the time difference to his peer-server of the
154 |        * current round. Since rank 0 is the global synchronization point,
155 |        * rank 0's array has to be up to date and the other clients have to
156 |        * communicate all their knowledge to rank 0 as described in the
157 |        * paper. */
158 |       
159 |       if(client) {
160 |         /* all clients just measured the time difference to node r + diff
161 |          * (=peer) */
162 |         diffs[peer] = diff;
163 | 
164 |         /* we are a client - we need to receive all the knowledge
165 |          * (differences) that the server we just synchronized with holds!
166 |          * Our server has been "round-1" times client and measures
167 |          * "round-1" diffs */
168 |         if(round > 1) {
169 |           double *recvbuf; /* receive the server's data */
170 |           int items, i;
171 | 
172 |           items = (1 << (round-1))-1;
173 |           recvbuf = (double*)malloc(items*sizeof(double));
174 |           
175 |           res = PMPI_Recv(recvbuf, items, MPI_DOUBLE, peer, 0, comm, MPI_STATUS_IGNORE);
176 |           
177 |           /*printf("[%i] round: %i, client merges %i items\n", r, round, items);*/
178 |           /* merge data into my own field */
179 |           for(i=0; i<items; i++) {
180 |             diffs[peer+i+1] =  diffs[peer] /* diff to server */ + 
181 |                             recvbuf[i] /* received time */; 
182 |           }
183 |           free(recvbuf);
184 |         }
185 |       }
186 | 
187 |       if(server) {
188 |         /* we are a server, we need to send all our knowledge (time
189 |          * differences to our client */
190 |       
191 |         /* we have measured "round-1" nodes at the end of round "round"
192 |          * and hold $2^(round-1)-1$ diffs at this time*/
193 |         if(round > 1) {
194 |           int i, tmpdist, tmppeer, items;
195 |           double *sendbuf;
196 |           
197 |           items = (1 << (round-1))-1;
198 |           sendbuf = (double*)malloc(items*sizeof(double));
199 | 
200 |           /*printf("[%i] round: %i, server sends %i items\n", r, round, items);*/
201 | 
202 |           /* fill buffer - every server holds the $2^(round-1)-1$ next
203 |            * diffs */
204 |           for(i=0; i<items; i++) {
205 |             sendbuf[i] = diffs[r+i+1];
206 |           }
207 |           res = PMPI_Send(sendbuf, items, MPI_DOUBLE, peer, 0, comm);
208 |           free(sendbuf);
209 |         }
210 |       }
211 |       
212 |       dist = dist << 1;
213 |       round++;
214 |     } while(dist < power);
215 |   }
216 |   /* all first power-of two nodes have their time difference now and the
217 |    * others have to synchronize with the first powers of two nodes ... 
218 |    * example p=6 -> power=4
219 |    * rank 0..3 are synched at this stage and rank 4 and 5 have to sync
220 |    * with 0 and 1 respectively */
221 |   if(r < power) {
222 |     /* check if I have a partner in the non power group */
223 |     if(p - power > r) { /* I have a partner */
224 |       peer = power + r; /* that's my partner */
225 |       /*printf("[%i] server for %i\n", r, peer);*/
226 |       sync_peer(0, peer, comm); /* I am the server */
227 |     }
228 |   } else {
229 |     peer = r - power; /* that's my partner */
230 |     /*printf("[%i] client for %i\n", r, peer);*/
231 |     diff = sync_peer(1, peer, comm); /* I am the client */
232 |     res = PMPI_Send(&diff, 1, MPI_DOUBLE, 0, 1, comm);
233 |   }
234 | 
235 |   if(0 == r) {
236 |     int syncpeer;
237 |     MPI_Request *reqs;
238 |     double *tmpdiffs;
239 |     
240 |     reqs = (MPI_Request*)malloc((p-power)*sizeof(MPI_Request));
241 |     tmpdiffs = (double*)malloc((p-power)*sizeof(double));
242 |    
243 |     /* pre-post all recv-request to speed it up a bit */
244 |     for(peer = power; peer < p; peer++) {
245 |       res = PMPI_Irecv(&tmpdiffs[peer-power], 1, MPI_DOUBLE, peer, 1, comm, &reqs[peer-power]);
246 |     }
247 |     
248 |     PMPI_Waitall(p-power,reqs,MPI_STATUSES_IGNORE);
249 |     
250 |     for(peer = power; peer < p; peer++) {
251 |       syncpeer = peer-power; /* the rank that 'peer' synchronized with */
252 |       diffs[peer] = diffs[peer-power] - tmpdiffs[peer-power];
253 |     }
254 | 
255 |     free(reqs);
256 |     free(tmpdiffs);
257 |   }
258 | 
259 |   /* scatter all the time diffs to the processes */
260 |   PMPI_Scatter(diffs, 1, MPI_DOUBLE, &gdiff, 1, MPI_DOUBLE, 0, comm);
261 |   /*printf("[%i] diff_tree: %lf usec\n", r, gdiff*1e6);*/
262 |   return gdiff*1e6;
263 | }
264 | 
265 | /* linear synchronization mechanism 
266 |  * - */
267 | double sync_lin(MPI_Comm comm) {
268 |   int p, r, res, peer=0;
269 | 
270 |   res = PMPI_Comm_rank(comm, &r);
271 |   res = PMPI_Comm_size(comm, &p);
272 |   
273 |   /* reallocate tha diffs array with the right size */
274 |   if(diffs != NULL) free(diffs);
275 |   diffs = (double*)calloc(1, p*sizeof(double));
276 |   
277 |   if(r == 0) {
278 |     for(peer = 1; peer < p; peer++) {
279 |       diffs[peer] = sync_peer(0, peer, comm);
280 |     }
281 |   } else {
282 |     sync_peer(1, peer, comm);
283 |   }
284 | 
285 |   /* scatter all the time diffs to the processes */
286 |   PMPI_Scatter(diffs, 1, MPI_DOUBLE, &gdiff, 1, MPI_DOUBLE, 0, comm);
287 |   /*printf("[%i] diff_lin: %lf usec\n", r, gdiff*1e6);*/
288 |   return gdiff*1e6;
289 | }
290 | 
291 | 
292 | 


--------------------------------------------------------------------------------
/src/Schedgen2/goal.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | 
  3 | 
  4 | class GoalLabeller:
  5 |     def __init__(self):
  6 |         self.next_label = 1
  7 |         self.next_comm = 1
  8 |         self.op_dict = {}
  9 |         self.comm_dict = {}
 10 | 
 11 |     def GetLabel(self, op):
 12 |         if op in self.op_dict:
 13 |             pass
 14 |         else:
 15 |             self.op_dict[op] = self.next_label
 16 |             self.next_label += 1
 17 |         return self.op_dict[op]
 18 | 
 19 |     def GetCommID(self, comm):
 20 |         if comm in self.comm_dict:
 21 |             pass
 22 |         else:
 23 |             self.comm_dict[comm] = self.next_comm
 24 |             self.next_comm += 1
 25 |         return self.comm_dict[comm]
 26 | 
 27 |     def MakeTag(self, tag, comm):
 28 |         """Combine the user tag and the comm tag portion"""
 29 |         return tag * 1000 + comm
 30 | 
 31 | 
 32 | class GoalOp:
 33 |     def __init__(self):
 34 |         self.depends_on = []
 35 | 
 36 |     def requires(self, required):
 37 |         # TODO check that self and required translate to the same rank in comm_world - we don't have the rank here :(
 38 |         self.depends_on.append(required)
 39 | 
 40 | 
 41 | class GoalSend(GoalOp):
 42 |     def __init__(self, dst, tag, size):
 43 |         super().__init__()
 44 |         self.dst = dst
 45 |         self.tag = tag
 46 |         self.size = size
 47 | 
 48 |     def write_goal(self, labeller, fh, comm, basecomm, format="goal"):
 49 |         if format == "goal":
 50 |             fh.write(
 51 |                 "l{label}: send {size}b to {dst} tag {tag}\n".format(
 52 |                     label=labeller.GetLabel(self),
 53 |                     size=str(self.size),
 54 |                     dst=str(comm.TranslateRank(self.dst, basecomm)),
 55 |                     tag=str(labeller.MakeTag(self.tag, labeller.GetCommID(comm))),
 56 |                 )
 57 |             )
 58 |         elif format == "graphviz":
 59 |             fh.write(
 60 |                 "\"l{label}\" [label=\"send {size}b to {dst} tag {tag}\"];\n".format(
 61 |                     label=labeller.GetLabel(self),
 62 |                     size=str(self.size),
 63 |                     dst=str(comm.TranslateRank(self.dst, basecomm)),
 64 |                     tag=str(labeller.MakeTag(self.tag, labeller.GetCommID(comm))),
 65 |                 )
 66 |             )
 67 |         else:
 68 |             raise NotImplementedError("Requested output format "+str(format)+" not implemented!")
 69 |         
 70 | 
 71 | 
 72 | class GoalRecv(GoalOp):
 73 |     def __init__(self, src, tag, size):
 74 |         super().__init__()
 75 |         self.src = src
 76 |         self.tag = tag
 77 |         self.size = size
 78 | 
 79 |     def write_goal(self, labeller, fh, comm, basecomm, format="goal"):
 80 |         if format == "goal":
 81 |             fh.write(
 82 |                 "l{label}: recv {size}b from {src} tag {tag}\n".format(
 83 |                     label=labeller.GetLabel(self),
 84 |                     size=str(self.size),
 85 |                     src=str(comm.TranslateRank(self.src, basecomm)),
 86 |                     tag=str(labeller.MakeTag(self.tag, labeller.GetCommID(comm))),
 87 |                 )
 88 |             )
 89 |         elif format == "graphviz":
 90 |             fh.write(
 91 |                 "\"l{label}\" [label=\"recv {size}b from {src} tag {tag}\"];\n".format(
 92 |                     label=labeller.GetLabel(self),
 93 |                     size=str(self.size),
 94 |                     src=str(comm.TranslateRank(self.src, basecomm)),
 95 |                     tag=str(labeller.MakeTag(self.tag, labeller.GetCommID(comm))),
 96 |                 )
 97 |             )
 98 |         else:
 99 |             raise NotImplementedError("Requested output format "+str(format)+" not implemented!")
100 | 
101 | 
102 | class GoalCalc(GoalOp):
103 |     def __init__(self, size):
104 |         super().__init__()
105 |         self.size = size
106 | 
107 |     def write_goal(self, labeller, fh, comm, basecomm, format="goal"):
108 |         if format == "goal":
109 |             fh.write(
110 |                 "l{label}: calc {size}\n".format(
111 |                     label=labeller.GetLabel(self), size=str(self.size))
112 |             )
113 |         elif format == "graphviz":
114 |             fh.write(
115 |                 "\"l{label}\" [label=\"calc {size}\"]\n".format(
116 |                     label=labeller.GetLabel(self), size=str(self.size)
117 |                 )
118 |             )
119 |         else:
120 |             raise NotImplementedError("Requested output format "+str(format)+" not implemented!")
121 | 
122 | class GoalRank:
123 |     def __init__(self, comm, rank):
124 |         self.comm = comm
125 |         self.rank = rank
126 |         self.base_rank = None
127 |         self.ops = []
128 | 
129 |     def Send(self, dst, tag, size):
130 |         if dst > self.comm.CommSize():
131 |             raise ValueError(str(dst) + " is larger than comm size!")
132 |         op = GoalSend(dst=dst, tag=tag, size=size)
133 |         self.ops.append(op)
134 |         return op
135 | 
136 |     def Recv(self, src, tag, size):
137 |         if src > self.comm.CommSize():
138 |             raise ValueError(str(src) + " is larger than comm size!")
139 |         op = GoalRecv(src=src, tag=tag, size=size)
140 |         self.ops.append(op)
141 |         return op
142 | 
143 |     def Calc(self, size):
144 |         op = GoalCalc(size=size)
145 |         self.ops.append(op)
146 |         return op
147 | 
148 |     def Merge(self, mrank):
149 |         self.ops += mrank.ops
150 | 
151 |     def Append(self, arank, dependOn=None, allOpsDepend=False):
152 |         """ Append arank to self. If dependOn is None, all ops in self need to finish before we start executing aranks ops. If dependOn is given we only depend on that. 
153 |             By default (allOpsDepend) only independent ops in arank depend on self, however if allOpsDepend=True all ops do. """
154 |         if dependOn is None:
155 |             c = self.Calc(0)
156 |             for l in self.LastOps():
157 |                 if l == c:
158 |                     pass
159 |                 else:
160 |                     c.requires(l)
161 |         else:
162 |             c = dependOn
163 |         self.ops += arank.ops
164 |         depops = arank.IndepOps()
165 |         if allOpsDepend:
166 |             depops = arank.ops
167 |         for i in depops:
168 |             i.requires(c) 
169 | 
170 |     def IndepOps(self):
171 |         res = [x for x in self.ops if (len(x.depends_on) == 0)]
172 |         return res
173 | 
174 |     def LastOps(self):
175 |         rem = []
176 |         for x in self.ops:
177 |             for d in x.depends_on:
178 |                 rem.append(d)
179 |         s = set(rem)
180 |         res = [x for x in self.ops if x not in s]
181 |         return res
182 | 
183 | 
184 |     def write_goal(self, labeller, fh, rankid=True, basecomm=None, format="goal"):
185 |         if basecomm is None:
186 |             basecomm = (
187 |                 self.comm
188 |             )  # stupid python evals default args at method definition, not call time :(
189 |         if rankid:
190 |             if format == "goal":
191 |                 fh.write("rank " + str(self.rank) + " {\n")
192 |             elif format == "graphviz":
193 |                 fh.write("subgraph cluster_" + str(self.rank) + " {\n")
194 |                 fh.write("style=filled; color=lightgrey; node [style=filled,color=white]; label=\"rank "+str(self.rank)+"\";")
195 |         for op in self.ops:
196 |             op.write_goal(labeller, fh, self.comm, basecomm, format=format)
197 |         for op in self.ops:
198 |             for req in op.depends_on:
199 |                 if format == "goal":
200 |                     fh.write(
201 |                         "l{label1} requires l{label2}\n".format(
202 |                             label1=labeller.GetLabel(op), label2=labeller.GetLabel(req)
203 |                         )
204 |                     )
205 |                 if format == "graphviz":
206 |                     # we "invert" dependencies in grphviz format, i.e, a->b means a is executed before b.
207 |                     # Where in goal it would be "b requires a" - but this would make graphs look upside down. 
208 |                     fh.write(
209 |                         "l{label2} -> l{label1}\n".format(
210 |                             label1=labeller.GetLabel(op), label2=labeller.GetLabel(req)
211 |                         )
212 |                     )
213 |         for sc in self.comm.subcomms:
214 |             sc.write_goal_subcomm(labeller, fh, self.rank, basecomm, format=format)
215 |         if rankid:
216 |             fh.write("}\n\n")
217 | 
218 | 
219 | class GoalComm:
220 |     def __init__(self, comm_size):
221 |         self.base_comm = self
222 |         self.comm_size = comm_size
223 |         self.subcomms = []
224 |         self.ranks = [GoalRank(comm=self, rank=rank) for rank in range(comm_size)]
225 | 
226 |     def __getitem__(self, index):
227 |         return self.ranks[index]
228 | 
229 |     def Append(self, comm):
230 |         """Append comm to self, such that when all ops in self are finished, those in comm can start."""
231 |         if comm.CommSize() > self.CommSize():
232 |             raise ValueError("Cannot append a larger comm to a smaller one!")
233 |         if len(comm.subcomms) > 0:
234 |             raise ValueError("Cannot append a comm with subcomms, flatten first?")
235 |         for idx, rank in enumerate(self.ranks):
236 |             rank.Append(comm[idx])
237 | 
238 |     def Merge(self, comm):
239 |         """Merge comm into self, such that the ops in both run in parallel."""
240 |         if comm.CommSize() > self.CommSize():
241 |             raise "Cannot merge a larger comm to a smaller one!"
242 |         if len(comm.subcomms) > 0:
243 |             raise ValueError("Cannot append a comm with subcomms, flatten first?")
244 |         for idx, rank in enumerate(self.ranks):
245 |             rank.Merge(comm[idx])
246 | 
247 |     def Send(self, src, dst, tag, size):
248 |         return self[src].Send(dst, tag, size)
249 | 
250 |     def Recv(self, dst, src, tag, size):
251 |         return self[dst].Recv(src, tag, size)
252 | 
253 |     def Calc(self, host, size):
254 |         return self[host].Calc(size)
255 | 
256 |     def CommSize(self):
257 |         return self.comm_size
258 | 
259 |     def CommSplit(self, color, key):
260 |         if len(list(color)) < self.comm_size or len(list(key)) < self.comm_size:
261 |             raise ValueError(
262 |                 "The length of color and key array must match the communicator size."
263 |             )
264 |         newcomms = []
265 |         order = [
266 |             (oldrank, color[oldrank], key[oldrank])
267 |             for oldrank in range(0, self.comm_size)
268 |         ]
269 |         color_buckets = {}
270 |         for o in order:
271 |             if o[1] in color_buckets:
272 |                 color_buckets[o[1]].append(o)
273 |             else:
274 |                 color_buckets[o[1]] = [o]
275 |         for c in color_buckets.keys():
276 |             c_list = sorted(
277 |                 color_buckets[c], key=lambda x: x[2]
278 |             )  # sort by key within color
279 |             nc = GoalComm(len(c_list))
280 |             nc.base_comm = self
281 |             for idx, r in enumerate(nc):
282 |                 r.base_rank = c_list[idx][
283 |                     0
284 |                 ]  # store the rank the new rank had in the comm it was splitted from
285 |             newcomms.append(nc)
286 |         self.subcomms += newcomms
287 |         return newcomms
288 | 
289 |     def write_goal(self, labeller=None, fh=sys.stdout, format="goal"):
290 |         if format == "goal":
291 |             fh.write("num_ranks " + str(len(self.ranks)) + "\n\n")
292 |         elif format == "graphviz":
293 |             fh.write("digraph G {\n")
294 |         if labeller is None:
295 |             labeller = GoalLabeller()
296 |         for r in self.ranks:
297 |             r.write_goal(labeller, fh, rankid=True, basecomm=self, format=format)
298 |         if format == "graphviz":
299 |             fh.write("}\n")
300 |     
301 | 
302 |     def write_goal_subcomm(self, labeller, fh, rank, basecomm, format="goal"):
303 |         """if this comm has a rank with base_rank=rank, print its goal ops without enclosing brackets"""
304 |         for r in self.ranks:
305 |             if r.base_rank == rank:
306 |                 r.write_goal(labeller, fh, rankid=False, basecomm=basecomm, format=format)
307 | 
308 |     def TranslateRank(self, rank, basecomm):
309 |         """Find out the rank id of the given rank (in self) in basecomm"""
310 |         if self == basecomm:
311 |             return rank
312 |         if rank == None:
313 |             raise ValueError("Attempt to translate a non-existing rank!")
314 |         return self.base_comm.TranslateRank(self.ranks[rank].base_rank, basecomm)
315 | 
316 | 
317 | if __name__ == "__main__":
318 |     comms = [ GoalComm(4), GoalComm(4) ]
319 |     for c in comms:
320 |         for i in range(1, c.CommSize()):
321 |             c[0].Send(dst=i, tag=42, size=23)
322 |         for i in range(1, c.CommSize()):
323 |             c[i].Recv(src=0, tag=42, size=23)
324 |         c.write_goal()
325 |     comms[0].Append(comms[1])
326 |     comms[0].write_goal()
327 | 


--------------------------------------------------------------------------------
/src/Schedgen2/patterns.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | from math import log2, ceil
  3 | from typing import List, Union
  4 | from goal import GoalComm
  5 | 
  6 | 
  7 | def binomialtree(
  8 |     comm_size: int,
  9 |     datasize: int,
 10 |     tag: int,
 11 |     algorithm: str = "reduce",
 12 |     compute_time_dependency: int = 0,
 13 |     **kwargs,
 14 | ) -> GoalComm:
 15 |     """
 16 |     Create a binomial tree communication pattern.
 17 | 
 18 |     :param comm_size: number of ranks in the communicator
 19 |     :param datasize: size of data to send or receive
 20 |     :param tag: tag that is used for all send and receive operations
 21 |     :param algorithm: communication algorithm that uses this pattern; default is reduce
 22 |     :param compute_time_dependency: compute time dependency for each send operation; if 0 (default), no compute time is added
 23 |     :param kwargs: additional arguments that are ignored
 24 |     :return: GoalComm object that represents the communication pattern
 25 |     """
 26 |     assert algorithm in [
 27 |         "reduce",
 28 |         "bcast",
 29 |         "scatter",
 30 |     ], "direction must be reduce, bcast, or scatter"
 31 |     comm = GoalComm(comm_size)
 32 |     for rank in range(0, comm_size):
 33 |         send = None
 34 |         recv = None
 35 |         for r in range(0, ceil(log2(comm_size))):
 36 |             peer = rank + pow(2, r)
 37 |             if (rank + pow(2, r) < comm_size) and (rank < pow(2, r)):
 38 |                 if algorithm == "reduce":
 39 |                     recv = comm.Recv(size=datasize, src=peer, dst=rank, tag=tag)
 40 |                 elif algorithm in ["bcast", "scatter"]:
 41 |                     send = comm.Send(size=datasize, dst=peer, src=rank, tag=tag)
 42 |                 else:
 43 |                     raise ValueError(
 44 |                         "direction "
 45 |                         + str(algorithm)
 46 |                         + " in binomialtree not implemented."
 47 |                     )
 48 |             if (send is not None) and (recv is not None):
 49 |                 if compute_time_dependency > 0:
 50 |                     calc = comm.Calc(host=rank, size=compute_time_dependency)
 51 |                     calc.requires(recv)
 52 |                     send.requires(calc)
 53 |                 else:
 54 |                     send.requires(recv)
 55 |             peer = rank - pow(2, r)
 56 |             if (rank >= pow(2, r)) and (rank < pow(2, r + 1)):
 57 |                 if algorithm == "reduce":
 58 |                     send = comm.Send(size=datasize, dst=peer, src=rank, tag=tag)
 59 |                 if algorithm in ["bcast", "scatter"]:
 60 |                     recv = comm.Recv(size=datasize, src=peer, dst=rank, tag=tag)
 61 | 
 62 |     return comm
 63 | 
 64 | 
 65 | def recdoub(
 66 |     comm_size: int,
 67 |     datasize: int,
 68 |     tag: int,
 69 |     algorithm: str = "reduce-scatter",
 70 |     compute_time_dependency: int = 0,
 71 |     **kwargs,
 72 | ) -> GoalComm:
 73 |     """
 74 |     Create a recursive doubling communication pattern.
 75 | 
 76 |     :param comm_size: number of ranks in the communicator
 77 |     :param datasize: size of data to send or receive
 78 |     :param tag: tag that is used for all send and receive operations
 79 |     :param algorithm: communication algorithm that uses this pattern; default is reduce-scatter
 80 |     :param compute_time_dependency: compute time dependency for each send operation; if 0 (default), no compute time is added
 81 |     :param kwargs: additional arguments that are ignored
 82 |     :return: GoalComm object that represents the communication pattern
 83 |     """
 84 | 
 85 |     assert algorithm in [
 86 |         "reduce-scatter",
 87 |         "allgather",
 88 |     ], f"the pattern does not currently support the {algorithm} algorithm"
 89 | 
 90 |     comm = GoalComm(comm_size)
 91 |     num_steps = int(log2(comm_size))
 92 |     dependencies = [None] * comm_size
 93 |     for r in range(num_steps):
 94 |         for rank in range(comm_size):
 95 |             if algorithm in ["reduce-scatter"]:
 96 |                 dest = rank ^ (2**r)
 97 |                 message_size = datasize // (2 ** (r + 1))
 98 |             elif algorithm in ["allgather"]:
 99 |                 dest = rank ^ (2 ** (num_steps - r - 1))
100 |                 message_size = datasize // (2 ** (num_steps - r))
101 |             else:
102 |                 raise ValueError(
103 |                     f"the pattern does not currently support the {algorithm} algorithm"
104 |                 )
105 |             if dest < comm_size:
106 |                 send = comm.Send(size=message_size, src=rank, dst=dest, tag=tag + r)
107 |                 if dependencies[rank] is not None:
108 |                     send.requires(dependencies[rank])
109 |                 dependencies[rank] = comm.Recv(
110 |                     size=message_size, src=dest, dst=rank, tag=tag + r
111 |                 )
112 |                 if compute_time_dependency > 0:
113 |                     calc = comm.Calc(host=rank, size=compute_time_dependency)
114 |                     calc.requires(dependencies[rank])
115 |                     dependencies[rank] = calc
116 |     return comm
117 | 
118 | 
119 | def ring(
120 |     comm_size: int,
121 |     datasize: int,
122 |     tag: int,
123 |     algorithm: str = "reduce-scatter",
124 |     rounds: int = 1,
125 |     compute_time_dependency: int = 0,
126 |     **kwargs,
127 | ) -> GoalComm:
128 |     """
129 |     Create a ring communication pattern.
130 | 
131 |     :param comm_size: number of ranks in the communicator
132 |     :param datasize: size of data to send in each round
133 |     :param tag: base tag that is incremented for each round
134 |     :param algorithm: communication algorithm that uses this pattern; default is reduce-scatter
135 |     :param rounds: number of rounds to send data around the ring
136 |     :param compute_time_dependency: compute time dependency for each send operation; if 0 (default), no compute time is added
137 |     :param kwargs: additional arguments that are ignored
138 |     :return: GoalComm object that represents the communication pattern
139 |     """
140 |     comm = GoalComm(comm_size)
141 |     dependencies = [None] * comm_size
142 |     if algorithm in ["reduce-scatter", "allgather"]:
143 |         datasize = datasize // comm_size
144 |     for r in range(rounds):
145 |         for rank in range(comm_size):
146 |             send = comm.Send(
147 |                 size=datasize, src=rank, dst=(rank + 1) % comm_size, tag=tag + r
148 |             )
149 |             if dependencies[rank] is not None:
150 |                 send.requires(dependencies[rank])
151 |             dependencies[rank] = comm.Recv(
152 |                 size=datasize, src=(rank - 1) % comm_size, dst=rank, tag=tag + r
153 |             )
154 |             if compute_time_dependency > 0:
155 |                 calc = comm.Calc(host=rank, size=compute_time_dependency)
156 |                 calc.requires(dependencies[rank])
157 |                 dependencies[rank] = calc
158 |     return comm
159 | 
160 | 
161 | def _single_source_or_destination_linear(
162 |     comm: GoalComm,
163 |     anchor: int,
164 |     datasizes: Union[List[int], List[List[int]]],
165 |     tag: int,
166 |     algorithm: str = "bcast",
167 |     parallel: bool = True,
168 |     window_size: int = 0,
169 |     compute_time_dependency: int = 0,
170 | ) -> GoalComm:
171 |     """
172 |     Create a single source or destination linear communication pattern.
173 | 
174 |     :param comm: GoalComm object that contains the ranks
175 |     :param anchor: rank that is the source or destination
176 |     :param datasizes: size(s) of data to send or receive
177 |     :param tag: tag that is used for all send and receive operations
178 |     :param algorithm: communication algorithm that uses this pattern; default is bcast (single source, multiple destinations)
179 |     :param parallel: whether to send multiple messages in parallel; default is True (send messages in parallel)
180 |     :param window_size: number of operations that can be in flight at once; default is 0 (no windowing)
181 |     :param compute_time_dependency: compute time dependency for each send operation; default is 0 (no compute time)
182 |     :return: GoalComm object that represents the communication pattern
183 |     """
184 |     assert algorithm in [
185 |         "bcast",
186 |         "reduce",
187 |         "alltoall",
188 |         "alltoallv",
189 |         "scatter",
190 |         "incast",
191 |         "outcast",
192 |     ], f"the pattern does not currently support the {algorithm} algorithm"
193 |     assert (
194 |         parallel and window_size == 0 and compute_time_dependency == 0
195 |     ) or algorithm not in [
196 |         "reduce",
197 |         "incast",
198 |     ], f"We do not introduce dependencies, windowing, or compute time for linear receives"
199 | 
200 |     dependency = None
201 |     if window_size > 0:
202 |         window = [None] * window_size
203 |         next_slot = 0
204 |     for rank in range(comm.comm_size):
205 |         if rank == anchor:
206 |             continue
207 |         if algorithm in ["bcast", "alltoall", "alltoallv", "scatter", "outcast"]:
208 |             if algorithm in ["alltoall", "alltoallv"]:
209 |                 datasize = datasizes[anchor][rank]
210 |             else:
211 |                 datasize = datasizes[rank]
212 |             send = comm.Send(src=anchor, dst=rank, size=datasize, tag=tag)
213 |             recv = comm.Recv(src=anchor, dst=rank, size=datasize, tag=tag)
214 |             if not parallel:
215 |                 if window_size == 0:
216 |                     if dependency is not None:
217 |                         send.requires(dependency)
218 |                     dependency = send
219 |                     if compute_time_dependency > 0:
220 |                         calc = comm.Calc(host=anchor, size=compute_time_dependency)
221 |                         calc.requires(dependency)
222 |                         dependency = calc
223 |                 else:
224 |                     if window[next_slot] is not None:
225 |                         send.requires(window[next_slot])
226 |                     window[next_slot] = send
227 |                     next_slot = (next_slot + 1) % window_size
228 |                     if compute_time_dependency > 0:
229 |                         send.requires(
230 |                             comm.Calc(host=anchor, size=compute_time_dependency)
231 |                         )
232 |             else:
233 |                 if compute_time_dependency > 0:
234 |                     send.requires(comm.Calc(host=anchor, size=compute_time_dependency))
235 |         elif algorithm in ["reduce", "incast"]:
236 |             datasize = datasizes[rank]
237 |             send = comm.Send(src=rank, dst=anchor, size=datasize, tag=tag)
238 |             recv = comm.Recv(src=rank, dst=anchor, size=datasize, tag=tag)
239 |             if compute_time_dependency > 0:
240 |                 send.requires(comm.Calc(host=rank, size=compute_time_dependency))
241 |         else:
242 |             raise ValueError(
243 |                 f"the pattern does not currently support the {algorithm} algorithm"
244 |             )
245 | 
246 | 
247 | def linear(
248 |     comm_size: int,
249 |     datasize: int,
250 |     tag: int,
251 |     algorithm: str = "bcast",
252 |     parallel: bool = True,
253 |     randomized_data: bool = False,
254 |     window_size: int = 0,
255 |     compute_time_dependency: int = 0,
256 |     **kwargs,
257 | ) -> GoalComm:
258 |     """
259 |     Create a linear communication pattern.
260 | 
261 |     :param comm_size: number of ranks in the communicator
262 |     :param datasize: size of data to send
263 |     :param tag: tag that is used for all send and receive operations
264 |     :param algorithm: communication algorithm that uses this pattern; default is bcast (single source, multiple destinations)
265 |     :param parallel: whether to send multiple messages in parallel; default is True (send messages in parallel)
266 |     :param randomized_data: whether to randomize the data sent or received; default is False (same size for all messages)
267 |     :param window_size: number of operations that can be in flight at once; default is 0 (no windowing)
268 |     :param compute_time_dependency: compute time dependency for each send operation; default is 0 (no compute time)
269 |     :param kwargs: additional arguments that are ignored
270 |     :return: GoalComm object that represents the communication pattern
271 |     """
272 |     comm = GoalComm(comm_size)
273 | 
274 |     assert algorithm in [
275 |         "bcast",
276 |         "reduce",
277 |         "alltoall",
278 |         "alltoallv",
279 |         "scatter",
280 |         "incast",
281 |         "outcast",
282 |     ], f"the pattern does not currently support the {algorithm} algorithm"
283 | 
284 |     if algorithm in ["alltoall", "alltoallv"]:
285 |         datasizes = [
286 |             [
287 |                 (datasize + int(0.1 * random.randint(-datasize, datasize)))
288 |                 if randomized_data
289 |                 else datasize
290 |                 for _ in range(comm_size)
291 |             ]
292 |             for _ in range(comm_size)
293 |         ]
294 | 
295 |         for anchor in range(comm_size):
296 |             _single_source_or_destination_linear(
297 |                 comm,
298 |                 anchor,
299 |                 datasizes,
300 |                 tag,
301 |                 algorithm,
302 |                 parallel,
303 |                 window_size,
304 |                 compute_time_dependency,
305 |             )
306 |     else:
307 |         datasizes = [
308 |             (datasize + int(0.1 * random.randint(-datasize, datasize)))
309 |             if randomized_data
310 |             else datasize
311 |             for _ in range(comm_size)
312 |         ]
313 |         _single_source_or_destination_linear(
314 |             comm,
315 |             0,
316 |             datasizes,
317 |             tag,
318 |             algorithm,
319 |             parallel,
320 |             window_size,
321 |             compute_time_dependency,
322 |         )
323 | 
324 |     return comm
325 | 


--------------------------------------------------------------------------------
/src/Drawviz/TimelineDrawing.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2009 The Trustees of Indiana University and Indiana
  3 |  *                    University Research and Technology
  4 |  *                    Corporation.  All rights reserved.
  5 |  *
  6 |  * Author(s): Torsten Hoefler <htor@cs.indiana.edu>
  7 |  *            Timo Schneider <timoschn@cs.indiana.edu>
  8 |  *
  9 |  */
 10 | 
 11 | #include <string>
 12 | #include <math.h>
 13 | #include <stdio.h>
 14 | #include <sstream>
 15 | #include <fstream>
 16 | #include <iostream>
 17 | #include <assert.h>
 18 | #include <inttypes.h>
 19 | #include <libps/pslib.h>
 20 | 
 21 | #include "TimelineDrawing.hpp"
 22 | 
 23 | 
 24 | void TimelineDrawing::init_graph(int numranks, int numcpus, int width = 800, int height = 800, std::string filename = "timeline.ps") {
 25 | 
 26 | 	this->numranks = numranks;
 27 | 	this->numcpus = numcpus;
 28 | 	this->width = width;
 29 | 	this->height = height;
 30 | 
 31 | 	this->ranksep = height/(numranks+2);
 32 | 	this->cpusep = (ranksep*0.75) / 5; // this means we assume 4 cpus at max
 33 | 	this->timesep = width/100;
 34 | 	this->fontsize = 10;
 35 | 	this->leftmargin = 50;
 36 | 
 37 | 	PS_boot();
 38 | 	this->psdoc = PS_new();
 39 | 	PS_open_file(this->psdoc, filename.c_str());
 40 | 	PS_begin_page(this->psdoc, (this->numranks+2)*this->ranksep, (this->numranks+2)*this->ranksep);
 41 | 	this->psfont = PS_findfont(this->psdoc, "Helvetica", "", 0);
 42 | 	PS_setfont(this->psdoc, this->psfont, this->fontsize);
 43 | 	
 44 | }
 45 | 
 46 | void TimelineDrawing::close_graph() {
 47 | 
 48 | 	PS_end_page(this->psdoc);
 49 | 	PS_close(this->psdoc);
 50 | 	PS_delete(this->psdoc);
 51 | 	PS_shutdown();
 52 | }
 53 | 
 54 | void TimelineDrawing::draw_everything(int maxtime) {
 55 | 
 56 | 	this->timesep = ((double) (this->width - (this->leftmargin * 2))) / (double) maxtime;
 57 | 	
 58 | 	for (unsigned int i=0; i<this->overheads.size(); i++) {
 59 | 		if (this->overheads.at(i).type == 1) {
 60 | 			draw_osend(this->overheads.at(i).rank,
 61 | 			           this->overheads.at(i).cpu,			
 62 | 			           this->overheads.at(i).start,
 63 | 			           this->overheads.at(i).end,
 64 | 			           this->overheads.at(i).r,
 65 | 			           this->overheads.at(i).g,
 66 | 			           this->overheads.at(i).b);
 67 | 		}
 68 | 		if (this->overheads.at(i).type == 2) {
 69 | 			draw_orecv(this->overheads.at(i).rank,
 70 | 			           this->overheads.at(i).cpu,
 71 | 			           this->overheads.at(i).start,
 72 | 			           this->overheads.at(i).end,
 73 | 			           this->overheads.at(i).r,
 74 | 			           this->overheads.at(i).g,
 75 | 			           this->overheads.at(i).b
 76 | 					  );
 77 | 
 78 | 		}
 79 | 		if (this->overheads.at(i).type == 3) {
 80 | 			draw_loclop(this->overheads.at(i).rank,
 81 | 			            this->overheads.at(i).cpu,
 82 | 			            this->overheads.at(i).start,
 83 | 			            this->overheads.at(i).end,
 84 | 			            this->overheads.at(i).r,
 85 | 			            this->overheads.at(i).g,
 86 | 			            this->overheads.at(i).b
 87 | 					   );
 88 | 
 89 | 		}
 90 | 		if (this->overheads.at(i).type == 4) {
 91 | 			draw_noise(this->overheads.at(i).rank,
 92 | 			            this->overheads.at(i).cpu,
 93 | 			            this->overheads.at(i).start,
 94 | 			            this->overheads.at(i).end,
 95 | 			            this->overheads.at(i).r,
 96 | 			            this->overheads.at(i).g,
 97 | 			            this->overheads.at(i).b
 98 | 					   );
 99 | 		}
100 | 
101 | 	}
102 | 	for (unsigned int i=0; i<this->transmissions.size(); i++) {
103 | 		draw_transmission(this->transmissions.at(i).source,
104 | 		                  this->transmissions.at(i).dest,
105 | 						  this->transmissions.at(i).starttime,
106 | 						  this->transmissions.at(i).endtime,
107 | 						  this->transmissions.at(i).size,
108 | 						  this->transmissions.at(i).G,
109 | 						  this->transmissions.at(i).r,
110 | 						  this->transmissions.at(i).g,
111 | 						  this->transmissions.at(i).b
112 | 						  );
113 | 	}
114 | }
115 | 
116 | void TimelineDrawing::draw_ranklines() {
117 | 
118 | 	for (int i=0; i<numranks; i++) {
119 | 		PS_setlinewidth(psdoc, 0.2);
120 | 		PS_moveto(psdoc, this->leftmargin, (i+2)*ranksep);
121 | 		PS_lineto(psdoc, this->width - this->leftmargin , (i+2)*ranksep);
122 | 		PS_stroke(psdoc);
123 | 		char textbuffer[128];
124 | 		snprintf(textbuffer, 128, "Rank %i", i);
125 | 		PS_setfont(psdoc, this->psfont, this->fontsize);
126 | 		PS_show_xy(psdoc, textbuffer, 5, (i+2)*ranksep);
127 | 		for (int j=1; j<numcpus; j++) {
128 | 			PS_setfont(psdoc, this->psfont, this->fontsize/1.75);
129 | 			PS_setlinewidth(psdoc, 0.05);
130 | 			PS_moveto(psdoc, this->leftmargin, (i+2)*ranksep - j*cpusep);
131 | 			PS_lineto(psdoc, this->width - this->leftmargin , (i+2)*ranksep - j*cpusep );
132 | 			PS_stroke(psdoc);
133 | 			PS_setlinewidth(psdoc, 0.2);
134 | 			snprintf(textbuffer, 128, "CPU %i", j);
135 | 			PS_show_xy(psdoc, textbuffer, 7, (i+2)*ranksep - j*cpusep);	
136 | 		}
137 | 	}
138 | 
139 | 	PS_setfont(psdoc, this->psfont, this->fontsize);
140 | 	PS_show_xy(psdoc, "Time", width * 0.5, ranksep*0.3);
141 | 
142 | }
143 | 
144 | void TimelineDrawing::draw_seperator(int rank, int cpu, int pos) {
145 | 
146 | 	PS_setlinewidth(psdoc, 0.1);
147 | 	PS_moveto(psdoc,
148 | 	          this->leftmargin + pos * this->timesep, 
149 | 			  (rank+2) * this->ranksep - cpu * this->cpusep - 3 );
150 | 	PS_lineto(psdoc,
151 | 	          this->leftmargin + pos * this->timesep, 
152 | 			  (rank+2) * this->ranksep - cpu * this->cpusep + 3 );
153 | 	PS_stroke(psdoc);
154 | }
155 | 
156 | void TimelineDrawing::draw_osend(int rank, int cpu, uint64_t start, uint64_t end, float r, float g, float b) {
157 | 
158 | 	PS_setcolor(psdoc, "stroke", "rgb", r, g, b, 0.0);
159 | 	PS_setlinewidth(psdoc, args_info.linethickness_arg+1.0);
160 | 	PS_moveto(psdoc,
161 | 	          this->leftmargin + start * this->timesep, 
162 | 			  (rank+2)*this->ranksep - cpu*this->cpusep);
163 | 	PS_lineto(psdoc,
164 | 	          this->leftmargin + end * this->timesep,
165 | 			  (rank+2)*this->ranksep - cpu*this->cpusep);
166 | 	PS_stroke(psdoc);
167 | 
168 | 	this->draw_seperator(rank, cpu, start);
169 | 	this->draw_seperator(rank, cpu, end);
170 | 
171 | 	if (args_info.descrtext_given) {
172 | 		PS_setfont(psdoc, this->psfont, this->fontsize/2);
173 | 		int xpos = this->leftmargin + (((end-start)/2 + start) * this->timesep);
174 | 		xpos -= (PS_stringwidth(psdoc, "o", this->psfont, this->fontsize/2) / 2);
175 | 		PS_show_xy(psdoc, "o", xpos, 
176 | 		           (rank+2)*ranksep - cpu*cpusep + ranksep * 0.1 );
177 | 
178 | 		xpos = this->leftmargin + (((end-start)/2 + start) * this->timesep);
179 | 		xpos -= (PS_stringwidth(psdoc, "send", this->psfont, this->fontsize/2) / 2);
180 | 		PS_show_xy(psdoc, "send", xpos, 
181 | 		           (rank+2)*ranksep - cpu*cpusep - ranksep * 0.1 );
182 | 	}
183 | }
184 | 
185 | void TimelineDrawing::draw_orecv(int rank, int cpu, uint64_t start, uint64_t end, float r, float g, float b) {
186 | 
187 | 	PS_setcolor(psdoc, "stroke", "rgb", r, g, b, 0.0);
188 | 	PS_setlinewidth(psdoc, args_info.linethickness_arg+1.0);
189 | 	PS_moveto(psdoc,
190 | 	          this->leftmargin + start * this->timesep, 
191 | 			  (rank+2)*this->ranksep - cpu*this->cpusep);
192 | 	PS_lineto(psdoc,
193 | 	          this->leftmargin + end * this->timesep,
194 | 			  (rank+2)*this->ranksep - cpu*this->cpusep);
195 | 	PS_stroke(psdoc);
196 | 
197 | 	this->draw_seperator(rank, cpu, start);
198 | 	this->draw_seperator(rank, cpu, end);
199 | 
200 | 	if (args_info.descrtext_given) {
201 | 		PS_setfont(psdoc, this->psfont, this->fontsize/2);
202 | 		int xpos = this->leftmargin + (((end-start)/2 + start) * this->timesep);
203 | 		xpos -= (PS_stringwidth(psdoc, "o", this->psfont, this->fontsize/2) / 2);
204 | 		PS_show_xy(psdoc, "o", xpos, 
205 | 		           (rank+2)*ranksep - cpu*cpusep + ranksep * 0.1 );
206 | 
207 | 		xpos = this->leftmargin + (((end-start)/2 + start) * this->timesep);
208 | 		xpos -= (PS_stringwidth(psdoc, "recv", this->psfont, this->fontsize/2) / 2);
209 | 		PS_show_xy(psdoc, "recv", xpos, 
210 | 		           (rank+2)*ranksep - cpu*cpusep - ranksep * 0.1 );
211 | 	}
212 | 
213 | }
214 | 
215 | void TimelineDrawing::draw_loclop(int rank, int cpu, uint64_t start, uint64_t end, float r, float g, float b) {
216 | 
217 | 	PS_setcolor(psdoc, "stroke", "rgb", r, g, b, 0.0);
218 | 
219 | 	PS_setlinewidth(psdoc, args_info.linethickness_arg+1.0);
220 | 	PS_moveto(psdoc,
221 | 	          this->leftmargin + start * this->timesep, 
222 | 			  (rank+2)*this->ranksep - cpu*this->cpusep);
223 | 	PS_lineto(psdoc,
224 | 	          this->leftmargin + end * this->timesep,
225 | 			  (rank+2)*this->ranksep - cpu*this->cpusep);
226 | 	PS_stroke(psdoc);
227 | 
228 | 	this->draw_seperator(rank, cpu, start);
229 | 	this->draw_seperator(rank, cpu, end);
230 | 
231 | 	if (args_info.descrtext_given) {
232 | 		PS_setfont(psdoc, this->psfont, this->fontsize/2);
233 | 		int xpos = this->leftmargin + (((end-start)/2 + start) * this->timesep);
234 | 		xpos -= (PS_stringwidth(psdoc, "calc", this->psfont, this->fontsize/2) / 2);
235 | 		PS_show_xy(psdoc, "calc", xpos, 
236 | 		           (rank+2)*ranksep - cpu*cpusep - ranksep * 0.1 );
237 | 	}
238 | 
239 | 
240 | }
241 | 
242 | void TimelineDrawing::draw_noise(int rank, int cpu,  uint64_t start, uint64_t end, float r, float g, float b) {
243 | 
244 | 	PS_setcolor(psdoc, "stroke", "rgb", r, g, b, 0.0);
245 | 
246 | 	PS_setlinewidth(psdoc, args_info.linethickness_arg+1.0);
247 | 	PS_moveto(psdoc,
248 | 	          this->leftmargin + start * this->timesep, 
249 | 			  (rank+2)*this->ranksep - cpu*this->cpusep);
250 | 	PS_lineto(psdoc,
251 | 	          this->leftmargin + end * this->timesep,
252 | 			  (rank+2)*this->ranksep - cpu*this->cpusep);
253 | 	PS_stroke(psdoc);
254 | 
255 | 	this->draw_seperator(rank, cpu, start);
256 | 	this->draw_seperator(rank, cpu, end);
257 | 
258 | 	if (args_info.descrtext_given) {
259 | 		PS_setfont(psdoc, this->psfont, this->fontsize/2);
260 | 		int xpos = this->leftmargin + (((end-start)/2 + start) * this->timesep);
261 | 		xpos -= (PS_stringwidth(psdoc, "calc", this->psfont, this->fontsize/2) / 2);
262 | 		PS_show_xy(psdoc, "calc", xpos, 
263 | 		           (rank+2)*ranksep - cpu*cpusep - ranksep * 0.1 );
264 | 	}
265 | 
266 | }
267 | 
268 | void TimelineDrawing::draw_transmission(int source, int dest, uint64_t starttime, uint64_t endtime, int size, int G, float r, float g, float b) {
269 | 
270 | 	PS_setcolor(psdoc, "stroke", "rgb", r, g, b, 0.0);
271 | 	PS_setlinewidth(psdoc, args_info.linethickness_arg);
272 | 
273 | 	for (int i = 0; i <= size-1; i++) {
274 | 		PS_setdash(psdoc, 2.0, 2.0);
275 | 		PS_moveto(psdoc, this->leftmargin + (starttime + i * G) * this->timesep, (source+2)*ranksep);
276 | 		
277 | 		// store coordinates for drawing the arrowheads
278 | 		int sx = this->leftmargin + (starttime + i * G) * this->timesep;
279 | 		int sy = (source+2)*ranksep;
280 | 		
281 | 		// the behaviour of the sim changed! oldsin: transmission "ends" with last byte,
282 | 		// newsim: transmission ends with first, so orecv can start earlier 
283 | 		int L = endtime - starttime;// - (size-1)*G;
284 | 		//assert(L > 0);
285 | 		PS_lineto(psdoc, this->leftmargin + ((starttime + i * G) + L) * this->timesep, (dest+2)*ranksep);
286 | 		
287 | 		// store coordinates for drawing the arrowheads
288 | 		int dx = this->leftmargin + ((starttime + i * G) + L) * this->timesep;
289 | 		int dy = (dest+2)*ranksep;
290 | 
291 | 		PS_stroke(psdoc);
292 | 
293 | 		if (args_info.arrowheads_given) {
294 | 			// draw arrowhead
295 | 			int x1, y1, x2, y2;
296 | 			calc_arrowhead_coords(sx, sy, dx, dy, &x1, &y1, &x2, &y2);
297 | 			PS_setdash(psdoc, 0.0, 0.0);
298 | 			PS_moveto(psdoc, dx, dy);
299 | 			PS_lineto(psdoc, x1, y1);
300 | 			PS_stroke(psdoc);
301 | 			PS_moveto(psdoc, dx, dy);
302 | 			PS_lineto(psdoc, x2, y2);
303 | 			PS_stroke(psdoc);
304 | 		}
305 | 	}
306 | }
307 | 
308 | void TimelineDrawing::add_osend(int rank, uint64_t start, uint64_t end, int cpu, float r, float g, float b) {
309 | 	
310 | 	overh os;
311 | 	os.type = 1;
312 | 	os.rank = rank;
313 | 	os.cpu = cpu;
314 | 	os.start = start;
315 | 	os.end = end;
316 | 	os.r = r;
317 | 	os.g = g;
318 | 	os.b = b;
319 | 	
320 | 	this->overheads.push_back(os);
321 | 
322 | }
323 | 
324 | void TimelineDrawing::add_orecv(int rank, uint64_t start, uint64_t end, int cpu, float r, float g, float b) {
325 | 	
326 | 	overh orecv;
327 | 	orecv.type = 2;
328 | 	orecv.rank = rank;
329 | 	orecv.cpu = cpu;
330 | 	orecv.start = start;
331 | 	orecv.end = end;
332 | 	orecv.r = r;
333 | 	orecv.g = g;
334 | 	orecv.b = b;
335 | 
336 | 	this->overheads.push_back(orecv);
337 | 
338 | }
339 | 
340 | void TimelineDrawing::add_loclop(int rank, uint64_t start, uint64_t end, int cpu, float r, float g, float b) {
341 | 	
342 | 	overh lop;
343 | 	lop.type = 3;
344 | 	lop.rank = rank;
345 | 	lop.cpu = cpu;
346 | 	lop.start = start;
347 | 	lop.end = end;
348 | 	lop.r = r;
349 | 	lop.g = g;
350 | 	lop.b = b;
351 | 
352 | 	this->overheads.push_back(lop);
353 | 
354 | }
355 | 
356 | void TimelineDrawing::add_noise(int rank, uint64_t start, uint64_t end, int cpu, float r, float g, float b) {
357 | 	
358 | 	overh noise;
359 | 	noise.type = 4;
360 | 	noise.rank = rank;
361 | 	noise.cpu = cpu;
362 | 	noise.start = start;
363 | 	noise.end = end;
364 | 	noise.r = r;
365 | 	noise.g = g;
366 | 	noise.b = b;
367 | 
368 | 	this->overheads.push_back(noise);
369 | 
370 | }
371 | 
372 | void TimelineDrawing::add_transmission(int source, int dest, uint64_t starttime, uint64_t endtime, int size, int G, float r, float g, float b) {
373 | 	
374 | 	trans tm;
375 | 	tm.source = source;
376 | 	tm.dest = dest;
377 | 	tm.starttime = starttime;
378 | 	tm.endtime = endtime;
379 | 	tm.size = size;
380 | 	tm.G = G;
381 | 	tm.r = r;
382 | 	tm.g = g;
383 | 	tm.b = b;
384 | 
385 | 	this->transmissions.push_back(tm);
386 | 
387 | 	std::stringstream os;
388 | 	os << "transmission " << source << " " << dest << " " << starttime << " ";
389 | 	os << endtime << " " << G << ";\n";
390 | 	this->content.append(os.str());
391 | }
392 | 
393 | void TimelineDrawing::calc_arrowhead_coords(int sx, int sy, int dx, int dy, int *x1, int *y1, int *x2, int *y2) {
394 | 
395 | 	double pi = 3.141592;
396 | 	double angle = atan2 (dy - sy, dx - sx) + pi;
397 | 	double arrowlength = 6*args_info.linethickness_arg;
398 | 	
399 | 	*x1 = dx + arrowlength * cos(angle - pi/12);
400 | 	*y1 = dy + arrowlength * sin(angle - pi/12);
401 | 	*x2 = dx + arrowlength * cos(angle + pi/12);
402 | 	*y2 = dy + arrowlength * sin(angle + pi/12);		
403 | 
404 | }
405 | 
406 | 


--------------------------------------------------------------------------------
/src/liballprof2/gensem.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python3
  2 | 
  3 | import re
  4 | import clang.cindex
  5 | import argparse
  6 | from collections import defaultdict
  7 | import yaml
  8 | 
  9 | class AllprofCodegen:
 10 | 
 11 |     def __init__(self, libclang_path):
 12 |         self.libclang_path=libclang_path
 13 |         self.nodes = []
 14 |         self.semantics = {}
 15 |         self.types = defaultdict(list)
 16 |         self.BLACKLISTED_FUNCTIONS = [
 17 |             'MPI_Comm_c2f', # this might be a macro i.e., in mpich
 18 |             'MPI_Comm_f2c', # this might be a macro i.e., in mpich
 19 |             'MPI_Group_f2c', # this might be a macro i.e., in mpich
 20 |             'MPI_Group_c2f', # this might be a macro i.e., in mpich
 21 |             'MPI_Win_f2c', # this might be a macro i.e., in mpich
 22 |             'MPI_Win_c2f', # this might be a macro i.e., in mpich
 23 |             'MPI_Type_f2c', # this might be a macro i.e., in mpich
 24 |             'MPI_Type_c2f', # this might be a macro i.e., in mpich
 25 |             'MPI_Errhandler_f2c', # this might be a macro i.e., in mpich
 26 |             'MPI_Errhandler_c2f', # this might be a macro i.e., in mpich
 27 |             'MPI_Request_f2c', # this might be a macro i.e., in mpich
 28 |             'MPI_Request_c2f', # this might be a macro i.e., in mpich
 29 |             'MPI_File_f2c', # this might be a macro i.e., in mpich
 30 |             'MPI_File_c2f', # this might be a macro i.e., in mpich
 31 |             'MPI_Info_f2c', # this might be a macro i.e., in mpich
 32 |             'MPI_Info_c2f', # this might be a macro i.e., in mpich
 33 |             'MPI_Message_f2c', # this might be a macro i.e., in mpich
 34 |             'MPI_Message_c2f', # this might be a macro i.e., in mpich
 35 |             'MPI_Op_f2c', # this might be a macro i.e., in mpich
 36 |             'MPI_Op_c2f', # this might be a macro i.e., in mpich
 37 |         ]
 38 | 
 39 |     def get_count_for_param_in_func(self, param, func):
 40 |         # TODO minimize this
 41 |         mapping = {}
 42 |         GET_NDIMS_CART_COMM = "int ndims; PMPI_Cartdim_get(comm, &ndims);"
 43 |         GET_COMM_SIZE = "int rank, size; PMPI_Comm_size(comm, &size); PMPI_Comm_rank(comm, &rank);"
 44 |         GET_NEIGH_GRAPH_COMM = "int ideg, odeg, wted; PMPI_Dist_graph_neighbors_count(comm, &ideg, &odeg, &wted);"
 45 |         mapping[("MPI_Cart_create", "dims")] = "ndims"
 46 |         mapping[("MPI_Cart_create", "periods")] = "ndims"
 47 |         mapping[("MPI_Cart_map", "dims")] = "ndims"
 48 |         mapping[("MPI_Cart_map", "periods")] = "ndims"
 49 |         mapping[("MPI_Cart_rank", "coords")] = (GET_NDIMS_CART_COMM, "ndims")
 50 |         mapping[("MPI_Cart_sub", "remain_dims")] = (GET_NDIMS_CART_COMM, "ndims")
 51 |         mapping[("MPI_Dist_graph_create", "nodes")] = "n"
 52 |         mapping[("MPI_Dist_graph_create", "degrees")] = "n"
 53 |         mapping[("MPI_Dist_graph_create", "targets")] = "n"
 54 |         mapping[("MPI_Dist_graph_create", "weights")] = "n"
 55 |         mapping[("MPI_Dist_graph_create_adjacent", "sources")] = "indegree"
 56 |         mapping[("MPI_Dist_graph_create_adjacent", "sourceweights")] = "indegree"
 57 |         mapping[("MPI_Dist_graph_create_adjacent", "destinations")] = "outdegree"
 58 |         mapping[("MPI_Dist_graph_create_adjacent", "destweights")] = "outdegree"
 59 |         mapping[("MPI_Comm_spawn_multiple", "array_of_maxprocs")] = "count"
 60 |         mapping[("MPI_Graph_create", "index")] = "nnodes"
 61 |         mapping[("MPI_Graph_create", "edges")] = "index[nnodes-1]"
 62 |         mapping[("MPI_Graph_map", "index")] = "nnodes"
 63 |         mapping[("MPI_Graph_map", "edges")] = "index[nnodes-1]"
 64 |         mapping[("MPI_Group_excl", "ranks")] = "n"
 65 |         mapping[("MPI_Group_incl", "ranks")] = "n"
 66 |         mapping[("MPI_Group_translate_ranks", "ranks1")] = "n"
 67 |         mapping[("MPI_Type_create_darray", "gsize_array")] = "ndims"
 68 |         mapping[("MPI_Type_create_darray", "distrib_array")] = "ndims"
 69 |         mapping[("MPI_Type_create_darray", "darg_array")] = "ndims"
 70 |         mapping[("MPI_Type_create_darray", "psize_array")] = "ndims"
 71 |         mapping[("MPI_Type_create_hindexed", "array_of_blocklengths")] = "count"
 72 |         mapping[("MPI_Type_create_indexed_block", "array_of_displacements")] = "count"
 73 |         mapping[("MPI_Type_create_struct", "array_of_block_lengths")] = "count"
 74 |         mapping[("MPI_Type_create_subarray", "size_array")] = "ndims"
 75 |         mapping[("MPI_Type_create_subarray", "subsize_array")] = "ndims"
 76 |         mapping[("MPI_Type_create_subarray", "start_array")] = "ndims"
 77 |         mapping[("MPI_Type_indexed", "array_of_blocklengths")] = "count"
 78 |         mapping[("MPI_Type_indexed", "array_of_displacements")] = "count"
 79 |         mapping[("MPI_Allgatherv", "recvcounts")] = (GET_COMM_SIZE, "size")
 80 |         mapping[("MPI_Allgatherv", "displs")] = (GET_COMM_SIZE, "size")
 81 |         mapping[("MPI_Iallgatherv", "recvcounts")] = (GET_COMM_SIZE, "size")
 82 |         mapping[("MPI_Iallgatherv", "displs")] = (GET_COMM_SIZE, "size")
 83 |         mapping[("MPI_Alltoallv", "sendcounts")] = (GET_COMM_SIZE, "size")
 84 |         mapping[("MPI_Alltoallv", "sdispls")] = (GET_COMM_SIZE, "size")
 85 |         mapping[("MPI_Alltoallv", "recvcounts")] = (GET_COMM_SIZE, "size")
 86 |         mapping[("MPI_Alltoallv", "rdispls")] = (GET_COMM_SIZE, "size")
 87 |         mapping[("MPI_Ialltoallv", "sendcounts")] = (GET_COMM_SIZE, "size")
 88 |         mapping[("MPI_Ialltoallv", "sdispls")] = (GET_COMM_SIZE, "size")
 89 |         mapping[("MPI_Ialltoallv", "recvcounts")] = (GET_COMM_SIZE, "size")
 90 |         mapping[("MPI_Ialltoallv", "rdispls")] = (GET_COMM_SIZE, "size")
 91 |         mapping[("MPI_Alltoallw", "sendcounts")] = (GET_COMM_SIZE, "size")
 92 |         mapping[("MPI_Alltoallw", "sdispls")] = (GET_COMM_SIZE, "size")
 93 |         mapping[("MPI_Alltoallw", "sendtypes")] = (GET_COMM_SIZE, "size")
 94 |         mapping[("MPI_Alltoallw", "recvcounts")] = (GET_COMM_SIZE, "size")
 95 |         mapping[("MPI_Alltoallw", "rdispls")] = (GET_COMM_SIZE, "size")
 96 |         mapping[("MPI_Alltoallw", "recvtypes")] = (GET_COMM_SIZE, "size")
 97 |         mapping[("MPI_Ialltoallw", "sendcounts")] = (GET_COMM_SIZE, "size")
 98 |         mapping[("MPI_Ialltoallw", "sdispls")] = (GET_COMM_SIZE, "size")
 99 |         mapping[("MPI_Ialltoallw", "sendtypes")] = (GET_COMM_SIZE, "size")
100 |         mapping[("MPI_Ialltoallw", "recvcounts")] = (GET_COMM_SIZE, "size")
101 |         mapping[("MPI_Ialltoallw", "rdispls")] = (GET_COMM_SIZE, "size")
102 |         mapping[("MPI_Ialltoallw", "recvtypes")] = (GET_COMM_SIZE, "size")
103 |         mapping[("MPI_Cart_coords", "coords")] = "maxdims"
104 |         mapping[("MPI_Cart_get", "dims")] = "maxdims"
105 |         mapping[("MPI_Cart_get", "periods")] = "maxdims"
106 |         mapping[("MPI_Cart_get", "coords")] = "maxdims"
107 |         mapping[("MPI_Dist_graph_neighbors", "sources")] = "maxindegree"
108 |         mapping[("MPI_Dist_graph_neighbors", "sourceweights")] = "maxindegree"
109 |         mapping[("MPI_Dist_graph_neighbors", "destinations")] = "maxoutdegree"
110 |         mapping[("MPI_Dist_graph_neighbors", "destweights")] = "maxoutdegree"
111 |         mapping[("MPI_Comm_spawn", "argv")] = None
112 |         mapping[("MPI_Comm_spawn", "array_of_errcodes")] = "maxprocs"
113 |         mapping[("MPI_Comm_spawn_multiple", "array_of_commands")] = "count" # only at root
114 |         mapping[("MPI_Comm_spawn_multiple", "array_of_argv")] = "count"     # only at root
115 |         mapping[("MPI_Comm_spawn_multiple", "array_of_info")] = "count"     # only at root
116 |         mapping[("MPI_Comm_spawn_multiple", "array_of_errcodes")] = "count" # only at root
117 |         mapping[("MPI_Dims_create", "dims")] = "ndims"
118 |         mapping[("MPI_Gatherv", "recvcounts")] = (GET_COMM_SIZE, "size")
119 |         mapping[("MPI_Gatherv", "displs")] = (GET_COMM_SIZE, "size")
120 |         mapping[("MPI_Igatherv", "recvcounts")] = (GET_COMM_SIZE, "size")
121 |         mapping[("MPI_Igatherv", "displs")] = (GET_COMM_SIZE, "size")
122 |         mapping[("MPI_Graph_get", "index")] = "maxindex"
123 |         mapping[("MPI_Graph_get", "edges")] = "maxedges"
124 |         mapping[("MPI_Graph_neighbors", "neighbors")] = "maxneighbors"
125 |         mapping[("MPI_Group_range_excl", "ranges")] = "n"
126 |         mapping[("MPI_Group_range_incl", "ranges")] = "n"
127 |         mapping[("MPI_Group_translate_ranks", "ranks2")] = "n"
128 |         mapping[("MPI_Neighbor_allgatherv", "recvcounts")] = (GET_NEIGH_GRAPH_COMM, "ideg")
129 |         mapping[("MPI_Neighbor_allgatherv", "displs")] = (GET_NEIGH_GRAPH_COMM, "ideg")
130 |         mapping[("MPI_Ineighbor_allgatherv", "recvcounts")] = (GET_NEIGH_GRAPH_COMM, "ideg")
131 |         mapping[("MPI_Ineighbor_allgatherv", "displs")] = (GET_NEIGH_GRAPH_COMM, "ideg")
132 |         mapping[("MPI_Neighbor_alltoallv", "sendcounts")] = (GET_NEIGH_GRAPH_COMM, "odeg")
133 |         mapping[("MPI_Neighbor_alltoallv", "sdispls")] = (GET_NEIGH_GRAPH_COMM, "odeg")
134 |         mapping[("MPI_Neighbor_alltoallv", "recvcounts")] = (GET_NEIGH_GRAPH_COMM, "ideg")
135 |         mapping[("MPI_Neighbor_alltoallv", "rdispls")] = (GET_NEIGH_GRAPH_COMM, "ideg")
136 |         mapping[("MPI_Ineighbor_alltoallv", "sendcounts")] = (GET_NEIGH_GRAPH_COMM, "odeg")
137 |         mapping[("MPI_Ineighbor_alltoallv", "sdispls")] = (GET_NEIGH_GRAPH_COMM, "odeg")
138 |         mapping[("MPI_Ineighbor_alltoallv", "recvcounts")] = (GET_NEIGH_GRAPH_COMM, "ideg")
139 |         mapping[("MPI_Ineighbor_alltoallv", "rdispls")] = (GET_NEIGH_GRAPH_COMM, "ideg")
140 |         mapping[("MPI_Neighbor_alltoallw", "sendcounts")] = (GET_NEIGH_GRAPH_COMM, "odeg")
141 |         mapping[("MPI_Neighbor_alltoallw", "sdispls")] = (GET_NEIGH_GRAPH_COMM, "odeg")
142 |         mapping[("MPI_Neighbor_alltoallw", "sendtypes")] = (GET_NEIGH_GRAPH_COMM, "odeg")
143 |         mapping[("MPI_Neighbor_alltoallw", "recvcounts")] = (GET_NEIGH_GRAPH_COMM, "ideg")
144 |         mapping[("MPI_Neighbor_alltoallw", "rdispls")] = (GET_NEIGH_GRAPH_COMM, "ideg")
145 |         mapping[("MPI_Neighbor_alltoallw", "recvtypes")] = (GET_NEIGH_GRAPH_COMM, "ideg")
146 |         mapping[("MPI_Ineighbor_alltoallw", "sendcounts")] = (GET_NEIGH_GRAPH_COMM, "odeg")
147 |         mapping[("MPI_Ineighbor_alltoallw", "sdispls")] = (GET_NEIGH_GRAPH_COMM, "odeg")
148 |         mapping[("MPI_Ineighbor_alltoallw", "sendtypes")] = (GET_NEIGH_GRAPH_COMM, "odeg")
149 |         mapping[("MPI_Ineighbor_alltoallw", "recvcounts")] = (GET_NEIGH_GRAPH_COMM, "ideg")
150 |         mapping[("MPI_Ineighbor_alltoallw", "rdispls")] = (GET_NEIGH_GRAPH_COMM, "ideg")
151 |         mapping[("MPI_Ineighbor_alltoallw", "recvtypes")] = (GET_NEIGH_GRAPH_COMM, "ideg")
152 |         mapping[("MPI_Pack_external", "datarep")] = "strlen(datarep)"
153 |         mapping[("MPI_Pack_external_size", "datarep")] = "strlen(datarep)"
154 |         mapping[("MPI_Reduce_scatter", "recvcounts")] = (GET_COMM_SIZE, "size")
155 |         mapping[("MPI_Ireduce_scatter", "recvcounts")] = (GET_COMM_SIZE, "size")
156 |         mapping[("MPI_Scatterv", "sendcounts")] = (GET_COMM_SIZE, "(rank==root ? size : 0)")
157 |         mapping[("MPI_Scatterv", "displs")] = (GET_COMM_SIZE, "(rank==root ? size : 0)")
158 |         mapping[("MPI_Iscatterv", "sendcounts")] = (GET_COMM_SIZE, "(rank == root ? size : 0)")
159 |         mapping[("MPI_Iscatterv", "displs")] = (GET_COMM_SIZE, "(rank == root ? size : 0)")
160 |         mapping[("MPI_Startall", "array_of_requests")] = "count"
161 |         mapping[("MPI_Testall", "array_of_requests")] = "count"
162 |         mapping[("MPI_Testall", "array_of_statuses")] = "count"
163 |         mapping[("MPI_Testany", "array_of_requests")] = "count"
164 |         mapping[("MPI_Testsome", "array_of_requests")] = "incount"
165 |         mapping[("MPI_Testsome", "array_of_indices")] = "*outcount"
166 |         mapping[("MPI_Testsome", "array_of_statuses")] = "*outcount"
167 |         mapping[("MPI_Type_create_hindexed_block", "array_of_displacements")] = "count"
168 |         mapping[("MPI_Type_create_hindexed", "array_of_displacements")] = "count"
169 |         mapping[("MPI_Type_create_struct", "array_of_displacements")] = "count"
170 |         mapping[("MPI_Type_create_struct", "array_of_types")] = "count"
171 |         mapping[("MPI_Type_get_contents", "array_of_integers")] = "max_integers"
172 |         mapping[("MPI_Type_get_contents", "array_of_addresses")] = "max_addresses"
173 |         mapping[("MPI_Type_get_contents", "array_of_datatypes")] = "max_datatypes"
174 |         mapping[("MPI_Unpack_external", "datarep")] = "strlen(datarep)"
175 |         mapping[("MPI_Waitall", "array_of_requests")] = "(array_of_statuses != MPI_STATUSES_IGNORE ? count : 0)"
176 |         mapping[("MPI_Waitany", "array_of_requests")] = "count"
177 |         mapping[("MPI_Waitsome", "array_of_requests")] = "incount"
178 |         mapping[("MPI_Waitsome", "array_of_indices")] = "*outcount"
179 |         mapping[("MPI_Waitsome", "array_of_statuses")] = "(array_of_statuses != MPI_STATUSES_IGNORE ? *outcount : 0)"
180 |         if (func, param) not in mapping:
181 |             print(f"Did not find mapping[(\"{func}\", \"{param}\")] = \"\"")
182 |             return None
183 |         else:
184 |             r = mapping[(func, param)]
185 |             if type(r) is tuple:
186 |                 return r
187 |             else:
188 |                 return (None, r)
189 | 
190 |     def traverse_ast(self, node, depth=0, print_ast=False):
191 |         if print_ast:
192 |             print('  ' * depth + f'{node.kind} ({node.displayname})')
193 |         if node.kind is clang.cindex.CursorKind.FUNCTION_DECL and re.match("MPI_.*", node.displayname) :
194 |             self.nodes += [node]
195 |         for child in node.get_children():
196 |             self.traverse_ast(child, depth + 1, print_ast)
197 | 
198 | 
199 |     def semnatics_for_func(self, node):
200 |         function_name = node.spelling
201 |         return_type = node.result_type.spelling
202 |         if function_name.startswith("MPI_T_") or (function_name in self.BLACKLISTED_FUNCTIONS):
203 |             return
204 |         self.semantics[function_name] = {}
205 |         self.semantics[function_name]['return_type'] = return_type
206 |         self.semantics[function_name]['params'] = []
207 |         
208 |         for param_cursor in node.get_children():
209 |             param_dict = {}
210 |             if param_cursor.kind != clang.cindex.CursorKind.PARM_DECL:
211 |                 continue
212 |             param_type = param_cursor.type.spelling
213 |             param_name = param_cursor.spelling                
214 |             param_dict['name'] = param_name
215 |             param_dict['type'] = param_type
216 |             if "[]" in param_type:
217 |                 prolog, varname = self.get_count_for_param_in_func(param=param_name, func=function_name)
218 |                 param_dict['elem_count'] = varname
219 |                 param_dict['prolog_elem_count'] = prolog
220 |                 param_dict['trace_each_elem'] = True
221 |             self.semantics[function_name]['params'].append(param_dict)
222 | 
223 |         # clang doesn't work with varargs???
224 |         if function_name == "MPI_Pcontrol":
225 |             param_dict = {}
226 |             param_dict['name'] = "..."
227 |             param_dict['type'] = ""
228 |             self.semantics[function_name]['params'].append(param_dict)
229 | 
230 |     def process_func(self, node, mode):
231 |         if mode == 'semantics':
232 |            self.semnatics_for_func(node)
233 | 
234 | 
235 |     def process_header(self, filename, mode):
236 |         clang.cindex.Config.set_library_path(self.libclang_path)
237 |         index = clang.cindex.Index.create()
238 |         translation_unit = index.parse(filename)
239 |         if not translation_unit:
240 |             print("Error parsing the file.")
241 |             return
242 |         root_cursor = translation_unit.cursor
243 |         self.traverse_ast(root_cursor)
244 |         for node in self.nodes:
245 |             self.process_func(node, mode)
246 | 
247 | 
248 | 
249 | if __name__ == "__main__":
250 |     parser = argparse.ArgumentParser(
251 |                     prog='liballprof_gencode',
252 |                     description='Generates wrappers for the MPI functions present in the supplied MPI header file. The wrappers output in liballprof2 trace format.',
253 |                     epilog='')
254 |     parser.add_argument('-m', '--mpi-header',           default="mpi.h",                    help="MPI header file to use as input (default: mpi.h)")
255 |     parser.add_argument('-s', '--semantics-file',       default='mpi_sem.yml',              help="Name of the file that specifies the tracer semantics (default: mpi-sem.yml)")
256 |     parser.add_argument('-l', '--libclang-path',        default="",                         help="Path to libclang, if empty let clang python module guess. (default=\"\")")
257 |     args = parser.parse_args()
258 | 
259 |     codegen = AllprofCodegen(libclang_path=args.libclang_path)
260 |     codegen.outfile = open(args.semantics_file, "w")
261 |     codegen.process_header(args.mpi_header, mode='semantics')
262 |     codegen.outfile.write(yaml.dump(codegen.semantics))
263 |     codegen.outfile.close()
264 | 


--------------------------------------------------------------------------------