├── .gitignore
├── LICENSE
├── Makefile.am
├── NOTICE
├── README.md
├── affinity
    ├── README.md
    ├── affinity.h
    ├── cpu.c
    ├── gpu.cu
    ├── makefile.mk
    ├── mpi+omp.c
    ├── mpi.c
    └── omp.c
├── aux
    ├── deepclean.sh
    └── load_env.sh
├── bootstrap
├── config
    └── tap-driver.py
├── configure.ac
├── doc
    ├── README.md
    └── mpibind.bib
├── etc
    ├── Makefile.am
    └── mpibind.pc.in
├── flux
    ├── Makefile.am
    ├── README.md
    ├── mpibind-flux-ex1.lua
    ├── mpibind-flux-ex2.lua
    ├── mpibind-flux.lua.in
    ├── options.md
    └── plugin.c
├── gpu-tests
    ├── makefile.mk
    ├── orig.mk
    ├── retrieve.cu
    ├── simple.cpp
    ├── visdevs-hwloc.cu
    └── visdevs.cu
├── python
    ├── Makefile.am
    ├── README.md
    ├── mpibind.py.in
    ├── mpibind_map.py
    ├── setup.py
    ├── test-mpi.py
    └── test-simple.py
├── slurm
    ├── Makefile.am
    ├── README.md
    └── plugin.c
├── src
    ├── Makefile.am
    ├── dev_tests.c
    ├── hwloc_tests.c
    ├── hwloc_utils.c
    ├── hwloc_utils.h
    ├── internals.c
    ├── main.c
    ├── manual.mk
    ├── mpibind-priv.h
    ├── mpibind.c
    ├── mpibind.h
    └── simple.mk
├── test-suite
    ├── Makefile.am
    ├── README.md
    ├── coral-ea.c
    ├── coral-lassen.c
    ├── cts1-quartz.c
    ├── environment.c
    ├── epyc-corona.c
    ├── error.c
    ├── expected
    │   ├── expected.coral-ea
    │   ├── expected.coral-lassen
    │   ├── expected.coral-lassen.v1
    │   ├── expected.cts1-quartz
    │   ├── expected.epyc-corona
    │   └── expected.epyc-corona.v1
    ├── python
    │   ├── py-coral-ea.py
    │   ├── py-coral-lassen.py
    │   ├── py-cts1-quartz.py
    │   ├── py-epyc-corona.py
    │   └── test_utils.py
    ├── test_utils.c
    └── test_utils.h
├── topo-xml
    ├── arm64-ulna-hwloc1.xml
    ├── cascade-lake-ap-snl-hwloc1.xml
    ├── coral-butte-hwloc1.xml
    ├── coral-ea-hwloc1.xml
    ├── coral-lassen-hwloc1.xml
    ├── coral-lassen.xml
    ├── coral-rzansel-hwloc1.xml
    ├── cts1-pascal.xml
    ├── cts1-quartz-smt1.xml
    ├── eas-tioga.xml
    ├── epyc-corona-hwloc1.xml
    ├── epyc-corona-p2.xml
    ├── epyc-corona.xml
    ├── epyc-dual-sock-hwloc1.xml
    ├── g4dnmetal.xml
    ├── knl-quad-cache-hwloc1.xml
    ├── knl-quad-flat-hwloc1.xml
    ├── knl-snc4-cache-hwloc1.xml
    └── knl-snc4-flat-hwloc1.xml
└── tutorials
    ├── common
        └── archs.md
    ├── cug23
        ├── README.md
        ├── module1.md
        ├── module2.md
        └── module3.md
    ├── cug24
        ├── README.md
        ├── archs.md
        ├── module1.md
        ├── module2.md
        ├── module3.md
        └── module4.md
    ├── eurosys25
        ├── README.md
        ├── module1.md
        ├── module2.md
        └── module3.md
    ├── figures
        ├── aws-architecture.png
        ├── aws-g4dn-metal.png
        ├── cache.png
        ├── computing-architecture.png
        ├── corona-merge.png
        ├── corona-no-cache-io-physical.png
        ├── corona-no-cache-io.png
        ├── corona-physical.png
        ├── corona-web.png
        ├── corona.pdf
        ├── corona.png
        ├── hwloc-objects.png
        ├── lassen-web.png
        ├── lassen.pdf
        ├── lassen.png
        ├── mammoth.pdf
        ├── mammoth.png
        ├── module4_sockets.png
        ├── module4_specifyplacement.png
        ├── module4_spread.png
        ├── module4_threadstocores.png
        ├── module4_threadstocpus.png
        ├── module4_threadstosockets.png
        ├── module5_tioga.png
        ├── numa.png
        ├── pascal-web.png
        ├── pascal.pdf
        ├── pascal.png
        ├── poodle-web.png
        ├── ruby.pdf
        ├── ruby.png
        ├── rzadams-web.png
        ├── rzadams
        │   ├── FigureA.png
        │   ├── FigureB.png
        │   ├── FigureC.png
        │   ├── FigureD.png
        │   ├── FigureE.png
        │   ├── FigureF.png
        │   ├── FigureG.png
        │   ├── FigureH.png
        │   ├── FigureI.png
        │   └── FigureJ.png
        ├── sierra.pdf
        ├── sierra.png
        ├── tioga-web.png
        ├── tioga.pdf
        ├── tioga.png
        └── tioga
        │   ├── Tioga-Mod1-Ex5.png
        │   ├── Tioga-Mod1-Ex6.png
        │   ├── Tioga-Mod1-Ex7.png
        │   ├── Tioga-Mod1-noprocs.png
        │   ├── Tioga-Mod2-Ex1.png
        │   ├── Tioga-Mod2-Ex10.png
        │   ├── Tioga-Mod2-Ex11.png
        │   ├── Tioga-Mod2-Ex2.png
        │   ├── Tioga-Mod2-Ex4.png
        │   ├── Tioga-Mod2-Ex5.png
        │   ├── Tioga-Mod2-Ex6.png
        │   ├── Tioga-Mod2-Ex7.png
        │   ├── Tioga-Mod2-Ex8.png
        │   ├── Tioga-Mod2-Ex9.png
        │   ├── Tioga-Mod3-Ex1a.png
        │   ├── Tioga-Mod3-Ex1b.png
        │   ├── Tioga-Mod3-Ex2a.png
        │   ├── Tioga-Mod3-Ex2b.png
        │   ├── Tioga-Mod3-Ex3a.png
        │   ├── Tioga-Mod3-Ex3b.png
        │   ├── Tioga-Mod3-Ex5a.png
        │   ├── Tioga-Mod3-Ex5b.png
        │   ├── Tioga-Mod3-Ex6.png
        │   ├── figureA.png
        │   ├── figureB.png
        │   ├── figureC.png
        │   ├── figureD.png
        │   ├── figureE.png
        │   ├── figureF.png
        │   ├── figureG.png
        │   ├── figureH.png
        │   ├── figureI.png
        │   ├── figureJ.png
        │   ├── figureK.png
        │   ├── figureL.png
        │   ├── figureM.png
        │   ├── figureN.png
        │   ├── tioga-merge.png
        │   ├── tioga-no-cache-io-physical.png
        │   └── tioga-no-cache-io.png
    ├── flux
        ├── README.md
        ├── module1.md
        └── module2.md
    ├── lanl22
        └── README.md
    ├── main
        ├── README.md
        ├── module1.md
        ├── module2.md
        ├── module3.md
        ├── module4.md
        └── module5.md
    └── tapia22
        ├── README.md
        ├── Sep-09-1045-Supercomputing-Systems-101.pdf
        └── tapia-setup-instructions.md


/.gitignore:
--------------------------------------------------------------------------------
  1 | # http://www.gnu.org/software/automake
  2 | Makefile.in
  3 | # http://www.gnu.org/software/autoconf
  4 | autom4te.cache
  5 | compile
  6 | configure
  7 | aclocal.m4
  8 | stamp-h1
  9 | aclocal.m4
 10 | config.guess
 11 | config.sub
 12 | depcomp
 13 | install-sh
 14 | ltmain.sh
 15 | missing
 16 | config.log
 17 | config.status
 18 | config.h
 19 | config.h.in
 20 | config.h.in~
 21 | libtool
 22 | .deps/
 23 | .libs/
 24 | libltdl/
 25 | 
 26 | # libtool pull-ins
 27 | /config/libtool.m4
 28 | /config/ltoptions.m4
 29 | /config/ltsugar.m4
 30 | /config/ltversion.m4
 31 | /config/lt~obsolete.m4
 32 | /config/ar-lib
 33 | /config/tap-driver.sh
 34 | 
 35 | # docs intermediate files
 36 | /doc/man*/*.xml
 37 | /doc/_build
 38 | 
 39 | # Object files
 40 | *.o
 41 | *.ko
 42 | *.obj
 43 | *.elf
 44 | # Libraries
 45 | *.lib
 46 | *.a
 47 | *.la
 48 | *.lo
 49 | # Shared objects (inc. Windows DLLs)
 50 | *.dll
 51 | *.so
 52 | *.so.*
 53 | *.dylib
 54 | # Executables
 55 | *.exe
 56 | *.out
 57 | *.app
 58 | *.i*86
 59 | *.x86_64
 60 | *.hex
 61 | *.pyc
 62 | *.pyo
 63 | # gcov output
 64 | *.gcno
 65 | *.gcda
 66 | # Test files
 67 | *.t
 68 | 
 69 | # autoconf-preprocessed
 70 | Makefile
 71 | *.1
 72 | *.3
 73 | *.5
 74 | *.7
 75 | *.8
 76 | *.spec
 77 | *.pc
 78 | 
 79 | # misc
 80 | *.swp
 81 | *.diff
 82 | *.tar.gz
 83 | *.orig
 84 | *.core
 85 | *.tap
 86 | .coverage*
 87 | *.trs
 88 | *.log
 89 | .dirstamp
 90 | 
 91 | # ignore mypy generated cache directory
 92 | .mypy_cache
 93 | 
 94 | # ignore local, maybe generated tooling files
 95 | compile_commands.json
 96 | compile_flags.txt
 97 | 
 98 | # local editor config dirs
 99 | .vscode
100 | .idea
101 | .clangd
102 | 
103 | # Python virtual environments
104 | .venv*/
105 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020, Lawrence Livermore National Security, LLC
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile.am:
--------------------------------------------------------------------------------
1 | .NOTPARALLEL:
2 | 
3 | SUBDIRS = src test-suite python flux slurm etc
4 | 
5 | ACLOCAL_AMFLAGS = -I config
6 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
 1 | This work was produced under the auspices of the U.S. Department of
 2 | Energy by Lawrence Livermore National Laboratory under Contract
 3 | DE-AC52-07NA27344.
 4 | 
 5 | This work was prepared as an account of work sponsored by an agency of
 6 | the United States Government. Neither the United States Government nor
 7 | Lawrence Livermore National Security, LLC, nor any of their employees
 8 | makes any warranty, expressed or implied, or assumes any legal liability
 9 | or responsibility for the accuracy, completeness, or usefulness of any
10 | information, apparatus, product, or process disclosed, or represents that
11 | its use would not infringe privately owned rights.
12 | 
13 | Reference herein to any specific commercial product, process, or service
14 | by trade name, trademark, manufacturer, or otherwise does not necessarily
15 | constitute or imply its endorsement, recommendation, or favoring by the
16 | United States Government or Lawrence Livermore National Security, LLC.
17 | 
18 | The views and opinions of authors expressed herein do not necessarily
19 | state or reflect those of the United States Government or Lawrence
20 | Livermore National Security, LLC, and shall not be used for advertising
21 | or product endorsement purposes.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## A Memory-Driven Mapping Algorithm for Heterogeneous Systems
  2 | 
  3 | `mpibind` is a memory-driven algorithm to map parallel hybrid
  4 | applications to the underlying hardware resources transparently,
  5 | efficiently, and portably. Unlike other mappings, its primary design point
  6 | is the memory system, including the cache hierarchy. Compute elements
  7 | are selected based on a memory mapping and not vice versa. In
  8 | addition, mpibind embodies a global awareness of hybrid programming
  9 | abstractions as well as heterogeneous systems with accelerators.
 10 | 
 11 | ### Getting started 
 12 | 
 13 | The easiest way to get `mpibind` is using
 14 | [spack](https://github.com/spack/spack).  
 15 | 
 16 | ```
 17 | spack install mpibind
 18 | 
 19 | # On systems with NVIDIA GPUs
 20 | spack install mpibind+cuda
 21 | 
 22 | # On systems with AMD GPUs
 23 | spack install mpibind+rocm
 24 | 
 25 | # More details
 26 | spack info mpibind
 27 | ```
 28 | 
 29 | Alternatively, one can build the package manually as described below. 
 30 | 
 31 | ### Building and installing 
 32 | 
 33 | This project uses GNU Autotools.
 34 | 
 35 | ```
 36 | $ ./bootstrap
 37 | 
 38 | $ ./configure --prefix=<install_dir>
 39 | 
 40 | $ make
 41 | 
 42 | $ make install
 43 | ```
 44 | 
 45 | If building from a release tarball, please specify MPIBIND_VERSION appropriately. For example: 
 46 | 
 47 | ```
 48 | $ MPIBIND_VERSION=0.15.1 ./bootstrap
 49 | 
 50 | $ ./configure --prefix=<install_dir>
 51 | 
 52 | $ make
 53 | 
 54 | $ make install
 55 | ```
 56 | 
 57 | 
 58 | The resulting library is `<install_dir>/lib/libmpibind` and a simple program using it is `src/main.c`
 59 | 
 60 | 
 61 | ### Test suite 
 62 | 
 63 | ```
 64 | $ make check
 65 | ```
 66 | 
 67 | ### Dependencies 
 68 | 
 69 | * `GNU Autotools` is the build system. 
 70 | 
 71 | * `hwloc` version 2 is required to detect the machine topology.
 72 | 
 73 |   Before building mpibind, make sure `hwloc` can be detected with `pkg-config`:
 74 |   ```
 75 |   pkg-config --variable=libdir --modversion hwloc
 76 |   ```
 77 |   If this fails, add hwloc's pkg-config directory to `PKG_CONFIG_PATH`, e.g.,
 78 |   ```
 79 |   export PKG_CONFIG_PATH=$PKG_CONFIG_PATH:<hwloc-prefix>/lib/pkgconfig
 80 |   ```
 81 | 
 82 | * `libtap` is required to build the test suite.
 83 | 
 84 |   To verify `tap` can be detected with `pkg-config`, follow a
 85 |   similar procedure as for `hwloc` above. 
 86 | 
 87 | 
 88 | ### Contributing
 89 | 
 90 | Contributions for bug fixes and new features are welcome and follow
 91 | the GitHub
 92 | [fork and pull model](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/about-collaborative-development-models).
 93 | Contributors develop on a branch of their personal fork and create
 94 | pull requests to merge their changes into the main repository. 
 95 | 
 96 | The steps are similar to those of the Flux framework:
 97 | 
 98 | 1. [Fork](https://help.github.com/en/github/getting-started-with-github/fork-a-repo) `mpibind`.
 99 | 2. [Clone](https://help.github.com/en/github/getting-started-with-github/fork-a-repo#keep-your-fork-synced)
100 | your fork: `git clone git@github.com:[username]/mpibind.git`
101 | 3. Create a topic branch for your changes: `git checkout -b new_feature`
102 | 4. Create feature or add fix (and add tests if possible)
103 | 5. Make sure everything still passes: `make check`
104 | 6. Push the branch to your GitHub repo: `git push origin new_feature`
105 | 7. Create a pull request against `mpibind` and describe what your
106 | changes do and why you think it should be merged. List any
107 | outstanding *todo* items. 
108 | 
109 | 
110 | ### Authors
111 | 
112 | `mpibind` was created by Edgar A. León.
113 | 
114 | ### Citing mpibind
115 | 
116 | To reference mpibind, please cite one of the
117 | following papers:
118 | 
119 | * Edgar A. León and Matthieu Hautreux. *Achieving Transparency Mapping
120 |   Parallel Applications: A Memory Hierarchy Affair*. In International
121 |   Symposium on Memory Systems, MEMSYS'18, Washington, DC,
122 |   October 2018. ACM. 
123 | 
124 | * Edgar A. León. *mpibind: A Memory-Centric Affinity Algorithm for
125 |   Hybrid Applications*. In International Symposium on Memory Systems,
126 |   MEMSYS'17, Washington, DC, October 2017. ACM.
127 | 
128 | * Edgar A. León, Ian Karlin, and Adam T. Moody. *System Noise
129 |   Revisited: Enabling Application Scalability and Reproducibility with
130 |   SMT*. In International Parallel & Distributed Processing Symposium,
131 |   IPDPS'16, Chicago, IL, May 2016. IEEE.
132 |   
133 | Other references: 
134 | 
135 | * J. P. Dahm, D. F. Richards, A. Black, A. D. Bertsch, L. Grinberg, I. Karlin, S. Kokkila-Schumacher, E. A. León, R. Neely, R. Pankajakshan, and O. Pearce. *Sierra Center of Excellence: Lessons learned*. In IBM Journal of Research and Development, vol. 64, no. 3/4, May-July 2020.
136 | 
137 | * Edgar A. León. *Cross-Architecture Affinity of Supercomputers*. In International Supercomputing Conference (Research Poster), ISC’19, Frankfurt, Germany, June 2019. 
138 | 
139 | * Edgar A. León. *Mapping MPI+X Applications to Multi-GPU
140 |   Architectures: A Performance-Portable Approach*. In GPU Technology
141 |   Conference, GTC'18, San Jose, CA, March 2018. 
142 |   
143 | 
144 | [Bibtex file](doc/mpibind.bib). 
145 | 
146 | 
147 | ### License
148 | 
149 | `mpibind` is distributed under the terms of the MIT license. All new
150 | contributions must be made under this license. 
151 | 
152 | See [LICENSE](LICENSE) and [NOTICE](NOTICE) for details.
153 | 
154 | SPDX-License-Identifier: MIT.
155 | 
156 | LLNL-CODE-812647.
157 | 


--------------------------------------------------------------------------------
/affinity/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | ### Report the mapping of workers to the hardware
  3 | 
  4 | These programs report the mapping of CPUs and GPUs for each process
  5 | and thread. There are three variants:
  6 | 
  7 | * MPI: `mpi`
  8 | * OpenMP: `omp`
  9 | * MPI+OpenMP: `mpi+omp`
 10 | 
 11 | #### Running
 12 | 
 13 | Usage is straightforward. Use the `-v` option for verbose GPU output and
 14 | the `-h` option for help.
 15 | 
 16 | ```
 17 | $ srun -n4 ./mpi
 18 | node173  Task   0/  4 running on 4 CPUs: 0,3,6,9
 19 |          Task   0/  4 has 2 GPUs: 0x63 0x43 
 20 | node173  Task   1/  4 running on 4 CPUs: 12,15,18,21
 21 |          Task   1/  4 has 2 GPUs: 0x3 0x27 
 22 | node173  Task   2/  4 running on 4 CPUs: 24,27,30,33
 23 |          Task   2/  4 has 2 GPUs: 0xe3 0xc3 
 24 | node173  Task   3/  4 running on 4 CPUs: 36,39,42,45
 25 |          Task   3/  4 has 2 GPUs: 0x83 0xa3 
 26 | ```
 27 | 
 28 | ```
 29 | $ srun -n4 ./mpi -v
 30 | node173  Task   0/  4 running on 4 CPUs: 0,3,6,9
 31 |          Task   0/  4 has 2 GPUs: 0x63 0x43 
 32 | 	Default device: 0x63
 33 | 	0x63: Vega 20, 31 GB Mem, 60 Multiprocessors, 1.725 GHZ, 9.6 CC
 34 | 	0x43: Vega 20, 31 GB Mem, 60 Multiprocessors, 1.725 GHZ, 9.6 CC
 35 | node173  Task   1/  4 running on 4 CPUs: 12,15,18,21
 36 |          Task   1/  4 has 2 GPUs: 0x3 0x27 
 37 | 	Default device: 0x3
 38 | 	0x03: Vega 20, 31 GB Mem, 60 Multiprocessors, 1.725 GHZ, 9.6 CC
 39 | 	0x27: Vega 20, 31 GB Mem, 60 Multiprocessors, 1.725 GHZ, 9.6 CC
 40 | node173  Task   2/  4 running on 4 CPUs: 24,27,30,33
 41 |          Task   2/  4 has 2 GPUs: 0xe3 0xc3 
 42 | 	Default device: 0xe3
 43 | 	0xe3: Vega 20, 31 GB Mem, 60 Multiprocessors, 1.725 GHZ, 9.6 CC
 44 | 	0xc3: Vega 20, 31 GB Mem, 60 Multiprocessors, 1.725 GHZ, 9.6 CC
 45 | node173  Task   3/  4 running on 4 CPUs: 36,39,42,45
 46 |          Task   3/  4 has 2 GPUs: 0x83 0xa3 
 47 | 	Default device: 0x83
 48 | 	0x83: Vega 20, 31 GB Mem, 60 Multiprocessors, 1.725 GHZ, 9.6 CC
 49 | 	0xa3: Vega 20, 31 GB Mem, 60 Multiprocessors, 1.725 GHZ, 9.6 CC
 50 | ```
 51 | 
 52 | ```
 53 | $ OMP_NUM_THREADS=4 srun -n2 ./omp
 54 | Process running on 1 CPUs: 0
 55 | Process has 4 GPUs: 0x63 0x43 0x3 0x27 
 56 | 	Default device: 0x63
 57 | 	0x63: Vega 20, 31 GB Mem, 60 Multiprocessors, 1.725 GHZ, 9.6 CC
 58 | 	0x43: Vega 20, 31 GB Mem, 60 Multiprocessors, 1.725 GHZ, 9.6 CC
 59 | 	0x03: Vega 20, 31 GB Mem, 60 Multiprocessors, 1.725 GHZ, 9.6 CC
 60 | 	0x27: Vega 20, 31 GB Mem, 60 Multiprocessors, 1.725 GHZ, 9.6 CC
 61 | Thread   0/  4 running on 1 CPUs: 0
 62 | Thread   0/  4 assigned to GPU: 0x63
 63 | Thread   1/  4 running on 1 CPUs: 6
 64 | Thread   1/  4 assigned to GPU: 0x43
 65 | Thread   2/  4 running on 1 CPUs: 12
 66 | Thread   2/  4 assigned to GPU: 0x3
 67 | Thread   3/  4 running on 1 CPUs: 18
 68 | Thread   3/  4 assigned to GPU: 0x27
 69 | 
 70 | Process running on 1 CPUs: 24
 71 | Process has 4 GPUs: 0xe3 0xc3 0x83 0xa3 
 72 | 	Default device: 0xe3
 73 | 	0xe3: Vega 20, 31 GB Mem, 60 Multiprocessors, 1.725 GHZ, 9.6 CC
 74 | 	0xc3: Vega 20, 31 GB Mem, 60 Multiprocessors, 1.725 GHZ, 9.6 CC
 75 | 	0x83: Vega 20, 31 GB Mem, 60 Multiprocessors, 1.725 GHZ, 9.6 CC
 76 | 	0xa3: Vega 20, 31 GB Mem, 60 Multiprocessors, 1.725 GHZ, 9.6 CC
 77 | Thread   0/  4 running on 1 CPUs: 24
 78 | Thread   0/  4 assigned to GPU: 0xe3
 79 | Thread   1/  4 running on 1 CPUs: 30
 80 | Thread   1/  4 assigned to GPU: 0xc3
 81 | Thread   2/  4 running on 1 CPUs: 36
 82 | Thread   2/  4 assigned to GPU: 0x83
 83 | Thread   3/  4 running on 1 CPUs: 42
 84 | Thread   3/  4 assigned to GPU: 0xa3
 85 | ```
 86 | 
 87 | #### Building
 88 | 
 89 | These program are built with a single Makefile. By default typing
 90 | `make` will only build the CPU-related programs. To enable GPU
 91 | information, the user needs to set an environment variable.
 92 | 
 93 | ```
 94 | # Build with CPU support
 95 | $ make -f makefile.mk
 96 | 
 97 | # Build with support for AMD GPUs
 98 | $ HAVE_AMD_GPUS=1 make -f makefile.mk 
 99 | 
100 | # Build with support for NVIDIA GPUs
101 | $ HAVE_NVIDIA_GPUS=1 make -f makefile.mk
102 | ```
103 | 
104 | To build with AMD GPU support, the ROCm environment must be
105 | present. Similarly, for NVIDIA support, the CUDA environment must be
106 | present. 
107 | 
108 | 
109 | 
110 | 
111 | 
112 | 
113 | 
114 | 


--------------------------------------------------------------------------------
/affinity/affinity.h:
--------------------------------------------------------------------------------
 1 | /***********************************************************
 2 |  * Edgar A. Leon
 3 |  * Lawrence Livermore National Laboratory 
 4 |  ***********************************************************/
 5 | 
 6 | #ifndef AFFINITY_H_INCLUDED
 7 | #define AFFINITY_H_INCLUDED
 8 | 
 9 | #define SHORT_STR_SIZE 32
10 | #define LONG_STR_SIZE 4096
11 | 
12 | #ifdef __cplusplus
13 | extern "C" {
14 | #endif
15 | 
16 |   int get_gpu_count(); 
17 | 
18 |   int get_gpu_pci_id(int dev); 
19 |   
20 |   int get_gpu_affinity(char *buf);
21 | 
22 |   int get_gpu_info(int dev, char *buf);
23 |   
24 |   int get_gpu_info_all(char *buf);
25 | 
26 |   int get_num_cpus();
27 |   
28 |   int get_cpu_affinity(char *buf); 
29 |   
30 | #ifdef __cplusplus
31 | } /* extern "C" */
32 | #endif
33 | 
34 | 
35 | #endif 
36 | 


--------------------------------------------------------------------------------
/affinity/cpu.c:
--------------------------------------------------------------------------------
  1 | /***********************************************************
  2 |  * Edgar A. Leon
  3 |  * Lawrence Livermore National Laboratory 
  4 |  ***********************************************************/
  5 | 
  6 | #include <stdio.h>
  7 | #include <stdlib.h>
  8 | #include <unistd.h>
  9 | 
 10 | /* __USE_GNU is needed for CPU_ISSET definition */ 
 11 | #ifndef __USE_GNU
 12 | #define __USE_GNU 1                
 13 | #endif
 14 | #include <sched.h>            // sched_getaffinity
 15 | 
 16 | 
 17 | /* 
 18 |  * Convert a non-negative array of ints to a range
 19 |  */ 
 20 | int int2range(int *intarr, int size, char *range)
 21 | {
 22 |   int i, curr;
 23 |   int nc = 0; 
 24 |   int start = -1; 
 25 |   int prev = -2;
 26 |   
 27 |   for (i=0; i<size; i++) {
 28 |     curr = intarr[i]; 
 29 |     if (curr != prev+1) {
 30 |       /* Record end of range */ 
 31 |       if (start != prev && prev >= 0)
 32 | 	nc += sprintf(range+nc, "-%d", prev);
 33 | 
 34 |       /* Record start of range */ 
 35 |       if (prev >= 0)
 36 | 	nc += sprintf(range+nc, ",");
 37 |       nc += sprintf(range+nc, "%d", curr);
 38 |       start = curr; 
 39 |     } else
 40 |       /* The last int is end of range */ 
 41 |       if (i == size-1)
 42 | 	nc += sprintf(range+nc, "-%d", curr);
 43 |     
 44 |     prev = curr; 
 45 |   }
 46 |   
 47 |   return nc; 
 48 | }
 49 | 
 50 | 
 51 | /* 
 52 |  * Get number of processing units (cores or hwthreads) 
 53 |  */ 
 54 | static
 55 | int get_total_num_pus()
 56 | {
 57 |   int pus = sysconf(_SC_NPROCESSORS_ONLN);
 58 |   
 59 |   if ( pus < 0 )
 60 |     perror("sysconf");
 61 |   
 62 |   return pus; 
 63 | }
 64 | 
 65 | 
 66 | 
 67 | 
 68 | /*
 69 |  * Get the affinity.
 70 |  */
 71 | static
 72 | int get_affinity(int *cpus, int *count)
 73 | {
 74 |   int i; 
 75 |   cpu_set_t resmask;
 76 |   
 77 |   CPU_ZERO(&resmask);
 78 |   
 79 |   int rc = sched_getaffinity(0, sizeof(resmask), &resmask); 
 80 |   if ( rc < 0 ) {
 81 |     perror("sched_getaffinity");
 82 |     return rc; 
 83 |   }
 84 |   
 85 |   *count = 0; 
 86 |   int pus = get_total_num_pus(); 
 87 |   for (i=0; i<pus; i++) 
 88 |     if ( CPU_ISSET(i, &resmask) ) {
 89 |       cpus[*count] = i; 
 90 |       (*count)++; 
 91 |     }
 92 |   
 93 |   return 0; 
 94 | }
 95 | 
 96 | 
 97 | /*
 98 |  * Get the number of CPUs where this worker can run. 
 99 |  */
100 | int get_num_cpus()
101 | {
102 |   cpu_set_t mask; 
103 | 
104 |   CPU_ZERO(&mask);
105 |   
106 |   int rc = sched_getaffinity(0, sizeof(mask), &mask); 
107 |   if ( rc < 0 ) {
108 |     perror("sched_getaffinity");
109 |     return rc; 
110 |   }
111 | 
112 |   return CPU_COUNT(&mask);
113 | }
114 | 
115 | 
116 | /* 
117 |  * Print my affinity into a buffer.
118 |  */ 
119 | int get_cpu_affinity(char *outbuf)
120 | {
121 |   int count; 
122 |   int nc = 0; 
123 |   
124 |   int *cpus = malloc(sizeof(int) * get_total_num_pus()); 
125 |   get_affinity(cpus, &count);
126 |   
127 | #if 1
128 |   nc += int2range(cpus, count, outbuf+nc);
129 |   //printf("nc=%d count=%d\n", nc, count); 
130 | #else
131 |   int i; 
132 |   for (i=0; i<count; i++) {
133 |     nc += sprintf(outbuf+nc, "%d ", cpus[i]); 
134 |   }
135 | #endif
136 |   nc += sprintf(outbuf+nc, "\n"); 
137 | 
138 |   free(cpus); 
139 |   
140 |   return nc; 
141 | }
142 | 
143 | 
144 | 


--------------------------------------------------------------------------------
/affinity/gpu.cu:
--------------------------------------------------------------------------------
  1 | /***********************************************************
  2 |  * Edgar A. Leon
  3 |  * Lawrence Livermore National Laboratory 
  4 |  ***********************************************************/
  5 | 
  6 | #include <stdio.h>
  7 | #include <cuda_runtime.h>      /* Documentation in hip_runtime_api.h */ 
  8 | #include "affinity.h"          /* Do not perform name mangling */ 
  9 | 
 10 | 
 11 | int get_gpu_count()
 12 | {
 13 |   /* 
 14 |      Surprinsingly, I must set 'count' to zero before
 15 |      passing it to cudaGetDeviceCount(&count)
 16 |      If CUDA_VISIBLE_DEVICES is set to '', calling 
 17 |      this function will not set a value for count. 
 18 |      Then, count will be used uninitialized and 
 19 |      most likely the program will segfault. 
 20 |   */ 
 21 |   int count=0;
 22 | 
 23 |   cudaGetDeviceCount(&count);
 24 | 
 25 |   return count;
 26 | }
 27 | 
 28 | 
 29 | int get_gpu_pci_id(int dev)
 30 | {
 31 |   int value = -1; 
 32 |   cudaError_t err = cudaDeviceGetAttribute(&value, cudaDevAttrPciBusId, dev);
 33 |   
 34 |   if ( err )
 35 |     fprintf(stderr, "Could not get PCI ID for GPU %d\n", dev);
 36 | 
 37 |   return value; 
 38 | }
 39 | 
 40 | 
 41 | int get_gpu_affinity(char *buf)
 42 | {
 43 |   int count=0;
 44 |   cudaGetDeviceCount(&count);
 45 |   
 46 |   int nc=0; 
 47 |   int i; 
 48 |   for (i=0; i<count; i++) {
 49 | #if 0
 50 |     cudaDeviceProp prop;
 51 |     cudaError_t err = cudaGetDeviceProperties(&prop, i);
 52 |     if ( err ) {
 53 |       fprintf(stderr, "Could not get info for GPU %d\n", i);
 54 |       return -1;
 55 |     }
 56 |     nc += sprintf(buf+nc, "%04x:%02x ", prop.pciDomainID, prop.pciBusID);
 57 | #else
 58 |     // [domain]:[bus]:[device].[function]
 59 |     char pcibusid[64];
 60 |     cudaDeviceGetPCIBusId(pcibusid, 64, i);
 61 |     nc += sprintf(buf+nc, "%s ", pcibusid);
 62 | #endif
 63 |   }
 64 |   nc += sprintf(buf+nc, "\n"); 
 65 |   
 66 |   return nc; 
 67 | }
 68 | 
 69 | 
 70 | int get_gpu_info(int devid, char *buf)
 71 | {
 72 |   cudaDeviceProp prop;
 73 |   cudaError_t err; 
 74 |   int nc = 0;
 75 |   
 76 |   err = cudaGetDeviceProperties(&prop, devid);
 77 |   if ( err ) {
 78 |     fprintf(stderr, "Could not get info for GPU %d\n", devid);
 79 |     return -1;
 80 |   }
 81 | 
 82 |   float ghz = prop.clockRate / 1000.0 / 1000.0; 
 83 | #if 1
 84 |   nc += sprintf(buf+nc, "\tName: %s\n", prop.name);
 85 |   nc += sprintf(buf+nc, "\tPCI domain ID 0x%x\n", prop.pciDomainID);
 86 |   nc += sprintf(buf+nc, "\tPCI bus ID: 0x%x\n", prop.pciBusID);
 87 |   nc += sprintf(buf+nc, "\tPCI device ID 0x%x\n", prop.pciDeviceID);
 88 |   nc += sprintf(buf+nc, "\tMemory: %lu GB\n", prop.totalGlobalMem >> 30);
 89 |   nc += sprintf(buf+nc, "\tMultiprocessor count: %d\n", prop.multiProcessorCount);
 90 |   nc += sprintf(buf+nc, "\tClock rate: %.3f Ghz\n", ghz); 
 91 |   nc += sprintf(buf+nc, "\tCompute capability: %d.%d\n",
 92 | 		prop.major, prop.minor);
 93 |   nc += sprintf(buf+nc, "\tECC enabled: %d\n", prop.ECCEnabled);
 94 | #else
 95 |   nc += sprintf(buf+nc, "\t0x%.2x: %s, %lu GB Mem, "
 96 | 		"%d Multiprocessors, %.3f GHZ, %d.%d CC\n",
 97 | 		prop.pciBusID, prop.name, prop.totalGlobalMem >> 30,
 98 | 		prop.multiProcessorCount, ghz, prop.major, prop.minor); 
 99 | #endif
100 |   
101 |   return nc; 
102 | }
103 | 
104 | 
105 | int get_gpu_info_all(char *buf)
106 | {
107 |   cudaError_t err; 
108 |   int i, myid, count=0;
109 |   int nc=0; 
110 |   
111 |   cudaGetDeviceCount(&count);
112 |   err = cudaGetDevice(&myid);
113 |   if ( err ) {
114 |     fprintf(stderr, "Could not get default device\n");
115 |     return -1; 
116 |   }
117 | 
118 |   char pcibusid[SHORT_STR_SIZE];
119 |   cudaDeviceGetPCIBusId(pcibusid, sizeof(pcibusid), myid);
120 |   nc += sprintf(buf+nc, "\tDefault device: %s\n", pcibusid);
121 |   
122 |   for (i=0; i<count; i++) {
123 |     //nc += sprintf(buf+nc, "\t--\n"); 
124 |     nc += get_gpu_info(i, buf+nc);
125 |   }
126 |   
127 |   return nc; 
128 | }
129 | 
130 | 
131 | 
132 | 
133 | 


--------------------------------------------------------------------------------
/affinity/makefile.mk:
--------------------------------------------------------------------------------
 1 | ##############################################################
 2 | # Edgar A. Leon
 3 | # Lawrence Livermore National Laboratory
 4 | ##############################################################
 5 | 
 6 | # To build with GPU support use the following: 
 7 | # HAVE_NVIDIA_GPUS = 1
 8 | # HAVE_AMD_GPUS    = 1
 9 | 
10 | 
11 | CFLAGS      = -Wall -Werror
12 | HIP_LDFLAGS = -L$(shell hipconfig --path)/lib -lamdhip64
13 | 
14 | OBJS        = cpu.o
15 | ifneq ($(strip $(or $(HAVE_AMD_GPUS),$(HAVE_NVIDIA_GPUS))),)
16 | GPU_FLAGS   = -DHAVE_GPUS
17 | OBJS       += gpu.o
18 | endif
19 | 
20 | 
21 | PROGS = mpi omp mpi+omp
22 | 
23 | all: $(PROGS)
24 | 
25 | 
26 | mpi: mpi.o $(OBJS)
27 | ifneq ($(strip $(HAVE_AMD_GPUS)),)
28 | 	mpicc $^ -o $@ $(HIP_LDFLAGS)
29 | else ifneq ($(strip $(HAVE_NVIDIA_GPUS)),)
30 | 	nvcc -ccbin mpicc -Xlinker -lcuda $^ -o $@
31 | else
32 | 	mpicc $^ -o $@ 
33 | endif 
34 | 
35 | omp: omp.o $(OBJS)
36 | ifneq ($(strip $(HAVE_AMD_GPUS)),)
37 | 	$(CC) -fopenmp $^ -o $@ $(HIP_LDFLAGS)
38 | else ifneq ($(strip $(HAVE_NVIDIA_GPUS)),)
39 | 	nvcc $^ -Xcompiler -fopenmp -o $@
40 | else
41 | 	$(CC) -fopenmp $^ -o $@
42 | endif
43 | 
44 | mpi+omp: mpi+omp.o $(OBJS)
45 | ifneq ($(strip $(HAVE_AMD_GPUS)),)
46 | 	mpicc -fopenmp $^ -o $@ $(HIP_LDFLAGS)
47 | else ifneq ($(strip $(HAVE_NVIDIA_GPUS)),)
48 | 	nvcc -ccbin mpicc -Xcompiler -fopenmp -Xlinker -lcuda $^ -o $@
49 | else
50 | 	mpicc -fopenmp $^ -o $@ 
51 | endif 
52 | 
53 | ifneq ($(strip $(HAVE_AMD_GPUS)),)
54 | gpu.o: gpu.cpp affinity.h
55 | 	hipcc -c $<
56 | else
57 | gpu.o: gpu.cu affinity.h
58 | 	nvcc --Werror all-warnings -x cu -c $<
59 | endif 
60 | 
61 | omp.o: omp.c affinity.h
62 | 	$(CC) $(CFLAGS) $(GPU_FLAGS) -fopenmp -c $<
63 | 
64 | mpi.o: mpi.c affinity.h
65 | 	mpicc $(CFLAGS) $(GPU_FLAGS) -c $<
66 | 
67 | mpi+omp.o: mpi+omp.c affinity.h
68 | #	mpicc $(CFLAGS) $(GPU_FLAGS) -fopenmp -c $<
69 | 	mpicc -Wall $(GPU_FLAGS) -fopenmp -c $<
70 | 
71 | cpu.o: cpu.c
72 | 	$(CC) $(CFLAGS) -c $< 
73 | 
74 | gpu.cpp: gpu.cu
75 | 	hipify-perl $< > $@
76 | 
77 | 
78 | clean:
79 | 	rm -f *.o *~ $(PROGS) gpu.cpp
80 | 
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/affinity/mpi+omp.c:
--------------------------------------------------------------------------------
  1 | /***********************************************************
  2 |  * Edgar A. Leon
  3 |  * Lawrence Livermore National Laboratory 
  4 |  ***********************************************************/
  5 | 
  6 | #include <stdio.h>
  7 | #include <string.h>
  8 | #include <mpi.h>
  9 | #include <omp.h>
 10 | #include "affinity.h"
 11 | 
 12 | 
 13 | static
 14 | void usage(char *name)
 15 | {
 16 |   printf("Usage: %s [options]\n", name);
 17 |   printf("\t    -mpi: Show MPI info only (no OpenMP)\n"); 
 18 |   printf("\t-verbose: Show detailed GPU info when -mpi enabled\n");
 19 |   printf("\t   -help: Show this page\n");
 20 | }
 21 | 
 22 | 
 23 | int main(int argc, char *argv[])
 24 | {
 25 |   char buf[LONG_STR_SIZE];
 26 |   char hostname[MPI_MAX_PROCESSOR_NAME]; 
 27 |   int rank, np, size, i, ngpus, ncpus;
 28 |   int verbose = 0;
 29 |   int help = 0; 
 30 |   int mpi = 0; 
 31 |   int nc = 0; 
 32 |   
 33 |   /* Command-line options */
 34 |   if (argc > 1) 
 35 |     for (i=1; i<argc; i++) {
 36 |       /* Todo: Eat heading dashes here */ 
 37 |       if ( strcmp(argv[i], "-v") >= 0 )
 38 | 	verbose = 1;
 39 |       else if ( strcmp(argv[i], "-m") >= 0 )
 40 | 	mpi = 1;
 41 |       else if ( strcmp(argv[i], "-h") >= 0 )
 42 | 	help = 1; 
 43 |     }
 44 |   
 45 |   MPI_Init(&argc, &argv); 
 46 |   MPI_Comm_rank(MPI_COMM_WORLD, &rank); 
 47 |   MPI_Comm_size(MPI_COMM_WORLD, &np); 
 48 |   MPI_Get_processor_name(hostname, &size); 
 49 |   
 50 |   if (help) {
 51 |     if (rank == 0)
 52 |       usage(argv[0]);
 53 |     
 54 |     MPI_Finalize(); 
 55 |     return 0; 
 56 |   }
 57 |   
 58 |   if ( mpi ) {
 59 |     
 60 |     /* MPI */  
 61 |     ncpus = get_num_cpus();
 62 |     nc += sprintf(buf+nc, "%s Task %2d/%2d with %d cpus: ",
 63 | 		  hostname, rank, np, ncpus);
 64 |     nc += get_cpu_affinity(buf+nc);
 65 | #ifdef HAVE_GPUS
 66 |     ngpus = get_gpu_count(); 
 67 |     nc += sprintf(buf+nc, "%s Task %2d/%2d with %d gpus: ",
 68 | 		  hostname, rank, np, ngpus); 
 69 |     nc += get_gpu_affinity(buf+nc);
 70 |     if (verbose)
 71 |       nc += get_gpu_info_all(buf+nc);
 72 | #endif
 73 |     
 74 |     /* Print per-task information */ 
 75 |     printf("%s", buf);
 76 |     
 77 |   } else {
 78 | 
 79 |     /* MPI+OpenMP */ 
 80 | #ifdef HAVE_GPUS    
 81 |     ngpus = get_gpu_count();
 82 | #endif 
 83 |     
 84 | #pragma omp parallel firstprivate(buf, nc) private(ncpus) shared(rank, np, ngpus, verbose)
 85 |     {
 86 |       int tid = omp_get_thread_num();
 87 |       int nthreads = omp_get_num_threads();
 88 |       ncpus = get_num_cpus();
 89 |       
 90 |       nc += sprintf(buf+nc, "%s Task %3d/%3d Thread %3d/%3d with %2d cpus: ",
 91 | 		    hostname, rank, np, tid, nthreads, ncpus);
 92 |       nc += get_cpu_affinity(buf+nc);
 93 | #ifdef HAVE_GPUS
 94 |       nc += sprintf(buf+nc, "%s Task %3d/%3d Thread %3d/%3d with %2d gpus: ",
 95 | 		    hostname, rank, np, tid, nthreads, ngpus);
 96 |       nc += get_gpu_affinity(buf+nc);
 97 | #endif
 98 |       
 99 |       /* Print per-worker information */ 
100 |       printf("%s", buf);
101 |     }
102 |     
103 |   }
104 |   
105 |   MPI_Finalize(); 
106 |   return 0; 
107 | }
108 | 


--------------------------------------------------------------------------------
/affinity/mpi.c:
--------------------------------------------------------------------------------
 1 | /***********************************************************
 2 |  * Edgar A. Leon
 3 |  * Lawrence Livermore National Laboratory 
 4 |  ***********************************************************/
 5 | 
 6 | #include <stdio.h>
 7 | #include <string.h>
 8 | #include <mpi.h>
 9 | #include "affinity.h"
10 | 
11 | 
12 | int main(int argc, char *argv[])
13 | {
14 |   char buf[LONG_STR_SIZE];
15 |   char hostname[MPI_MAX_PROCESSOR_NAME]; 
16 |   int rank, np, size, i;
17 |   int verbose = 0; 
18 |   int ncpus = get_num_cpus(); 
19 |   int nc = 0; 
20 | 
21 |   /* Get rid of compiler warning. Ay. */
22 |   (void) verbose; 
23 |   
24 |   /* Command-line options */
25 |   if (argc > 1) 
26 |     for (i=1; i<argc; i++) {
27 |       if ( strcmp(argv[i], "-v") == 0 )
28 | 	verbose = 1; 
29 |     }
30 |   
31 |   MPI_Init(&argc, &argv); 
32 |   MPI_Comm_rank(MPI_COMM_WORLD, &rank); 
33 |   MPI_Comm_size(MPI_COMM_WORLD, &np); 
34 |   MPI_Get_processor_name(hostname, &size); 
35 | 
36 |   nc += sprintf(buf+nc, "%-10s Task %3d/%3d running on %d CPUs: ",
37 | 		hostname, rank, np, ncpus);
38 |   nc += get_cpu_affinity(buf+nc);
39 | #ifdef HAVE_GPUS
40 |   int ndevs = get_gpu_count();
41 |   nc += sprintf(buf+nc, "%10s Task %3d/%3d has %d GPUs: ",
42 | 		"", rank, np, ndevs); 
43 |   nc += get_gpu_affinity(buf+nc);
44 |   if (verbose)
45 |     nc += get_gpu_info_all(buf+nc);
46 | #endif
47 |   
48 |   /* Print per-task information */ 
49 |   printf("%s", buf);
50 |   
51 |   MPI_Finalize(); 
52 | 
53 |   return 0; 
54 | }
55 | 


--------------------------------------------------------------------------------
/affinity/omp.c:
--------------------------------------------------------------------------------
 1 | /***********************************************************
 2 |  * Edgar A. Leon
 3 |  * Lawrence Livermore National Laboratory 
 4 |  ***********************************************************/
 5 | 
 6 | #include <stdio.h>
 7 | #include <string.h>
 8 | #include <omp.h>
 9 | #include "affinity.h"
10 | 
11 | 
12 | int main(int argc, char *argv[])
13 | {
14 |   char buf[LONG_STR_SIZE];
15 |   int i;
16 |   int ncpus = get_num_cpus();
17 |   int verbose = 0; 
18 |   int nc = 0;
19 | 
20 |   /* Get rid of compiler warning. Ay. */
21 |   (void) verbose; 
22 |   
23 |   /* Command-line options */
24 |   if (argc > 1) 
25 |     for (i=1; i<argc; i++) {
26 |       if ( strcmp(argv[i], "-v") == 0 )
27 | 	verbose = 1; 
28 |     }
29 | 
30 |   nc += sprintf(buf+nc, "Process running on %d CPUs: ", ncpus);
31 |   nc += get_cpu_affinity(buf+nc);
32 | #ifdef HAVE_GPUS
33 |   int ndevs = get_gpu_count();
34 |   nc += sprintf(buf+nc, "Process has %d GPUs: ", ndevs); 
35 |   nc += get_gpu_affinity(buf+nc);
36 |   nc += get_gpu_info_all(buf+nc);
37 | #endif
38 |   
39 |   /* Print the process information */ 
40 |   printf("\n%s", buf);
41 |   
42 |   /* Clear buffer for reuse */
43 |   nc = 0; 
44 |   buf[0] = '\0';
45 | 
46 | #pragma omp parallel firstprivate(buf, nc) private(ncpus) shared(verbose)
47 |   {
48 |     int tid = omp_get_thread_num();
49 |     int nthreads = omp_get_num_threads();
50 |     ncpus = get_num_cpus();
51 |  
52 |     nc += sprintf(buf+nc, "Thread %3d/%3d running on %d CPUs: ",
53 | 		  tid, nthreads, ncpus);
54 |     nc += get_cpu_affinity(buf+nc);
55 | #ifdef HAVE_GPUS
56 |     int dev = tid % ndevs; 
57 |     nc += sprintf(buf+nc, "Thread %3d/%3d assigned to GPU: 0x%x\n",
58 | 		  tid, nthreads, get_gpu_pci_id(dev));
59 |     if (verbose)
60 |       nc += get_gpu_info(dev, buf+nc);
61 | #endif
62 | 
63 |     printf("%s", buf); 
64 |   }
65 |   
66 |   return 0;
67 | }
68 | 


--------------------------------------------------------------------------------
/aux/deepclean.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #ACLOCAL_PATH=$ACLOCAL_PATH:$(spack location -i pkgconfig)/share/aclocal autoreconf --install
 4 | 
 5 | # Todo: I should manage this through 'clean-local:' in Makefile.am
 6 | # https://www.gnu.org/software/automake/manual/html_node/Extending.html#Extending 
 7 | 
 8 | set -x
 9 | 
10 | # make maintainer-clean
11 | make distclean
12 | 
13 | rm Makefile.in
14 | rm src/Makefile.in
15 | rm aclocal.m4
16 | rm -r autom4te.cache
17 | rm -r config
18 | rm configure
19 | 
20 | rm -r install
21 | 
22 | 


--------------------------------------------------------------------------------
/aux/load_env.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #spack load pkgconf
 4 | spack load m4/hwz6doo
 5 | spack load autoconf/kcfww4q
 6 | spack load automake/j4uvdos
 7 | spack load libtool/fac54ff
 8 | 
 9 | spack load py-cffi
10 | 
11 | export PKG_CONFIG_PATH=$PKG_CONFIG_PATH:$HOME/work/repos/libtap/install/lib/pkgconfig
12 | 
13 | # pycotap is install under ~/.local/
14 | #export PYTHONPATH=$PYTHONPATH:$HOME/work/repos/pycotap/install/lib/python3.8/site-packages
15 | 


--------------------------------------------------------------------------------
/bootstrap:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -x 
 4 | libtoolize --verbose --copy || exit
 5 | autoreconf --verbose --install || exit 
 6 | 
 7 | set +x
 8 | echo "Success! Now run './configure'"
 9 | 
10 | 


--------------------------------------------------------------------------------
/config/tap-driver.py:
--------------------------------------------------------------------------------
 1 | #######################################################
 2 | # File originally created by S. Herbein@LLNL and
 3 | # included here with permission from M. Grondona@LLNL
 4 | #######################################################
 5 | #!/usr/bin/env python
 6 | 
 7 | import sys
 8 | import os
 9 | from os import path
10 | 
11 | """
12 |     This is the runner for the python tests. It is essentially a wrapper
13 |     for the tap-driver.sh runner used to the c tests. This runner
14 |     inserts the location of the correct python executable before the
15 |     python test file and then call tap-driver.sh with the correct
16 |     argument ordering.
17 | """
18 | def driver():
19 |     arguments = sys.argv[1:] # 0 is me
20 |     # driver_args = arguments for tap-driver.sh
21 |     # test_command = python test file (e.g. py-epyc-corona.py)
22 |     try:
23 |         args_split_point = arguments.index('--')
24 |         driver_args = arguments[:args_split_point]
25 |         test_command = arguments[args_split_point+1:]
26 |     except ValueError:
27 |         for idx, value in enumerate(arguments):
28 |             if not value.startswith('--'):
29 |                 driver_args = arguments[:idx]
30 |                 test_command = arguments[idx:]
31 |                 break
32 | 
33 |     # driver = location of tap-driver.sh
34 |     # sys.executable = location of python binary
35 |     # insert location of python binary before test command in full_command
36 |     # full_command = driver + driver_args + python executable + python test
37 |     driver = path.join(path.dirname(path.realpath(__file__)), "tap-driver.sh")
38 |     full_command = [driver] + driver_args + ["--", sys.executable] + test_command
39 |     os.execv(driver, full_command)
40 | 
41 | if __name__ == "__main__":
42 |     driver()
43 | 


--------------------------------------------------------------------------------
/configure.ac:
--------------------------------------------------------------------------------
  1 | ## Prologue
  2 | AC_INIT([mpibind],
  3 | 	m4_esyscmd_s([test -n "$MPIBIND_VERSION" && printf $MPIBIND_VERSION || git describe --tags | sed s/^v//]))
  4 | AC_CONFIG_MACRO_DIRS([config])
  5 | AC_CONFIG_AUX_DIR([config])
  6 | AC_CONFIG_HEADERS([config/config.h])
  7 | 
  8 | 
  9 | ## Automake support
 10 | m4_ifdef([AM_PROG_AR], [AM_PROG_AR])
 11 | AM_INIT_AUTOMAKE([-Wall -Werror foreign subdir-objects])
 12 | AM_SILENT_RULES([yes])
 13 | 
 14 | 
 15 | ## Check for programs 
 16 | # pkg-config is discovered through PKG_PROG_PKG_CONFIG, 
 17 | # which detects the presence and version of pkg-config itself.
 18 | # Need to make sure that pkg.m4 is in ACLOCAL_PATH, e.g, 
 19 | # ACLOCAL_PATH=$ACLOCAL_PATH:PKG_CONFIG_ROOT/share/aclocal
 20 | # aclocal --print-ac-dir
 21 | PKG_PROG_PKG_CONFIG
 22 | 
 23 | # libtool 
 24 | LT_INIT
 25 | #AC_DISABLE_SHARED
 26 | #AC_DISABLE_STATIC
 27 | 
 28 | AC_PROG_CC_C99
 29 | AC_PROG_AWK
 30 | AC_PROG_LN_S
 31 | AC_PROG_MKDIR_P
 32 | 
 33 | # Define the host type variable: host
 34 | # And its parts: host_cpu, host_vendor, and host_os.
 35 | AC_CANONICAL_HOST
 36 | #AC_MSG_NOTICE([$host: $host_cpu, $host_vendor, $host_os])
 37 | 
 38 | case "${host_os}" in
 39 |     darwin*) darwin_os=true ;;
 40 | esac
 41 | AM_CONDITIONAL([HAVE_DARWIN_OS], [test x$darwin_os = xtrue])
 42 | 
 43 | 
 44 | PKG_CHECK_MODULES([HWLOC], [hwloc >= 2.1], [
 45 |   hwloc_dir=`$PKG_CONFIG --variable=libdir hwloc`
 46 |   AC_MSG_NOTICE([$hwloc_dir])
 47 |   ]
 48 | )
 49 | 
 50 | # In newer versions of pkgconf, I could use
 51 | # PKG_HAVE_WITH_MODULES and PKG_CHECK_VAR.
 52 | # I could also request min-version above with
 53 | # PKG_PROG_PKG_CONFIG([MIN-VERSION])
 54 | PKG_CHECK_MODULES([TAP], [tap], [
 55 |   libtap=true
 56 |   libtap_dir=`$PKG_CONFIG --variable=libdir tap`
 57 |   AC_SUBST(TAP_LIBDIR, $libtap_dir)
 58 |   AC_REQUIRE_AUX_FILE([tap-driver.sh])
 59 |   AC_MSG_NOTICE([$libtap_dir])
 60 |   ],
 61 |   [AC_MSG_NOTICE([C test suite will not be built])]
 62 | )
 63 | AM_CONDITIONAL([HAVE_LIBTAP], [test x$libtap = xtrue])
 64 | # PKG_CHECK_VAR([TAP_LIBDIR], [tap], [libdir],
 65 | #   [libtap_libdir=true],
 66 | #   [AC_MSG_NOTICE([TAP's libdir not found])]
 67 | # )
 68 | 
 69 | PKG_CHECK_MODULES([FLUX_CORE], [flux-core], [
 70 |   flux_core=true
 71 |   flux_dir=`$PKG_CONFIG --variable=libdir flux-core`
 72 |   AC_MSG_NOTICE([$flux_dir])
 73 |   ],
 74 |   [AC_MSG_NOTICE([Flux plugin will not be built])]
 75 | )
 76 | AM_CONDITIONAL([HAVE_FLUX_CORE], [test x$flux_core = xtrue])
 77 | #PKG_CHECK_MODULES([FLUX_CORE], [flux-core], [
 78 | #  fluxcore=true
 79 | #  flux_shell_plugin_dir=`$PKG_CONFIG --variable=libdir flux-core`
 80 | #  flux_shell_plugin_dir+=/flux/shell/plugins
 81 | #  AC_SUBST(FLUX_SHELL_PLUGIN_DIR, $flux_shell_plugin_dir)
 82 | #  ],
 83 | #  [AC_MSG_NOTICE([Flux plugin will not be built])]
 84 | #)
 85 | #AM_CONDITIONAL([HAVE_FLUX_CORE], [test x$fluxcore = xtrue])
 86 | 
 87 | PKG_CHECK_MODULES([SLURM], [slurm], [
 88 |   slurm=true
 89 |   slurm_dir=`$PKG_CONFIG --variable=includedir slurm`
 90 |   AC_MSG_NOTICE([$slurm_dir])
 91 |   ],
 92 |   [AC_MSG_NOTICE([Slurm plugin will not be built])]
 93 | )
 94 | AM_CONDITIONAL([HAVE_SLURM], [test x$slurm = xtrue])
 95 | 
 96 | # Notes 
 97 | # AC_DEFINE([HAVE_LIBTAP], 1, [Define libtap to build the test suite])
 98 | # AC_CHECK_FUNC([hwloc_topology_set_all_types_filter])
 99 | # AC_CHECK_HEADERS([pkgconf/libpkgconf/libpkgconf.h])
100 | # defines HAVE_LIBTAP and prepends -ltap to LIBS
101 | # AC_CHECK_LIB(tap, plan)
102 | # AC_SEARCH_LIBS([pkgconf_pkg_free], [pkgconf],
103 | #  [AC_MSG_NOTICE([Greetings from pkgconf])],
104 | #  [AC_MSG_ERROR([unable to find pkgconf_pkg_free()])
105 | # ])
106 | # AS_IF(test-1, [run-if-true-1], ..., [run-if-false])
107 | # AC_SUBST(SLURM_INCDIR, $slurm_dir)
108 | 
109 | # Define pkgconfigdir to install mpibind.pc
110 | PKG_INSTALLDIR
111 | 
112 | # Path for mpibind modules, e.g., flux plugin
113 | AS_VAR_SET(mpibindmoddir, $libdir/mpibind)
114 | AC_SUBST(mpibindmoddir)
115 | 
116 | 
117 | ## Dependencies for Python bindings
118 | # AC_CHECK_PYMOD(module, [action-if-found], [action-if-not-found])
119 | # ----------------------------------------------------------------
120 | # Didn't use AC_CACHE_CHECK because AC_CHECK_PYMOD
121 | # may be called multiple times, but with different arguments! 
122 | AC_DEFUN([AC_CHECK_PYMOD],
123 | [AC_REQUIRE([AM_PATH_PYTHON])
124 | AC_MSG_CHECKING([for $1 in python])
125 | have_pymod=no
126 | prog="
127 | import sys
128 | try:
129 | 	import $1
130 | except ImportError:
131 | 	sys.exit(1)
132 | except:
133 | 	sys.exit(0)
134 | sys.exit(0)"
135 | ($PYTHON -c "$prog") && have_pymod=yes
136 | AC_MSG_RESULT($have_pymod)
137 | if test "$have_pymod" = yes; then
138 |    ifelse([$2], [], true, [$2])
139 | else
140 |    ifelse([$3], [], true, [$3])
141 | fi])
142 | 
143 | AM_PATH_PYTHON([3],, [:])
144 | AM_CONDITIONAL([HAVE_PYTHON], [test "$PYTHON" != :])
145 | 
146 | AC_CHECK_PYMOD(cffi,
147 | 	have_cffi=true,
148 | 	AC_MSG_NOTICE([Python bindings and test suite will not be built]))
149 | AM_CONDITIONAL([HAVE_CFFI], [test x$have_cffi = xtrue])
150 | 
151 | AC_CHECK_PYMOD(pycotap,
152 | 	have_pycotap=true,
153 | 	AC_MSG_NOTICE([Python test suite will not be built]))
154 | AM_CONDITIONAL([HAVE_PYCOTAP], [test x$have_pycotap = xtrue])
155 | 
156 | #AM_COND_IF(HAVE_CFFI, [echo "cffi yes!"], [echo "cffi no!"])
157 | #AM_COND_IF(HAVE_PYCOTAP, [echo "pycotap yes!"], [echo "pycotap no!"])
158 | 
159 | 
160 | ## Epilogue
161 | AC_CONFIG_FILES([
162 |  Makefile
163 |  src/Makefile
164 |  test-suite/Makefile
165 |  python/Makefile
166 |  flux/Makefile
167 |  slurm/Makefile
168 |  etc/Makefile
169 |  etc/mpibind.pc
170 | ])
171 | 
172 | AC_OUTPUT
173 | 


--------------------------------------------------------------------------------
/doc/README.md:
--------------------------------------------------------------------------------
 1 | # Mapping Applications to Heterogeneous Systems
 2 | 
 3 | ## mpibind 
 4 | 
 5 | * [Slurm Plugin and User Options](https://github.com/LLNL/mpibind/blob/master/slurm/README.md)
 6 | * [Slurm Tutorial](https://github.com/LLNL/mpibind/blob/master/tutorials/main/module2.md)
 7 | * [Flux Plugin](https://github.com/LLNL/mpibind/blob/master/flux/README.md)
 8 | * [Flux User Options](https://github.com/LLNL/mpibind/blob/master/flux/options.md)
 9 | * [Flux Tutorial](https://github.com/LLNL/mpibind/blob/master/tutorials/flux/module2.md)
10 | 
11 | 
12 | ## Conference Tutorials
13 | 
14 | [Bridging Applications and Hardware](https://github.com/LLNL/mpibind/blob/master/tutorials/eurosys25/README.md)
15 | * 4th Tutorial on Mapping and Affinity (MAP)
16 | * ASPLOS and EuroSys
17 | * March 2025, Rotterdam, The Netherlands
18 | 
19 | [Supercomputer Affinity on HPE Systems](https://github.com/LLNL/mpibind/blob/master/tutorials/cug24/README.md)
20 | * 3rd Tutorial on Mapping and Affinity (MAP)
21 | * Cray User Group 2024
22 | * May 2024, Perth, Australia
23 | 
24 | [Supercomputer Affinity on HPE Systems](https://github.com/LLNL/mpibind/blob/master/tutorials/cug23/README.md)
25 | * 2nd Tutorial on Mapping and Affinity (MAP)
26 | * Cray User Group 2023
27 | * May 2023, Helsinki, Finland 
28 | 
29 | [Supercomputing Systems 101](https://github.com/LLNL/mpibind/tree/master/tutorials/tapia22/README.md)
30 | * 2022 CMD-IT/ACM Richard Tapia Celebration of Diversity in Computing Conference
31 | * September 2022, Washington, D.C.
32 | 
33 | Supercomputer Affinity
34 | * 1st Tutorial on Mapping and Affinity (MAP)
35 | * CEA/EDF/Inria 2022 Summer School on Informatics: Hybrid and Asynchronous High-Performance Programming
36 | * July 2022, Reims, France
37 | 
38 | [Machine Topology and Binding](https://github.com/LLNL/mpibind/blob/master/tutorials/lanl22/README.md)
39 | * Los Alamos Parallel Computing Summer Lecture Series
40 | * June 2022, Virtual
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/doc/mpibind.bib:
--------------------------------------------------------------------------------
 1 | 
 2 | ## mpibind for two-level memory systems
 3 | @InProceedings{ leon.memsys18,
 4 |   author = {Edgar A. Le{\'o}n and Matthieu Hautreux},
 5 |   title = {Achieving Transparency Mapping Parallel Applications: A Memory Hierarchy Affair},
 6 |   booktitle = {International Symposium on Memory Systems},
 7 |   series = {MEMSYS'18},
 8 |   publisher = {ACM},
 9 |   address = {Washington, DC},
10 |   year = {2018},
11 |   month = oct
12 | }
13 | 
14 | ## mpibind for multi-GPU systems
15 | @InProceedings{ leon.gtc18,
16 |   author = {Edgar A. Le{\'o}n},
17 |   title = {Mapping {MPI+X} Applications to Multi-{GPU} Architectures: A Performance-Portable Approach},
18 |   booktitle = {GPU Technology Conference},
19 |   series = {GTC'18},
20 |   address = {San Jose, CA},
21 |   year = {2018},
22 |   month = mar
23 | }
24 | 
25 | ## The initial mpibind algorithm
26 | @InProceedings{ leon.memsys17,
27 |   author = {Edgar A. Le{\'o}n},
28 |   title = {{mpibind}: A Memory-Centric Affinity Algorithm for Hybrid Applications},
29 |   booktitle = {International Symposium on Memory Systems},
30 |   series = {MEMSYS'17},
31 |   publisher = {ACM},
32 |   address = {Washington, DC},
33 |   year = {2017},
34 |   month = oct
35 | }
36 | 
37 | ## Using mpibind to reduce system noise through thread specialization
38 | @Inproceedings{ leon.ipdps16,
39 |   author = {Edgar A. Le{\'o}n and Ian Karlin and Adam T. Moody},
40 |   title = {System Noise Revisited: Enabling Application Scalability and Reproducibility with {SMT}},
41 |   booktitle = {International Parallel \& Distributed Processing Symposium},
42 |   series = {IPDPS'16},
43 |   publisher = {IEEE},
44 |   address = {Chicago, IL},
45 |   year = {2016},
46 |   month = may
47 | }
48 | 
49 | ## mpibind on IBM Spectrum LSF
50 | @Article{	  dahm.ea:ibm20
51 |   author	= {J. P. Dahm and D. F. Richards and A. Black and A. D.
52 | 		  Bertsch and L. Grinberg and I. Karlin and S.
53 | 		  Kokkila-Schumacher and \Edgar and R. Neely and R.
54 | 		  Pankajakshan and O. Pearce},
55 |   journal	= {IBM Journal of Research and Development},
56 |   title		= {{Sierra Center of Excellence}: Lessons Learned},
57 |   volume	= {64},
58 |   number	= {3/4},
59 |   month		= may,
60 |   year		= {2020},
61 |   pages		= {2:1--2:14},
62 |   doi		= {10.1147/JRD.2019.2961069},
63 |   issn		= {0018-8646}
64 | }
65 | 
66 | ## Three case studies for mpibind 
67 | @InProceedings{	  :19:cross-architecture,
68 |   author	= {Edgar A. Le{\'o}n},
69 |   title		= {Cross-Architecture Affinity of Supercomputers},
70 |   booktitle	= {International Supercomputing Conference; Research Poster},
71 |   series	= {ISC'19},
72 |   address	= {Frankfurt, Germany},
73 |   year		= 2019,
74 |   month		= jun
75 | }
76 | 
77 | 


--------------------------------------------------------------------------------
/etc/Makefile.am:
--------------------------------------------------------------------------------
1 | 
2 | #if WITH_PKG_CONFIG
3 | pkgconfig_DATA = mpibind.pc
4 | #endif
5 | 


--------------------------------------------------------------------------------
/etc/mpibind.pc.in:
--------------------------------------------------------------------------------
 1 | prefix=@prefix@
 2 | exec_prefix=@exec_prefix@
 3 | libdir=@libdir@
 4 | includedir=@includedir@
 5 | plugindir=@libdir@/mpibind
 6 | 
 7 | Name: mpibind
 8 | Description: A memory-driven mapping algorithm for heterogeneous systems
 9 | URL: https://github.com/LLNL/mpibind
10 | Version: @PACKAGE_VERSION@
11 | Requires: hwloc >= 2.1
12 | Cflags: -I${includedir}
13 | Libs: -L${libdir} -lmpibind


--------------------------------------------------------------------------------
/flux/Makefile.am:
--------------------------------------------------------------------------------
 1 | 
 2 | #######################################################
 3 | # libmpibind_flux
 4 | #######################################################
 5 | 
 6 | #if HAVE_FLUX_CORE
 7 | #mpibindmod_LTLIBRARIES  = mpibind_flux.la
 8 | # Not installing the lua file anymore
 9 | #mpibindmod_SCRIPTS = mpibind_flux.lua
10 | 
11 | # The mpibind plugin for Flux depends on a Flux header.
12 | # Therefore, when using Spack, Flux is a dependency
13 | # of mpibind. To install mpibind and Flux, one would use:
14 | #   spack install mpibind+flux
15 | # There's an issue though: mpibind may not be able
16 | # to install the plugin into the flux plugins directory,
17 | # e.g., when Flux is a system-wide distribution.
18 | # To work around this Flux has created an environment
19 | # variable that allows loading a plugin without writing
20 | # it to Flux's installation directory. 
21 | # 
22 | # Install mpibind_flux.so into the flux shell
23 | # plugin path so it is loaded by default with
24 | # 'plugin.load("*.so")'. 
25 | # plugin_name = ${mpibindmod_LTLIBRARIES:la=so}
26 | # install-exec-hook: 
27 | # 	$(AM_V_at)echo Installing the mpibind flux plugin...
28 | # 	$(MKDIR_P) $(FLUX_SHELL_PLUGIN_DIR) && \
29 | # 	$(INSTALL) $(builddir)/.libs/$(plugin_name) $(FLUX_SHELL_PLUGIN_DIR)/
30 | 
31 | # Install using a symbolic link 
32 | # install-exec-hook: 
33 | # 	$(AM_V_at)echo Installing the mpibind flux plugin...
34 | # 	cd /g/g99/leon/firefall && \
35 | # 	$(MKDIR_P) $(FLUX_SHELL_PLUGIN_DIR) && \
36 | # 	cd $(FLUX_SHELL_PLUGIN_DIR) && \
37 | # 	(test -e $(plugin_name) && rm $(plugin_name)) && \
38 | # 	$(LN_S) $(mpibindmoddir)/$(plugin_name) . 
39 | #endif
40 | 
41 | # The build directory of the plugin 
42 | plugin_int_dir = $(abs_top_srcdir)/flux/.libs
43 | 
44 | if HAVE_FLUX_CORE
45 | ## The Flux plugin 
46 | mpibindmod_LTLIBRARIES  = mpibind_flux.la
47 | 
48 | ## Script to load the plugin--to be used by Flux
49 | pkgdata_SCRIPTS = mpibind-flux.lua
50 | CLEANFILES = $(pkgdata_SCRIPTS)
51 | 
52 | # Script to load the Flux plugin 
53 | # from the installation directory
54 | install-data-hook:
55 | 	sed -i.tmp 's|$(plugin_int_dir)|$(libdir)/mpibind|g' \
56 | 	$(pkgdatadir)/mpibind-flux.lua && \
57 | 	rm $(pkgdatadir)/mpibind-flux.lua.tmp
58 | endif
59 | 
60 | # The Flux plugin 
61 | mpibind_flux_la_SOURCES = plugin.c
62 | mpibind_flux_la_CFLAGS  = -Wall -Werror -I$(top_srcdir)/src
63 | mpibind_flux_la_CFLAGS += $(HWLOC_CFLAGS) $(FLUX_CORE_CFLAGS)
64 | mpibind_flux_la_LIBADD  = $(top_builddir)/src/libmpibind.la
65 | mpibind_flux_la_LDFLAGS = -module
66 | 
67 | # Script to load the Flux plugin 
68 | # from the '.libs' directory
69 | mpibind-flux.lua: mpibind-flux.lua.in Makefile
70 | 	sed 's|[@]fluxplugindir[@]|$(plugin_int_dir)|g' \
71 | 	mpibind-flux.lua.in > $@
72 | 
73 | # If mpibind was already installed: 
74 | #mpibind_flux_la_CFLAGS  = -Wall -Werror $(MPIBIND_CFLAGS)
75 | #mpibind_flux_la_LIBADD  = $(MPIBIND_LIBS)
76 | 


--------------------------------------------------------------------------------
/flux/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | ## The mpibind Flux Plugin
  3 | 
  4 | The `mpibind_flux.so` plugin enables the use of the mpibind algorithm
  5 | in Flux to map parallel codes to the hardware. It replaces Flux's
  6 | cpu-affinity and gpu-affinity modules.
  7 | 
  8 | <!---
  9 | #### Requirements
 10 | 
 11 | It requires a working installation of `flux-core` and
 12 | `flux-sched`. Furthermore, the installation must be visible via
 13 | `pkg-config`, e.g.,
 14 | ```
 15 | pkg-config --variable=libdir flux-core
 16 | ```
 17 | If flux is not found during the top directory's `configure` phase, the
 18 | plugin will not be built.
 19 | --->
 20 | 
 21 | ### Installing the plugin in Flux 
 22 | 
 23 | The `mpibind_flux.so` plugin is installed here:  
 24 | ```
 25 | <mpibind-prefix>/lib/mpibind/
 26 | 
 27 | # It can be obtained with the command
 28 | pkg-config --variable=plugindir mpibind
 29 | ```
 30 | 
 31 | There are many ways to load a plugin into Flux. Here, I outline three.
 32 | 1. Extend the Flux plugin search path.
 33 |    ```
 34 |    export FLUX_SHELL_RC_PATH=<mpibind-prefix>/share/mpibind
 35 |    ```
 36 | 2. Add to the Flux shell plugins directory.
 37 |    ```
 38 |    # Copy or link mpibind_flux.so to the Flux shell plugins directory
 39 |    cp mpibind_flux.so <flux-prefix>/lib/flux/shell/plugins/
 40 | 
 41 |    # The plugins directory can be obtained as follows
 42 |    pkg-config --variable=fluxshellpluginpath flux-core
 43 |    ```
 44 |    This method assumes write access to the Flux shell plugins directory, i.e., one owns the Flux installation. 
 45 | 
 46 | 3. Load the plugin explicitly at runtime.
 47 | 
 48 |    One can create a job shell `initrc` file (e.g., mpibind-flux.lua) that will load the mpibind plugin:
 49 |    ```
 50 |    -- mpibind-flux.lua
 51 | 
 52 |    plugin.load { file="<mpibind-build-dir>/flux/.libs/mpibind_flux.so" }
 53 |    ```
 54 |    Load the plugin explicitly every time a program is run, e.g., 
 55 |    ```
 56 |    flux run -n2 -o initrc=mpibind-flux.lua hostname
 57 |    ```
 58 |    Make sure to specify the path of `mpibind-flux.lua` and, within the lua
 59 | script, make sure the location of `mpibind_flux.so` is accurate. 
 60 | 
 61 |    To verify the mpibind flux plugin was loaded successfully, one can use the Flux verbose option:
 62 |    ```
 63 |    flux run -n2 -o initrc=mpibind-flux.lua -o verbose=1 hostname
 64 |    ```
 65 | 
 66 | ### Usage 
 67 | 
 68 | Using the mpibind plugin should be transparent to the user, i.e., no additional parameters to `flux run` should be needed to execute the plugin. To verify that indeed the plugin has been loaded one can run the following: 
 69 | 
 70 | ```
 71 | flux run -n2 -o mpibind=verbose:1 hostname
 72 | ```
 73 | 
 74 | To disable the plugin and enable Flux's cpu-affinity module: 
 75 | 
 76 | ```
 77 | flux run -n2 -o mpibind=off -o cpu-affinity=on hostname
 78 | ```
 79 | 
 80 | The options of mpibind are documented [here](options.md). A [tutorial](../tutorials/flux/README.md) is also available.  
 81 | 
 82 | 
 83 | ### Other details about Flux
 84 | 
 85 | You need at least `v0.17.0` of `flux-core` built with `hwloc v2.1` or
 86 | above. 
 87 | 
 88 | Verify hwloc's installation and version: 
 89 | ```
 90 | pkg-config --variable=libdir --modversion hwloc
 91 | ```
 92 | If this fails, add hwloc's pkgconf directory to `PKG_CONFIG_PATH`, e.g.,
 93 | ```
 94 | export PKG_CONFIG_PATH=$PKG_CONFIG_PATH:<hwloc-prefix>/lib/pkgconfig
 95 | ```
 96 | 
 97 | Configure and build `flux-core` against `hwloc v2.1+` and install into
 98 | `flux-install-dir`. 
 99 | ```
100 | flux-core$ ./configure --prefix=<flux-install-dir>
101 | 
102 | flux-core$ make -j 24
103 | ```
104 | 
105 | Ensure Flux was built with `hwloc v2.1+`:
106 | ```
107 | flux-core$ src/cmd/flux version 
108 | commands:     0.18.0-120-g96b3edc
109 | libflux-core: 0.18.0-120-g96b3edc
110 | build-options: +hwloc==2.1.0
111 | ```
112 | 
113 | Then install into the prefix path:
114 | ```
115 | flux-core$ make install 
116 | ```
117 | 
118 | (Optional) Build and install `flux-sched` to the same installation
119 | path. 
120 | 
121 | Add Flux to `pkg-config`:
122 | ```
123 | export PKG_CONFIG_PATH=$PKG_CONFIG_PATH:<flux-install-dir>/lib/pkgconfig
124 | ```
125 | 
126 | 
127 | 
128 | <!---
129 | Checkout the latest mpibind and build:
130 | 
131 | ```
132 | $ git clone https://github.com/LLNL/mpibind
133 | Cloning into 'mpibind'...
134 | 
135 | $ cd mpibind
136 | 
137 | mpibind$ ./bootstrap
138 | 
139 | mpibind$ ./configure --prefix=<mpibind-install-dir>
140 | 
141 | mpibind$ make
142 | ```
143 | 
144 | Either (A) install mpibind or (B) create a job shell *initrc* to load
145 | the plugin:
146 | 
147 | A. Install mpibind. This step will install the mpibind plugin into the
148 | Flux's plugin directory so it is automatically loaded by Flux. 
149 | ```
150 | mpibind$ make install
151 | ```
152 | To test, start a local session and run a job using mpibind:
153 | ```
154 | $ flux start -s 4
155 | 
156 | $ flux mini run -o verbose -n2 -c2 /bin/true
157 | ```
158 | --->
159 | 
160 | <!--
161 | ### Flux shell plugins 
162 | 
163 | One can create a job shell `initrc` file that will load the mpibind plugin from
164 | the build directory. Let's call it `mpibind.lua`:
165 | 
166 | ```
167 | mpibind$ cat mpibind.lua
168 | 
169 | plugin.load { file="./flux/.libs/mpibind_flux.so" }
170 | ```
171 | To test, start a local session and run a job using mpibind:
172 | ```
173 | mpibind$ flux start -s 4
174 | 
175 | mpibind$ flux mini run -o verbose -o userrc=mpibind.lua -n2 -c2 /bin/true
176 | ```
177 | -->


--------------------------------------------------------------------------------
/flux/mpibind-flux-ex1.lua:
--------------------------------------------------------------------------------
 1 | 
 2 | -- While it requires less user intervention, as it tries to figure
 3 | -- locations and paths for the user, this script may be overly
 4 | -- complex. This file (.in) was intended to be read by autoconf
 5 | -- to generate the final lua file.
 6 | 
 7 | 
 8 | -- initrc file to load the mpibind plugin into flux.
 9 | -- To use, add the following to to the flux mini run command 
10 | -- '-o initrc=mpibind_flux.lua' 
11 | 
12 | 
13 | -- Disable Flux's 'cpu- and gpu-affinity'
14 | --shell.options['cpu-affinity'] = "off"
15 | --shell.options['gpu-affinity'] = "off"
16 | 
17 | 
18 | -- This construct allows a site to set default mpibind parameters.
19 | -- Note that by default when the plugin is loaded, mpibind is on
20 | -- even when '-o mpibind' is not used. 
21 | --if not shell.options.mpibind then
22 | --    shell.options.mpibind = on
23 | --end
24 | 
25 | 
26 | -- Load the system initrc.lua to get the system plugins 
27 | -- Todo: In the future, this won't be necessary: 
28 | -- Use '-o userrc=mpibind.lua' instead of '-o initrc=mpibind.lua.
29 | -- https://github.com/flux-framework/flux-core/pull/3132
30 | source_if_exists(os.getenv("FLUX_DIR").."/etc/flux/shell/initrc.lua")
31 | 
32 | 
33 | -- Load the mpibind plugin into flux-shell
34 | -- Can use shell.log() or shell.debug() for output
35 | shell.debug("Flux plugin search path: "..plugin.searchpath)
36 | 
37 | -- Look for the mpibind flux plugin in the mpibind installation
38 | --f = assert(io.popen("pkg-config --variable=libdir mpibind"))
39 | --s = f:read("*l")
40 | --if s then
41 | --  sofile = s .. "/mpibind/mpibind_flux.so"
42 | --  if not io.open(sofile) then
43 | --    sofile = nil 
44 | --  end 
45 | --end
46 | sofile = /g/g99/leon/firefall/nick/install/lib/mpibind/mpibind_flux.so 
47 | if not io.open(sofile) then
48 |   sofile = nil 
49 | end 
50 | 
51 | -- If not found, look for the MPIBIND_FLUX_PLUGIN env var
52 | varname = "MPIBIND_FLUX_PLUGIN"
53 | if sofile == nil then
54 |   sofile = os.getenv(varname)
55 |   if sofile and not io.open(sofile) then
56 |     sofile = nil 
57 |   end 
58 | end
59 | 
60 | if sofile == nil then
61 |   shell.log("Could not find mpibind flux plugin.\n" ..
62 |   "\tMake sure /g/g99/leon/firefall/nick/install/lib/mpibind/mpibind_flux.so exists or\n" ..
63 |   "\texport " .. varname .. "=<path>/mpibind_flux.so")
64 | else
65 |   shell.log("Loading plugin: "..sofile)
66 |   plugin.load { file = sofile }
67 |   --plugin.load { file = "./.libs/mpibind_flux.so", conf = {} }
68 | end 
69 | 
70 | 
71 | 
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/flux/mpibind-flux-ex2.lua:
--------------------------------------------------------------------------------
 1 | -- initrc file to load the mpibind plugin into flux.
 2 | -- To use, add the following to to the flux mini run command 
 3 | -- '-o initrc=mpibind_flux.lua' 
 4 | 
 5 | 
 6 | -- This construct allows a site to set default mpibind parameters.
 7 | -- Note that by default when the plugin is loaded, mpibind is on
 8 | -- even when '-o mpibind' is not used. 
 9 | --if not shell.options.mpibind then
10 | --    shell.options.mpibind = on
11 | --end
12 | 
13 | 
14 | -- Load the system initrc.lua to get the system plugins 
15 | -- Todo: In the future, this won't be necessary: 
16 | -- Use '-o userrc=mpibind.lua' instead of '-o initrc=mpibind.lua.
17 | -- https://github.com/flux-framework/flux-core/pull/3132
18 | source_if_exists(os.getenv("FLUX_DIR").."/etc/flux/shell/initrc.lua")
19 | 
20 | 
21 | -- Load the mpibind plugin into flux-shell
22 | shell.debug("Flux plugin search path: "..plugin.searchpath)
23 | -- sofile = <mpibind-prefix>/lib/mpibind/mpibind_flux.so
24 | sofile = "./.libs/mpibind_flux.so"
25 | shell.debug("Loading plugin: " .. sofile)
26 | plugin.load { file = sofile, conf = {} }
27 | 
28 | 


--------------------------------------------------------------------------------
/flux/mpibind-flux.lua.in:
--------------------------------------------------------------------------------
1 | 
2 | -- Load the mpibind plugin for Flux
3 | plugin.load ("@fluxplugindir@/mpibind_flux.so")
4 | 


--------------------------------------------------------------------------------
/flux/options.md:
--------------------------------------------------------------------------------
 1 | ## mpibind options
 2 | 
 3 | These are the options and environment variables to control mpibind:
 4 | 
 5 | ### Options
 6 | 
 7 | ```
 8 | -o mpibind=off|on
 9 | -o mpibind=verbose
10 | -o mpibind=smt:<n>
11 | -o mpibind=greedy:0|1
12 | -o mpibind=gpu_optim:0|1
13 | -o mpibind=omp_places|omp_proc_bind|visible_devices 
14 | ```
15 | 
16 | When setting more than one option, use commas to separate them, e.g., `-o mpibind=smt:2,verbose`.
17 | 
18 | ### Environment variables
19 | 
20 | ```
21 | MPIBIND_RESTRICT_TYPE=cpu|mem
22 | MPIBIND_RESTRICT=<list-of-integers>
23 | MPIBIND_TOPOFILE=<xml-file>
24 | FLUX_MPIBIND_USE_TOPOFILE=<value>
25 | ```
26 | 
27 | ---
28 | 
29 | ### Turn mpibind on or off
30 | 
31 | If mpibind is disabled by default, turn it on with `-o mpibind=on`
32 | 
33 | If mpibind is enabled by default, turn it off with `-o mpibind=off`
34 | 
35 | ### Enable verbosity
36 | 
37 | To display the mapping of tasks to CPUs and GPUs, use `-o mpibind=verbose`
38 | 
39 | ### Specify an SMT level
40 | 
41 | To specify how many hardware threads per core to use for the application, use `-o mpibind=smt:<n>`, where `n` ranges between 1 and the number of hardware threads per core.
42 | 
43 | To use two hardware threads per core on an SMT-4 architecture, for instance, use `-o mpibind=smt:2`
44 | 
45 | By default mpibind uses one hardware thread per core. 
46 | 
47 | ### Turn greedy on or off
48 | 
49 | To minimize remote memory accesses, mpibind nominally assigns one NUMA domain per task. When launching less tasks than NUMA domains, this can significantly limit the resources available to the application.
50 | 
51 | To use all of the resources of a node when using less tasks than NUMA domains, use `-o mpibind=greedy:1`
52 | 
53 | To assign a single NUMA domain to every task even when using less tasks than NUMA domains, use `-o mpibind=greedy:0`
54 | 
55 | By default greedy mode is on.
56 | 
57 | ### Enable GPU optimized mappings
58 | 
59 | On some heterogeneous architectures the best mapping depends on the type of resource the application will use the most.
60 | 
61 | To fine-tune the mapping provided by mpibind for CPU usage, use `-o mpibind=gpu_optim:0`
62 | 
63 | To fine-tune the mapping provided by mpibind for GPU usage, use `-o mpibind=gpu_optim:1`
64 | 
65 | On systems with GPUs, GPU-optimized mapping is on by default.
66 | 
67 | ### Enable core or thread specialization to mitigate system noise
68 | 
69 | On systems with significant noise generated by system processes, hardware resources can be dedicated for running these processes, e.g., system cores. On such systems user jobs should not be scheduled on these resources.
70 | 
71 | One can tell mpibind to schedule application work on a specific subset of the compute node to, for example, avoid using system resources.
72 | 
73 | One can specify the application resources in a memory-driven or compute-driven fashion: When MPIBIND_RESTRICT_TYPE is set to `cpu` one specifies a set of Linux CPUs and when this variable is set to `mem` one specifies a list of (NUMA) memory domains. When specifying a NUMA domain all of the compute resources local to that domain are included in the set. By default MPIBIND_RESTRICT_TYPE is set to `cpu`.
74 | 
75 | The MPIBIND_RESTRICT variable is then used to specify the IDs of the resources to use for application work.
76 | 
77 | For example, to restrict the application resources to the first and third NUMA domains (and their local resources) one would set `MPIBIND_RESTRICT_TYPE=mem` and `MPIBIND_RESTRICT=0,2`; and to restrict the application to CPUs 12-24 one would set `MPIBIND_RESTRICT_TYPE=cpu` and `MPIBIND_RESTRICT=12-24`.
78 | 
79 | On machines where these variables are set by default (presumably to mitigate system noise), one can unset these variables to regain access to the full node, but one has to be cognizant of the potential implications of running on the same resources as other system processes.
80 | 
81 | 
82 | ### Disable OpenMP affinity or GPU affinity 
83 | 
84 | To enable OpenMP affinity and GPU affinity, mpibind sets certain environment variables. One can instruct mpibind not to set them as follows.
85 | 
86 | To not set OMP_PROC_BIND and OMP_PLACES, use `-o mpibind=omp_proc_bind` and `-o mpibind=omp_places`, respectively. To not set both, use `-o mpibind=omp_proc_bind,omp_places`
87 | 
88 | To not set GPU affinity, use `-o mpibind=visible_devices`. This setting applies to AMD and NVIDIA GPUs.
89 | 
90 | By default OpenMP affinity and GPU affinity are enabled.
91 | 
92 | ### Read in the machine topology
93 | 
94 | Discovering the node topology can be an expensive operation. When running under Flux, mpibind gets the topology specification from Flux rather than querying the topology once again.
95 | 
96 | Alternatively, one can tell mpibind to read the topology (1) from a static hwloc file or (2) dynamically. The former can be accomplished by setting `FLUX_MPIBIND_USE_TOPOFILE` to any non-empty value and `MPIBIND_TOPOFILE` to the hwloc-xml-file. The latter can be accomplished by setting `FLUX_MPIBIND_USE_TOPOFILE` only.
97 | 


--------------------------------------------------------------------------------
/gpu-tests/makefile.mk:
--------------------------------------------------------------------------------
 1 | 
 2 | HIP_PLATFORM = $(shell hipconfig --platform)
 3 | HWLOC_CFLAGS    = $(shell pkg-config --cflags hwloc)
 4 | HWLOC_LDLIBS    = $(shell pkg-config --libs hwloc)
 5 | 
 6 | PROGS  = retrieve visdevs visdevs-hwloc
 7 | 
 8 | all: $(PROGS)
 9 | 	
10 | 
11 | ifneq ($(strip $(HAVE_AMD_GPUS)),)
12 | retrieve: retrieve.cpp 
13 | 	hipcc -Wall -Werror -DHAVE_AMD_GPUS $< -o $@
14 | else
15 | retrieve: retrieve.cu 
16 | 	nvcc --Werror all-warnings -x cu $< -o $@
17 | endif 
18 | 
19 | ifneq ($(strip $(HAVE_AMD_GPUS)),)
20 | visdevs: visdevs.cpp 
21 | 	hipcc -Wall -Werror -DHAVE_AMD_GPUS $< -o $@
22 | else
23 | visdevs: visdevs.cu 
24 | 	nvcc --Werror all-warnings -x cu $< -o $@
25 | endif
26 | 
27 | ifneq ($(strip $(HAVE_AMD_GPUS)),)
28 | visdevs-hwloc: visdevs-hwloc.cpp 
29 | 	hipcc -Wall -Werror -DHAVE_AMD_GPUS $(HWLOC_CFLAGS) $< -o $@ $(HWLOC_LDLIBS)
30 | else
31 | visdevs-hwloc: visdevs-hwloc.cu 
32 | 	nvcc --Werror all-warnings $(HWLOC_CFLAGS) -x cu $< -o $@ $(HWLOC_LDLIBS)
33 | endif
34 | 
35 | 
36 | retrieve.cpp: retrieve.cu
37 | 	hipify-perl $< > $@
38 | 
39 | visdevs.cpp: visdevs.cu
40 | 	hipify-perl $< > $@
41 | 
42 | visdevs-hwloc.cpp: visdevs-hwloc.cu
43 | 	hipify-perl $< > $@
44 | 
45 | clean:
46 | 	rm -f *.o $(PROGS) $(PROGS:=.cpp)
47 | 


--------------------------------------------------------------------------------
/gpu-tests/orig.mk:
--------------------------------------------------------------------------------
  1 | ##############################################################
  2 | # Edgar A. Leon
  3 | # Lawrence Livermore National Laboratory
  4 | ##############################################################
  5 | 
  6 | 
  7 | # Check if we have AMD GPUs
  8 | #HAVE_AMD_GPUS = $(shell rocm-smi --showbus 2>/dev/null | grep GPU)
  9 | #HAVE_NVIDIA_GPUS = 1
 10 | #HAVE_AMD_GPUS    = 1
 11 | 
 12 | CFLAGS      = -Wall -Werror
 13 | HIP_LDFLAGS = -L$(shell hipconfig --path)/lib -lamdhip64
 14 | 
 15 | OBJS        = cpu.o
 16 | ifneq ($(strip $(or $(HAVE_AMD_GPUS),$(HAVE_NVIDIA_GPUS))),)
 17 | GPU_FLAGS   = -DHAVE_GPUS
 18 | OBJS       += gpu.o
 19 | endif
 20 | 
 21 | 
 22 | # Get system configuration with 'hipconfig'
 23 | # hipconfig --platform
 24 | # hipconfig --version
 25 | # hipconfig --compiler
 26 | # hipconfig --runtime
 27 | 
 28 | ##############################################################
 29 | # Build a HIP program with nvcc (for NVIDIA hardware)
 30 | ##############################################################
 31 | #	nvcc -I$(HIP_ROOT)/include $(MPI_CFLAGS) -Xcompiler -DCUDA_ENABLE_DEPRECATED -x cu $< -Xlinker -lcuda -Xlinker "$(MPI_LIBS)"
 32 | #	nvcc -I$(HIP_ROOT)/include -Xcompiler -DCUDA_ENABLE_DEPRECATED -x cu -ccbin mpicc $< -Xlinker -lcuda
 33 | 
 34 | ##############################################################
 35 | # Build a HIP program with hipcc (for NVIDIA hardware)
 36 | # To start with a CUDA program, hipify first, e.g., 
 37 | # hipify-perl square.cu > square.cpp
 38 | # Note: hipcc takes .cpp programs (not .c for example)
 39 | ##############################################################
 40 | # Export the following environment variables 
 41 | # HIP_PLATFORM=nvcc
 42 | # HIP_COMPILER=nvcc
 43 | # HIPCC_VERBOSE=1
 44 | #	hipcc -Xcompiler -DCUDA_ENABLE_DEPRECATED $(MPI_CFLAGS) $< $(MPI_LIBS) -o $@
 45 | # Could use HIP_PLATFORM to determine the flags to use
 46 | #ifeq (${HIP_PLATFORM}, nvcc)
 47 | #	HIPCC_FLAGS = -Xcompiler -DCUDA_ENABLE_DEPRECATED
 48 | #	HIPCC_FLAGS = -gencode=arch=compute_20,code=sm_20 
 49 | #endif
 50 | 
 51 | ##############################################################
 52 | # Build an MPI program with hipcc 
 53 | ##############################################################
 54 | # MPI_ROOT   = /usr/tce/packages/mvapich2/mvapich2-2.3-intel-19.0.4
 55 | # MPI_CFLAGS = -I$(MPI_ROOT)/include
 56 | # MPI_LIBS   = -L$(MPI_ROOT)/lib -lmpi
 57 | # ifneq ($(strip $(HAVE_AMD_GPUS)),)
 58 | # simple: simple.cpp
 59 | # 	hipcc $(MPI_CFLAGS) $^ $(MPI_LIBS) -o $@
 60 | # endif
 61 | 
 62 | 
 63 | ##############################################################
 64 | # Link an OpenMP program with hipcc
 65 | ##############################################################
 66 | # Find the OpenMP lib
 67 | # HIP_CLANG_LIB = $(shell hipconfig --hipclangpath)/../lib
 68 | # omp: omp.o gpu.o
 69 | #	hipcc -fopenmp -Xlinker -rpath=$(HIP_CLANG_LIB) $^ -o $@
 70 | 
 71 | 
 72 | ## I could have chosen to build GPU programs with hipcc
 73 | ## for both AMD and NVIDIA devices, but the hipcc options 
 74 | ## for NVIDIA are almost like calling nvcc directly...
 75 | ## I might as well call nvcc directly and no need
 76 | ## for HIP on NVIDIA architectures! 
 77 | 
 78 | 
 79 | PROGS = mpi omp mpi+omp
 80 | 
 81 | 
 82 | all: $(PROGS)
 83 | 
 84 | 
 85 | mpi: mpi.o $(OBJS)
 86 | ifneq ($(strip $(HAVE_AMD_GPUS)),)
 87 | 	mpicc $^ -o $@ $(HIP_LDFLAGS)
 88 | else ifneq ($(strip $(HAVE_NVIDIA_GPUS)),)
 89 | 	nvcc -ccbin mpicc -Xlinker -lcuda $^ -o $@
 90 | else
 91 | 	mpicc $^ -o $@ 
 92 | endif 
 93 | 
 94 | omp: omp.o $(OBJS)
 95 | ifneq ($(strip $(HAVE_AMD_GPUS)),)
 96 | 	$(CC) -fopenmp $^ -o $@ $(HIP_LDFLAGS)
 97 | else ifneq ($(strip $(HAVE_NVIDIA_GPUS)),)
 98 | 	nvcc $^ -Xcompiler -fopenmp -o $@
 99 | else
100 | 	$(CC) -fopenmp $^ -o $@
101 | endif
102 | 
103 | mpi+omp: mpi+omp.o $(OBJS)
104 | ifneq ($(strip $(HAVE_AMD_GPUS)),)
105 | 	mpicc -fopenmp $^ -o $@ $(HIP_LDFLAGS)
106 | else ifneq ($(strip $(HAVE_NVIDIA_GPUS)),)
107 | 	nvcc -ccbin mpicc -Xcompiler -fopenmp -Xlinker -lcuda $^ -o $@
108 | else
109 | 	mpicc -fopenmp $^ -o $@ 
110 | endif 
111 | 
112 | 
113 | ifneq ($(strip $(HAVE_AMD_GPUS)),)
114 | gpu.o: gpu.cpp affinity.h
115 | 	hipcc -c $<
116 | else
117 | gpu.o: gpu.cu affinity.h
118 | 	nvcc --Werror all-warnings -x cu -c $<
119 | endif 
120 | 
121 | omp.o: omp.c affinity.h
122 | 	$(CC) $(CFLAGS) $(GPU_FLAGS) -fopenmp -c $<
123 | 
124 | mpi.o: mpi.c affinity.h
125 | 	mpicc $(CFLAGS) $(GPU_FLAGS) -c $<
126 | 
127 | mpi+omp.o: mpi+omp.c affinity.h
128 | 	mpicc $(CFLAGS) $(GPU_FLAGS) -fopenmp -c $<
129 | 
130 | cpu.o: cpu.c
131 | 	$(CC) $(CFLAGS) -c $< 
132 | 
133 | gpu.cpp: gpu.cu
134 | 	hipify-perl $< > $@
135 | 
136 | 
137 | clean:
138 | 	rm -f *.o *~ $(PROGS)
139 | 
140 | 
141 | 
142 | # gpu-hip.o: gpu-hip.cpp affinity.h
143 | # ifneq ($(strip $(HAVE_AMD_GPUS)),)
144 | # 	hipcc -g -c -o $@ $<
145 | # else
146 | # 	nvcc -I$(HIP_ROOT)/include -Xcompiler -DCUDA_ENABLE_DEPRECATED -c -o $@ $<
147 | # endif
148 | 
149 | #/usr/tce/packages/cuda/cuda-10.1.243/nvidia/bin/nvcc -I/usr/tce/packages/hip/hip-3.0.0/include -Xcompiler -DCUDA_ENABLE_DEPRECATED -Xcompiler -DHIP_VERSION_MAJOR=3 -Xcompiler -DHIP_VERSION_MINOR=0 -Xcompiler -DHIP_VERSION_PATCH=0 -x cu square.hipref.cpp -Xlinker '"-rpath=/usr/tce/packages/cuda/cuda-10.1.243/nvidia/lib64:/usr/tce/packages/cuda/cuda-10.1.243"'
150 | 
151 | 


--------------------------------------------------------------------------------
/gpu-tests/retrieve.cu:
--------------------------------------------------------------------------------
  1 | /******************************************************
  2 |  * Edgar A. Leon
  3 |  * Lawrence Livermore National Laboratory
  4 |  ******************************************************/
  5 | 
  6 | #include <stdio.h>
  7 | #ifdef HAVE_AMD_GPUS
  8 | #include "hip/hip_runtime.h"
  9 | #endif 
 10 | 
 11 | #define MAX_PCI_LEN 20
 12 | 
 13 | void chooseDevPartial(int dev)
 14 | {
 15 |   int odev=-1;
 16 |   int busId=-1, deviceId=-1, domainId=-1;
 17 |   char pci[MAX_PCI_LEN];
 18 |   cudaDeviceProp prop; 
 19 | 
 20 |   // Get selected device properties 
 21 |   cudaDeviceGetPCIBusId(pci, MAX_PCI_LEN, dev);
 22 |   sscanf(pci, "%04x:%02x:%02x", &domainId, &busId, &deviceId);
 23 |   
 24 |    // Partially fill device properties and match 
 25 |    memset(&prop, 0, sizeof(cudaDeviceProp));
 26 |    prop.pciDomainID = domainId; 
 27 |    prop.pciBusID = busId;
 28 |    prop.pciDeviceID = deviceId; 
 29 |    
 30 |    cudaChooseDevice(&odev, &prop);
 31 |    printf("Partial match of device %d: device %d\n", dev, odev);
 32 |    printf("\tInput: DomainID=0x%x BusId=0x%x DeviceId=0x%x\n",
 33 |     domainId, busId, deviceId); 
 34 |    if (dev != odev)
 35 |      printf("\tError: ChooseDevice did not match the correct device\n");
 36 | }
 37 | 
 38 | void chooseDevFull(int dev)
 39 | {
 40 |   int odev=-1;
 41 |   cudaDeviceProp prop; 
 42 | 
 43 |   // Get all device properties 
 44 |   cudaGetDeviceProperties(&prop, dev); 
 45 | 
 46 |   cudaChooseDevice(&odev, &prop);
 47 |   printf("Full match of device %d: device %d\n", dev, odev);
 48 |   printf("\tInput: DomainID=0x%x BusId=0x%x DeviceId=0x%x\n",
 49 | 	 prop.pciDomainID, prop.pciBusID, prop.pciDeviceID); 
 50 | #ifndef HAVE_AMD_GPUS
 51 |   // HIP does not have a uuid field! 
 52 |   printf("\t       UUID=0x%x\n", prop.uuid); 
 53 | #endif
 54 | 
 55 |   if (dev != odev)
 56 |     printf("\tError: ChooseDevice did not match the correct device\n");
 57 | }
 58 | 
 59 | void getDevByPCI(int dev, char *pci)
 60 | {
 61 |   int pciBusID=-1, pciDeviceID=-1, pciDomainID=-1;
 62 |   int odev=-1; 
 63 |     
 64 |   sscanf(pci, "%04x:%02x:%02x", &pciDomainID, &pciBusID, &pciDeviceID);
 65 | 
 66 |   // PCI ID: String in one of the following forms: 
 67 |   // [domain]:[bus]:[device].[function] 
 68 |   // [domain]:[bus]:[device] 
 69 |   // [bus]:[device].[function] 
 70 |   // where domain, bus, device, and function are all hex values
 71 |   cudaDeviceGetByPCIBusId(&odev, pci);
 72 | 
 73 |   printf("GetbyPCI match of device %d: device %d\n", dev, odev);
 74 |   printf("\tInput: DomainID=0x%x BusId=0x%x DeviceId=0x%x\n",
 75 |     pciDomainID, pciBusID, pciDeviceID); 
 76 |   if (odev != dev)
 77 |     printf("Error: GetByPCI did not match the correct device\n");
 78 | }
 79 | 
 80 | 
 81 | int main(int argc, char *argv[])
 82 | {
 83 |   int dev, ndevs; 
 84 |   char pci[MAX_PCI_LEN];
 85 | 
 86 |   
 87 |   cudaGetDeviceCount(&ndevs);
 88 |   if (ndevs <= 0) {
 89 |     printf("No devices found\n"); 
 90 |     return 0; 
 91 |   }
 92 | 
 93 |   // Select input device
 94 |   // Avoid choosing device 0, if possible, to enhance testing
 95 |   // dev = 1; 
 96 |   dev = ndevs-1; 
 97 | 
 98 |   cudaDeviceGetPCIBusId(pci, MAX_PCI_LEN, dev);
 99 |   printf("PCI ID of device %d = %s\n", dev, pci);
100 | 
101 |   getDevByPCI(dev, pci);
102 | 
103 |   chooseDevPartial(dev);
104 | 
105 |   chooseDevFull(dev);
106 | 
107 |   cudaSetDevice(dev);
108 |     
109 |   return 0; 
110 | }
111 | 
112 | 


--------------------------------------------------------------------------------
/gpu-tests/simple.cpp:
--------------------------------------------------------------------------------
 1 | /***********************************************************
 2 |  * Edgar A. Leon
 3 |  * Lawrence Livermore National Laboratory 
 4 |  ***********************************************************/
 5 | 
 6 | #include <stdio.h>
 7 | #include <hip/hip_runtime.h>
 8 | #include <mpi.h>
 9 | 
10 | #define STR_SIZE 100
11 | 
12 | void check_devices(char *buf)
13 | {
14 |   hipDevice_t mydev;
15 |   hipDeviceProp_t devProp;
16 |   int i, ndevs, myid;
17 |   char pciBusId[STR_SIZE] = "";
18 |   int nc = 0; 
19 |   
20 |   hipGetDeviceCount(&ndevs);
21 |   nc += sprintf(buf+nc, "Num devices: %d\n", ndevs); 
22 | 
23 |   hipGetDevice(&myid); 
24 |   hipDeviceGet(&mydev, myid);
25 |   hipDeviceGetPCIBusId(pciBusId, STR_SIZE, mydev);
26 |   nc += sprintf(buf+nc, "Default device: %s\n", pciBusId); 
27 | 		
28 |   for (i=0; i<ndevs; i++) { 
29 |     hipGetDeviceProperties(&devProp, i);
30 |     nc += sprintf(buf+nc, "\t--\n"); 
31 |     nc += sprintf(buf+nc, "\tSystem major and minor: %d %d\n",
32 | 		  devProp.major, devProp.minor); 
33 |     nc += sprintf(buf+nc, "\tName: %s\n", devProp.name); 
34 |     nc += sprintf(buf+nc, "\tPCI bus ID: 0x%x\n", devProp.pciBusID);
35 |     nc += sprintf(buf+nc, "\tPCI device ID 0x%x\n", devProp.pciDeviceID);
36 |   }
37 | }
38 | 
39 | 
40 | int main(int argc, char *argv[])
41 | {
42 |   int rank, np;
43 |   char buf[1024]; 
44 |   
45 |   MPI_Init(&argc, &argv); 
46 |   MPI_Comm_rank(MPI_COMM_WORLD, &rank); 
47 |   MPI_Comm_size(MPI_COMM_WORLD, &np); 
48 |   
49 |   check_devices(buf);
50 | 
51 |   printf("Buf: %s\n", buf);
52 |   
53 |   MPI_Finalize(); 
54 | 
55 |   return 0; 
56 | }
57 | 


--------------------------------------------------------------------------------
/gpu-tests/visdevs-hwloc.cu:
--------------------------------------------------------------------------------
  1 | /******************************************************
  2 |  * Edgar A. Leon
  3 |  * Lawrence Livermore National Laboratory
  4 |  ******************************************************/
  5 | #include <stdio.h>
  6 | #include <stdlib.h>
  7 | #include <string.h>
  8 | #include <hwloc.h>
  9 | 
 10 | #include <sys/wait.h>
 11 | #include <unistd.h>
 12 | 
 13 | #ifdef HAVE_AMD_GPUS
 14 | #include "hip/hip_runtime.h"
 15 | #endif 
 16 | 
 17 | #define MAX_PCI_LEN 20
 18 | #define MAX_STR_LEN 512
 19 | 
 20 | 
 21 | int obj_attr_snprintf(char *str, size_t size, hwloc_obj_t obj, 
 22 |                       int verbose)
 23 | {
 24 |   int nc=0; 
 25 | 
 26 |   if (obj->type == HWLOC_OBJ_OS_DEVICE)
 27 |   switch (obj->attr->osdev.type) {
 28 |     case HWLOC_OBJ_OSDEV_COPROC :
 29 |       nc += hwloc_obj_type_snprintf(str+nc, size-nc, obj, 1);
 30 |       nc += snprintf(str+nc, size-nc, ": name=%s ", obj->name);
 31 |       nc += snprintf(str+nc, size-nc, "subtype=%s ", obj->subtype);
 32 |       nc += snprintf(str+nc, size-nc, "GPUModel=%s ", 
 33 |         hwloc_obj_get_info_by_name(obj, "GPUModel"));
 34 |       nc += snprintf(str+nc, size-nc, "   ");
 35 |       /* Get obj->infos in one shot */ 
 36 |       nc += hwloc_obj_attr_snprintf(str+nc, size-nc, obj, " ", verbose);
 37 |       break; 
 38 | 
 39 |     default: 
 40 |       break;
 41 |   }
 42 | 
 43 |   
 44 |   return nc;
 45 | }
 46 |  
 47 | 
 48 | 
 49 | void set_vis_devs(char *str)
 50 | {
 51 |   // Don't invoke any GPU calls before resetting the environment!
 52 |   // Otherwise, there's no effect of setting VISIBLE_DEVICES. 
 53 |   //cudaGetDeviceCount(&ndevs);
 54 |   //printf("Initial num. devices %d\n", ndevs); 
 55 | 
 56 |   printf("Resetting environment to devices %s\n", str); 
 57 |   unsetenv("ROCR_VISIBLE_DEVICES");
 58 |   unsetenv("HIP_VISIBLE_DEVICES");
 59 |   unsetenv("CUDA_VISIBLE_DEVICES");
 60 | #ifdef HAVE_AMD_GPUS
 61 |     setenv("ROCR_VISIBLE_DEVICES", str, 1);
 62 | #else 
 63 |     setenv("CUDA_VISIBLE_DEVICES", str, 1);
 64 | #endif
 65 | }
 66 | 
 67 | 
 68 | void print_devices(hwloc_topology_t topo)
 69 | {
 70 |   char str[MAX_STR_LEN]; 
 71 |   hwloc_obj_t obj = NULL;
 72 |   while ( (obj=hwloc_get_next_obj_by_type(topo, HWLOC_OBJ_OS_DEVICE, obj)) != NULL ) 
 73 |     if (obj->attr->osdev.type == HWLOC_OBJ_OSDEV_COPROC) {
 74 |       str[0] = '\0'; 
 75 |       obj_attr_snprintf(str, MAX_STR_LEN, obj, 0);
 76 |       printf("%s\n", str);
 77 |   }
 78 | }
 79 | 
 80 | int get_list_len(char *lst)
 81 | {
 82 |   // Copy VISDEVS string since strtok modifies the input string 
 83 |   char tmp[strlen(lst)]; 
 84 |   strcpy(tmp, lst);
 85 | 
 86 |   /* Get list size */ 
 87 |   int idevs = 0; 
 88 |   char *token = strtok(tmp, ",");
 89 |   while( token != NULL ) {
 90 |     idevs++; 
 91 |     token = strtok(NULL, ",");
 92 |   }
 93 | 
 94 |   return idevs; 
 95 | }
 96 | 
 97 | 
 98 | void test_wdup(char *visdevs, hwloc_topology_t topo)
 99 | {
100 |   set_vis_devs(visdevs); 
101 | 
102 |   hwloc_topology_t topo2; 
103 |   printf("Duplicating the topology\n"); 
104 |   hwloc_topology_dup(&topo2, topo); 
105 | 
106 |   set_vis_devs(visdevs); 
107 | 
108 |   print_devices(topo2);
109 |   hwloc_topology_destroy(topo2);    
110 | }
111 | 
112 | void test_wfork(char *vds)
113 | {
114 |   set_vis_devs(vds); 
115 |   pid_t cpid = fork();
116 |   
117 |   if (cpid == 0) {
118 |     unsetenv("ROCR_VISIBLE_DEVICES");
119 |     unsetenv("HIP_VISIBLE_DEVICES");
120 |     printf("Child:\n");
121 |     set_vis_devs(vds); 
122 | 
123 |     hwloc_topology_t topo;
124 |     hwloc_topology_init(&topo);
125 |     hwloc_topology_set_io_types_filter(topo,
126 |         HWLOC_TYPE_FILTER_KEEP_IMPORTANT);
127 |     hwloc_topology_load(topo);
128 |     print_devices(topo);
129 |     hwloc_topology_destroy(topo);    
130 | 
131 |     exit(0);
132 |   } else if (cpid > 0) {
133 |     printf("Parent: Nothing to do but wait...\n");
134 |     wait(NULL);
135 |   } else {
136 |     printf("fork() failed\n");
137 |   }
138 | }
139 | 
140 | void test_wnew_topo(char *vds)
141 | {
142 |   set_vis_devs(vds); 
143 | 
144 |   hwloc_topology_t topo;
145 |   hwloc_topology_init(&topo);
146 |   hwloc_topology_set_io_types_filter(topo,
147 |       HWLOC_TYPE_FILTER_KEEP_IMPORTANT);
148 |   hwloc_topology_load(topo);
149 |   print_devices(topo);
150 |   hwloc_topology_destroy(topo);    
151 | }
152 | 
153 | 
154 | void test_wdev_api(char *vds)
155 | {
156 |   int i, odevs=-1; 
157 |   /* Cannot call the device driver before settting 
158 |      VISIBLE DEVICES. Otherwise, the devices are set
159 |      and cannot be changed */ 
160 |   //cudaGetDeviceCount(&odevs);
161 |   //printf("Modified num. devices %d\n", odevs); 
162 | 
163 |   set_vis_devs(vds); 
164 |   cudaGetDeviceCount(&odevs);
165 |   printf("Modified num. devices %d\n", odevs); 
166 | 
167 |   /* Get device PCI ID */ 
168 |   char pci[MAX_PCI_LEN]; 
169 |   for (i=0; i<odevs; i++) {
170 |       pci[0] = '\0'; 
171 |       cudaDeviceGetPCIBusId(pci, MAX_PCI_LEN, i);
172 |       printf("PCI ID of device %d = %s\n", i, pci);
173 |   }
174 | }
175 | 
176 | void test_wfork_api(char *vds)
177 | {
178 |   int i, odevs=-1; 
179 |   /* Don't call into device functions until 
180 |      after setting visible devices */ 
181 |   //cudaGetDeviceCount(&odevs);
182 |   //printf("Num. devices %d\n", odevs); 
183 | 
184 |   set_vis_devs(vds); 
185 |   cudaGetDeviceCount(&odevs);
186 |   printf("Num. devices %d\n", odevs); 
187 | 
188 |   pid_t cpid = fork();
189 |   
190 |   if (cpid == 0) {
191 |     unsetenv("ROCR_VISIBLE_DEVICES");
192 |     unsetenv("HIP_VISIBLE_DEVICES");
193 |     printf("Child:\n");
194 |     set_vis_devs(vds); 
195 | 
196 |     cudaGetDeviceCount(&odevs);
197 |     printf("Num. devices %d\n", odevs); 
198 |     /* Get device PCI ID */ 
199 |     char pci[MAX_PCI_LEN]; 
200 |     for (i=0; i<odevs; i++) {
201 |       pci[0] = '\0'; 
202 |       cudaDeviceGetPCIBusId(pci, MAX_PCI_LEN, i);
203 |       printf("PCI ID of device %d = %s\n", i, pci);
204 |     }
205 |     
206 |     exit(0);
207 |   } else if (cpid > 0) {
208 |     printf("Parent: Nothing to do but wait...\n");
209 |     wait(NULL);
210 |   } else {
211 |     printf("fork() failed\n");
212 |   }
213 | }
214 | 
215 | 
216 | 
217 | /* Lessons learned: 
218 |    1. Setting VISIBLE DEVICES in the context of hwloc: 
219 |       The environmnet variables must be set before the 
220 |       first time the topology is loaded. 
221 |    2. Setting VISIBLE DEVICES in the context of device API calls: 
222 |       The environment variables must be called before the 
223 |       first invocation of a device function. 
224 |    3. Using fork does not really allows to overwrite the points
225 |       above. 
226 |    4. hwloc loading a topology has the same effect as calling 
227 |       a device function, i.e., after this setting VISIBLE 
228 |       DEVICES is too late. 
229 |  */ 
230 | 
231 | int main(int argc, char *argv[])
232 | {
233 |   char vds[] = "1"; 
234 |   //int idevs = get_list_len(vds); 
235 | 
236 |   hwloc_topology_t topo;
237 |   hwloc_topology_init(&topo);
238 |   /* OS devices are filtered by default, enable to see GPUs */
239 |   hwloc_topology_set_type_filter(topo, HWLOC_OBJ_OS_DEVICE,
240 |       HWLOC_TYPE_FILTER_KEEP_IMPORTANT);
241 |   /* Include PCI devices to determine whether two GPUs                          
242 |      are the same device, i.e., opencl1d1 and cuda1 */
243 |   hwloc_topology_set_type_filter(topo, HWLOC_OBJ_PCI_DEVICE,
244 |        HWLOC_TYPE_FILTER_KEEP_IMPORTANT);
245 | 
246 |   /* Setting visible devices must be done before 
247 |      loading the topology the first time! */ 
248 |   set_vis_devs(vds); 
249 | 
250 |   /* If testing whether VISIBLE DEVICES work with 
251 |      the device API functions, don't load the topology
252 |      because this set the devices and can't be changed later */ 
253 |   hwloc_topology_load(topo);
254 |   //print_devices(topo);
255 | 
256 | 
257 | #if 1
258 |   test_wnew_topo(vds); 
259 | #endif 
260 | #if 0
261 |   test_wdup(vds, topo);
262 | #endif
263 | #if 0
264 |   test_wfork(vds);
265 | #endif
266 | #if 0
267 |   test_wdev_api(vds);
268 | #endif
269 | #if 0
270 |   test_wfork_api(vds);
271 | #endif 
272 | 
273 |   hwloc_topology_destroy(topo);
274 | 
275 |   return 0; 
276 | }


--------------------------------------------------------------------------------
/gpu-tests/visdevs.cu:
--------------------------------------------------------------------------------
 1 | /******************************************************
 2 |  * Edgar A. Leon
 3 |  * Lawrence Livermore National Laboratory
 4 |  ******************************************************/
 5 | 
 6 | #include <stdio.h>
 7 | #include <stdlib.h>
 8 | #include <string.h>
 9 | 
10 | #ifdef HAVE_AMD_GPUS
11 | #include "hip/hip_runtime.h"
12 | #endif 
13 | 
14 | #define MAX_PCI_LEN 20
15 | 
16 | int main(int argc, char *argv[])
17 | {
18 |     char str[] = "1,7"; 
19 | 
20 |     // Copy VISDEVS string since strtok modifies the input string 
21 |     char tmp[strlen(str)]; 
22 |     strcpy(tmp, str);
23 | 
24 |     /* Get list size */ 
25 |     int i, idevs = 0; 
26 |     char *token = strtok(tmp, ",");
27 |     while( token != NULL ) {
28 |         idevs++; 
29 |         token = strtok(NULL, ",");
30 |     }
31 | 
32 |     /* Convert VISDEVS list into ints */ 
33 |     //int i=0, visdevs[idevs];
34 |     //strcpy(tmp, str);
35 |     //token = strtok(tmp, ",");
36 |     //while( token != NULL ) {
37 |     //    visdevs[i++] = atoi(token);
38 |     //    token = strtok(NULL, ",");
39 |     //}
40 | 
41 |     // Don't invoke any GPU calls before resetting the environment!
42 |     // Otherwise, there's no effect of setting VISIBLE_DEVICES. 
43 |     //cudaGetDeviceCount(&ndevs);
44 |     //printf("Initial num. devices %d\n", ndevs); 
45 | 
46 |     printf("Resetting environment to devices %s\n", str); 
47 |     unsetenv("ROCR_VISIBLE_DEVICES");
48 |     unsetenv("HIP_VISIBLE_DEVICES");
49 |     unsetenv("CUDA_VISIBLE_DEVICES");
50 | #ifdef HAVE_AMD_GPUS
51 |     setenv("ROCR_VISIBLE_DEVICES", str, 1);
52 | #else 
53 |     setenv("CUDA_VISIBLE_DEVICES", str, 1);
54 | #endif
55 | 
56 |     int odevs=-1; 
57 |     cudaGetDeviceCount(&odevs);
58 |     printf("Modified num. devices %d\n", odevs); 
59 | 
60 |     /* Get device PCI ID */ 
61 |     char pci[MAX_PCI_LEN]; 
62 |     for (i=0; i<idevs; i++) {
63 |         pci[0] = '\0'; 
64 |         cudaDeviceGetPCIBusId(pci, MAX_PCI_LEN, i);
65 |         printf("PCI ID of device %d = %s\n", i, pci);
66 |     }
67 | 
68 |     return 0; 
69 | }


--------------------------------------------------------------------------------
/python/Makefile.am:
--------------------------------------------------------------------------------
 1 | #######################################################
 2 | # mpibind python bindings
 3 | #######################################################
 4 | 
 5 | # I wish there was a better way to determine the type of
 6 | # library libtool creates. For now, use .dylib for MacOS
 7 | # and .so elsewhere. 
 8 | if HAVE_DARWIN_OS
 9 | mpibind_lib_ext = .dylib
10 | else
11 | mpibind_lib_ext = .so
12 | endif
13 | mpibind_lib_name = libmpibind$(mpibind_lib_ext)
14 | 
15 | # Copy bindings, example, readme, and setup to the install tree
16 | if HAVE_CFFI
17 | data_DATA  = mpibind.py setup.py
18 | data_DATA += mpibind_map.py test-simple.py test-mpi.py
19 | 
20 | # Version of mpibind used for testing,
21 | # references the shared library in the build tree
22 | mpibind.py: mpibind.py.in
23 | 	sed 's|\@mpibindlib\@|$(abs_top_srcdir)/src/.libs/$(mpibind_lib_name)|g' \
24 | 	mpibind.py.in > $@
25 | 
26 | # After installation modify mpibind.py to point to libmpibind in the install tree
27 | install-data-hook:
28 | 	sed -i.tmp 's|$(abs_top_srcdir)/src/.libs/$(mpibind_lib_name)|$(libdir)/$(mpibind_lib_name)|g' \
29 | 	$(datadir)/mpibind.py && \
30 | 	rm $(datadir)/mpibind.py.tmp
31 | 
32 | CLEANFILES = mpibind.py 
33 | 
34 | endif
35 | 


--------------------------------------------------------------------------------
/python/README.md:
--------------------------------------------------------------------------------
  1 | # The Python interface
  2 | 
  3 | `mpibind` for Python enables the use of the mpibind algorithm in
  4 | arbitrary Python programs.  
  5 | 
  6 | ## Building and installing 
  7 | 
  8 | ### Spack 
  9 | 
 10 | The easiest way to build and install the Python interface is through
 11 | `spack`
 12 | ```
 13 | spack install mpibind+python
 14 | spack load mpibind 
 15 | ```
 16 | 
 17 | ### Autotools
 18 | 
 19 | Otherwise use the Autools process described at the top
 20 | directory. Basically, the Python bindings are built provided 
 21 | that Python 3 and CFFI are present at `configure` time. Let's assume
 22 | that mpibind's installation directory is `install_dir`
 23 | 
 24 | Options available to install the Python interface: 
 25 | 
 26 | * Add the bindings to the Python path 
 27 | ```
 28 |    export PYTHONPATH=$PYTHONPATH:install_dir/share"
 29 | ```
 30 | * Use `setup.py`
 31 | ```
 32 |    cd install_dir/share
 33 |    python setup.py install
 34 | ```
 35 | 
 36 | ### Dependencies 
 37 | 
 38 | * Python 3
 39 | * The C Foreign Function Interface for Python
 40 |  ([CFFI](https://cffi.readthedocs.io/en/latest/)) 
 41 | * [Pycotap](https://pypi.org/project/pycotap/) (for unit testing)
 42 | 
 43 | 
 44 | ## Usage 
 45 | 
 46 | Here is a simple [program](test-simple.py) that demonstrates the Python
 47 | interface.
 48 | 
 49 | ```python
 50 | import os
 51 | import mpibind
 52 | 
 53 | # This simple example does not use MPI, thus                                    
 54 | # specify my rank and total number of tasks                                     
 55 | rank = 2
 56 | ntasks_per_node = 4
 57 | 
 58 | # Is sched_getaffinity supported?                                               
 59 | affinity = True if hasattr(os, 'sched_getaffinity') else False
 60 | 
 61 | if affinity:
 62 |     cpus = sorted(os.sched_getaffinity(0))
 63 |     affstr  = "\n>Before\n"
 64 |     affstr += "Running on {:2d} cpus: {}\n".format(len(cpus), cpus)
 65 | 
 66 | # Create a handle                                                               
 67 | # Num tasks is a required parameter                                             
 68 | handle = mpibind.MpibindHandle(ntasks=ntasks_per_node)
 69 | 
 70 | # Create the mapping                                                            
 71 | handle.mpibind()
 72 | 
 73 | # Print the mapping                                                             
 74 | handle.mapping_print()
 75 | 
 76 | # Apply the mapping as if I am worker 'rank'                                    
 77 | # This function is not supported on some platforms                              
 78 | if affinity:
 79 |     handle.apply(rank)
 80 |     cpus = sorted(os.sched_getaffinity(0))
 81 |     print(affstr + ">After\n" +
 82 |         "Running on {:2d} cpus: {}".format(len(cpus), cpus))
 83 | ```
 84 | 
 85 | Running it on a dual-socket system with 18x2 SMT-2 cores results in the
 86 | output below. Note that the resulting mapping uses only the first socket
 87 | because `mpibind` optimizes placement for GPUs by default (configurable
 88 | parameter) and both GPUs are located on the first socket.
 89 | 
 90 | ```bash
 91 | $ python3 test-simple.py 
 92 | mpibind: task  0 nths  4 gpus 0 cpus 0-4
 93 | mpibind: task  1 nths  4 gpus 0 cpus 5-9
 94 | mpibind: task  2 nths  4 gpus 1 cpus 10-13
 95 | mpibind: task  3 nths  4 gpus 1 cpus 14-17
 96 | 
 97 | >Before
 98 | Task 2: Running on 72 cpus: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71]
 99 | >After
100 | Task 2: Running on  4 cpus: [10, 11, 12, 13]
101 | ```
102 | 
103 | A more realistic example that uses MPI is provided in
104 | [test-mpi.py](test-mpi.py). This program uses `mpi4py` so make sure it is
105 | installed on your system, e.g., `pip install mpi4py`. It can be run as follows:
106 | 
107 | ```bash
108 | $ srun -N2 -n8 python3 test-mpi.py
109 | pascal7 task 0/8: lrank 0/4 nths 4 gpus ['0'] cpus [0, 1, 2, 3, 4]
110 | pascal7 task 1/8: lrank 1/4 nths 4 gpus ['0'] cpus [5, 6, 7, 8, 9]
111 | pascal7 task 2/8: lrank 2/4 nths 4 gpus ['1'] cpus [10, 11, 12, 13]
112 | pascal7 task 3/8: lrank 3/4 nths 4 gpus ['1'] cpus [14, 15, 16, 17]
113 | pascal8 task 4/8: lrank 0/4 nths 4 gpus ['0'] cpus [0, 1, 2, 3, 4]
114 | pascal8 task 5/8: lrank 1/4 nths 4 gpus ['0'] cpus [5, 6, 7, 8, 9]
115 | pascal8 task 6/8: lrank 2/4 nths 4 gpus ['1'] cpus [10, 11, 12, 13]
116 | pascal8 task 7/8: lrank 3/4 nths 4 gpus ['1'] cpus [14, 15, 16, 17]
117 | ```
118 | 
119 | ## Unit tests
120 | 
121 | Unit tests are located in [test-suite/python](../test-suite/python) and can be
122 | launched from the top directory with `make check`. We use `pycotap` to emit the
123 | Test Anything Protocol (TAP) from the Python tests. Make sure `pycotap` is
124 | installed, e.g., `pip install pycotap` before running `configure` from the top
125 | directory.
126 | 
127 | Two modifications are required to add a Python test. 
128 | 
129 | 1. Create a new test file under [test-suite/python](../test-suite/python)
130 | 2. Add the new test file to the `PYTHON_TESTS` variable in
131 | [test-suite/Makefile.am](../test-suite/Makefile.am)
132 | 
133 | 
134 | ## Development 
135 | 
136 | We use CFFI to build mpibind for Python.
137 | CFFI is a Python library that allows building Python wrappers for C
138 | code. CFFI allows for several modes of interaction between C and
139 | Python: API vs ABI and out-of-line vs in-line. For mpibind, we use
140 | CFFI in ABI, in-line mode. 
141 | 
142 | Exposing an mpibind C function to Python requires two modifications to
143 | [mpibind.py.in](mpibind.py.in)
144 | 
145 | 1. Add the C function definition to the `cdef` argument
146 | 2. Add a wrapper for the function to the class `MpibindHandle`
147 | 
148 | 


--------------------------------------------------------------------------------
/python/mpibind_map.py:
--------------------------------------------------------------------------------
 1 | ###################################################
 2 | # Edgar A. Leon
 3 | # Lawrence Livermore National Laboratory
 4 | #
 5 | # This is a wrapper of mpibind functions to easily
 6 | # get an application's mapping to the hardware
 7 | # in the context of MPI.
 8 | # 
 9 | # This wrapper calls mpibind once per compute node
10 | # so that the hardware topology is discovered once
11 | # rather than n times, where n is the number of
12 | # processes per node. 
13 | #
14 | ###################################################
15 | 
16 | 
17 | ## Todo: Add a variable number of parameters to this
18 | ## function and pass them to MpibindHandle(). 
19 | def mpibind_get_mapping(verbose=False):
20 |     '''Get the mpibind mapping of an MPI program. 
21 |     The return value is a dictionary with the keys
22 |     nthreads, cpus, and gpus.'''
23 |     from mpi4py import MPI
24 |     import mpibind
25 |     import re
26 |     
27 |     comm = MPI.COMM_WORLD
28 |     size = comm.Get_size()
29 |     rank = comm.Get_rank()
30 |     name = MPI.Get_processor_name()
31 | 
32 |     ## Get a leader for each compute node
33 |     match = re.search('\d+', name)
34 |     if not match:
35 |         print("mpibind: Could not determine node id")
36 |         return None 
37 | 
38 |     nodeid = int(match.group())
39 |     node_comm = comm.Split(color=nodeid, key=rank)
40 |     node_rank = node_comm.Get_rank()
41 |     node_size = node_comm.Get_size()
42 | 
43 |     ## One task per node calculates the mapping.
44 |     ## This is not a hard requirement, but it is
45 |     ## more efficient than every process discovering
46 |     ## the topology of the compute node. 
47 |     if node_rank == 0: 
48 |         # Create an mpibind handle, 'ntasks' is a required parameter
49 |         # See 'help(mpibind.MpibindHandle)' for detailed usage
50 |         handle = mpibind.MpibindHandle(ntasks=node_size)
51 |         
52 |         # Create the mapping 
53 |         handle.mpibind()
54 |         #handle.mapping_print()
55 | 
56 |         # Distribute the mapping
57 |         nthreads = handle.nthreads
58 |         cpus = handle.get_cpus_ptask(0)
59 |         gpus = handle.get_gpus_ptask(0)
60 |         for i in range(1, node_size):
61 |             node_comm.send(handle.get_cpus_ptask(i), dest=i)
62 |             node_comm.send(handle.get_gpus_ptask(i), dest=i)
63 |     else:
64 |         nthreads = None
65 |         cpus = node_comm.recv(source=0)
66 |         gpus = node_comm.recv(source=0)
67 |     
68 |     nthreads = node_comm.scatter(nthreads, root=0)
69 | 
70 |     if verbose: 
71 |         print('{} task {}/{}: lrank {}/{} nths {} gpus {} cpus {}'\
72 |               .format(name, rank, size, node_rank, node_size,
73 |                       nthreads, gpus, cpus))
74 | 
75 |     return {"nthreads": nthreads, "cpus": cpus, "gpus": gpus}
76 | 
77 | 
78 | 
79 | 


--------------------------------------------------------------------------------
/python/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | 
 3 | # https://packaging.python.org/guides/distributing-packages-using-setuptools/#setup-py
 4 | 
 5 | longtext = """Python bindings for mpibind, a memory-driven algorithm to map parallel hybrid
 6 |               applications to the underlying hardware resources transparently,
 7 |               efficiently, and portably."""
 8 | 
 9 | setup(
10 |   name='mpibind',
11 |   version='0.5.0',
12 |   author='LLNL',
13 |   url='https://github.com/LLNL/mpibind',
14 |   description='Memory-First Affinity Scheduler',
15 |   long_description=longtext,
16 |   keywords='affinity, NUMA, hybrid applications, heterogeneous systems',
17 |   py_modules=['mpibind'],
18 | #  install_requires=['cffi'],
19 |   platforms=['posix'],
20 |   license='MIT',
21 |   classifiers=[
22 |     'Development Status :: 4 - Beta',
23 |     'License :: OSI Approved :: MIT License',
24 |     'Operating System :: POSIX :: Linux',
25 |     'Programming Language :: Python :: 3',
26 |   ],
27 | )
28 | 


--------------------------------------------------------------------------------
/python/test-mpi.py:
--------------------------------------------------------------------------------
 1 | ###################################################
 2 | # Edgar A. Leon
 3 | # Lawrence Livermore National Laboratory
 4 | ###################################################
 5 | 
 6 | # There is an issue when mpi4py implicitly calls
 7 | # MPI_Finalize and loading the mpibind module,
 8 | # which leads to a segmentation fault. To avoid this,
 9 | # make sure Finalize is not called automatically.  
10 | import mpi4py
11 | #mpi4py.rc.threads      = True        # thread support
12 | #mpi4py.rc.thread_level = "funneled"  # thread support level 
13 | #mpi4py.rc.initialize   = False       # do not initialize MPI automatically
14 | mpi4py.rc.finalize      = False       # do not finalize MPI automatically
15 | from mpi4py import MPI
16 | 
17 | from mpibind_map import mpibind_get_mapping
18 | 
19 | 
20 | # The search path for Python modules
21 | #import sys
22 | #print("sys.path:")
23 | #print(sys.path)
24 | 
25 | # Path to the MPI module 
26 | #print("MPI module:")
27 | #print(MPI.__file__)
28 | 
29 | 
30 | # Print either the mapping from mpibind (true)
31 | # or the actual mapping from the runtime system (false)
32 | mpibind_verbose = True
33 | 
34 | 
35 | ## Is sched_getaffinity supported?
36 | import os                   
37 | affinity = True if hasattr(os, 'sched_getaffinity') else False 
38 | 
39 | 
40 | comm = MPI.COMM_WORLD
41 | size = comm.Get_size()
42 | rank = comm.Get_rank()
43 | # if rank == 0:
44 | #     (version, subversion) = MPI.Get_version()
45 | #     print("Using MPI {}.{}".format(version, subversion))
46 | 
47 | 
48 | # Get the mapping
49 | # mapping["nthreads"]: The number of threads this process can launch
50 | #     mapping["cpus"]: The CPUs assigned to this process
51 | #     mapping["gpus"]: The GPUs assigned to this process
52 | mapping = mpibind_get_mapping(mpibind_verbose)
53 | 
54 | 
55 | ## Apply the CPU mapping 
56 | if affinity:
57 |     pid = 0
58 |     cpus = sorted(os.sched_getaffinity(pid))
59 |     affstr = "{:2d}/{:2d} was running on {:2d} cpus: {}\n"\
60 |           .format(rank, size, len(cpus), cpus)
61 | 
62 |     os.sched_setaffinity(pid, mapping["cpus"])
63 | 
64 |     cpus = sorted(os.sched_getaffinity(pid))
65 |     if not mpibind_verbose:
66 |         print(affstr + "      now running on {:2d} cpus: {}"\
67 |               .format(len(cpus), cpus))
68 | 
69 | 
70 | ## Use mapping["gpus"] to launch work on GPUs
71 | ## ...
72 | 


--------------------------------------------------------------------------------
/python/test-simple.py:
--------------------------------------------------------------------------------
 1 | ###################################################
 2 | # Edgar A. Leon
 3 | # Lawrence Livermore National Laboratory
 4 | ###################################################
 5 | 
 6 | import os
 7 | import mpibind
 8 | 
 9 | # This simple example does not use MPI, thus 
10 | # specify my rank and total number of tasks 
11 | rank = 2
12 | ntasks_per_node = 4
13 | 
14 | # Is sched_getaffinity supported? 
15 | affinity = True if hasattr(os, 'sched_getaffinity') else False 
16 | 
17 | if affinity:
18 |     cpus = sorted(os.sched_getaffinity(0))
19 |     affstr  = "\n>Before\n"
20 |     affstr += "{}: Running on {:2d} cpus: {}\n"\
21 |               .format(rank, len(cpus), cpus)
22 | 
23 | # Create a handle
24 | # Num tasks is a required parameter
25 | handle = mpibind.MpibindHandle(ntasks=ntasks_per_node)
26 | 
27 | # Create the mapping 
28 | handle.mpibind()
29 | 
30 | # Print the mapping
31 | handle.mapping_print()
32 | 
33 | # Apply the mapping as if I am worker 'rank'
34 | # This function is not supported on some platforms
35 | if affinity:
36 |     handle.apply(rank)
37 |     cpus = sorted(os.sched_getaffinity(0))
38 |     print(affstr + ">After\n" + 
39 |         "{}: Running on {:2d} cpus: {}"\
40 |           .format(rank, len(cpus), cpus))
41 | 
42 | 


--------------------------------------------------------------------------------
/slurm/Makefile.am:
--------------------------------------------------------------------------------
 1 | 
 2 | #######################################################
 3 | # Slurm plugin
 4 | #######################################################
 5 | 
 6 | if HAVE_SLURM
 7 | mpibindmod_LTLIBRARIES  = mpibind_slurm.la
 8 | endif
 9 | 
10 | mpibind_slurm_la_SOURCES = plugin.c
11 | mpibind_slurm_la_CFLAGS  = -Wall -Werror -I$(top_srcdir)/src
12 | mpibind_slurm_la_CFLAGS += $(HWLOC_CFLAGS) $(SLURM_CFLAGS)
13 | # Need slurm/spank.h
14 | # SLURM_CFLAGS is empty because 'pkg-config --cflags slurm'
15 | # is empty: default paths, e.g., /usr/include, are not returned!
16 | # Because the Slurm headers/libs are on standard locations, 
17 | # e.g., /usr/include, everything works.
18 | # mpibind_slurm_la_CFLAGS += -I$(SLURM_INCDIR) $(HWLOC_CFLAGS)
19 | mpibind_slurm_la_LIBADD  = $(top_builddir)/src/libmpibind.la
20 | mpibind_slurm_la_LDFLAGS = -module
21 | 


--------------------------------------------------------------------------------
/slurm/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | ## The mpibind Slurm Plugin
  3 | 
  4 | The `mpibind_slurm.so` library is a SPANK plugin that enables using
  5 | mpibind in Slurm to map parallel codes to the hardware.
  6 | 
  7 | ### Requirements
  8 | 
  9 | The file `slurm/spank.h` is necessary to build the plugin. This file is distributed with Slurm.
 10 | 
 11 | ### Building and installing 
 12 | 
 13 | The building system looks for a Slurm installation using `pkg-config` and, if
 14 | found, the plugin is built and installed here:
 15 | ```
 16 | <mpibind-prefix>/lib/mpibind/
 17 | # which can be obtained with the command
 18 | pkg-config --variable=plugindir mpibind
 19 | ```
 20 | 
 21 | To install the plugin into your Slurm installation, add the following
 22 | line to the `plugstack.conf` file:
 23 | ```
 24 | required <mpibind-prefix>/lib/mpibind/mpibind_slurm.so
 25 | ```
 26 | The plugin configuration options are below. Separate multiple options with commas. 
 27 | ```
 28 | # Disable the plugin by default
 29 | # To use mpibind add --mpibind=on to srun 
 30 | default_off
 31 | 
 32 | # By default, mpibind is enabled only on full-node allocations
 33 | # This option enables mpibind on partial-node allocations as well
 34 | exclusive_only_off
 35 | ```
 36 | For example:
 37 | ```
 38 | required <mpibind-prefix>/lib/mpibind/mpibind_slurm.so default_off
 39 | ```
 40 | ### Usage 
 41 | 
 42 | mpibind can be used with the `srun` command as follows. 
 43 | 
 44 | ```
 45 | Automatically map tasks/threads/GPU kernels to heterogeneous hardware
 46 | 
 47 | Usage: --mpibind=[args]
 48 |   
 49 | where args is a comma separated list of one or more of the following:
 50 |   gpu[:0|1]         Enable(1)/disable(0) GPU-optimized mappings
 51 |   greedy[:0|1]      Allow(1)/disallow(0) multiple NUMAs per task
 52 |   help              Display this message
 53 |   off               Disable mpibind
 54 |   on                Enable mpibind
 55 |   smt:<k>           Enable worker use of SMT-<k>
 56 |   v[erbose]         Print affinty for each task
 57 | ```
 58 | 
 59 | For example:
 60 | 
 61 | ```
 62 | # Turn off mpibind to check the mapping without it
 63 | $ srun -N1 -n8 --mpibind=off ./mpi
 64 | mutt29     Task   0/  8 running on 224 CPUs: 0-223
 65 | mutt29     Task   1/  8 running on 224 CPUs: 0-223
 66 | mutt29     Task   2/  8 running on 224 CPUs: 0-223
 67 | mutt29     Task   3/  8 running on 224 CPUs: 0-223
 68 | mutt29     Task   4/  8 running on 224 CPUs: 0-223
 69 | mutt29     Task   5/  8 running on 224 CPUs: 0-223
 70 | mutt29     Task   6/  8 running on 224 CPUs: 0-223
 71 | mutt29     Task   7/  8 running on 224 CPUs: 0-223
 72 | 
 73 | # mpibind should be on by default, but can by enabled explicitly
 74 | $ srun -N1 -n8 --mpibind=on ./mpi
 75 | mutt29     Task   0/  8 running on 14 CPUs: 0-13
 76 | mutt29     Task   1/  8 running on 14 CPUs: 14-27
 77 | mutt29     Task   2/  8 running on 14 CPUs: 28-41
 78 | mutt29     Task   3/  8 running on 14 CPUs: 42-55
 79 | mutt29     Task   4/  8 running on 14 CPUs: 56-69
 80 | mutt29     Task   5/  8 running on 14 CPUs: 70-83
 81 | mutt29     Task   6/  8 running on 14 CPUs: 84-97
 82 | mutt29     Task   7/  8 running on 14 CPUs: 98-111
 83 | 
 84 | # Get the mapping from mpibind itself
 85 | $ srun -N1 -n8 --mpibind=v /bin/true 
 86 | mpibind: 0 GPUs on this node
 87 | mpibind: task  0 nths 14 gpus  cpus 0-13
 88 | mpibind: task  1 nths 14 gpus  cpus 14-27
 89 | mpibind: task  2 nths 14 gpus  cpus 28-41
 90 | mpibind: task  3 nths 14 gpus  cpus 42-55
 91 | mpibind: task  4 nths 14 gpus  cpus 56-69
 92 | mpibind: task  5 nths 14 gpus  cpus 70-83
 93 | mpibind: task  6 nths 14 gpus  cpus 84-97
 94 | mpibind: task  7 nths 14 gpus  cpus 98-111
 95 | ```
 96 | 
 97 | ### Environment variables
 98 | 
 99 | ```
100 | # The type of resource to restrict mpibind to
101 | MPIBIND_RESTRICT_TYPE=<cpu|mem>
102 | 
103 | # Restrict mpibind to a list of CPUs or NUMA domains
104 | MPIBIND_RESTRICT=<list-of-integers>
105 | 
106 | # The hwloc topology file, in XML format, matching the cluster's topology
107 | MPIBIND_TOPOFILE=<xml-file>
108 | ```
109 | 
110 | To restrict mpibind to a subset of the node resources, MPIBIND_RESTRICT must be defined with the resource IDs. Optionally, MPIBIND_RESTRICT_TYPE can be specified with the type of resource: CPUs or NUMA memory (the default is CPUs). 
111 | 
112 | 
113 | For example, restrict mpibind to the third and forth NUMA domains: 
114 | 
115 | ```
116 | $ export MPIBIND_RESTRICT_TYPE=mem
117 | 
118 | $ export MPIBIND_RESTRICT=2,3
119 | 
120 | $ srun --mpibind=on -N1 -n4 ./mpi
121 | mutt124    Task   0/  4 running on 7 CPUs: 28-34
122 | mutt124    Task   1/  4 running on 7 CPUs: 35-41
123 | mutt124    Task   2/  4 running on 7 CPUs: 42-48
124 | mutt124    Task   3/  4 running on 7 CPUs: 49-55
125 | ```
126 | 
127 | To have mpibind use a topology file defining the node architecture, one can use MPIBIND_TOPOFILE. This can speed up launch time since the topology does not need to be discovered every srun command. 
128 | 
129 | Notes:
130 | * This variable may already be defined in the user's environment. To check use `printenv MPIBIND_TOPOFILE`
131 | * The topology file *must* match the node architecture where mpibind is run. Otherwise, the job may fail due to invalid mapping assignments.  
132 | * To generate a topology file, run `hwloc` on a compute node as follows `lstopo <name-of-file>.xml`
133 | 
134 | For example:
135 | 
136 | ```
137 | # Generate the topology file using hwloc 
138 | $ lstopo mutt.xml
139 | 
140 | # Run mpibind on the same node architecture as where the file was created
141 | $ export MPIBIND_TOPOFILE=mutt.xml
142 | 
143 | $ srun -N1 -n8 --mpibind=v /bin/true 
144 | mpibind: 0 GPUs on this node
145 | mpibind: task  0 nths 14 gpus  cpus 0-13
146 | mpibind: task  1 nths 14 gpus  cpus 14-27
147 | mpibind: task  2 nths 14 gpus  cpus 28-41
148 | mpibind: task  3 nths 14 gpus  cpus 42-55
149 | mpibind: task  4 nths 14 gpus  cpus 56-69
150 | mpibind: task  5 nths 14 gpus  cpus 70-83
151 | mpibind: task  6 nths 14 gpus  cpus 84-97
152 | mpibind: task  7 nths 14 gpus  cpus 98-111
153 | ```
154 | 
155 | 
156 | 
157 | 
158 | 


--------------------------------------------------------------------------------
/src/Makefile.am:
--------------------------------------------------------------------------------
 1 | 
 2 | AM_CPPFLAGS = -Wall -Werror $(HWLOC_CFLAGS)
 3 | 
 4 | #######################################################
 5 | # libmpibind
 6 | #######################################################
 7 | lib_LTLIBRARIES       = libmpibind.la
 8 | 
 9 | libmpibind_la_SOURCES = \
10 |     mpibind.c  mpibind-priv.h \
11 |     internals.c \
12 |     hwloc_utils.c hwloc_utils.h
13 | 
14 | include_HEADERS       = mpibind.h
15 | libmpibind_la_LIBADD  = $(HWLOC_LIBS)
16 | #libmpibind_la_CPPFLAGS = -Wall -Werror $(HWLOC_CFLAGS)
17 | 
18 | #######################################################
19 | # Program using libmpibind and other auxiliary progs
20 | #######################################################
21 | 
22 | noinst_PROGRAMS = main hwloc_tests
23 | 
24 | hwloc_tests_SOURCES = hwloc_tests.c hwloc_utils.c hwloc_utils.h
25 | hwloc_tests_LDADD   = $(HWLOC_LIBS)
26 | # Rename hwloc_utils object file since it is used by both
27 | # a libtool library and a non-libtool target. 
28 | hwloc_tests_CFLAGS  = $(AM_CFLAGS)
29 | #hwloc_tests_CPPFLAGS = -Wall -Werror $(HWLOC_CFLAGS)
30 | 
31 | main_SOURCES = main.c mpibind.h 
32 | main_LDADD   = libmpibind.la $(HWLOC_LIBS)
33 | 
34 | 


--------------------------------------------------------------------------------
/src/hwloc_tests.c:
--------------------------------------------------------------------------------
  1 | /******************************************************
  2 |  * Edgar A. Leon
  3 |  * Lawrence Livermore National Laboratory 
  4 |  ******************************************************/
  5 | 
  6 | #include <hwloc.h>
  7 | #include <hwloc/plugins.h>
  8 | #include <stdio.h>
  9 | #include <string.h>
 10 | #include <inttypes.h>
 11 | #include "hwloc_utils.h"
 12 | 
 13 | 
 14 | 
 15 | 
 16 | void get_cpuset_of_nobjs(hwloc_topology_t topo,
 17 | 			 int nobjs, hwloc_obj_type_t type, 
 18 | 			 hwloc_bitmap_t cpuset)
 19 | {
 20 |   int i;
 21 |   hwloc_obj_t obj;
 22 |   
 23 |   hwloc_bitmap_zero(cpuset); 
 24 | 
 25 |   for (i=0; i<nobjs; i++) {
 26 |     obj = hwloc_get_obj_by_type(topo, type, i);
 27 |     hwloc_bitmap_or(cpuset, cpuset, obj->cpuset); 
 28 |   }
 29 | }
 30 | 
 31 | void test_distrib(hwloc_topology_t topo, int wks)
 32 | {
 33 |   hwloc_obj_t root = hwloc_get_root_obj(topo);
 34 |   int i, n_roots = 1, flags = 0;
 35 |   int until = INT_MAX;
 36 |   hwloc_bitmap_t set[wks];
 37 |   char str[128]; 
 38 |   
 39 |   for (i=0; i<wks; i++) 
 40 |     set[i] = hwloc_bitmap_alloc();
 41 |   
 42 |   /* The 'until' parameter does not seem to have an effect when 
 43 |      its value is greater than the Core depth. In other words, 
 44 |      even when having multiple PUs per Core and assigning 'until'
 45 |      to PU's depth, the distrib function assigns a full core to 
 46 |      a task (as opposed to a single PU per task). 
 47 |      Also, if there are more cores than tasks, the last task 
 48 |      gets all remaining cores, while the first n-1 tasks get 
 49 |      a single core */ 
 50 |   hwloc_distrib(topo, &root, n_roots, set, wks, until, flags);
 51 |   
 52 |   for (i=0; i<wks; i++) {
 53 |     hwloc_bitmap_list_snprintf(str, sizeof(str), set[i]);
 54 |     printf("[%d]: %s\n", i, str); 
 55 |     hwloc_bitmap_free(set[i]);
 56 |   }			  
 57 | }
 58 | 
 59 | int restr_topo_to_n_cores(hwloc_topology_t topo, int ncores)
 60 | {
 61 |   hwloc_bitmap_t restr = hwloc_bitmap_alloc();
 62 |   
 63 |   get_cpuset_of_nobjs(topo, ncores, HWLOC_OBJ_CORE, restr); 
 64 |   
 65 |   /* REMOVE_CPULESS flag is necessary to achieve correct mappings
 66 |      with hwloc_distrib when 'until' parameter is less than the 
 67 |      maximum depth */ 
 68 |   if ( hwloc_topology_restrict(topo, restr,
 69 |   			       HWLOC_RESTRICT_FLAG_REMOVE_CPULESS) ) { 
 70 |     perror("hwloc_topology_restrict");
 71 |     hwloc_bitmap_free(restr);
 72 |     return errno; 
 73 |   }
 74 |   
 75 |   hwloc_bitmap_free(restr);
 76 |   return 0; 
 77 | }
 78 | 
 79 | /*
 80 |  * Print object properties to a string. 
 81 |  * I/O objects considered include PCI devices and 
 82 |  * certain types of OS devices. 
 83 |  * To determine if an object is I/O, one can use 
 84 |  * hwloc_obj_type_is_io()
 85 |  */
 86 | int obj_attrs_str(hwloc_obj_t obj, char *str, size_t size, int verbose)
 87 | {
 88 |   int nc=0; 
 89 |   
 90 |   nc += hwloc_obj_type_snprintf(str+nc, size-nc, obj, 1);
 91 |   nc += snprintf(str+nc, size-nc, ": depth=%d gp_index=0x%" PRIu64 " ",
 92 |            obj->depth, obj->gp_index);
 93 |   
 94 |   if (hwloc_obj_type_is_normal(obj->type)) {
 95 |     nc += snprintf(str+nc, size-nc, "\n  ");
 96 |     nc += snprintf(str+nc, size-nc, "os_idx=%d ", obj->os_index);
 97 |     nc += snprintf(str+nc, size-nc, "l_idx=%d ", obj->logical_index);
 98 |     nc += snprintf(str+nc, size-nc, "cpuset=");
 99 |     nc += hwloc_bitmap_list_snprintf(str+nc, size-nc, obj->cpuset);
100 |     nc += snprintf(str+nc, size-nc, " nodeset=");
101 |     nc += hwloc_bitmap_list_snprintf(str+nc, size-nc, obj->nodeset);
102 |     nc += snprintf(str+nc, size-nc, " arity=%d ", obj->arity);
103 |     nc += snprintf(str+nc, size-nc, "amem=%d ", obj->memory_arity);
104 |     nc += snprintf(str+nc, size-nc, "aio=%d ", obj->io_arity);
105 |   }
106 | 
107 |   if (obj->type == HWLOC_OBJ_OS_DEVICE)
108 |   switch (obj->attr->osdev.type) {
109 |     case HWLOC_OBJ_OSDEV_COPROC :
110 |       nc += snprintf(str+nc, size-nc, "subtype=%s ",
111 |         obj->subtype);
112 |     case HWLOC_OBJ_OSDEV_GPU : 
113 |       nc += snprintf(str+nc, size-nc, "\n  uuid="); 
114 |       nc += gpu_uuid_snprintf(str+nc, size-nc, obj);
115 |     case HWLOC_OBJ_OSDEV_OPENFABRICS :
116 |       nc += snprintf(str+nc, size-nc, " busid=");
117 |       nc += pci_busid_snprintf(str+nc, size-nc, obj);
118 |       nc += snprintf(str+nc, size-nc, " name=%s ",
119 |         obj->name);
120 |       if (verbose > 0) {
121 |         nc += snprintf(str+nc, size-nc, "\n  ");
122 |         /* Get obj->infos in one shot */ 
123 |         nc += hwloc_obj_attr_snprintf(str+nc, size-nc, obj, " ", 1);
124 |       }
125 |     default: 
126 |       break;
127 |   }
128 | 
129 |   if (obj->type == HWLOC_OBJ_PCI_DEVICE) {
130 |         nc += snprintf(str+nc, size-nc, "\n  ");
131 |         /* Get the obj->infos attributes */ 
132 |         nc += hwloc_obj_attr_snprintf(str+nc, size-nc, obj, " ", 1);
133 |       }
134 | 
135 |   //hwloc_pci_class_string(obj->attr->pcidev.class_id)
136 | 
137 |   return nc;
138 | }
139 | 
140 | 
141 | void check_topo_filters(hwloc_topology_t topo)
142 | {
143 |   enum hwloc_type_filter_e f1, f2; 
144 |   hwloc_topology_get_type_filter(topo, 
145 |     HWLOC_OBJ_PCI_DEVICE, &f1);
146 |   hwloc_topology_get_type_filter(topo, 
147 |     HWLOC_OBJ_MISC, &f2);
148 | 
149 |   if (f1 == HWLOC_TYPE_FILTER_KEEP_IMPORTANT ||
150 |       f1 == HWLOC_TYPE_FILTER_KEEP_ALL)
151 |       printf("PCI devices enabled\n");
152 |   if (f2 == HWLOC_TYPE_FILTER_KEEP_IMPORTANT ||
153 |       f2 == HWLOC_TYPE_FILTER_KEEP_ALL)
154 |       printf("Misc objects enabled\n");
155 | }
156 | 
157 | 
158 | 
159 | int main(int argc, char *argv[])
160 | {
161 |   hwloc_topology_t topology;
162 |   
163 |   printf("hwloc: API version=0x%x, HWLOC_API_VERSION=0x%x\n",
164 | 	 hwloc_get_api_version(), HWLOC_API_VERSION); 
165 |   
166 |   hwloc_topology_init(&topology); 
167 |   /* OS devices are filtered by default, enable to see GPUs */
168 |   hwloc_topology_set_type_filter(topology, HWLOC_OBJ_OS_DEVICE,
169 |                                  HWLOC_TYPE_FILTER_KEEP_IMPORTANT);
170 |   /* Include PCI devices to determine whether two GPUs 
171 |      are the same device, i.e., opencl1d1 and cuda1 */ 
172 |   hwloc_topology_set_type_filter(topology, HWLOC_OBJ_PCI_DEVICE,
173 |                                  HWLOC_TYPE_FILTER_KEEP_IMPORTANT);
174 |   hwloc_topology_load(topology); 
175 |   
176 | 
177 |   printf("=====Begin brief topology\n");
178 |   print_topo_brief(topology);
179 |   printf("=====End brief topology\n");
180 | 
181 |   printf("=====Begin I/O topology\n");
182 |   print_topo_io(topology);
183 |   printf("=====End I/O topology\n");
184 | 
185 |   printf("=====Begin flat list of devices\n");
186 |   print_devices(topology, HWLOC_OBJ_GROUP); 
187 |   print_devices(topology, HWLOC_OBJ_OS_DEVICE); 
188 |   printf("=====End flat list of devices\n");
189 | 
190 |   printf("=====Begin filter type\n");
191 |   check_topo_filters(topology);
192 |   printf("=====End filter type\n");
193 | 
194 | #if 0
195 |   /* I haven't been able to use VISIBLE_DEVICES
196 |      within a process to restrict the GPU set */ 
197 |   printf("=====Begin ENV\n"); 
198 |   print_devices(topology, HWLOC_OBJ_OS_DEVICE);
199 | 
200 |   int rc = putenv("CUDA_VISIBLE_DEVICES=1");
201 |   printf("===CUDA_VISIBLE_DEVICES=1 rc=%d===\n", rc);
202 |   
203 |   hwloc_topology_t topo2; 
204 |   hwloc_topology_init(&topo2);
205 |   hwloc_topology_set_io_types_filter(topo2, 
206 |     HWLOC_TYPE_FILTER_KEEP_IMPORTANT);
207 |   hwloc_topology_load(topo2); 
208 |   print_devices(topo2, HWLOC_OBJ_OS_DEVICE);
209 |   hwloc_topology_destroy(topo2);
210 |   printf("=====End ENV\n");  
211 | #endif
212 | 
213 |   printf("=====Begin root\n"); 
214 |   print_obj(hwloc_get_root_obj(topology), 0); 
215 |   printf("=====End root\n");  
216 | 
217 |   printf("=====Begin hwloc_restrict\n");
218 |   restr_topo_to_n_cores(topology, 4); 
219 |   print_topo_brief(topology);
220 |   printf("=====End hwloc_restrict\n");
221 |   
222 |   printf("=====Begin hwloc_distrib\n");
223 |   test_distrib(topology, 3);
224 |   printf("=====End hwloc_distrib\n"); 
225 |   
226 |   hwloc_topology_destroy(topology);
227 |   
228 |   return 0; 
229 | }
230 | 


--------------------------------------------------------------------------------
/src/hwloc_utils.h:
--------------------------------------------------------------------------------
 1 | /******************************************************
 2 |  * Edgar A Leon
 3 |  * Lawrence Livermore National Laboratory 
 4 |  ******************************************************/
 5 | #ifndef HWLOC_UTILS_H_INCLUDED
 6 | #define HWLOC_UTILS_H_INCLUDED
 7 | 
 8 | #include <hwloc.h>
 9 | 
10 | #ifdef __cplusplus
11 | extern "C" {
12 | #endif
13 | 
14 | int pci_busid_snprintf(char *buf, size_t size, hwloc_obj_t io); 
15 | 
16 | int gpu_uuid_snprintf(char *buf, size_t size, hwloc_obj_t dev); 
17 | 
18 | void tree_walk_io(void (*apply)(hwloc_obj_t, void*, int), 
19 |                   hwloc_obj_t root, void *args, int depth);
20 | 
21 | void print_obj_info(hwloc_obj_t obj);
22 | 
23 | void print_obj(hwloc_obj_t obj, int indent);
24 | 
25 | void print_devices(hwloc_topology_t topo, hwloc_obj_type_t type);
26 | 
27 | void print_topo_brief(hwloc_topology_t topo);
28 | 
29 | void print_topo_io(hwloc_topology_t topo);
30 | 
31 | #ifdef __cplusplus
32 | } /* extern "C" */
33 | #endif
34 | 
35 | #endif // HWLOC_UTILS_H_INCLUDED


--------------------------------------------------------------------------------
/src/main.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Edgar A. Leon
  3 |  * Lawrence Livermore National Laboratory
  4 |  */ 
  5 | #include "mpibind.h"
  6 | 
  7 | 
  8 | /* 
  9 |  * Show the GPU mapping using various ID types. 
 10 |  */ 
 11 | void howto_gpu_ids(mpibind_t *handle)
 12 | {
 13 |   char **gpu_str; 
 14 |   char str[128];
 15 |   int i, j, k, ngpus;
 16 |   int ids[] = {MPIBIND_ID_PCIBUS, MPIBIND_ID_UNIV,
 17 |                MPIBIND_ID_SMI, MPIBIND_ID_NAME};
 18 |   char *desc[] = { "PCI", "UUID", "VISDEVS", "NAME"};
 19 |   int ntasks = mpibind_get_ntasks(handle);
 20 | 
 21 |   for (k=0; k<sizeof(ids)/sizeof(int); k++) {
 22 |     /* Set the desired type of GPU ID */ 
 23 |     mpibind_set_gpu_ids(handle, ids[k]); 
 24 |     printf("GPU IDs using %s\n", desc[k]);
 25 |     /* Get the IDs per task */ 
 26 |     for (i=0; i<ntasks; i++) {
 27 |       gpu_str = mpibind_get_gpus_ptask(handle, i, &ngpus);
 28 |       if (ngpus > 0) {
 29 |         printf("\tTask %d: ", i);
 30 |         for (j=0; j<ngpus; j++) {
 31 |           printf("%s", gpu_str[j]);
 32 |           printf( (j == ngpus-1) ? "\n" : "," ); 
 33 |         }
 34 |       }
 35 |     }
 36 |   }
 37 | 
 38 |   /* For reference, show mpibind device IDs */ 
 39 |   hwloc_bitmap_t *gpus = mpibind_get_gpus(handle);
 40 |   printf("GPU IDs using mpibind numbering\n");
 41 |   for (i=0; i<ntasks; i++)
 42 |     if (hwloc_bitmap_list_snprintf(str, sizeof(str), gpus[i]) > 0)
 43 |       printf("\tTask %d: %s\n", i, str);  
 44 | }
 45 | 
 46 | 
 47 | /* 
 48 |  * Show how to extract and use certain environment 
 49 |  * variables related to affinity. 
 50 |  */ 
 51 | void howto_env_vars(mpibind_t *handle)
 52 | {
 53 |   /* Set the environment variables first */ 
 54 |   mpibind_set_env_vars(handle);
 55 | 
 56 | #if 0
 57 |   /* Take a comprehensive look */ 
 58 |   mpibind_env_vars_print(handle);
 59 | #else
 60 |   int i, nvars; 
 61 |   char **names;
 62 |   char **values; 
 63 |   int ntasks = mpibind_get_ntasks(handle);
 64 | 
 65 |   /* Get the names of the environment variables */ 
 66 |   names = mpibind_get_env_var_names(handle, &nvars);
 67 |   printf("Environment variables:\n");
 68 |   for (i=0; i<nvars; i++)
 69 |     printf("\t%s\n", names[i]);
 70 | 
 71 |   values = mpibind_get_env_var_values(handle, names[nvars-1]);
 72 |   printf("%s:\n", names[nvars-1]);
 73 |   for (i=0; i<ntasks; i++) {
 74 |     if (strlen(values[i]))
 75 |       printf("\t[%d]: %s\n", i, values[i]); 
 76 |   }
 77 | #endif
 78 | }
 79 | 
 80 | 
 81 | 
 82 | int main(int argc, char *argv[])
 83 | {
 84 |   /* Optional program input: number of tasks */ 
 85 |   int ntasks = 5; 
 86 |   if (argc > 1)
 87 |     ntasks = atoi(argv[1]); 
 88 | 
 89 |   mpibind_t *handle;
 90 |   mpibind_init(&handle);
 91 | 
 92 |   /* User input */
 93 |   mpibind_set_ntasks(handle, ntasks);
 94 |   //mpibind_set_nthreads(handle, 3);
 95 |   //mpibind_set_greedy(handle, 0);
 96 |   mpibind_set_gpu_optim(handle, 0);
 97 |   //mpibind_set_smt(handle, 2);
 98 |   //params.restr_type = MEM; 
 99 |   //mpibind_set_restrict_type(handle, MPIBIND_RESTRICT_CPU);
100 |   //params.restr_set = "24-29,72-77,36-41,84-89";
101 |   //params.restr_set = "24-35,72-83";
102 |   //params.restr_set = "4-6";
103 |   //params.restr_set = "8";
104 |   //mpibind_set_restrict_ids(handle, "24-35,72-83");
105 |   //mpibind_set_restrict_ids(handle, "0-11,24-47");
106 | 
107 | 
108 |   /* Use an hwloc topology file */
109 | #if 0
110 |   char xml[] = "../../hwloc/flash-v100.xml";
111 |   hwloc_topology_t etopo;
112 |   if (hwloc_topology_init(&etopo) < 0) {
113 |     fprintf(stderr, "hwloc_topology_init failed\n");
114 |     return 0;
115 |   }
116 |   if (hwloc_topology_set_xml(etopo, xml) < 0) {
117 |     fprintf(stderr, "hwloc_topology_set_xml(%s) failed\n", xml);
118 |     return 0;
119 |   }
120 |   if (mpibind_filter_topology(etopo) < 0) {
121 |     fprintf(stderr, "mpibind_filter_topology failed\n");
122 |     return 0;
123 |   }
124 |   if (hwloc_topology_load(etopo) < 0) {
125 |     fprintf(stderr, "hwloc_topology_load failed");
126 |     return 0;
127 |   }
128 |   mpibind_set_topology(handle, etopo);
129 | #endif
130 | 
131 | 
132 |   /* Get the mapping */ 
133 |   if ( mpibind(handle) )
134 |     return 1;
135 |   
136 |   /* Get the hwloc topology to parse the hwloc_bitmaps */
137 |   hwloc_topology_t topo; 
138 |   topo = mpibind_get_topology(handle); 
139 | 
140 |   /* Verbose mapping */ 
141 |   //Specify the type of GPU IDs to use
142 |   mpibind_set_gpu_ids(handle, MPIBIND_ID_SMI);
143 |   mpibind_mapping_print(handle); 
144 | 
145 |   /* Test popping CPUs/cores */ 
146 |   //mpibind_pop_cpus_ptask(handle, 2, 4);
147 |   //mpibind_pop_cores_ptask(handle, 1, 3); 
148 |   //mpibind_mapping_print(handle); 
149 |   
150 |   int ngpus = mpibind_get_num_gpus(handle);
151 |   printf("There are %d GPUs\n", ngpus);
152 |   if (ngpus > 0)
153 |     /* Example using various GPU IDs */ 
154 |     howto_gpu_ids(handle);
155 |   
156 |   /* Example using affinity environment variables */ 
157 |   howto_env_vars(handle);
158 |   
159 |   /* Clean up */ 
160 |   mpibind_finalize(handle);
161 | 
162 |   /* Last clean up activity: destroy the topology */ 
163 |   hwloc_topology_destroy(topo);
164 |   
165 |   return 0;
166 | }
167 | 


--------------------------------------------------------------------------------
/src/manual.mk:
--------------------------------------------------------------------------------
 1 | 
 2 | UNAME           = $(shell uname)
 3 | BASIC_CFLAGS    = -Wall -Werror
 4 | 
 5 | HWLOC_CFLAGS    = $(shell pkg-config --cflags hwloc)
 6 | HWLOC_LDLIBS    = $(shell pkg-config --libs hwloc)
 7 | 
 8 | # Building libmpibind 
 9 | VER = 1
10 | MIN = 0
11 | REL = 1
12 | MPIBIND_LIB     = libmpibind.so
13 | MPIBIND_SONAME  = $(MPIBIND_LIB).$(VER)
14 | MPIBIND_FNAME   = $(MPIBIND_SONAME).$(MIN).$(REL)
15 | 
16 | # Using libmpibind 
17 | MPIBIND_DIR     = $(shell pwd)
18 | MPIBIND_CFLAGS  = -I$(MPIBIND_DIR) $(HWLOC_CFLAGS)
19 | MPIBIND_LDLIBS  = -L$(MPIBIND_DIR) -lmpibind $(HWLOC_LDLIBS)
20 | ifeq ($(UNAME),Linux)
21 | MPIBIND_LDLIBS += -Wl,-rpath=$(MPIBIND_DIR)
22 | endif
23 | 
24 | 
25 | PROGS = main
26 | 
27 | ## HWLOC_XML_VERBOSE=1
28 | ## HWLOC_XMLFILE=../topo-xml/coral-lassen.xml ./main
29 | 
30 | all: $(PROGS) $(MPIBIND_SONAME) $(MPIBIND_LIB)
31 | 
32 | hwloc_tests: hwloc_tests.c
33 | 	$(CC) $(BASIC_CFLAGS) $(HWLOC_CFLAGS) $@.c $(HWLOC_LDLIBS) -o $@
34 | 
35 | dev_tests: dev_tests.c mpibind.h
36 | 	$(CC) $(BASIC_CFLAGS) $(HWLOC_CFLAGS) $@.c $(HWLOC_LDLIBS) -o $@
37 | 
38 | main: main.c $(MPIBIND_LIB)
39 | 	$(CC) $@.c $(BASIC_CFLAGS) $(MPIBIND_CFLAGS) -o $@ $(MPIBIND_LDLIBS)
40 | 
41 | # Todo
42 | #check:
43 | #install:
44 | 
45 | $(MPIBIND_SONAME): $(MPIBIND_FNAME)
46 | 	ln -s -f $< $@
47 | 
48 | $(MPIBIND_LIB): $(MPIBIND_FNAME)
49 | 	ln -s -f $< $@
50 | 
51 | $(MPIBIND_FNAME): mpibind.o 
52 | ifeq ($(UNAME),Linux)
53 | 	$(CC) -shared -Wl,-soname,$(MPIBIND_SONAME) -o $@ $< $(HWLOC_LDLIBS)
54 | else
55 | 	$(CC) -shared -o $@ $< $(HWLOC_LDLIBS)
56 | endif 
57 | 
58 | mpibind.o: mpibind.c mpibind.h mpibind-priv.h
59 | 	$(CC) -fPIC $(BASIC_CFLAGS) $(HWLOC_CFLAGS) -c mpibind.c 
60 | 
61 | 
62 | clean:
63 | 	rm -f $(PROGS) $(MPIBIND_LIB) $(MPIBIND_SONAME) $(MPIBIND_FNAME) *.dSYM *.o *~
64 | 


--------------------------------------------------------------------------------
/src/mpibind-priv.h:
--------------------------------------------------------------------------------
 1 | /******************************************************
 2 |  * Edgar A Leon
 3 |  * Lawrence Livermore National Laboratory 
 4 |  ******************************************************/
 5 | #ifndef MPIBIND_PRIV_H_INCLUDED
 6 | #define MPIBIND_PRIV_H_INCLUDED
 7 | 
 8 | #define SHORT_STR_SIZE 32
 9 | #define LONG_STR_SIZE 1024
10 | 
11 | #define PCI_BUSID_LEN 16
12 | #define UUID_LEN 64
13 | #define MAX_IO_DEVICES 128
14 | #define MAX_CPUS_PER_TASK 1024
15 | 
16 | #define VERBOSE 0
17 | #define DEBUG 0
18 | #define OUT_STREAM stderr
19 | 
20 | #define ERR_MSG(whatstr)						\
21 |   do {									\
22 |     fprintf(stderr, "%s failed: %s.\n", __func__, (whatstr));		\
23 |   } while (0)
24 | 
25 | #define PRINT(...) fprintf(stderr, __VA_ARGS__)
26 | 
27 | 
28 | /* 
29 |  * An environment variable with one value per task 
30 |  */ 
31 | typedef struct {
32 |   int size; 
33 |   char *name;
34 |   char **values;
35 | } mpibind_env_var;
36 | 
37 | /* 
38 |  * The type of I/O devices 
39 |  */ 
40 | enum { 
41 |     DEV_GPU,    
42 |     DEV_NIC,
43 | }; 
44 | 
45 | /*
46 |  * The various I/O device IDs. 
47 |  * GPU devices are different from other I/O devices 
48 |  * by having visdevs (and optionally smi) set 
49 |  * to a non-negative integer. 
50 |  */
51 | struct device {
52 |   char name[SHORT_STR_SIZE];   // Device name
53 |   char pci[PCI_BUSID_LEN];     // PCI bus ID
54 |   char univ[UUID_LEN];         // Universally unique ID
55 |   hwloc_obj_t ancestor;        // First (smallest) non-I/O ancestor object
56 |   int type;                    // Type of I/O device, e.g., DEV_GPU
57 |   int vendor_id;               // Device vendor
58 |   /* GPU specific */ 
59 |   int smi;                     // System management ID (RSMI and NVML)
60 |   char vendor[SHORT_STR_SIZE]; // Vendor of GPU/COPROC devices
61 |   char model[SHORT_STR_SIZE];  // Model of GPU/COPROC devices
62 | }; 
63 | 
64 | /* 
65 |  * The mpibind handle 
66 |  */ 
67 | struct mpibind_t {
68 |   /* Input parameters */ 
69 |   int ntasks;
70 |   int in_nthreads;
71 |   int greedy; 
72 |   int gpu_optim; 
73 |   int smt;
74 |   char *restr_set;
75 |   int restr_type;
76 |   
77 |   /* Input/Output parameters */ 
78 |   hwloc_topology_t topo;
79 |   
80 |   /* Output parameters */
81 |   int *nthreads;
82 |   hwloc_bitmap_t *cpus; 
83 |   hwloc_bitmap_t *gpus;
84 |   char ***gpus_usr; 
85 |   int **cpus_usr;
86 | 
87 |   /* Environment variables */
88 |   int nvars;
89 |   char **names; 
90 |   mpibind_env_var *env_vars; 
91 | 
92 |   /* IDs of I/O devices */
93 |   int ndevs;  
94 |   struct device **devs;
95 | };
96 | 
97 | 
98 | #endif // MPIBIND_PRIV_H_INCLUDED
99 | 


--------------------------------------------------------------------------------
/src/simple.mk:
--------------------------------------------------------------------------------
 1 | 
 2 | CFLAGS += -Wall -Werror
 3 | CFLAGS += $(shell pkg-config --cflags hwloc)
 4 | LDLIBS += $(shell pkg-config --libs hwloc)
 5 | 
 6 | PROGS = dev_tests main
 7 | 
 8 | OBJS = mpibind_v2.1.o
 9 | 
10 | ## HWLOC_XML_VERBOSE=1
11 | ## HWLOC_XMLFILE=topo-xml/lassen-hw2.xml
12 | ## ./mpibind_v0.14.1
13 | 
14 | all: $(PROGS)
15 | 
16 | main: $(OBJS)
17 | 
18 | $(OBJS): mpibind.h
19 | 
20 | clean:
21 | 	rm -f $(PROGS) *.o *~
22 | 


--------------------------------------------------------------------------------
/test-suite/Makefile.am:
--------------------------------------------------------------------------------
 1 | AM_CPPFLAGS = -Wall -Werror -I$(top_srcdir)/src $(HWLOC_CFLAGS) $(TAP_CFLAGS)
 2 | AM_LDFLAGS = -rpath $(TAP_LIBDIR) 
 3 | LDADD = $(top_srcdir)/src/libmpibind.la $(TAP_LIBS) $(HWLOC_LIBS)    
 4 | 
 5 | AM_TESTS_ENVIRONMENT = \
 6 |     export PYTHONPATH=:"$(top_srcdir)/python:$(PYTHON_SITE_PKG):$$PYTHONPATH";
 7 | 
 8 | TEST_EXTENSIONS = .t .py
 9 | 
10 | T_LOG_DRIVER = env AM_TAP_AWK='$(AWK)' $(SHELL) \
11 |             $(top_srcdir)/config/tap-driver.sh
12 | 
13 | PY_LOG_DRIVER = $(PYTHON) $(top_srcdir)/config/tap-driver.py
14 | 
15 | coral_lassen_t_SOURCES = coral-lassen.c test_utils.c test_utils.h
16 | epyc_corona_t_SOURCES = epyc-corona.c test_utils.c test_utils.h
17 | coral_ea_t_SOURCES = coral-ea.c test_utils.c test_utils.h
18 | cts1_quartz_t_SOURCES = cts1-quartz.c test_utils.c test_utils.h
19 | error_t_SOURCES = error.c test_utils.c test_utils.h
20 | environment_t_SOURCES = environment.c test_utils.c test_utils.h
21 | 
22 | # Fix to make tests work on macOS:
23 | #  The tap library path is not set correctly in the executable. 
24 | # Statement must terminate with a semicolon.
25 | if HAVE_DARWIN_OS
26 | AM_TESTS_ENVIRONMENT += \
27 | 	if [[ ! $@ =~ python ]]; then \
28 | 	echo "Executing dylib fix for `echo $@ | sed 's/\.log/\.t/'`"; \
29 | 	install_name_tool -change libtap.dylib \
30 | 	$(TAP_LIBDIR)/libtap.dylib .libs/`echo $@ | sed 's/\.log/\.t/'`; fi;
31 | endif
32 | 
33 | C_TESTS = \
34 |     error.t \
35 |     environment.t \
36 |     coral_lassen.t \
37 |     epyc_corona.t \
38 |     coral_ea.t \
39 |     cts1_quartz.t
40 | 
41 | PYTHON_TESTS = \
42 |     python/py-coral-ea.py \
43 |     python/py-coral-lassen.py \
44 |     python/py-cts1-quartz.py \
45 |     python/py-epyc-corona.py
46 | 
47 | if HAVE_LIBTAP
48 | TESTS = $(C_TESTS)
49 | check_PROGRAMS = $(C_TESTS)
50 | endif
51 | 
52 | if HAVE_CFFI
53 | if HAVE_PYCOTAP
54 | check_SCRIPTS = $(PYTHON_TESTS)
55 | if HAVE_LIBTAP
56 | TESTS += $(PYTHON_TESTS)
57 | else
58 | TESTS = $(PYTHON_TESTS)
59 | endif
60 | endif
61 | endif
62 | 


--------------------------------------------------------------------------------
/test-suite/README.md:
--------------------------------------------------------------------------------
 1 | # mpibind tests
 2 | 
 3 | The current iteration of the test suite is designed to generate a set of tests
 4 | based on a given topology, then compare the resultant mappings to a file that
 5 | defines expected output (see `expected` directory). Generating the tests
 6 | involves gathering basic information about a topology, and using that
 7 | information to tweak each test to be suitable for the topology.
 8 | 
 9 | An example of the answers file is below:
10 | 
11 | ```
12 | # Line that start with a pound are comments!
13 | # Each answer description consist of 6 lines:
14 | # 1. A comment with the test number
15 | # 2. A comment describing the parameters used for the test in JSON format
16 | # 3. The test description
17 | # 4. the thread mapping
18 | # 5. the cpu mapping
19 | # 6. the gpu_mapping
20 | # The mapping for each task is separated by a defined character.
21 | # This separator can be changed in test_utils.c::parse_answer()
22 | 
23 | # 1:
24 | # {"params": {"ntasks": 40, "in_nthreads": 0, "greedy": 1, "gpu_optim": 1, "smt": 0, "restr_set": null, "restrict_type": 0}}
25 | Map one task to every core
26 | "1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1"
27 | "8;12;16;20;24;28;32;36;40;44;48;52;56;60;64;68;72;76;80;84;96;100;104;108;112;116;120;124;128;132;136;140;144;148;152;156;160;164;168;172"
28 | "0;0;0;0;0;0;0;0;0;0;1;1;1;1;1;1;1;1;1;1;2;2;2;2;2;2;2;2;2;2;3;3;3;3;3;3;3;3;3;3"
29 | 
30 | ```
31 | 
32 | The Python tests parse the params comment to initialize mpibind handles correctly.
33 | 
34 | ## Test details
35 | 
36 | 1. Valid mpibind configurations
37 |     * Map one task to every core
38 |     * Map one task greedily
39 |     * Map two tasks greedily
40 |     * Mapping such that ntasks < #NUMA nodes but nworkers > #NUMA nodes (this makes sure mpibind accounts * for the number of threads as well
41 |     * Restrict x tasks a single core (x == machine's smt level)
42 |     * Map two tasks at smt 1
43 |     * Map two tasks at smt (max_smt - 1)
44 |     * Map two tasks, but restrict them to a single NUMA domain
45 |     * Map number_numas tasks without GPU optimization
46 |     * Map number_numas tasks with GPU optimization
47 |     * Map 8 tasks to a single PU
48 | 2. Error checking
49 |     * Passing NULL in place of the handle to all of the setter and getter functions.
50 |     * Trying to run mpibind with an invalid number of threads (e.g. -1)
51 |     * Trying to run mpibind with an invalid number of tasks (e.g. -1)
52 |     * Trying to run mpibind with an invalid SMT level (e.g. -1 or 8 on a machine with SMT-4
53 | 3. Environment Varibles
54 |     * Check that AMD and NVIDIA gpus can be properly detected
55 |     * Check that the OMP_PLACES variable is formatted correctly
56 | 
57 | ## Debugging 
58 | 
59 | The tests are fired off by running `make check` from the top directory. One can
60 | use `V=1` to show the verbose compilation lines and `VERBOSE=1` to show any
61 | libtap error(s). 
62 | ```
63 | make V=1 VERBOSE=1 check 
64 | ```
65 | 
66 | The <test>.t files are libtool scripts that call the `.libs/<test>.h` binaries. 
67 | 
68 | The expected mappings are in the `expected` directory. 


--------------------------------------------------------------------------------
/test-suite/coral-ea.c:
--------------------------------------------------------------------------------
 1 | #include "test_utils.h"
 2 | 
 3 | int main(int argc, char **argv) {
 4 |   plan(NO_PLAN);
 5 | 
 6 |   char* topology_file = "../topo-xml/coral-ea-hwloc1.xml";
 7 |   char* answer_file = "./expected/expected.coral-ea";
 8 |   unit_test_topology(topology_file, answer_file);
 9 | 
10 |   done_testing();
11 |   return (0);
12 | }
13 | 


--------------------------------------------------------------------------------
/test-suite/coral-lassen.c:
--------------------------------------------------------------------------------
 1 | #include "test_utils.h"
 2 | 
 3 | int main(int argc, char **argv) {
 4 |   plan(NO_PLAN);
 5 | 
 6 |   char* topology_file = "../topo-xml/coral-lassen.xml";
 7 |   char* answer_file = "./expected/expected.coral-lassen";
 8 |   unit_test_topology(topology_file, answer_file);
 9 | 
10 |   done_testing();
11 |   return (0);
12 | }
13 | 


--------------------------------------------------------------------------------
/test-suite/cts1-quartz.c:
--------------------------------------------------------------------------------
 1 | #include "test_utils.h"
 2 | 
 3 | int main(int argc, char **argv) {
 4 |   plan(NO_PLAN);
 5 | 
 6 |   char* topology_file = "../topo-xml/cts1-quartz-smt1.xml";
 7 |   char* answer_file = "./expected/expected.cts1-quartz";
 8 |   unit_test_topology(topology_file, answer_file);
 9 | 
10 |   done_testing();
11 |   return (0);
12 | }
13 | 


--------------------------------------------------------------------------------
/test-suite/environment.c:
--------------------------------------------------------------------------------
  1 | #include "test_utils.h"
  2 | 
  3 | static void check_amd_env() {
  4 |   mpibind_t* handle;
  5 |   hwloc_topology_t topo;
  6 | 
  7 |   load_topology(&topo, "../topo-xml/epyc-corona.xml");
  8 | 
  9 |   mpibind_init(&handle);
 10 |   mpibind_set_topology(handle, topo);
 11 |   mpibind_set_ntasks(handle, 1);
 12 |   mpibind_set_gpu_optim(handle, 1);
 13 |   mpibind_set_greedy(handle, 1);
 14 | 
 15 |   mpibind(handle);
 16 | 
 17 |   mpibind_set_env_vars(handle);
 18 |   //ok(mpibind_get_gpu_type(handle) == MPIBIND_GPU_AMD,
 19 |   //   "mpibind correctly identifies AMD gpus");
 20 | 
 21 |   int num, i, idx = -1;
 22 |   char** env_var_names = mpibind_get_env_var_names(handle, &num);
 23 | 
 24 |   for (i = 0; i < num; i++) {
 25 |     if (!strcmp(env_var_names[i], "ROCR_VISIBLE_DEVICES")) {
 26 |       idx = i;
 27 |     }
 28 |   }
 29 | 
 30 |   ok(idx >= 0, "GPU variable is correct");
 31 | 
 32 |   mpibind_finalize(handle);
 33 |   hwloc_topology_destroy(topo);
 34 | }
 35 | 
 36 | static void check_omp_places() {
 37 |   mpibind_t* handle;
 38 |   hwloc_topology_t topo;
 39 | 
 40 |   load_topology(&topo, "../topo-xml/epyc-corona.xml");
 41 | 
 42 |   mpibind_init(&handle);
 43 |   mpibind_set_topology(handle, topo);
 44 |   mpibind_set_ntasks(handle, 1);
 45 |   mpibind_set_gpu_optim(handle, 0);
 46 |   mpibind_set_smt(handle, 1);
 47 |   mpibind_set_greedy(handle, 1);
 48 |   mpibind_set_restrict_type(handle, MPIBIND_RESTRICT_CPU);
 49 |   mpibind_set_restrict_ids(handle, "6-11");
 50 | 
 51 |   mpibind(handle);
 52 | 
 53 |   mpibind_set_env_vars(handle);
 54 | 
 55 |   int num, i, rc = -1;
 56 |   char** env_var_names = mpibind_get_env_var_names(handle, &num);
 57 |   char** env_var_values;
 58 | 
 59 |   for (i = 0; i < num; i++) {
 60 |     if (!strcmp(env_var_names[i], "OMP_PLACES")) {
 61 |       env_var_values = mpibind_get_env_var_values(handle, "OMP_PLACES");
 62 |       rc = is(env_var_values[0], "{6},{7},{8},{9},{10},{11}",
 63 |               "Checking OMP_PLACES mapping");
 64 |       break;
 65 |     }
 66 |   }
 67 | 
 68 |   if (rc == -1) {
 69 |     fail("failed to find OMP_PLACES variable");
 70 |   }
 71 | 
 72 |   mpibind_finalize(handle);
 73 |   hwloc_topology_destroy(topo);
 74 | }
 75 | 
 76 | static void check_nvidia_env() {
 77 |   mpibind_t* handle;
 78 |   hwloc_topology_t topo;
 79 | 
 80 |   load_topology(&topo, "../topo-xml/coral-lassen.xml");
 81 | 
 82 |   mpibind_init(&handle);
 83 |   mpibind_set_topology(handle, topo);
 84 |   mpibind_set_ntasks(handle, 1);
 85 |   mpibind_set_gpu_optim(handle, 1);
 86 |   mpibind_set_greedy(handle, 1);
 87 | 
 88 |   mpibind(handle);
 89 | 
 90 |   mpibind_set_env_vars(handle);
 91 |   //ok(mpibind_get_gpu_type(handle) == MPIBIND_GPU_NVIDIA,
 92 |   //   "mpibind correctly identifies NVIDIA gpus");
 93 | 
 94 |   int num, i, idx = -1;
 95 |   char** env_var_names = mpibind_get_env_var_names(handle, &num);
 96 |   for (i = 0; i < num; i++) {
 97 |     if (!strcmp(env_var_names[i], "CUDA_VISIBLE_DEVICES")) {
 98 |       idx = i;
 99 |     }
100 |   }
101 |   ok(idx >= 0, "GPU variable is correct");
102 | 
103 |   mpibind_finalize(handle);
104 |   hwloc_topology_destroy(topo);
105 | }
106 | 
107 | int main(int argc, char** argv) {
108 |   plan(NO_PLAN);
109 | 
110 |   check_amd_env();
111 |   check_nvidia_env();
112 |   check_omp_places();
113 | 
114 |   done_testing();
115 |   return (0);
116 | }
117 | 


--------------------------------------------------------------------------------
/test-suite/epyc-corona.c:
--------------------------------------------------------------------------------
 1 | #include "test_utils.h"
 2 | 
 3 | int main(int argc, char **argv) {
 4 |   plan(NO_PLAN);
 5 | 
 6 |   char* topology_file = "../topo-xml/epyc-corona.xml";
 7 |   char* answer_file = "./expected/expected.epyc-corona";
 8 |   unit_test_topology(topology_file, answer_file);
 9 | 
10 |   done_testing();
11 |   return (0);
12 | }
13 | 


--------------------------------------------------------------------------------
/test-suite/error.c:
--------------------------------------------------------------------------------
  1 | #include "test_utils.h"
  2 | #define XML_PATH "../topo-xml/coral-lassen.xml"
  3 | 
  4 | /**Test passing null to all setters and getter functions**/
  5 | int test_null_handle() {
  6 |   diag("Testing passing a null handle to setters and getters");
  7 |   mpibind_t *handle = NULL;
  8 |   int count; //for mpibind_get_env_var_names
  9 | 
 10 |   ok(mpibind_set_ntasks(handle, 4) == 1,
 11 |      "mpibind_set_ntasks fails when handle == NULL");
 12 |   ok(mpibind_set_nthreads(handle, 4) == 1,
 13 |      "mpibind_set_nthreads fails when handle == NULL");
 14 |   ok(mpibind_set_greedy(handle, 1) == 1,
 15 |      "mpibind_set_greedy fails when handle == NULL");
 16 |   ok(mpibind_set_gpu_optim(handle, 1) == 1,
 17 |      "mpibind_set_gpu_optim fails when handle == NULL");
 18 |   ok(mpibind_set_smt(handle, 1) == 1,
 19 |      "mpibind_set_smt fails when handle == NULL");
 20 |   ok(mpibind_set_restrict_ids(handle, NULL) == 1,
 21 |      "mpibind_set_restrict_ids fails when handle == NULL");
 22 |   ok(mpibind_set_restrict_type(handle, 1) == 1,
 23 |      "mpibind_set_restrict_type fails when handle == NULL");
 24 |   ok(mpibind_set_topology(handle, NULL) == 1,
 25 |      "mpibind_set_topology fails when handle == NULL");
 26 |   ok(mpibind_set_env_vars(handle) == 1,
 27 |      "mpibind_set_end_vars fails when handle == NULL");
 28 | 
 29 |   ok(mpibind_get_ntasks(handle) == -1,
 30 |      "mpibind_get_ntasks return -1 when handle == NULL");
 31 |   ok(mpibind_get_greedy(handle) == -1,
 32 |      "mpibind_get_greedy return -1 when handle == NULL");
 33 |   ok(mpibind_get_gpu_optim(handle) == -1,
 34 |      "mpibind_get_gpu_optim return -1 when handle == NULL");
 35 |   ok(mpibind_get_smt(handle) == -1,
 36 |      "mpibind_get_smt return -1 when handle == NULL");
 37 |   ok(mpibind_get_restrict_ids(handle) == NULL,
 38 |      "mpibind_get_ntasks return NULL when handle == NULL");
 39 |   ok(mpibind_get_restrict_type(handle) == -1,
 40 |      "mpibind_get_restrict_type return -1 when handle == NULL");
 41 | 
 42 |   ok(mpibind_get_nthreads(handle) == NULL,
 43 |      "mpibind_get_nthreads returns NULL when handle == NULL");
 44 |   ok(mpibind_get_cpus(handle) == NULL,
 45 |      "mpibind_get_cpus returns NULL when handle == NULL");
 46 |   ok(mpibind_get_gpus(handle) == NULL,
 47 |      "mpibind_get_gpus returns NULL when handle == NULL");
 48 |   //ok(mpibind_get_gpu_type(handle) == -1,
 49 |   //   "mpibind_get_gpu_type returns NULL when handle == NULL");
 50 |   ok(mpibind_get_topology(handle) == NULL,
 51 |      "mpibind_get_topology returns NULL when handle == NULL");
 52 |   ok(mpibind_get_env_var_values(handle, NULL) == NULL,
 53 |      "mpibind_get_env_var_values returns NULL when handle == NULL");
 54 |   ok(mpibind_get_env_var_names(handle, &count) == NULL,
 55 |      "mpibind_get_env_var_names returns NULL when handle == NULL");
 56 |   ok(mpibind_finalize(handle) == 1,
 57 |      "mpibind_finalize fails when handle == NULL");
 58 | 
 59 |   return 0;
 60 | }
 61 | 
 62 | int test_mpibind_errors() {
 63 |   mpibind_t *handle;
 64 |   hwloc_topology_t topo;
 65 | 
 66 |   // setup topology
 67 |   hwloc_topology_init(&topo);
 68 |   hwloc_topology_set_xml(topo, XML_PATH);
 69 |   hwloc_topology_set_all_types_filter(topo, HWLOC_TYPE_FILTER_KEEP_STRUCTURE);
 70 |   hwloc_topology_set_type_filter(topo, HWLOC_OBJ_OS_DEVICE,
 71 |                                  HWLOC_TYPE_FILTER_KEEP_IMPORTANT);
 72 |   hwloc_topology_load(topo);
 73 | 
 74 |   mpibind_init(&handle);
 75 | 
 76 |   int ntasks = 5;
 77 |   mpibind_set_ntasks(handle, ntasks);
 78 |   mpibind_set_nthreads(handle, 4);
 79 |   mpibind_set_greedy(handle, 0);
 80 |   mpibind_set_gpu_optim(handle, 0);
 81 |   mpibind_set_smt(handle, 2);
 82 | 
 83 |   diag("Testing error handling in mpibind()");
 84 | 
 85 |   mpibind_set_nthreads(handle, -4);
 86 |   ok(mpibind(handle) == 1, "Mapping fails if nthreads is invalid");
 87 | 
 88 |   mpibind_set_nthreads(handle, 4);
 89 |   mpibind_set_ntasks(handle, -1);
 90 |   ok(mpibind(handle) == 1, "Mapping fails if ntasks is invalid");
 91 | 
 92 |   mpibind_set_ntasks(handle, 4);
 93 |   mpibind_set_smt(handle, -1);
 94 |   ok(mpibind(handle) == 1, "Mapping fails if smt is invalid");
 95 | 
 96 |   mpibind_set_smt(handle, 16);
 97 |   ok(mpibind(handle) == 1, "Mapping fails if smt is valid but too high");
 98 | 
 99 |   // TODO: ERROR CODES RELATED TO RESTRICT SETS
100 |   todo("Error codes related to restrict sets");
101 |   return 0;
102 | }
103 | 
104 | int main(int argc, char **argv) {
105 |   plan(NO_PLAN);
106 |   test_null_handle();
107 |   test_mpibind_errors();
108 |   done_testing();
109 |   return (0);
110 | }
111 | 


--------------------------------------------------------------------------------
/test-suite/expected/expected.coral-ea:
--------------------------------------------------------------------------------
  1 | # Line that start with a pound are comments!
  2 | # Each answer description consist of 6 lines:
  3 | # 1. A comment with the test number
  4 | # 2. A comment describing the parameters used for the test in JSON format
  5 | # 3. The test description
  6 | # 4. the thread mapping
  7 | # 5. the cpu mapping
  8 | # 6. the gpu_mapping
  9 | # The mapping for each task is separated by a defined character.
 10 | # This separator can be changed in test_utils.c::parse_answer()
 11 | 
 12 | 
 13 | # 1:
 14 | # {"params": {"ntasks": 20, "in_nthreads": 0, "greedy": 1, "gpu_optim": 1, "smt": 0, "restr_set": null, "restrict_type": 0}}
 15 | Map one task to every core
 16 | "1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1"
 17 | "0;8;16;24;32;40;48;56;64;72;80;88;96;104;112;120;128;136;144;152"
 18 | ";;;;;;;;;;;;;;;;;;;"
 19 | 
 20 | # 2:
 21 | # {"params": {"ntasks": 1, "in_nthreads": 0, "greedy": 1, "gpu_optim": 1, "smt": 0, "restr_set": null, "restrict_type": 0}}
 22 | Map one task greedily
 23 | "160"
 24 | "0-159"
 25 | ""
 26 | 
 27 | 
 28 | # 3:
 29 | # {"params": {"ntasks": 2, "in_nthreads": 0, "greedy": 1, "gpu_optim": 1, "smt": 0, "restr_set": null, "restrict_type": 0}}
 30 | Map two tasks greedily
 31 | "10;10"
 32 | "0,8,16,24,32,40,48,56,64,72;80,88,96,104,112,120,128,136,144,152"
 33 | ";"
 34 | 
 35 | # 4:
 36 | # {"params": {"ntasks": 1, "in_nthreads": 4, "greedy": 1, "gpu_optim": 1, "smt": 0, "restr_set": null, "restrict_type": 0}}
 37 | Map such that ntasks < #NUMA nodes but nworkers > #NUMA nodes
 38 | "160"
 39 | "0-159"
 40 | ""
 41 | 
 42 | 
 43 | # 5:
 44 | # {"params": {"ntasks": 8, "in_nthreads": 0, "greedy": 1, "gpu_optim": 1, "smt": 0, "restr_set": "0-7", "restrict_type": 0}}
 45 | Restrict x tasks a single core (x == machine's smt level)
 46 | "1;1;1;1;1;1;1;1"
 47 | "0;1;2;3;4;5;6;7"
 48 | ";;;;;;;"
 49 | 
 50 | 
 51 | # 6:
 52 | # {"params": {"ntasks": 2, "in_nthreads": 0, "greedy": 1, "gpu_optim": 1, "smt": 1, "restr_set": null, "restrict_type": 0}}
 53 | Map two tasks at SMT 1
 54 | "10;10"
 55 | "0,8,16,24,32,40,48,56,64,72;80,88,96,104,112,120,128,136,144,152"
 56 | ";"
 57 | 
 58 | 
 59 | # 7:
 60 | # {"params": {"ntasks": 2, "in_nthreads": 0, "greedy": 1, "gpu_optim": 1, "smt": 8, "restr_set": null, "restrict_type": 0}}
 61 | Map two tasks at max smt (across all cores)
 62 | "80;80"
 63 | "0-79;80-159"
 64 | ";"
 65 | 
 66 | 
 67 | # 8:
 68 | # {"params": {"ntasks": 2, "in_nthreads": 0, "greedy": 1, "gpu_optim": 1, "smt": 7, "restr_set": null, "restrict_type": 0}}
 69 | Map two tasks at max smt-1
 70 | "70;70"
 71 | "0-6,8-14,16-22,24-30,32-38,40-46,48-54,56-62,64-70,72-78;80-86,88-94,96-102,104-110,112-118,120-126,128-134,136-142,144-150,152-158"
 72 | ";"
 73 | 
 74 | 
 75 | # 9:
 76 | # {"params": {"ntasks": 2, "in_nthreads": 0, "greedy": 1, "gpu_optim": 1, "smt": 0, "restr_set": "0", "restrict_type": 1}}
 77 | Map two tasks, but restrict them to a single NUMA domain
 78 | "5;5"
 79 | "0,8,16,24,32;40,48,56,64,72"
 80 | ";"
 81 | 
 82 | 
 83 | # 10:
 84 | # {"params": {"ntasks": 2, "in_nthreads": 0, "greedy": 1, "gpu_optim": 0, "smt": 0, "restr_set": null, "restrict_type": 0}}
 85 | Map num_numa tasks without GPU optimization
 86 | "10;10"
 87 | "0,8,16,24,32,40,48,56,64,72;80,88,96,104,112,120,128,136,144,152"
 88 | ";"
 89 | 
 90 | 
 91 | # 11:
 92 | # {"params": {"ntasks": 2, "in_nthreads": 0, "greedy": 1, "gpu_optim": 1, "smt": 0, "restr_set": null, "restrict_type": 0}}
 93 | Map num_numa tasks with GPU optimization
 94 | "10;10"
 95 | "0,8,16,24,32,40,48,56,64,72;80,88,96,104,112,120,128,136,144,152"
 96 | ";"
 97 | 
 98 | 
 99 | # 12:
100 | # {"params": {"ntasks": 8, "in_nthreads": 0, "greedy": 1, "gpu_optim": 0, "smt": 0, "restr_set": "0", "restrict_type": 0}}
101 | Map eight tasks to a single pu
102 | "1;1;1;1;1;1;1;1"
103 | "0;0;0;0;0;0;0;0"
104 | ";;;;;;;"
105 | 


--------------------------------------------------------------------------------
/test-suite/expected/expected.coral-lassen:
--------------------------------------------------------------------------------
  1 | # Line that start with a pound are comments!
  2 | # Each answer description consist of 6 lines:
  3 | # 1. A comment with the test number
  4 | # 2. A comment describing the parameters used for the test in JSON format
  5 | # 3. The test description
  6 | # 4. the thread mapping
  7 | # 5. the cpu mapping
  8 | # 6. the gpu_mapping
  9 | # The mapping for each task is separated by a defined character.
 10 | # This separator can be changed in test_utils.c::parse_answer()
 11 | 
 12 | 
 13 | 
 14 | # 1:
 15 | # {"params": {"ntasks": 40, "in_nthreads": 0, "greedy": 1, "gpu_optim": 1, "smt": 0, "restr_set": null, "restrict_type": 0}}
 16 | Map one task to every core
 17 | "1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1"
 18 | "8;12;16;20;24;28;32;36;40;44;48;52;56;60;64;68;72;76;80;84;96;100;104;108;112;116;120;124;128;132;136;140;144;148;152;156;160;164;168;172"
 19 | "0;0;0;0;0;0;0;0;0;0;1;1;1;1;1;1;1;1;1;1;2;2;2;2;2;2;2;2;2;2;3;3;3;3;3;3;3;3;3;3"
 20 | 
 21 | 
 22 | # 2:
 23 | # {"params": {"ntasks": 1, "in_nthreads": 0, "greedy": 1, "gpu_optim": 1, "smt": 0, "restr_set": null, "restrict_type": 0}}
 24 | Map one task greedily
 25 | "160"
 26 | "8-87,96-175"
 27 | "0,1,2,3"
 28 | 
 29 | 
 30 | # 3:
 31 | # {"params": {"ntasks": 2, "in_nthreads": 0, "greedy": 1, "gpu_optim": 1, "smt": 0, "restr_set": null, "restrict_type": 0}}
 32 | Map two tasks greedily
 33 | "20;20"
 34 | "8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84;96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172"
 35 | "0,1;2,3" 
 36 | 
 37 | 
 38 | # 4:
 39 | # {"params": {"ntasks": 1, "in_nthreads": 4, "greedy": 1, "gpu_optim": 1, "smt": 0, "restr_set": null, "restrict_type": 0}}
 40 | Map such that ntasks < #NUMA nodes but nworkers > #NUMA nodes
 41 | "160"
 42 | "8-87,96-175"
 43 | "0,1,2,3"
 44 | 
 45 | 
 46 | # 5:
 47 | # {"params": {"ntasks": 4, "in_nthreads": 0, "greedy": 1, "gpu_optim": 1, "smt": 0, "restr_set": "8-11", "restrict_type": 0}}
 48 | Restrict x tasks a single core (x == machine's smt level)
 49 | "1;1;1;1"
 50 | "8;9;10;11"
 51 | "0;0;1;1"
 52 | 
 53 | 
 54 | # 6:
 55 | # {"params": {"ntasks": 2, "in_nthreads": 0, "greedy": 1, "gpu_optim": 1, "smt": 1, "restr_set": null, "restrict_type": 0}}
 56 | Map two tasks at SMT 1
 57 | "20;20"
 58 | "8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84;96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172"
 59 | "0,1;2,3"
 60 | 
 61 | 
 62 | # 7:
 63 | # {"params": {"ntasks": 2, "in_nthreads": 0, "greedy": 1, "gpu_optim": 1, "smt": 4, "restr_set": null, "restrict_type": 0}}
 64 | Map two tasks at max smt (across all cores)
 65 | "80;80"
 66 | "8-87;96-175"
 67 | "0,1;2,3"
 68 | 
 69 | 
 70 | # 8:
 71 | # {"params": {"ntasks": 2, "in_nthreads": 0, "greedy": 1, "gpu_optim": 1, "smt": 3, "restr_set": null, "restrict_type": 0}}
 72 | Map two tasks at max smt-1
 73 | "60;60"
 74 | "8-10,12-14,16-18,20-22,24-26,28-30,32-34,36-38,40-42,44-46,48-50,52-54,56-58,60-62,64-66,68-70,72-74,76-78,80-82,84-86;96-98,100-102,104-106,108-110,112-114,116-118,120-122,124-126,128-130,132-134,136-138,140-142,144-146,148-150,152-154,156-158,160-162,164-166,168-170,172-174"
 75 | "0,1;2,3"
 76 | 
 77 | 
 78 | # 9:
 79 | # {"params": {"ntasks": 2, "in_nthreads": 0, "greedy": 1, "gpu_optim": 1, "smt": 0, "restr_set": "0", "restrict_type": 1}}
 80 | Map two tasks, but restrict them to a single NUMA domain
 81 | "10;10"
 82 | "8,12,16,20,24,28,32,36,40,44;48,52,56,60,64,68,72,76,80,84"
 83 | "0;1"
 84 | 
 85 | 
 86 | # 10:
 87 | # {"params": {"ntasks": 2, "in_nthreads": 0, "greedy": 1, "gpu_optim": 0, "smt": 0, "restr_set": null, "restrict_type": 0}}
 88 | Map num_numa tasks without GPU optimization
 89 | "20;20"
 90 | "8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84;96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172"
 91 | "0,1;2,3"
 92 | 
 93 | 
 94 | # 11:
 95 | # {"params": {"ntasks": 2, "in_nthreads": 0, "greedy": 1, "gpu_optim": 1, "smt": 0, "restr_set": null, "restrict_type": 0}}
 96 | Map num_numa tasks with GPU optimization
 97 | "20;20"
 98 | "8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84;96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172"
 99 | "0,1;2,3"
100 | 
101 | 
102 | # 12:
103 | # {"params": {"ntasks": 8, "in_nthreads": 0, "greedy": 1, "gpu_optim": 0, "smt": 0, "restr_set": "8", "restrict_type": 0}}
104 | Map eight tasks to a single pu
105 | "1;1;1;1;1;1;1;1"
106 | "8;8;8;8;8;8;8;8"
107 | "0;0;0;0;1;1;1;1"
108 | 


--------------------------------------------------------------------------------
/test-suite/expected/expected.coral-lassen.v1:
--------------------------------------------------------------------------------
 1 | # Line that start with a pound are comments!
 2 | # The first non-commented line should be the number of tests.
 3 | # After the number of tests, each answer description consist of 4 lines:
 4 | # The test description, the thread mapping, the cpu mapping, and the
 5 | # gpu_mapping.The mapping for each task is separated by a defined character.
 6 | # This separator can be changed in test_utils.c::parse_answer()
 7 | 
 8 | 
 9 | # 1: Map one task to every core
10 | Map one task to every core
11 | "1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1"
12 | "8;12;16;20;24;28;32;36;40;44;48;52;56;60;64;68;72;76;80;84;96;100;104;108;112;116;120;124;128;132;136;140;144;148;152;156;160;164;168;172"
13 | "0;0;0;0;0;0;0;0;0;0;1;1;1;1;1;1;1;1;1;1;2;2;2;2;2;2;2;2;2;2;3;3;3;3;3;3;3;3;3;3"
14 | 
15 | 
16 | # 2: Map 1 task greedily
17 | Map 1 task greedily
18 | "160"
19 | "8-87,96-175"
20 | "0-3"
21 | 
22 | 
23 | # 3: Map two tasks greedily
24 | Map two tasks greedily
25 | "20;20"
26 | "8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84;96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172"
27 | "0-1;2-3"
28 | 
29 | # 4:
30 | Mapping such that ntasks < #NUMA nodes but nworkers > #NUMA nodes
31 | "160"
32 | "8-87,96-175"
33 | "0-3"
34 | 
35 | 
36 | # 5:
37 | Restrict x tasks a single core (x == machine's smt level)
38 | "1;1;1;1"
39 | "8;9;10;11"
40 | "0;0;1;1"
41 | 
42 | 
43 | # 6:
44 | Map two tasks at SMT 1
45 | "20;20"
46 | "8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84;96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172"
47 | "0-1;2-3"
48 | 
49 | 
50 | # 7:
51 | Map 2 tasks at max smt (across all cores)
52 | "80;80"
53 | "8-87;96-175"
54 | "0-1;2-3"
55 | 
56 | 
57 | # 8:
58 | Map tasks at max smt-1
59 | "60;60"
60 | "8-10,12-14,16-18,20-22,24-26,28-30,32-34,36-38,40-42,44-46,48-50,52-54,56-58,60-62,64-66,68-70,72-74,76-78,80-82,84-86;96-98,100-102,104-106,108-110,112-114,116-118,120-122,124-126,128-130,132-134,136-138,140-142,144-146,148-150,152-154,156-158,160-162,164-166,168-170,172-174"
61 | "0-1;2-3"
62 | 
63 | 
64 | # 9:
65 | Map two tasks, but restrict them to a single NUMA domain
66 | "10;10"
67 | "8,12,16,20,24,28,32,36,40,44;48,52,56,60,64,68,72,76,80,84"
68 | "0;1"
69 | 
70 | 
71 | # 10:
72 | Map num_numa tasks without GPU optimization
73 | "20;20"
74 | "8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84;96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172"
75 | "0-1;2-3"
76 | 
77 | 
78 | # 11:
79 | Map num_numa tasks with GPU optimization
80 | "20;20"
81 | "8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84;96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172"
82 | "0-1;2-3"
83 | 
84 | 
85 | # 12:
86 | Map 8 tasks to a single pu
87 | "1;1;1;1;1;1;1;1"
88 | "8;8;8;8;8;8;8;8"
89 | "0;0;0;0;1;1;1;1"
90 | 


--------------------------------------------------------------------------------
/test-suite/expected/expected.cts1-quartz:
--------------------------------------------------------------------------------
  1 | # Line that start with a pound are comments!
  2 | # Each answer description consist of 6 lines:
  3 | # 1. A comment with the test number
  4 | # 2. A comment describing the parameters used for the test in JSON format
  5 | # 3. The test description
  6 | # 4. the thread mapping
  7 | # 5. the cpu mapping
  8 | # 6. the gpu_mapping
  9 | # The mapping for each task is separated by a defined character.
 10 | # This separator can be changed in test_utils.c::parse_answer()
 11 | 
 12 | 
 13 | # 1:
 14 | # {"params": {"ntasks": 36, "in_nthreads": 0, "greedy": 1, "gpu_optim": 1, "smt": 0, "restr_set": null, "restrict_type": 0}}
 15 | Map one task to every core
 16 | "1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1"
 17 | "0;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;19;20;21;22;23;24;25;26;27;28;29;30;31;32;33;34;35"
 18 | ";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;"
 19 | 
 20 | 
 21 | # 2:
 22 | # {"params": {"ntasks": 1, "in_nthreads": 0, "greedy": 1, "gpu_optim": 1, "smt": 0, "restr_set": null, "restrict_type": 0}}
 23 | Map one task greedily
 24 | "36"
 25 | "0-35"
 26 | ""
 27 | 
 28 | 
 29 | # 3:
 30 | # {"params": {"ntasks": 2, "in_nthreads": 0, "greedy": 1, "gpu_optim": 1, "smt": 0, "restr_set": null, "restrict_type": 0}}
 31 | Map two tasks greedily
 32 | "18;18"
 33 | "0-17;18-35"
 34 | ";"
 35 | 
 36 | # 4:
 37 | # {"params": {"ntasks": 1, "in_nthreads": 4, "greedy": 1, "gpu_optim": 1, "smt": 0, "restr_set": null, "restrict_type": 0}}
 38 | Map such that ntasks < #NUMA nodes but nworkers > #NUMA nodes
 39 | "36"
 40 | "0-35"
 41 | ""
 42 | 
 43 | 
 44 | # 5:
 45 | # {"params": {"ntasks": 1, "in_nthreads": 0, "greedy": 1, "gpu_optim": 1, "smt": 0, "restr_set": "0", "restrict_type": 0}}
 46 | Restrict x tasks a single core (x == machine's smt level)
 47 | "1"
 48 | "0"
 49 | ""
 50 | 
 51 | 
 52 | # 6:
 53 | # {"params": {"ntasks": 2, "in_nthreads": 0, "greedy": 1, "gpu_optim": 1, "smt": 1, "restr_set": null, "restrict_type": 0}}
 54 | Map two tasks at SMT 1
 55 | "18;18"
 56 | "0-17;18-35"
 57 | ";"
 58 | 
 59 | 
 60 | # 7:
 61 | # {"params": {"ntasks": 2, "in_nthreads": 0, "greedy": 1, "gpu_optim": 1, "smt": 1, "restr_set": null, "restrict_type": 0}}
 62 | Map two tasks at max smt (across all cores)
 63 | "18;18"
 64 | "0-17;18-35"
 65 | ";"
 66 | 
 67 | 
 68 | # 8:
 69 | # {"params": {"ntasks": 2, "in_nthreads": 0, "greedy": 1, "gpu_optim": 1, "smt": 1, "restr_set": null, "restrict_type": 0}}
 70 | Map two tasks at max smt-1
 71 | "18;18"
 72 | "0-17;18-35"
 73 | ";"
 74 | 
 75 | 
 76 | # 9:
 77 | # {"params": {"ntasks": 2, "in_nthreads": 0, "greedy": 1, "gpu_optim": 1, "smt": 0, "restr_set": "0", "restrict_type": 1}}
 78 | Map two tasks, but restrict them to a single NUMA domain
 79 | "9;9"
 80 | "0-8;9-17"
 81 | ";"
 82 | 
 83 | 
 84 | # 10:
 85 | # {"params": {"ntasks": 2, "in_nthreads": 0, "greedy": 1, "gpu_optim": 0, "smt": 0, "restr_set": null, "restrict_type": 0}}
 86 | Map num_numa tasks without GPU optimization
 87 | "18;18"
 88 | "0-17;18-35"
 89 | ";"
 90 | 
 91 | # 11:
 92 | # {"params": {"ntasks": 2, "in_nthreads": 0, "greedy": 1, "gpu_optim": 1, "smt": 0, "restr_set": null, "restrict_type": 0}}
 93 | Map num_numa tasks with GPU optimization
 94 | "18;18"
 95 | "0-17;18-35"
 96 | ";"
 97 | 
 98 | # 12:
 99 | # {"params": {"ntasks": 8, "in_nthreads": 0, "greedy": 1, "gpu_optim": 0, "smt": 0, "restr_set": "0", "restrict_type": 0}}
100 | Map eight tasks to a single pu
101 | "1;1;1;1;1;1;1;1"
102 | "0;0;0;0;0;0;0;0"
103 | ";;;;;;;"
104 | 


--------------------------------------------------------------------------------
/test-suite/expected/expected.epyc-corona:
--------------------------------------------------------------------------------
  1 | # Line that start with a pound are comments!
  2 | # Each answer description consist of 6 lines:
  3 | # 1. A comment with the test number
  4 | # 2. A comment describing the parameters used for the test in JSON format
  5 | # 3. The test description
  6 | # 4. the thread mapping
  7 | # 5. the cpu mapping
  8 | # 6. the gpu_mapping
  9 | # The mapping for each task is separated by a defined character.
 10 | # This separator can be changed in test_utils.c::parse_answer()
 11 | 
 12 | # 1:
 13 | # {"params": {"ntasks": 48, "in_nthreads": 0, "greedy": 1, "gpu_optim": 1, "smt": 0, "restr_set": null, "restrict_type": 0}}
 14 | Map one task to every core
 15 | "1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1"
 16 | "6;54;7;55;8;56;9;57;10;58;11;59;12;60;13;61;14;62;15;63;16;64;17;65;30;78;31;79;32;80;33;81;34;82;35;83;42;90;43;91;44;92;45;93;46;94;47;95"
 17 | "0;0;0;0;0;0;0;0;0;0;0;0;1;1;1;1;1;1;1;1;1;1;1;1;2;2;2;2;2;2;2;2;2;2;2;2;3;3;3;3;3;3;3;3;3;3;3;3"
 18 | 
 19 | # 2:
 20 | # {"params": {"ntasks": 1, "in_nthreads": 0, "greedy": 1, "gpu_optim": 1, "smt": 0, "restr_set": null, "restrict_type": 0}}
 21 | Map one task greedily
 22 | "96"
 23 | "0-95"
 24 | "0,1,2,3"
 25 | 
 26 | 
 27 | # 3:
 28 | # {"params": {"ntasks": 2, "in_nthreads": 0, "greedy": 1, "gpu_optim": 1, "smt": 0, "restr_set": null, "restrict_type": 0}}
 29 | Map two tasks greedily
 30 | "48;48"
 31 | "0-23,48-71;24-47,72-95"
 32 | "0,1;2,3"
 33 | 
 34 | # 4:
 35 | # {"params": {"ntasks": 7, "in_nthreads": 16, "greedy": 1, "gpu_optim": 1, "smt": 0, "restr_set": null, "restrict_type": 0}}
 36 | Map such that ntasks < #NUMA nodes but nworkers > #NUMA nodes
 37 | "24;12;12;12;12;12;12"
 38 | "0-11,48-59;12-17,60-65;18-23,66-71;24-29,72-77;30-35,78-83;36-41,84-89;42-47,90-95"
 39 | "0;1;;;2;;3"
 40 | 
 41 | # 5:
 42 | # {"params": {"ntasks": 2, "in_nthreads": 0, "greedy": 1, "gpu_optim": 1, "smt": 0, "restr_set": "0,48", "restrict_type": 0}}
 43 | Restrict x tasks a single core (x == machine's smt level)
 44 | "1;1"
 45 | "0;48"
 46 | ";"
 47 | 
 48 | # 6:
 49 | # {"params": {"ntasks": 2, "in_nthreads": 0, "greedy": 1, "gpu_optim": 1, "smt": 1, "restr_set": null, "restrict_type": 0}}
 50 | Map two tasks at SMT 1
 51 | "48;48"
 52 | "0-23,48-71;24-47,72-95"
 53 | "0,1;2,3"
 54 | 
 55 | 
 56 | # 7:
 57 | # {"params": {"ntasks": 2, "in_nthreads": 0, "greedy": 1, "gpu_optim": 1, "smt": 2, "restr_set": null, "restrict_type": 0}}
 58 | Map two tasks at max smt (across all cores)
 59 | "48;48"
 60 | "0-23,48-71;24-47,72-95"
 61 | "0,1;2,3"
 62 | 
 63 | 
 64 | # 8:
 65 | # {"params": {"ntasks": 2, "in_nthreads": 0, "greedy": 1, "gpu_optim": 1, "smt": 1, "restr_set": null, "restrict_type": 0}}
 66 | Map two tasks at max smt-1
 67 | "48;48"
 68 | "0-23,48-71;24-47,72-95"
 69 | "0,1;2,3"
 70 | 
 71 | 
 72 | # 9:
 73 | # {"params": {"ntasks": 2, "in_nthreads": 0, "greedy": 1, "gpu_optim": 1, "smt": 0, "restr_set": "0", "restrict_type": 1}}
 74 | Map two tasks, but restrict them to a single NUMA domain
 75 | "3;3"
 76 | "0-2;3-5"
 77 | ";"
 78 | 
 79 | 
 80 | # 10:
 81 | # {"params": {"ntasks": 8, "in_nthreads": 0, "greedy": 1, "gpu_optim": 0, "smt": 0, "restr_set": null, "restrict_type": 0}}
 82 | Map num_numa tasks without GPU optimization
 83 | "6;6;6;6;6;6;6;6"
 84 | "0-5;6-11;12-17;18-23;24-29;30-35;36-41;42-47"
 85 | ";0;1;;;2;;3"
 86 | 
 87 | 
 88 | # 11:
 89 | # {"params": {"ntasks": 8, "in_nthreads": 0, "greedy": 1, "gpu_optim": 1, "smt": 0, "restr_set": null, "restrict_type": 0}}
 90 | Map num_numa tasks with GPU optimization
 91 | "3;3;3;3;3;3;3;3"
 92 | "6-8;9-11;12-14;15-17;30-32;33-35;42-44;45-47"
 93 | "0;0;1;1;2;2;3;3"
 94 | 
 95 | 
 96 | # 12:
 97 | # {"params": {"ntasks": 8, "in_nthreads": 0, "greedy": 1, "gpu_optim": 0, "smt": 0, "restr_set": "0", "restrict_type": 0}}
 98 | Map eight tasks to a single pu
 99 | "1;1;1;1;1;1;1;1"
100 | "0;0;0;0;0;0;0;0"
101 | ";;;;;;;"
102 | 


--------------------------------------------------------------------------------
/test-suite/expected/expected.epyc-corona.v1:
--------------------------------------------------------------------------------
 1 | # Line that start with a pound are comments!
 2 | # The first non-commented line should be the number of tests.
 3 | # After the number of tests, each answer description consist of 4 lines:
 4 | # The test description, the thread mapping, the cpu mapping, and the
 5 | # gpu_mapping.The mapping for each task is separated by a defined character.
 6 | # This separator can be changed in test_utils.c::parse_answer()
 7 | 
 8 | 
 9 | # 1: Map one task to every core
10 | Map one task to every core
11 | "1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1"
12 | "6,54;6,54;7,55;7,55;8,56;8,56;9,57;9,57;10,58;10,58;11,59;11,59;12,60;12,60;13,61;13,61;14,62;14,62;15,63;15,63;16,64;16,64;17,65;17,65;30,78;30,78;31,79;31,79;32,80;32,80;33,81;33,81;34,82;34,82;35,83;35,83;42,90;42,90;43,91;43,91;44,92;44,92;45,93;45,93;46,94;46,94;47,95;47,95"
13 | "0;0;0;0;0;0;0;0;0;0;0;0;1;1;1;1;1;1;1;1;1;1;1;1;2;2;2;2;2;2;2;2;2;2;2;2;3;3;3;3;3;3;3;3;3;3;3;3"
14 | 
15 | # 2: Map 1 task greedily
16 | Map 1 task greedily
17 | "96"
18 | "0-95"
19 | "0-3"
20 | 
21 | 
22 | # 3: Map two tasks greedily
23 | Map two tasks greedily
24 | "48;48"
25 | "0-23,48-71;24-47,72-95"
26 | "0-1;2-3"
27 | 
28 | # 4:
29 | Mapping such that ntasks < #NUMA nodes but nworkers > #NUMA nodes
30 | "24;12;12;12;12;12;12"
31 | "0-11,48-59;12-17,60-65;18-23,66-71;24-29,72-77;30-35,78-83;36-41,84-89;42-47,90-95"
32 | "0;1;;;2;;3"
33 | 
34 | # 5:
35 | Restrict x tasks a single core (x == machine's smt level)
36 | "1;1"
37 | "0;48"
38 | ";"
39 | 
40 | # 6:
41 | Map two tasks at SMT 1
42 | "48;48"
43 | "0-23,48-71;24-47,72-95"
44 | "0-1;2-3"
45 | 
46 | 
47 | # 7:
48 | Map 2 tasks at max smt (across all cores)
49 | "48;48"
50 | "0-23,48-71;24-47,72-95"
51 | "0-1;2-3"
52 | 
53 | 
54 | # 8:
55 | Map tasks at max smt-1
56 | "48;48"
57 | "0-23,48-71;24-47,72-95"
58 | "0-1;2-3"
59 | 
60 | 
61 | # 9:
62 | Map two tasks, but restrict them to a single NUMA domain
63 | "3;3"
64 | "0-2;3-5"
65 | ";"
66 | 
67 | 
68 | # 10:
69 | Map num_numa tasks without GPU optimization
70 | "6;6;6;6;6;6;6;6"
71 | "0-5;6-11;12-17;18-23;24-29;30-35;36-41;42-47"
72 | ";0;1;;;2;;3"
73 | 
74 | 
75 | # 11:
76 | Map num_numa tasks with GPU optimization
77 | "3;3;3;3;3;3;3;3"
78 | "6-8;9-11;12-14;15-17;30-32;33-35;42-44;45-47"
79 | "0;0;1;1;2;2;3;3"
80 | 
81 | 
82 | # 12:
83 | Map 8 tasks to a single pu
84 | "1;1;1;1;1;1;1;1"
85 | "0;0;0;0;0;0;0;0"
86 | ";;;;;;;"
87 | 


--------------------------------------------------------------------------------
/test-suite/python/py-coral-ea.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import unittest
 4 | from test_utils import *
 5 | 
 6 | topology_file = "../topo-xml/coral-ea-hwloc1.xml"
 7 | answer_file = "./expected/expected.coral-ea"
 8 | 
 9 | # test class that inherits from unittest
10 | # test cases are added in main body
11 | class TestCoralEA(unittest.TestCase):
12 |     pass
13 | 
14 | if __name__ == "__main__":
15 |     # read expected file
16 |     test_info = parse_expected(answer_file)
17 | 
18 |     # add setup and teardown functions
19 |     setattr(TestCoralEA, 'setUp', setup_generator(topology_file))
20 |     setattr(TestCoralEA, 'tearDown', teardown_generator())
21 | 
22 |     # build and add test cases from the expected file
23 |     for single_test_info in test_info:
24 |         setattr(TestCoralEA, make_test_name(single_test_info['description']), 
25 |             test_generator(single_test_info))
26 | 
27 |     #use pycotap to emit TAP from python unit tests
28 |     from pycotap import TAPTestRunner
29 |     suite = unittest.TestLoader().loadTestsFromTestCase(TestCoralEA)
30 |     TAPTestRunner().run(suite)


--------------------------------------------------------------------------------
/test-suite/python/py-coral-lassen.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import unittest
 4 | from test_utils import *
 5 | 
 6 | topology_file = "../topo-xml/coral-lassen.xml"
 7 | answer_file = "./expected/expected.coral-lassen"
 8 | 
 9 | # test class that inherits from unittest
10 | # test cases are added in main body
11 | class TestCoralLassen(unittest.TestCase):
12 |     pass
13 |     
14 | if __name__ == "__main__":
15 |     # read expected file
16 |     test_info = parse_expected(answer_file)
17 | 
18 |     # add setup and teardown functions
19 |     setattr(TestCoralLassen, 'setUp', setup_generator(topology_file))
20 |     setattr(TestCoralLassen, 'tearDown', teardown_generator())
21 | 
22 |     # build and add test cases from the expected file
23 |     for single_test_info in test_info:
24 |         setattr(TestCoralLassen, make_test_name(single_test_info['description']), 
25 |             test_generator(single_test_info))
26 | 
27 |     #use pycotap to emit TAP from python unit tests
28 |     from pycotap import TAPTestRunner
29 |     suite = unittest.TestLoader().loadTestsFromTestCase(TestCoralLassen)
30 |     TAPTestRunner().run(suite)


--------------------------------------------------------------------------------
/test-suite/python/py-cts1-quartz.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import unittest
 4 | from test_utils import *
 5 | 
 6 | topology_file = "../topo-xml/cts1-quartz-smt1.xml"
 7 | answer_file = "./expected/expected.cts1-quartz"
 8 | 
 9 | # test class that inherits from unittest
10 | # test cases are added in main body
11 | class TestCTS1Quartz(unittest.TestCase):
12 |     pass
13 | 
14 | if __name__ == "__main__":
15 |     # read expected file
16 |     test_info = parse_expected(answer_file)
17 | 
18 |     # add setup and teardown functions
19 |     setattr(TestCTS1Quartz, 'setUp', setup_generator(topology_file))
20 |     setattr(TestCTS1Quartz, 'tearDown', teardown_generator())
21 | 
22 |     # build and add test cases from the expected file
23 |     for single_test_info in test_info:
24 |         setattr(TestCTS1Quartz, make_test_name(single_test_info['description']), 
25 |             test_generator(single_test_info))
26 | 
27 |     #use pycotap to emit TAP from python unit tests
28 |     from pycotap import TAPTestRunner
29 |     suite = unittest.TestLoader().loadTestsFromTestCase(TestCTS1Quartz)
30 |     TAPTestRunner().run(suite)


--------------------------------------------------------------------------------
/test-suite/python/py-epyc-corona.py:
--------------------------------------------------------------------------------
 1 | 
 2 | #!/usr/bin/env python3
 3 | 
 4 | import unittest
 5 | from test_utils import *
 6 | 
 7 | topology_file = "../topo-xml/epyc-corona.xml"
 8 | answer_file = "./expected/expected.epyc-corona"
 9 | 
10 | # test class that inherits from unittest
11 | # test cases are added in main body
12 | class TestEpycCorona(unittest.TestCase):
13 |     pass
14 | 
15 | if __name__ == "__main__":
16 |     # read expected file
17 |     test_info = parse_expected(answer_file)
18 | 
19 |     # add setup and teardown functions
20 |     setattr(TestEpycCorona, 'setUp', setup_generator(topology_file))
21 |     setattr(TestEpycCorona, 'tearDown', teardown_generator())
22 | 
23 |     # build and add test cases from the expected file
24 |     for single_test_info in test_info:
25 |         setattr(TestEpycCorona, make_test_name(single_test_info['description']), 
26 |             test_generator(single_test_info))
27 | 
28 |     #use pycotap to emit TAP from python unit tests
29 |     from pycotap import TAPTestRunner
30 |     suite = unittest.TestLoader().loadTestsFromTestCase(TestEpycCorona)
31 |     TAPTestRunner().run(suite)


--------------------------------------------------------------------------------
/test-suite/python/test_utils.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import mpibind
  3 | import unittest
  4 | import re
  5 | import itertools
  6 | 
  7 | 
  8 | # Based on https://stackoverflow.com/questions/4628333/\
  9 | # converting-a-list-of-integers-into-range-in-python
 10 | # lst: [0, 1, 2, 3, 4, 7, 8, 9, 11]
 11 | # key: 0 group: [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4)]
 12 | # key: 2 group: [(5, 7), (6, 8), (7, 9)]
 13 | # key: 3 group: [(8, 11)]
 14 | def ints2ranges(lst):
 15 |     '''Convert an integer list into a range generator'''
 16 |     key_func = lambda pair: pair[1] - pair[0]
 17 |     for key, grp in itertools.groupby(enumerate(lst), key_func):
 18 |         grp = list(grp)
 19 |         beg = grp[0][1]
 20 |         end = grp[-1][1]
 21 | 
 22 |         res = "{}".format(beg)
 23 |         if beg != end:
 24 |             res += "-{}".format(end)
 25 | 
 26 |         yield res
 27 | 
 28 | 
 29 | def parse_expected(answer_file):
 30 |     """
 31 |     Parse parameters and answers from the expected file
 32 | 
 33 |     :param answer_file: file path of the expected file
 34 |     :type answer_file: string or path-like
 35 |     :return: test information for each of the tests in the answer file
 36 |     :rtype: list of dictionaries
 37 |     """
 38 |     line_types = [
 39 |         'description',
 40 |         'thread_mapping',
 41 |         'cpu_mapping',
 42 |         'gpu_mapping'
 43 |     ]
 44 |     test_info = []
 45 |     cur_answer = dict()
 46 |     type_idx = 0
 47 |     with open(answer_file, 'r') as f:
 48 |         for line in f.readlines():
 49 |             line = line.strip() 
 50 |             if not line:
 51 |                 continue
 52 |             if line[0] == '#' and 'params' not in line:
 53 |                 continue
 54 |             
 55 |             if 'params' in line:
 56 |                 json_string = line.replace('# ', '')
 57 |                 cur_answer['params'] = json.loads(json_string)['params']
 58 |             else:
 59 |                 cur_answer[line_types[type_idx]] = line.replace('"', '')
 60 |                 type_idx += 1
 61 | 
 62 |             if type_idx == 4:
 63 |                 test_info.append(cur_answer)
 64 |                 type_idx = 0
 65 |                 cur_answer = dict()
 66 | 
 67 |     return test_info
 68 | 
 69 | def get_actual(handle, single_test_info):
 70 |     """
 71 |     Use the test info to paramaterize a mpibind handle
 72 |     and then compute a mapping.
 73 | 
 74 |     :param handle: the mpibind handle
 75 |     :type handle: MpibindHandle
 76 |     :param single_test_info: the test information for a single test
 77 |     :type single_test_info: dictionary
 78 |     :return: 3-tuple of answers
 79 |     :rtype: tuple of strings
 80 |     """
 81 |     handle.ntasks = single_test_info['params']['ntasks']
 82 |     handle.nthreads = single_test_info['params']['in_nthreads']
 83 |     handle.greedy = single_test_info['params']['greedy']
 84 |     handle.gpu_optim = single_test_info['params']['gpu_optim']
 85 |     handle.smt = single_test_info['params']['smt']
 86 |     handle.restrict_ids = single_test_info['params']['restr_set']
 87 |     handle.restrict_type = single_test_info['params']['restrict_type']
 88 |     handle.mpibind()
 89 | 
 90 |     thread_mapping = ';'.join([str(ele) for ele in handle.nthreads])
 91 |     gpu_mapping = ';'.join([','.join(handle.get_gpus_ptask(i)) for i in range(handle.ntasks)])
 92 |     #cpu_mapping = ';'.join([handle.get_cpus_ptask(i)
 93 |     #                        for i in range(handle.ntasks)])
 94 |     # Since 'get_cpus_ptask' now returns a list of ints,
 95 |     # convert the list into ranges as a string
 96 |     cpu_mapping = []
 97 |     for i in range(handle.ntasks):
 98 |         # ints2ranges is a generator, thus make it a list
 99 |         # and join the ranges with commas
100 |         cpu_lst = list(ints2ranges(handle.get_cpus_ptask(i)))
101 |         cpu_mapping.append(','.join(cpu_lst))
102 |     # The mapping of tasks is separated by semicolons
103 |     cpu_mapping = ';'.join(cpu_mapping)
104 | 
105 |     return thread_mapping, cpu_mapping, gpu_mapping
106 | 
107 | def make_test_name(description):
108 |     """
109 |     generate a test name from a description
110 | 
111 |     :param description: plain english description of a test
112 |     :type description: string
113 |     :return: the generated test name
114 |     :rtype: string
115 |     """
116 |     return 'test_' + re.sub(r'\s+', '_', description.strip().lower())
117 | 
118 | def test_generator(single_test_info):
119 |     """
120 |     generate a python unit test from the given test info
121 | 
122 |     :param single_test_info: the test information for a single test
123 |     :type single_test_info: dictionary
124 |     :return: the generated test
125 |     :rtype: function
126 |     """
127 |     def test(self):
128 |         thread_mapping, cpu_mapping, gpu_mapping = get_actual(self.handle, single_test_info)
129 |         self.assertEqual(thread_mapping, single_test_info['thread_mapping'])
130 |         self.assertEqual(cpu_mapping, single_test_info['cpu_mapping'])
131 |         self.assertEqual(gpu_mapping, single_test_info['gpu_mapping'])
132 |     return test
133 | 
134 | def setup_generator(topology_file):
135 |     """
136 |     generate the setup functino
137 | 
138 |     :param topology_file: the path to the topology file to use during testing
139 |     :type topology_file: string
140 |     :return: the generated setup function
141 |     :rtype: function
142 |     """
143 |     def setUp(self):
144 |         mpibind.topology_set_xml(topology_file)
145 |         self.handle = mpibind.MpibindHandle()
146 |     return setUp
147 | 
148 | def teardown_generator():
149 |     """
150 |     generate the teardown function
151 | 
152 |     :return: the generated teardown function
153 |     :rtype: function
154 |     """
155 |     def tearDown(self):
156 |         self.handle.finalize()
157 |     return tearDown
158 | 


--------------------------------------------------------------------------------
/test-suite/test_utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef MPIBIND_TEST_UTILS_H
 2 | #define MPIBIND_TEST_UTILS_H
 3 | #include <stdbool.h>
 4 | #include <stdio.h>
 5 | #include <string.h>
 6 | #include "mpibind.h"
 7 | #include "tap.h"
 8 | 
 9 | /**
10 |  * The number of tests present in the test_suite.
11 |  * This will be referenced when parsing answers and
12 |  * generating tests. This is also used to ensure the number of
13 |  * tests and number of answers are consistent.
14 |  * **/
15 | #define NUM_TESTS 12
16 | 
17 | /**
18 |  * Representation of a test answer
19 |  * **/
20 | typedef struct {
21 |   char* description;
22 |   char* thread_mapping;
23 |   char* cpu_mapping;
24 |   char* gpu_mapping;
25 | } mpibind_test_out_t;
26 | /**
27 |  * Input parameters for a test. This mimics
28 |  * the structure of mpibind_t, but is defined
29 |  * separately to make the tests independent of
30 |  * mpibind_t's definition.
31 |  * **/
32 | typedef struct {
33 |   /* Input parameters */
34 |   hwloc_topology_t topo;
35 |   int ntasks;
36 |   int in_nthreads;
37 |   int greedy;
38 |   int gpu_optim;
39 |   int smt;
40 |   char* restr_set;
41 |   int restr_type;
42 | } mpibind_test_in_t;
43 | /**
44 |  * Initialize a test struct to default values.
45 |  * This mimics the behavior of mpibind_init
46 |  * **/
47 | int mpibind_test_in_t_init(mpibind_test_in_t* hdl);
48 | /**
49 |  * Frees an answer
50 |  * **/
51 | void mpibind_test_out_t_free(mpibind_test_out_t* t);
52 | /**
53 |  * Frees a test object
54 |  * **/
55 | void mpibind_test_in_t_free(mpibind_test_in_t* t);
56 | /**
57 |  * Prints the current state of an mpibind_test_in_t object
58 |  * **/
59 | void mpibind_test_in_t_print(mpibind_test_in_t* params);
60 | /** Helper function to check the cpu, gpu, and thread mappings**/
61 | void check_mapping(mpibind_t* handle, mpibind_test_out_t* expected);
62 | /**
63 |  * Runs a set of tests and compares them to their answers.
64 |  * **/
65 | void run_test(hwloc_topology_t topo, mpibind_test_in_t *params,  mpibind_test_out_t *expected);
66 | /**
67 |  * Generate unit test information from a topology.
68 |  * This will take high level information and create an array of
69 |  * of objects containing parameters for each of the tests. The number
70 |  * of tests created is passed back via num_test_ptr
71 |  * **/
72 | mpibind_test_in_t** generate_test_information(hwloc_topology_t);
73 | /** load an xml file into a topology **/
74 | void load_topology(hwloc_topology_t* topo, char* xml_file);
75 | /**
76 |  * Loads a set of test answers from a file.
77 |  * num_test_ptr will be used to store the number of answers parsed
78 |  * **/
79 | mpibind_test_out_t** load_answers(char* filename);
80 | /** Performs a unit test using a given topology xml and an answer file.
81 |  * Test drivers should call this method
82 |  * **/
83 | void unit_test_topology(char* topology_filename, char* answer_filename);
84 | #endif
85 | 


--------------------------------------------------------------------------------
/tutorials/common/archs.md:
--------------------------------------------------------------------------------
  1 | # Example architectures
  2 | 
  3 | Here is the node architecture of several Livermore Computing supercomputers. A summary of key features for these machines, including differing numbers of processors, NUMA domains, GPUs, and cores, are summarized in the table below, with images depicting the topology of each below that.
  4 | 
  5 | 
  6 | <table style="text-align: center">
  7 |   <tr>
  8 |     <th></th>
  9 |     <th colspan="2">Per node</th>
 10 |     <th colspan="3">Per processor</th>
 11 |     <th>Per core</th>
 12 |   </tr>
 13 |   <tr>
 14 |     <th>Corona</th>
 15 |     <td>2 AMD Rome processors</td>
 16 |     <td>8 AMD MI50 GPUs</td>
 17 |     <td>1 NUMA domain</td>
 18 |     <td>8 L3s, 24 L2s, 24 L1s</td>
 19 |     <td>24 cores</td>
 20 |     <td>2 HW threads</td>    
 21 |   </tr>
 22 |   <tr>
 23 |     <th>Lassen</th>
 24 |     <td>2 IBM Power9 processors</td>
 25 |     <td>4 NVIDIA Volta GPUs</td>
 26 |     <td>1 NUMA domain</td>
 27 |     <td>10 L3s, 10 L2s, 20 L1s</td>
 28 |     <td>20 cores</td>
 29 |     <td>4 HW threads</td>    
 30 |   </tr>
 31 |   <tr>
 32 |     <th>Pascal</th>
 33 |     <td>2 Intel Broadwell processors</td>
 34 |     <td>2 NVIDIA Pascal GPUs</td>
 35 |     <td>1 NUMA domain</td>
 36 |     <td>1 L3, 18 L2s, 18 L1s</td>
 37 |     <td>18 cores</td>
 38 |     <td>2 HW threads</td>    
 39 |   </tr>
 40 |   <tr>
 41 |     <th>Poodle</th>
 42 |     <td>2 Intel Sapphire Rapids processors</td>
 43 |     <td></td>
 44 |     <td>4 NUMA domains</td>
 45 |     <td>1 L3, 56 L2s, 56 L1s</td>
 46 |     <td>56 cores</td>
 47 |     <td>2 HW threads</td>    
 48 |   </tr>
 49 |   <tr>
 50 |     <th>RZAdams</th>
 51 |     <td colspan="2">4 AMD Instinct MI300A APUs:<br>
 52 |       4 processors + 4 GPUs</td>
 53 |     <td>1 NUMA domain</td>
 54 |     <td>3 L3s, 24 L2s, 24 L1s</td>
 55 |     <td>24 cores</td>
 56 |     <td>2 HW threads</td>    
 57 |   </tr>
 58 |   <tr>
 59 |     <th>Tioga</th>
 60 |     <td>1 AMD 3rd Gen EPYC processor</td>
 61 |     <td>8 AMD Instinct MI250X GPUs</td>
 62 |     <td>4 NUMA domains</td>
 63 |     <td>8 L3s, 64 L2s, 64 L1s</td>
 64 |     <td>64 cores</td>
 65 |     <td>2 HW threads</td>
 66 |   </tr>
 67 |   <tr>
 68 |     <th></th>
 69 |     <td></td>
 70 |     <td></td>
 71 |     <td></td>
 72 |     <td></td>
 73 |     <td></td>
 74 |     <td></td>    
 75 |   </tr>
 76 | </table>
 77 | 
 78 | 
 79 | 
 80 | |<b> Tioga </b>|
 81 | |:--:|
 82 | |![Tioga](../figures/tioga-web.png "Tioga (MI250X)")|
 83 | 
 84 | |<b> RZAdams </b>|
 85 | |:--:|
 86 | |![Tioga](../figures/rzadams-web.png "RZAdams (MI300A)")|
 87 | 
 88 | |<b> Corona </b>|
 89 | |:--:|
 90 | |![Corona](../figures/corona-web.png "Corona")|
 91 | 
 92 | |<b> Pascal </b>|
 93 | |:--:|
 94 | |![Pascal](../figures/pascal-web.png "Pascal")|
 95 | 
 96 | |<b> Poodle </b>|
 97 | |:--:|
 98 | |![Poodle](../figures/poodle-web.png "Poodle")|
 99 | 
100 | |<b> Lassen </b>|
101 | |:--:|
102 | |![Lassen](../figures/lassen-web.png "Lassen")|
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 
110 | <!-- Commenting out since Gitlab does not display PDFs
111 | <object data="../hwloc/ruby.pdf" type="application/pdf" width="800px" height="800px">
112 | </object>
113 | 
114 | <object data="../hwloc/mammoth.pdf" type="application/pdf" width="800px" height="800px">
115 | </object>
116 | 
117 | <object data="../hwloc/corona.pdf" type="application/pdf" width="800px" height="800px">
118 | </object>
119 | -->
120 | 


--------------------------------------------------------------------------------
/tutorials/cug23/README.md:
--------------------------------------------------------------------------------
 1 | # CUG 2023: Supercomputer Affinity 
 2 | 
 3 | *Edgar A. León* and *Jane E. Herriman*<br>
 4 | Lawrence Livermore National Laboratory
 5 | 
 6 | ## Schedule
 7 | 
 8 | <center>
 9 | 
10 | | Begin | End | Topic |
11 | |-:|-:|:-|
12 | | 8:30 | 8:50 | Introduction + Setup |
13 | | 8:50 | 9:30 | Architecture Topology |
14 | | 9:30 | 10:00 | Process Affinity |
15 | | *10:00* | *10:30* | *Coffee* | 
16 | | 10:30 | 10:40 | Process Affinity Cont. |
17 | | 10:40 | 11:00 | Hands-on Exercises |
18 | | 11:00 | 11:40 | GPU Affinity |
19 | |11:40 | 12:00 | Hands-on Exercises |
20 | 
21 | </center>
22 | 
23 | ## AWS Cluster
24 | 
25 | Accounts: `user5`, `user6`, ..., `user35`
26 | 
27 | Password: 
28 | 
29 | ```
30 | ssh user5@
31 | 
32 | source /home/tutorial/scripts/user-env.sh
33 | 
34 | srun -N1 -n1 mpi
35 | ```
36 | 
37 | 
38 | ## Tutorial Notebook 
39 | 
40 | <br>
41 | <p align="center">
42 |    <img src="../figures/sierra.png" width="750"/>
43 | </p>
44 | 
45 | 
46 | 1. Making sense of affinity: [Discovering the node architecture topology](module1.md)
47 | 
48 |    Learn how to identify the compute and memory components of a
49 |    compute node using `hwloc`. A precise understanding of the hardware
50 |    resources is needed to map an application to the machine
51 |    efficiently. This includes identifying the node's GPUs, cores,
52 |    hardware threads, cache hierarchy, NUMA domains, and network
53 |    interfaces. Furthermore, attendees will be introduced to locality,
54 |    will identify local hardware resources, and will select resources
55 |    using affinity masks.  
56 | 
57 | 2. Exerting resource manager affinity: [Process affinity with Slurm](module2.md)
58 | 
59 |    Learn how to use Slurm’s affinity to map a parallel program to the
60 |    hardware at runtime when submitting a job. Attendees will learn to
61 |    construct CPU-based bindings using low-level and high-level
62 |    abstractions. High-level bindings are driven by hardware components
63 |    such as Cores and Sockets. 
64 | 
65 | 3. Putting it all together: [Adding in GPUs](module3.md)
66 | 
67 |    Learn how to assign GPUs to MPI processes to leverage
68 |    locality. Learn how to apply combined process and GPU
69 |    affinity policies. Attendees will learn to
70 |    manage CPU and GPU affinity concurrently to take advantage of local
71 |    resources and reduce data movement.
72 | 
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/tutorials/cug24/README.md:
--------------------------------------------------------------------------------
 1 | # Supercomputer Affinity on HPE Systems
 2 | ## CUG 2024
 3 | 
 4 | *Edgar A. León* and *Jane E. Herriman*<br>
 5 | Lawrence Livermore National Laboratory
 6 | 
 7 | ## Schedule
 8 | 
 9 | <center>
10 | 
11 | | Begin | End | Topic |
12 | |-:|-:|:-|
13 | | 8:30 | 8:50 | Introduction + Setup |
14 | | 8:50 | 9:20 | Module 1: Architecture Topology |
15 | | 9:20 | 9:50 | Module 2: Process Affinity | 
16 | | 9:50 | 10:00 | Module 2: Hands-on Exercises |
17 | | *10:00* | *10:30* | *Coffee* |
18 | | 10:30 | 10:40 | Module 2: Hands-on Exercises |
19 | | 10:40 | 11:10 | Module 3: GPU Affinity | 
20 | | 11:10 | 11:30 | Module 3: Hands-on exercises |
21 | | 11:30 | 12:00 | Module 4: Flux affinity on the AMD MI300A APU |
22 | 
23 | </center>
24 | 
25 | <!--
26 | ## AWS Cluster
27 | 
28 | Accounts: `user5`, `user6`, ..., `user35`
29 | 
30 | Password: 
31 | 
32 | ```
33 | ssh user5@
34 | 
35 | source /home/tutorial/scripts/user-env.sh
36 | 
37 | srun -N1 -n1 mpi
38 | ```
39 | -->
40 | 
41 | ## Tutorial Notebook 
42 | 
43 | <br>
44 | <p align="center">
45 |    <img src="../figures/sierra.png" width="750"/>
46 | </p>
47 | 
48 | 
49 | 1. [Discovering the node architecture topology](module1.md)
50 | 
51 |    Learn how to identify the compute and memory components of a
52 |    compute node using `hwloc`. A precise understanding of the hardware
53 |    resources is needed to map an application to the machine
54 |    efficiently. This includes identifying the node's GPUs, cores,
55 |    hardware threads, cache hierarchy, NUMA domains, and network
56 |    interfaces. Furthermore, attendees will be introduced to locality,
57 |    will identify local hardware resources, and will select resources
58 |    using affinity masks.  
59 | 
60 | 2. [Process affinity with Slurm](module2.md)
61 | 
62 |    Learn how to use Slurm’s affinity to map a parallel program to the
63 |    hardware at runtime when submitting a job. Attendees will learn to
64 |    construct CPU-based bindings using low-level and high-level
65 |    abstractions. High-level bindings are driven by hardware components
66 |    such as Cores and Sockets. 
67 | 
68 | 3. [Adding in GPUs](module3.md)
69 | 
70 |    Learn how to assign GPUs to MPI processes to leverage
71 |    locality. Learn how to apply combined process and GPU
72 |    affinity policies. Attendees will learn to
73 |    manage CPU and GPU affinity concurrently to take advantage of local
74 |    resources and reduce data movement.
75 | 
76 | 4. [Process and GPU affinity with Flux](module4.md)
77 | 
78 |    Learn the basics of the Flux resource manager to launch parallel programs on a supercomputer. Attendees will learn how to apply combined process and GPU affinity policies using Flux. 
79 | 
80 | 
81 | 


--------------------------------------------------------------------------------
/tutorials/cug24/archs.md:
--------------------------------------------------------------------------------
 1 | # Example architectures
 2 | 
 3 | Here is the node architecture of a few Livermore Computing supercomputer: `Tioga`, `RZAdams`, `Corona`, and `Pascal`. A summary of key features for these machines, including differing numbers of processors, NUMA domains, GPUs, and cores, are summarized in the table below, with images depicting the topology of each below that.
 4 | 
 5 | In particular, note that `Tioga` includes MI250X GPUs and `RZAdams` is composed of MI300A APUs.
 6 | 
 7 | <table>
 8 |   <tr>
 9 |     <th></th>
10 |     <th>Tioga</th>
11 |     <th>RZAdams</th>
12 |     <th>Corona</th>
13 |     <th>Pascal</th>
14 |   </tr>
15 |   <tr>
16 |     <td rowspan="2">Per<br>node</td>
17 |     <td>1 AMD 3rd Gen EPYC processor</td>
18 |     <td rowspan="2">4 AMD Instinct MI300A APUs:
19 |     	4 processors + 4 GPUs</td>
20 |     <td>2 AMD Rome processors</td>
21 |     <td>2 Intel Broadwell processors</td>
22 |   </tr>
23 |   <tr>
24 |     <td>8 AMD Instinct MI250X GPUs</td>
25 |     <td>8 AMD MI50 GPUs</td>
26 |     <td>2 NVIDIA Pascal GPUs</td>
27 |   </tr>
28 |   <tr>
29 |     <td rowspan="3">Per<br>processor</td>
30 |     <td>4 NUMA domains</td>
31 |     <td>1 NUMA domain</td>
32 |     <td>1 NUMA domain</td>
33 |     <td>1 NUMA domain</td>
34 |   </tr>
35 |   <tr>
36 |     <td>8 L3s, 64 L2s, 64 L1s</td>
37 |     <td>3 L3s, 24 L2s, 24 L1s</td>
38 |     <td>8 L3s, 24 L2s, 24 L1s</td>
39 |     <td>1 L3, 18 L2s, 18 L1s</td>
40 |   </tr>
41 |   <tr>
42 |     <td>64 cores</td>
43 |     <td>24 cores</td>
44 |     <td>24 cores</td>
45 |     <td>18 cores</td>
46 |   </tr>
47 |   <tr>
48 |     <td>Per<br>core</td>
49 |     <td>2 hardware threads</td>
50 |     <td>2 hardware threads</td>
51 |     <td>2 hardware threads</td>
52 |     <td>2 hardware threads</td>
53 |   </tr>
54 | </table>
55 | 
56 | |<b> Tioga </b>|
57 | |:--:|
58 | |![Tioga](../figures/tioga-web.png "Tioga (MI250X)")|
59 | 
60 | |<b> RZAdams </b>|
61 | |:--:|
62 | |![Tioga](../figures/rzadams-web.png "RZAdams (MI300A)")|
63 | 
64 | |<b> Corona </b>|
65 | |:--:|
66 | |![Corona](../figures/corona-web.png "Corona")|
67 | 
68 | |<b> Pascal </b>|
69 | |:--:|
70 | |![Pascal](../figures/pascal-web.png "Pascal")|
71 | 
72 | 
73 | 
74 | 
75 | 
76 | 
77 | 
78 | 
79 | 
80 | <!-- Commenting out since Gitlab does not display PDFs
81 | <object data="../hwloc/ruby.pdf" type="application/pdf" width="800px" height="800px">
82 | </object>
83 | 
84 | <object data="../hwloc/mammoth.pdf" type="application/pdf" width="800px" height="800px">
85 | </object>
86 | 
87 | <object data="../hwloc/corona.pdf" type="application/pdf" width="800px" height="800px">
88 | </object>
89 | -->
90 | 


--------------------------------------------------------------------------------
/tutorials/eurosys25/README.md:
--------------------------------------------------------------------------------
  1 | # 4th Tutorial on Mapping and Affinity (MAP)
  2 | 
  3 | *Edgar A. León*<br>
  4 | Lawrence Livermore National Laboratory
  5 | 
  6 | ## Bridging Applications and Hardware
  7 | 
  8 | When we consider the grand challenges addressed by distributed
  9 | systems, we likely imagine large-scale machines running parallel
 10 | code. Yet, these two pillars of computing – hardware and software –
 11 | are not enough to ensure high efficiency and reproducible
 12 | performance. When unaware of the topology of the underlying hardware,
 13 | even well-designed applications and research software can fail to
 14 | achieve their scientific goals. Affinity – how software maps to and
 15 | leverages local hardware resources – forms a third pillar critical to
 16 | computing systems. 
 17 | 
 18 | Multiple factors motivate an understanding of affinity for parallel-
 19 | and distributed-computing users. On the software side, applications
 20 | are increasingly memory-bandwidth limited making locality more
 21 | important. On the hardware side, today’s computer architectures offer
 22 | increasingly complex memory and compute topologies, making proper
 23 | affinity policies crucial to effective software-hardware assignments.
 24 | 
 25 | In this half-day tutorial, attendees will learn principles behind
 26 | effective affinity policies – like understanding the hardware topology
 27 | and the importance of memory and GPU locality. They will learn how to
 28 | control and apply these policies to create effective, locality-aware
 29 | mappings for MPI processes and GPU kernels and to ensure reproducible
 30 | performance. These techniques are relevant to both on-premise users
 31 | and those using the cloud such as AWS.  
 32 | 
 33 | 
 34 | ## Requirements and Prerequisites
 35 | 
 36 | * Attendees will need a laptop equipped with Wi-Fi, a shell terminal,
 37 |   and the ssh program. Users will be provided accounts
 38 |   to access a supercomputer-like environment required for
 39 |   demonstrations and hands-on exercises.
 40 |   
 41 | * Attendees should have a working knowledge of Unix-like systems. For
 42 |   example, they should know how to navigate a filesystem and launch
 43 |   applications from the command line.
 44 |   
 45 | * Attendees will also need some familiarity with high-level parallel
 46 |   programming concepts. For example, attendees should be comfortable
 47 |   with terms like thread, process, and GPU, but do not need experience
 48 |   writing parallel programs.
 49 | 
 50 | 
 51 | ## Schedule
 52 | 
 53 | <center>
 54 | 
 55 | | Begin | End | Topic |
 56 | |-:|-:|:-|
 57 | | 14:00 | 14:20 | Introduction + Setup |
 58 | | 14:20 | 15:00 | Module 1: Discovering the node architecture topology|
 59 | | 15:00 | 15:20 | Module 1: Hands-on exercises |
 60 | | 15:20 | 15:30 | Module 2: Mapping processes to the hardware |
 61 | | *15:30* | *16:00* | *Coffee* |
 62 | | 16:00 | 16:30 | Module 2: Mapping processes to the hardware (cont.)|
 63 | | 16:30 | 16:50 | Module 2: Hands-on exercises |
 64 | | 16:50 | 17:30 | Module 3: Adding in GPU kernels: Putting it all together | 
 65 | | 17:30 | 17:45 | Module 3: Hands-on exercises (optional)|
 66 | 
 67 | </center>
 68 | 
 69 | <!--
 70 | ## AWS Cluster
 71 | 
 72 | Accounts: `user5`, `user6`, ..., `user35`
 73 | 
 74 | Password: 
 75 | 
 76 | ```
 77 | ssh user5@
 78 | 
 79 | source /home/tutorial/scripts/user-env.sh
 80 | 
 81 | srun -N1 -n1 mpi
 82 | ```
 83 | -->
 84 | 
 85 | ## Notebook 
 86 | 
 87 | <br>
 88 | <p align="center">
 89 |    <img src="../figures/sierra.png" width="750"/>
 90 | </p>
 91 | 
 92 | 
 93 | 1. [Discovering the node architecture topology](module1.md)
 94 | 
 95 |    Learn how to identify the compute and memory components of a
 96 |    compute node using `hwloc`. A precise understanding of the hardware
 97 |    resources is needed to map an application to the machine
 98 |    efficiently. This includes identifying the node's GPUs, cores,
 99 |    hardware threads, cache hierarchy, NUMA domains, and network
100 |    interfaces. Furthermore, attendees will be introduced to locality,
101 |    will identify local hardware resources, and will select resources
102 |    using affinity masks.  
103 | 
104 | 2. [Mapping processes to the hardware](module2.md)
105 | 
106 |    Learn how to use the resource manager to map a parallel
107 |    program to the
108 |    hardware at runtime when submitting a job. Attendees will learn to
109 |    construct CPU-based bindings using low-level and high-level
110 |    abstractions. High-level bindings are driven by hardware components
111 |    such as Cores and Sockets. Furthermore, attendees will learn how to
112 |    report affinity on a given system. 
113 | 
114 | 3. [Adding in GPU kernels: Putting it all together](module3.md) 
115 | 
116 |    Learn how to assign GPUs to MPI processes to leverage
117 |    locality. Learn how to apply combined process and GPU
118 |    affinity policies. Attendees will learn to
119 |    manage CPU and GPU affinity concurrently to take advantage of local
120 |    resources and reduce data movement.
121 | 
122 | 
123 | 
124 | 


--------------------------------------------------------------------------------
/tutorials/figures/aws-architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/aws-architecture.png


--------------------------------------------------------------------------------
/tutorials/figures/aws-g4dn-metal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/aws-g4dn-metal.png


--------------------------------------------------------------------------------
/tutorials/figures/cache.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/cache.png


--------------------------------------------------------------------------------
/tutorials/figures/computing-architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/computing-architecture.png


--------------------------------------------------------------------------------
/tutorials/figures/corona-merge.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/corona-merge.png


--------------------------------------------------------------------------------
/tutorials/figures/corona-no-cache-io-physical.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/corona-no-cache-io-physical.png


--------------------------------------------------------------------------------
/tutorials/figures/corona-no-cache-io.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/corona-no-cache-io.png


--------------------------------------------------------------------------------
/tutorials/figures/corona-physical.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/corona-physical.png


--------------------------------------------------------------------------------
/tutorials/figures/corona-web.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/corona-web.png


--------------------------------------------------------------------------------
/tutorials/figures/corona.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/corona.pdf


--------------------------------------------------------------------------------
/tutorials/figures/corona.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/corona.png


--------------------------------------------------------------------------------
/tutorials/figures/hwloc-objects.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/hwloc-objects.png


--------------------------------------------------------------------------------
/tutorials/figures/lassen-web.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/lassen-web.png


--------------------------------------------------------------------------------
/tutorials/figures/lassen.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/lassen.pdf


--------------------------------------------------------------------------------
/tutorials/figures/lassen.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/lassen.png


--------------------------------------------------------------------------------
/tutorials/figures/mammoth.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/mammoth.pdf


--------------------------------------------------------------------------------
/tutorials/figures/mammoth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/mammoth.png


--------------------------------------------------------------------------------
/tutorials/figures/module4_sockets.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/module4_sockets.png


--------------------------------------------------------------------------------
/tutorials/figures/module4_specifyplacement.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/module4_specifyplacement.png


--------------------------------------------------------------------------------
/tutorials/figures/module4_spread.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/module4_spread.png


--------------------------------------------------------------------------------
/tutorials/figures/module4_threadstocores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/module4_threadstocores.png


--------------------------------------------------------------------------------
/tutorials/figures/module4_threadstocpus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/module4_threadstocpus.png


--------------------------------------------------------------------------------
/tutorials/figures/module4_threadstosockets.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/module4_threadstosockets.png


--------------------------------------------------------------------------------
/tutorials/figures/module5_tioga.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/module5_tioga.png


--------------------------------------------------------------------------------
/tutorials/figures/numa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/numa.png


--------------------------------------------------------------------------------
/tutorials/figures/pascal-web.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/pascal-web.png


--------------------------------------------------------------------------------
/tutorials/figures/pascal.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/pascal.pdf


--------------------------------------------------------------------------------
/tutorials/figures/pascal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/pascal.png


--------------------------------------------------------------------------------
/tutorials/figures/poodle-web.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/poodle-web.png


--------------------------------------------------------------------------------
/tutorials/figures/ruby.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/ruby.pdf


--------------------------------------------------------------------------------
/tutorials/figures/ruby.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/ruby.png


--------------------------------------------------------------------------------
/tutorials/figures/rzadams-web.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/rzadams-web.png


--------------------------------------------------------------------------------
/tutorials/figures/rzadams/FigureA.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/rzadams/FigureA.png


--------------------------------------------------------------------------------
/tutorials/figures/rzadams/FigureB.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/rzadams/FigureB.png


--------------------------------------------------------------------------------
/tutorials/figures/rzadams/FigureC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/rzadams/FigureC.png


--------------------------------------------------------------------------------
/tutorials/figures/rzadams/FigureD.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/rzadams/FigureD.png


--------------------------------------------------------------------------------
/tutorials/figures/rzadams/FigureE.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/rzadams/FigureE.png


--------------------------------------------------------------------------------
/tutorials/figures/rzadams/FigureF.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/rzadams/FigureF.png


--------------------------------------------------------------------------------
/tutorials/figures/rzadams/FigureG.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/rzadams/FigureG.png


--------------------------------------------------------------------------------
/tutorials/figures/rzadams/FigureH.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/rzadams/FigureH.png


--------------------------------------------------------------------------------
/tutorials/figures/rzadams/FigureI.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/rzadams/FigureI.png


--------------------------------------------------------------------------------
/tutorials/figures/rzadams/FigureJ.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/rzadams/FigureJ.png


--------------------------------------------------------------------------------
/tutorials/figures/sierra.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/sierra.pdf


--------------------------------------------------------------------------------
/tutorials/figures/sierra.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/sierra.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga-web.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga-web.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga.pdf


--------------------------------------------------------------------------------
/tutorials/figures/tioga.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga/Tioga-Mod1-Ex5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga/Tioga-Mod1-Ex5.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga/Tioga-Mod1-Ex6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga/Tioga-Mod1-Ex6.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga/Tioga-Mod1-Ex7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga/Tioga-Mod1-Ex7.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga/Tioga-Mod1-noprocs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga/Tioga-Mod1-noprocs.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga/Tioga-Mod2-Ex1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga/Tioga-Mod2-Ex1.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga/Tioga-Mod2-Ex10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga/Tioga-Mod2-Ex10.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga/Tioga-Mod2-Ex11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga/Tioga-Mod2-Ex11.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga/Tioga-Mod2-Ex2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga/Tioga-Mod2-Ex2.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga/Tioga-Mod2-Ex4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga/Tioga-Mod2-Ex4.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga/Tioga-Mod2-Ex5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga/Tioga-Mod2-Ex5.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga/Tioga-Mod2-Ex6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga/Tioga-Mod2-Ex6.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga/Tioga-Mod2-Ex7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga/Tioga-Mod2-Ex7.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga/Tioga-Mod2-Ex8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga/Tioga-Mod2-Ex8.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga/Tioga-Mod2-Ex9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga/Tioga-Mod2-Ex9.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga/Tioga-Mod3-Ex1a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga/Tioga-Mod3-Ex1a.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga/Tioga-Mod3-Ex1b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga/Tioga-Mod3-Ex1b.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga/Tioga-Mod3-Ex2a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga/Tioga-Mod3-Ex2a.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga/Tioga-Mod3-Ex2b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga/Tioga-Mod3-Ex2b.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga/Tioga-Mod3-Ex3a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga/Tioga-Mod3-Ex3a.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga/Tioga-Mod3-Ex3b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga/Tioga-Mod3-Ex3b.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga/Tioga-Mod3-Ex5a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga/Tioga-Mod3-Ex5a.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga/Tioga-Mod3-Ex5b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga/Tioga-Mod3-Ex5b.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga/Tioga-Mod3-Ex6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga/Tioga-Mod3-Ex6.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga/figureA.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga/figureA.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga/figureB.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga/figureB.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga/figureC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga/figureC.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga/figureD.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga/figureD.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga/figureE.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga/figureE.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga/figureF.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga/figureF.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga/figureG.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga/figureG.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga/figureH.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga/figureH.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga/figureI.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga/figureI.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga/figureJ.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga/figureJ.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga/figureK.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga/figureK.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga/figureL.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga/figureL.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga/figureM.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga/figureM.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga/figureN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga/figureN.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga/tioga-merge.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga/tioga-merge.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga/tioga-no-cache-io-physical.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga/tioga-no-cache-io-physical.png


--------------------------------------------------------------------------------
/tutorials/figures/tioga/tioga-no-cache-io.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/figures/tioga/tioga-no-cache-io.png


--------------------------------------------------------------------------------
/tutorials/flux/README.md:
--------------------------------------------------------------------------------
 1 | <!--
 2 | Built mpibind for TOSS 4 (rzalastor,
 3 | use compatibility flags, e.g., march...)
 4 | -->
 5 | 
 6 | # Exercising Affinity in Flux
 7 | 
 8 | *Edgar A. Leon* and *Jane E. Herriman*<br>
 9 | Lawrence Livermore National Laboratory
10 | 
11 | ## Table of contents
12 | 
13 | 1. [Flux basics and affinity](module1.md)
14 | 1. [Affinity with mpibind](module2.md)
15 | 
16 | 
17 | 
18 | 
19 | <!--
20 | 
21 | <details>
22 | <summary>
23 | 
24 | ```
25 | ```
26 | </summary>
27 | 
28 | ```
29 | ```
30 | </details>
31 | -->
32 | 


--------------------------------------------------------------------------------
/tutorials/main/README.md:
--------------------------------------------------------------------------------
 1 | # Supercomputer Affinity 
 2 | 
 3 | *Edgar A. León* and *Jane E. Herriman*<br>
 4 | Lawrence Livermore National Laboratory
 5 | 
 6 | ## Tutorial Notebook 
 7 | 
 8 | <br>
 9 | <p align="center">
10 |    <img src="../figures/sierra.png" width="750"/>
11 | </p>
12 | 
13 | 
14 | 1. Making sense of affinity: [Discovering the node architecture topology](module1.md)
15 | 
16 |    Learn how to identify the compute and memory components of a compute node using `hwloc` before learning how to leverage these resources to improve program performance. This includes identifying the node's GPUs, cores, hardware threads, cache hierarchy, NUMA domains, and network interfaces. Furthermore, attendees will be introduced to locality and will identify local hardware resources.
17 | 
18 | 2. Applying automatic affinity: [mpibind](module2.md)
19 | 
20 |    Learn how to map parallel codes to the hardware automatically using `mpibind`. Attendees will learn to turn mpibind on and off and to identify the resources available to processes and threads in eacy case. They will explore locality effects and learn to tune mpibind to best leverage locality for hybrid applications that are either CPU or GPU constrained.
21 | 
22 | 3. Exerting resource manger affinity: [Process affinity with Slurm](module3.md)
23 | 
24 |    Learn how to use Slurm’s affinity to map a program to the hardware at runtime when submitting a job. Attendees will learn low-level and policy-based binding, e.g., compute-bound, before covering task distribution enumerations. Finally, they will learn how to create affinity masks to specify sets of CPUs.
25 | 
26 | 4. Exerting thread affinity: [OpenMP](module4.md)
27 | 
28 |    Learn how to map OpenMP threads to specific hardware resources. Attendees will learn how to map threads explicitly and implicitly using OpenMP’s predefined policies.
29 | 
30 | 5. Putting it all together: [Adding in GPUs](module5.md)
31 | 
32 |    Learn how to assign GPUs to MPI processes and then to OpenMP threads. Learn how to apply combined process, thread, and GPU affinity policies to hybrid applications. Attendees will learn to avoid conflicting directives from the different types of affinity. Furthermore, they will assess whether automatic affinity policies may be sufficient for their use cases.
33 | 
34 | 


--------------------------------------------------------------------------------
/tutorials/tapia22/Sep-09-1045-Supercomputing-Systems-101.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/mpibind/48411c26c9016560083c66b1b87cd8781fdb7fa3/tutorials/tapia22/Sep-09-1045-Supercomputing-Systems-101.pdf


--------------------------------------------------------------------------------
/tutorials/tapia22/tapia-setup-instructions.md:
--------------------------------------------------------------------------------
 1 | # Setup instructions
 2 | 
 3 | 1. **ssh to AWS**
 4 | 
 5 | If you'd like to ssh to our AWS environment from a native terminal application, you can run `ssh <username>@18.219.49.4` with one of our provided usernames.
 6 | 
 7 | All usernames are of the form `user<N>`with corresponding passwords `user<N>PASS`.
 8 | 
 9 | For example, you might connect with
10 | 
11 | ```
12 | ssh user15@18.219.49.4
13 | ```
14 | 
15 | When prompted for a password, `user15` would provide `user15PASS`.
16 | 
17 | 2. **Run setup script**
18 | 
19 | Once you've ssh'ed to AWS, you'll want to run the script `/home/tutorial/aws/user-env.sh` to set up your environment via
20 | 
21 | ```
22 | source /home/tutorial/aws/user-env.sh
23 | ```
24 | 


--------------------------------------------------------------------------------