├── .gitignore
├── LICENSE
├── Manifest.toml
├── Project.toml
├── README.md
├── help
├── NERSC Education Resources.pdf
├── README.md
├── cpunode.sh
├── gpunode.sh
├── jupyter-kernel
│ ├── install.sh
│ └── julia-tutorial
│ │ ├── kernel-helper.sh
│ │ ├── kernel.json
│ │ ├── logo-32x32.png
│ │ └── logo-64x64.png
├── perlmutter_cheatsheet.md
└── vscode_cheatsheet.md
├── imgs
└── julia_hpc_workshop.png
├── julia_wrapper.sh
├── onboarding
├── README.md
├── intro.pdf
├── julia_vscode_on_perlmutter.pdf
└── overview.pdf
├── parts
├── diffusion_2d
│ ├── README.md
│ ├── diffusion_2d.ipynb
│ ├── diffusion_2d.jl
│ ├── diffusion_2d_loop.jl
│ └── imgs
│ │ ├── initial.png
│ │ └── stagg_2D.png
├── distributed
│ └── explanation
│ │ ├── 01_distributed.ipynb
│ │ ├── 01_distributed.slides.html
│ │ ├── 02_dagger.ipynb
│ │ ├── 02_dagger.slides.html
│ │ └── Project.toml
├── gpu
│ ├── README.md
│ ├── advanced
│ │ ├── closest_device.jl
│ │ ├── job_gpu_mpi_multinode.sh
│ │ └── job_gpu_mpi_singlenode.sh
│ ├── diffusion_2d_cuda.jl
│ ├── diffusion_2d_cuda_mpi.jl
│ ├── get_gpu_compute_node_interactive.sh
│ ├── gpu.ipynb
│ ├── imgs
│ │ ├── cpu_gpu_evo.png
│ │ ├── cuda_grid.png
│ │ └── frontier.png
│ ├── job_bench_gpu.sh
│ ├── job_gpu_mpi_multinode.sh
│ ├── job_gpu_mpi_singlenode.sh
│ ├── multigpu.jl
│ ├── slurm
│ │ ├── hello.jl
│ │ ├── job_hello_multinode.sh
│ │ └── job_hello_singlenode.sh
│ ├── solution
│ │ ├── diffusion_2d_cuda.jl
│ │ ├── diffusion_2d_cuda_mpi.jl
│ │ ├── job_bench_gpu.sh
│ │ ├── job_gpu_mpi_multinode.sh
│ │ ├── job_gpu_mpi_singlenode.sh
│ │ └── visualize_mpi.jl
│ ├── visualize.jl
│ └── visualize_mpi.jl
├── mpi
│ ├── README.md
│ ├── diffusion_2d_mpi.jl
│ ├── explanation
│ │ ├── 01_mpi+jupyter.ipynb
│ │ ├── 01_mpi+jupyter.slides.html
│ │ ├── 02_comms.ipynb
│ │ ├── 02_comms.slides.html
│ │ ├── 03_halo.ipynb
│ │ ├── 03_halo.slides.html
│ │ ├── Project.toml
│ │ ├── advanced
│ │ │ └── 00_gpu_select.ipynb
│ │ ├── diffusion_2d_halo_exchange.pdf
│ │ ├── diffusion_2d_halo_exchange.png
│ │ └── l8_1D_global_grid.png
│ ├── get_compute_node_interactive.sh
│ ├── job_mpi_multinode.sh
│ ├── job_mpi_singlenode.sh
│ ├── solution
│ │ ├── diffusion_2d_mpi.jl
│ │ ├── job_mpi_multinode.sh
│ │ ├── job_mpi_singlenode.sh
│ │ ├── multinode_results.txt
│ │ ├── slurm_mpi_singlenode.out
│ │ ├── visualization_before.png
│ │ └── visualization_desired.png
│ ├── visualize_mpi.ipynb
│ └── visualize_mpi.jl
├── multithreading
│ ├── README.md
│ ├── diffusion_2d_threads.jl
│ ├── imgs
│ │ ├── amd_milan_cpu_die.svg
│ │ ├── stack_heap_threads.png
│ │ ├── stack_heap_threads.svg
│ │ ├── tasks_threads_cores.svg
│ │ └── topo.svg
│ ├── job_bench_threads.sh
│ ├── job_compare_threads_serial.sh
│ ├── multithreading.ipynb
│ └── solution
│ │ ├── bench_threads.jl
│ │ ├── diffusion_2d_threads.jl
│ │ ├── job_bench_threads.sh
│ │ ├── job_compare_threads_serial.sh
│ │ ├── slurm_bench_threads.out
│ │ └── slurm_compare_threads_serial.out
└── shared.jl
└── setup.sh
/.gitignore:
--------------------------------------------------------------------------------
1 | # misc
2 | .DS_Store
3 | .vscode
4 | .ipynb_checkpoints
5 |
6 | # julia
7 | # Manifest.toml
8 |
9 | # output
10 | out*
11 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 JuliaHPC
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Project.toml:
--------------------------------------------------------------------------------
1 | [deps]
2 | AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
3 | BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
4 | CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
5 | CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
6 | CpuId = "adafc99b-e345-5852-983c-f28acb93d879"
7 | Hwloc = "0e44f5e4-bd66-52a0-8798-143a42290a1d"
8 | IJulia = "7073ff75-c697-5162-941a-fcdaad2a7d2a"
9 | JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
10 | MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
11 | NUMA = "292f1341-b53f-425a-80e5-3597ad0961bf"
12 | ThreadPinning = "811555cd-349b-4f26-b7bc-1f208b848042"
13 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # JuliaCon24 Workshop: Hands-on with Julia for HPC on GPUs and CPUs
2 |
3 |
4 |
5 |
6 |
7 | **Instructors:** [Carsten Bauer](https://github.com/carstenbauer), [Ludovic Räss](https://github.com/luraess), [Ivan Utkin](https://github.com/utkinis), and [Johannes Blaschke](https://github.com/JBlaschke) (remote).
8 |
9 | **Where:** TU-Eindhoven 0.244
10 |
11 | **When:** July 9th, 1:30 PM (CEST)
12 |
13 | **More:** https://pretalx.com/juliacon2024/talk/NTQZJJ/
14 |
15 | ## Schedule
16 |
17 | * **Onboarding**
18 | * [Introduction](./onboarding/intro.pdf)
19 | * [NERSC overview](./onboarding/overview.pdf)
20 | * [Julia + VS Code on Perlmutter](./onboarding/julia_vscode_on_perlmutter.pdf)
21 |
22 | * **Introducing the example**
23 | * [2D linear diffusion solver](./parts/diffusion_2d)
24 |
25 | * **Parallelization on Perlmutter**
26 | * [Multithreading](./parts/multithreading)
27 |
28 | (short break)
29 | * [MPI parallelization](./parts/mpi)
30 | * [GPU acceleration](./parts/gpu)
31 |
32 | ## Prepare for the workshop
33 |
34 | To begin with, make sure that you have [VS Code](https://code.visualstudio.com/download) installed on your laptop.
35 |
36 | ### VS Code → Perlmutter (via SSH)
37 |
38 | 1) In VS Code, press `F1` and run the `Remote-SSH: Open SSH Host...` command.
39 | - If the command isn't available, make sure that [Remote - SSH extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-ssh) is installed (but it should be available out of the box).
40 | 2) Enter `trainXY@perlmutter.nersc.gov` (with `trainXY` replaced by your training account) and press enter.
41 | 3) In the popup input box, enter your password and press enter.
42 |
43 | After a second or two, you should have VS Code running on a Perlmutter login node! 🎉
44 |
45 |
46 | ### On Perlmutter
47 | 1. Clone the workshop materials into `$SCRATCH/juliacon24-hpcworkshop`by running the following command.
48 |
49 | git clone https://github.com/JuliaHPC/juliacon24-hpcworkshop $SCRATCH/juliacon24-hpcworkshop
50 |
51 | * **You will always work in this folder (`$SCRATCH/juliacon24-hpcworkshop`) during the workshop.**
52 | 2. Run the following commands:
53 |
54 | cd $SCRATCH/juliacon24-hpcworkshop
55 | ./setup.sh
56 |
57 |
58 | What does this do? (click me if you're curious)
59 |
60 | * The setup script
61 | * modifies your `$HOME/.bashrc` to
62 | * permanently put your Julia depot onto the parallel file system (`$SCRATCH/.julia`)
63 | * auto-load the Julia module when you login (such that the `julia` command is available)
64 | * make `mpiexecjl` available (i.e. modify `$PATH`)
65 | * instantiates the Julia environment
66 | * installs MPI.jl's `mpiexecjl` wrapper
67 | * installs a Jupyter kernel (for NERSC's Jupyter hub)
68 |
69 |
70 |
71 | 3. **!! Before you proceed, restart VS Code !!**
72 | * Close it fully, open it again, and connect to Perlmutter again (see above). Otherwise the `.bashrc` changes won't be in effect.
73 |
74 | 4. Let's now turn to the Julia VS Code extension.
75 |
76 | 1) Installing the extension
77 | - Open the extensions view (press `CTRL/CMD + SHIFT + X`).
78 | - Search for `julia`.
79 | - Click on `install`.
80 | 2) Pointing it to `julia_wrapper.sh`
81 | - Open the VS Code Settings (press `CTRL/CMD + ,`).
82 | - Click on the tab `Remote [SSH: perlmutter.nersc.gov]`.
83 | - Search for `Julia executable`.
84 | - Insert `/pscratch/sd/t/trainXY/juliacon24-hpcworkshop/julia_wrapper.sh` - with `trainXY` replaced by you training account name - into the text field under `Julia: Executable Path`.
85 | 3) If `ALT/OPTION + J` followed by `ALT/OPTION + O` (**or** pressing `F1` and executing the `Julia: Start REPL` command) successfully spins up the integrated Julia REPL, you know that the setup is working! 🎉
86 |
87 | 5. Finally, you should open the workshop directory in VS Code.
88 | * In the VS Code terminal, run `cd $SCRATCH/juliacon24-hpcworkshop` followed by `code -r .`
89 | * Manual alternative: Click on the green button "Open Folder" (or press `CTRL/CMD + O`) and enter `/pscratch/sd/t/trainXY/juliacon24-hpcworkshop` - **with `trainXY` replaced by you training account name**.
90 |
91 | ## Help?
92 |
93 | ### Cheatsheets
94 |
95 | * [Perlmutter cheatsheet](./help/perlmutter_cheatsheet.md)
96 | * [VS Code cheatsheet](./help/vscode_cheatsheet.md)
97 |
98 | ### VS Code isn't working for me, what should I do?
99 |
100 | As a fallback, you can also try to use Jupyter under https://jupyter.nersc.gov. Just make sure to use the `JuliaCon24 HPC Workshop 1.10.4` kernel (open a notebook and select the kernel in the top right corner).
101 |
102 | ## Applying for NERSC Training Account
103 |
104 | To get the most out of the workshop, you need to apply for a NERSC training account **before the workshop (as early as possible)**! The reason for this is that everyone who applies for an account has to be checked, which can take some time (between a few minutes and a week) depending on their personal background (e.g. nationality and affiliation).
105 |
106 | **Please only apply for an account if you 1) have a workshop ticket and 2) really plan to participate in the JuliaCon 2024 workshop on Tuesday, July 9 in person!**
107 |
108 | ### Sign up for an account
109 |
110 | To apply for an account:
111 | 1. Go to https://iris.nersc.gov/train
112 | 2. Fill out the application form with your details and **use the training code that you've received by email**.
113 | 3. Iris will display your training account's login credials **only once**. **Take a screenshot of your login credials**, you will not be able to change or recover these after you close this tab!
114 | 4. You can already start experimenting once your account has been approved. Your training account will be availabe until July 14th (end of JuliaCon). Accounts get deleted afterwards, so remember to **backup your data** before July 14th.
115 |
116 | **If your institution is not listed in the drop down menu at https://iris.nersc.gov/train:** Please choose "Training Account Only - Org Not Listed", and put your organization name in the "Department" field next.
117 |
--------------------------------------------------------------------------------
/help/NERSC Education Resources.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JuliaHPC/juliacon24-hpcworkshop/2c6e764bcf94eebd549785b69264215b1012eaf2/help/NERSC Education Resources.pdf
--------------------------------------------------------------------------------
/help/README.md:
--------------------------------------------------------------------------------
1 | * The files `cpunode.sh` and `gpunode.sh` can be used to get an interactive shell on a (full) CPU/GPU node on Perlmutter (e.g. `sh cpunode.sh`).
2 | * The `perlmutter_cheatsheet.md` collects a bunch of useful information and commands for Perlmutter, like example job scripts.
3 | * The folder `jupyter-kernel` is only for backup purposes and shouldn't be needed.
4 |
--------------------------------------------------------------------------------
/help/cpunode.sh:
--------------------------------------------------------------------------------
1 | # Request an entire CPU node for interactive usage (you'll end up with a shell on the compute node)
2 | # Run as: sh cpunode.sh
3 | salloc --nodes 1 --qos interactive --time 01:00:00 --constraint cpu --account=ntrain1
4 |
--------------------------------------------------------------------------------
/help/gpunode.sh:
--------------------------------------------------------------------------------
1 | # Request an entire GPU node for interactive usage (you'll end up with a shell on the compute node)
2 | # Run as: sh gpunode.sh
3 | salloc --nodes 1 --qos interactive --time 01:00:00 --constraint gpu --gpus 4 --account=ntrain1
4 |
--------------------------------------------------------------------------------
/help/jupyter-kernel/install.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | kernel_dir=${HOME}/.local/share/jupyter/kernels
4 | mkdir -p $kernel_dir
5 | cp -r ${SCRATCH}/juliacon24-hpcworkshop/help/jupyter-kernel/julia-tutorial $kernel_dir
6 |
--------------------------------------------------------------------------------
/help/jupyter-kernel/julia-tutorial/kernel-helper.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | module load PrgEnv-gnu
4 | module load cray-hdf5-parallel
5 | module load python
6 |
7 | module use /global/common/software/nersc/julia_hpc_24/modules/
8 | module use /global/common/software/nersc/n9/julia/modules/
9 | module load adios2 julia
10 |
11 | readarray -t ijulia_boostrap < <(julia /global/cfs/cdirs/nstaff/blaschke/julia/kernels/bootstrap.jl)
12 |
13 | echo "Check-and-install returned following output:"
14 | _IFS=$IFS
15 | IFS=$'\n'
16 | for each in ${ijulia_boostrap[*]}
17 | do
18 | echo $each
19 | done
20 | IFS=$_IFS
21 |
22 | JULIA_EXEC=$(which julia)
23 | KERNEL="${ijulia_boostrap[-1]}"
24 | export JULIA_NUM_THREADS=8
25 |
26 | echo "Connecting using JULIA_EXEC=$JULIA_EXEC and KERNEL=$KERNEL"
27 | exec $JULIA_EXEC -i --startup-file=yes --color=yes $KERNEL "$@"
28 |
--------------------------------------------------------------------------------
/help/jupyter-kernel/julia-tutorial/kernel.json:
--------------------------------------------------------------------------------
1 | {
2 | "display_name": "JuliaCon24 HPC Workshop",
3 | "argv": [
4 | "{resource_dir}/kernel-helper.sh",
5 | "{connection_file}"
6 | ],
7 | "language": "julia",
8 | "env": {},
9 | "interrupt_mode": "signal"
10 | }
11 |
--------------------------------------------------------------------------------
/help/jupyter-kernel/julia-tutorial/logo-32x32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JuliaHPC/juliacon24-hpcworkshop/2c6e764bcf94eebd549785b69264215b1012eaf2/help/jupyter-kernel/julia-tutorial/logo-32x32.png
--------------------------------------------------------------------------------
/help/jupyter-kernel/julia-tutorial/logo-64x64.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JuliaHPC/juliacon24-hpcworkshop/2c6e764bcf94eebd549785b69264215b1012eaf2/help/jupyter-kernel/julia-tutorial/logo-64x64.png
--------------------------------------------------------------------------------
/help/perlmutter_cheatsheet.md:
--------------------------------------------------------------------------------
1 | # Perlmutter Cheatsheet
2 |
3 | ## Managing jobs
4 |
5 | ### Submitting a job
6 | `sbatch job_script.sh`
7 |
8 | ### List your submitted jobs
9 |
10 | `sqs` or maybe even `watch -n 10 'sqs'`
11 |
12 | ### Canceling a job
13 |
14 | `scancel ` where `` is the id of the job (can be found with `squeue`, see above).
15 |
16 | ## Interactive sessions on compute nodes
17 |
18 | ### CPU
19 | ```bash
20 | salloc --nodes 1 --qos interactive --time 01:00:00 --constraint cpu --account=ntrain1
21 | ```
22 | (see the file `cpunode.sh` which you can simply run with `sh cpunode.sh`)
23 |
24 | ### GPU
25 | ```bash
26 | salloc --nodes 1 --qos interactive --time 01:00:00 --constraint gpu --gpus 4 --account=ntrain1
27 | ```
28 | (see the file `gpunode.sh` which you can simply run with `sh gpunode.sh`)
29 |
30 | ## Examplatory job scripts
31 |
32 | ### CPU (full node)
33 | ```bash
34 | #!/bin/bash
35 | #SBATCH --time=00:05:00
36 | #SBATCH --nodes=1
37 | #SBATCH --ntasks-per-node=1
38 | #SBATCH --cpus-per-task=256
39 | #SBATCH --constraint=cpu
40 | #SBATCH --account=ntrain1
41 |
42 | # Load julia
43 | ml use /global/common/software/nersc/n9/julia/modules
44 | ml julia
45 |
46 | julia --project --threads 8 whatever.jl
47 | ```
48 |
49 | ### MPI
50 |
51 | * "tasks" in SLURM correspond to MPI ranks
52 | * **If you want more than 8 nodes, you need to specify `#SBATCH --qos=regular`.**
53 |
54 | ```bash
55 | #!/bin/bash
56 | #SBATCH --time=00:10:00
57 | #SBATCH --nodes=9
58 | #SBATCH --ntasks-per-node=1
59 | #SBATCH --constraint=cpu
60 | #SBATCH --account=ntrain1
61 | #SBATCH --qos=regular
62 |
63 | # Load julia
64 | ml use /global/common/software/nersc/n9/julia/modules
65 | ml julia
66 |
67 | mpiexecjl --project -n 9 julia mpicode.jl
68 | ```
69 | ### MPI GPU
70 |
71 | ```bash
72 | #!/bin/bash
73 | #SBATCH -A ntrain1
74 | #SBATCH -C gpu
75 | #SBATCH -q regular
76 | #SBATCH --output=slurm_gpu_mpi_multinode.out
77 | #SBATCH --time=00:05:00
78 | #SBATCH --nodes=4
79 | #SBATCH --ntasks=16
80 | #SBATCH --gpus-per-node=4
81 | #SBATCH --exclusive
82 | #SBATCH --gpu-bind=none
83 |
84 | # pin to closest NIC to GPU
85 | export MPICH_OFI_NIC_POLICY=GPU
86 |
87 | # default to std memory pool, see: https://juliaparallel.org/MPI.jl/stable/knownissues/#Memory-pool
88 | export JULIA_CUDA_MEMORY_POOL=none
89 |
90 | ml use /global/common/software/nersc/n9/julia/modules
91 | ml julia
92 |
93 | mpiexecjl --project=../.. julia gpu_mpicode.jl
94 |
95 | ```
96 |
--------------------------------------------------------------------------------
/help/vscode_cheatsheet.md:
--------------------------------------------------------------------------------
1 | # VS Code Cheatsheet
2 |
3 | ## SSH → Perlmutter
4 |
5 | 1) In VS Code, press `F1` and run the `Remote-SSH: Open SSH Host...` command.
6 | - If the command isn't available, make sure that [Remote - SSH extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-ssh) is installed (but it should be available out of the box).
7 | 2) Enter `trainXY@perlmutter.nersc.gov` (with `trainXY` replaced by your training account) and press enter.
8 | 3) In the popup input box, enter your password and press enter.
9 |
10 | After a second or two, you should have VS Code running on a Perlmutter login node! 🎉
11 |
12 | ## Basics
13 |
14 | * Run a command: Press `F1` or `CTRL/CMD + SHIFT + P`
15 |
16 | * Open a terminal: `` Ctrl + ` ``
17 |
18 | * Open a folder from the terminal: `code -r .`
19 |
20 | * Search for a file: `CTRL/CMD + T`
21 |
22 | * Search for a function in a file: `CTRL/CMD + R`
23 |
24 | ## Julia
25 |
26 | * Open the REPL: `ALT/OPTION + J` followed by `ALT/OPTION + O`
27 |
28 | * Restart the REPL: `ALT/OPTION + J` followed by `ALT/OPTION + R`
29 |
30 | * Kill the REPL: `ALT/OPTION + J` followed by `ALT/OPTION + K`
31 |
32 | * Change the Julia environment: `ALT/OPTION + J` followed by `ALT/OPTION + E`
--------------------------------------------------------------------------------
/imgs/julia_hpc_workshop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JuliaHPC/juliacon24-hpcworkshop/2c6e764bcf94eebd549785b69264215b1012eaf2/imgs/julia_hpc_workshop.png
--------------------------------------------------------------------------------
/julia_wrapper.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Making module / ml available
4 | # ------------------------------------------------------------
5 | export MODULEPATH=/global/common/software/nersc/n9/julia/modules:/opt/cray/pe/lmod/modulefiles/perftools/23.12.0:/opt/cray/pe/lmod/modulefiles/comnet/gnu/12.0/ofi/1.0:/opt/cray/pe/lmod/modulefiles/mix_compilers:/opt/cray/pe/lmod/modulefiles/compiler/gnu/12.0:/opt/cray/pe/lmod/modulefiles/mpi/gnu/12.0/ofi/1.0/cray-mpich/8.0:/opt/cray/pe/lmod/modulefiles/net/ofi/1.0:/opt/cray/pe/lmod/modulefiles/cpu/x86-milan/1.0:/opt/cray/pe/modulefiles/Linux:/opt/cray/pe/modulefiles/Core:/opt/cray/pe/lmod/lmod/modulefiles/Core:/opt/cray/pe/lmod/modulefiles/core:/opt/cray/pe/lmod/modulefiles/craype-targets/default:/global/common/software/nersc/pe/modulefiles_hotfixes:/opt/nersc/pe/modulefiles:/usr/share/lmod/lmod/modulefiles/Core:/opt/cray/modulefiles
6 | source /usr/share/lmod/lmod/init/profile
7 | export LMOD_SYSTEM_DEFAULT_MODULES=craype-x86-milan:craype-network-ofi:perftools-base:xpmem:PrgEnv-gnu:cpe:gpu
8 | module --initial_load restore
9 | # ------------------------------------------------------------
10 |
11 | # Load julia
12 | ml use /global/common/software/nersc/n9/julia/modules
13 | ml julia
14 |
15 | # Pass on all arguments to julia
16 | exec julia "${@}"
--------------------------------------------------------------------------------
/onboarding/README.md:
--------------------------------------------------------------------------------
1 | # Getting Started at NERSC
2 |
3 | This place we keep onboarding instructions. Please also refer to the [cheat sheets](../help/).
4 |
5 | Also, if you're interested in applying for a NERSC account, please take a look at [Rebecca's slides](../help/NERSC%20Education%20Resources.pdf).
6 |
7 | ## Important: Before you go on your own ways
8 |
9 | We've taken some shortcuts in order to help you start being productive quickly. If you're using NERSC for more than just training purposes, please consider the following
10 |
11 | 1. We put your software environment on `$SCRATCH` -- this is a temporary place. For production software please use:
12 | - Containers: https://docs.nersc.gov/development/containers/
13 | - `/global/common/software/$YOUR_PROJECT_ID`
14 | - `$HOME` or `$CFS` for your source code
15 |
16 | 2. The [setup script](../setup.sh) configures your `.bashrc`. Please understand these, and configure your user environment in a way that works for you. **Make sure that important job scripts, and software environments don't rely on shell configurations**
17 |
18 | 3. We put our shared code into a `shared.jl` and include this in our Julia programs. This is fine for small-scale runs (couple dozen nodes). Ideally you want to be able to precompile though, for this to work, you want to package your program up [as julia packages](https://pkgdocs.julialang.org/v1/creating-packages/)
19 |
--------------------------------------------------------------------------------
/onboarding/intro.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JuliaHPC/juliacon24-hpcworkshop/2c6e764bcf94eebd549785b69264215b1012eaf2/onboarding/intro.pdf
--------------------------------------------------------------------------------
/onboarding/julia_vscode_on_perlmutter.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JuliaHPC/juliacon24-hpcworkshop/2c6e764bcf94eebd549785b69264215b1012eaf2/onboarding/julia_vscode_on_perlmutter.pdf
--------------------------------------------------------------------------------
/onboarding/overview.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JuliaHPC/juliacon24-hpcworkshop/2c6e764bcf94eebd549785b69264215b1012eaf2/onboarding/overview.pdf
--------------------------------------------------------------------------------
/parts/diffusion_2d/README.md:
--------------------------------------------------------------------------------
1 | # 2D Linear Diffusion Solver
2 |
3 | In this part, we introduce the Diffusion 2D example we will use throughout the workshop to exemplify various HPC concepts in Julia, namely:
4 | - [Multithreading](./../multithreading/)
5 | - [Distributed computing](./../mpi/)
6 | - [GPU acceleration](./../gpu/)
7 |
8 | The script [`diffusion_d2.jl`](./diffusion_2d.jl) provides the starting point; a vectorised 2D linear diffusion solver computing diffusive fluxes and their divergence in a `compute_flux!` and `diffusion_step!` function, respectively.
9 |
10 | The follow-up script, [`diffusion_d2_loop.jl`](./diffusion_2d_loop.jl), implements a serial loop version of the previous script that we will use as a starting point for all our further experiments.
11 |
12 | ## Warm-up Task - running 2D diffusion
13 |
14 | Your very first task is to get familiar with the script structure and output generated. Run the [`diffusion_2d-jl`](diffusion_2d.jl) script, verifying that plotting works and assess the reported effective memory throughput `T_eff` (in the REPL).
15 |
16 | Repeat the same for the [`diffusion_2d_loop.jl`](diffusion_2d_loop.jl) script.
17 |
--------------------------------------------------------------------------------
/parts/diffusion_2d/diffusion_2d.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Introducing the example: Diffusion 2D"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Overview"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "* **The brief physics intro**\n",
22 | " * 2D explicit diffusion using the finite-difference method\n",
23 | "\n",
24 | "* **The code structure overview**\n",
25 | " * Compute and \"main\" functions\n",
26 | " * [`shared.jl`](./../shared.jl) (included) script\n",
27 | "\n",
28 | "* **The output**\n",
29 | " * Visualisation\n",
30 | " * `Time` and `T_eff` - Performance reporting in the REPL\n",
31 | "\n",
32 | "* **The serial loop version**\n",
33 | " * Macros, \"race\" conditions\n",
34 | " * Moving to a single compute function"
35 | ]
36 | },
37 | {
38 | "cell_type": "markdown",
39 | "metadata": {},
40 | "source": [
41 | "## The brief physics intro"
42 | ]
43 | },
44 | {
45 | "cell_type": "markdown",
46 | "metadata": {},
47 | "source": [
48 | "During this workshop, we will use a 2D diffusion solver to investigate how Julia allows us to tackle various HPC concepts in a concise fashion, without trading performance.\n",
49 | "\n",
50 | "We will solve the 2D diffusion equation for a quantity $C$,\n",
51 | "$$\n",
52 | "\\frac{∂C}{∂t} = -∇ ⋅ q~,\n",
53 | "$$\n",
54 | "where $q$ represents the diffusive flux:\n",
55 | "$$\n",
56 | "q = -D \\; ∇C~,\n",
57 | "$$\n",
58 | "and where $D$ stands for the diffusion coefficient.\n",
59 | "\n",
60 | "\n",
61 | "We will solve this partial differential equation (PDE) using the finite-difference method and an explicit forward Euler time integrator on a regular staggered Cartesian grid.\n",
62 | "\n",
63 | "
\n",
64 | "\n",
65 | "The 2D domain is of size $L=10$ and the scalar linear diffusion coefficient $D=1$. We use a constant grid size `ds = L / ns`, where `ns` represent the number of finite-difference cells in both $x$ and $y$ dimension.\n",
66 | "\n",
67 | "As initial condition, we define a Gaussian perturbation centred in the middle of the domain of amplitude and standard deviation equal to 1.\n",
68 | "\n",
69 | "
"
70 | ]
71 | },
72 | {
73 | "cell_type": "markdown",
74 | "metadata": {},
75 | "source": [
76 | "## The code structure overview"
77 | ]
78 | },
79 | {
80 | "cell_type": "markdown",
81 | "metadata": {},
82 | "source": [
83 | "Let's have a look at the code structure. We'll first look at the [`diffusion_2d.jl`](diffusion_2d.jl) script. It contains:\n",
84 | "- 2 compute functions implementing the spatial and temporal discretisation of the PDE;\n",
85 | "\n",
86 | "- a \"main\" function to run the code;\n",
87 | "\n",
88 | "- an include statement for [`shared.jl`](./../shared.jl), mostly containing parameters and arrays initialisation, and visualisation."
89 | ]
90 | },
91 | {
92 | "cell_type": "markdown",
93 | "metadata": {},
94 | "source": [
95 | "## The output"
96 | ]
97 | },
98 | {
99 | "cell_type": "markdown",
100 | "metadata": {},
101 | "source": [
102 | "### Graphics\n",
103 | "\n",
104 | "The visualisation renders the evolution of the distribution of the diffusing quantity $C$ throughout the simulation at frequency intervals defined by `nout = nt / 5`."
105 | ]
106 | },
107 | {
108 | "cell_type": "markdown",
109 | "metadata": {},
110 | "source": [
111 | "### Timing and performance\n",
112 | "\n",
113 | "Besides plotting, the code also reports performance using wall-time and effective memory throughput as metric and prints in the REPL."
114 | ]
115 | },
116 | {
117 | "cell_type": "markdown",
118 | "metadata": {},
119 | "source": [
120 | "#### Wall time\n",
121 | "\n",
122 | "The first performance metric is wall time, or total runtime. It is computed using a simple custom tic-toc approach, making sure to skip the first 10 iterations to account for \"warm-up\". For any performance assessment, we should make sure to run the code for at least 1 second in order to ensure reliable results.\n",
123 | "\n",
124 | "#### Effective memory throughput\n",
125 | "\n",
126 | "The second metric is the effective memory throughput $T_\\mathrm{eff}$ (`T_eff` in the REPL). It defines as the **non-redundant** memory access per iteration divided by the time per iteration $t_\\mathrm{it}$ (in sec.):\n",
127 | "$$\n",
128 | "T_\\mathrm{eff} = \\frac{A_\\mathrm{eff}}{t_\\mathrm{it}}~,\n",
129 | "$$\n",
130 | "where $A_\\mathrm{eff} = n_\\mathrm{IO} ~ n_s^2 ~ s_\\mathrm{DAT} ~ 10^{-9}$ is the effective memory access (in GB).\n",
131 | "\n",
132 | "In our example, $n_\\mathrm{IO} = 2$ as we only need to read old values of $C$ and write them back to solve the diffusion PDE. $s_\\mathrm{DAT} = 8$ as we are running double precision floating point arithmetic.\n",
133 | "\n",
134 | "$T_\\mathrm{eff}$ provides an idea on how far from the performance of memory copy only memory-bounded codes are, under various assumptions. Refer to [Räss et al. (2022)](https://doi.org/10.5194/gmd-15-5757-2022) for details.\n",
135 | "\n",
136 | "We will further use this metric in the GPU computing part."
137 | ]
138 | },
139 | {
140 | "cell_type": "markdown",
141 | "metadata": {},
142 | "source": [
143 | "## The serial loop version\n",
144 | "\n",
145 | "The final step to look at, before we start our deep dive, is the serial loop version of the 2D diffusion code. If we now open the [`diffusion_2d_loop.jl`](diffusion_2d_loop.jl) script aside the vectorized one ([`diffusion_2d.jl`](diffusion_2d.jl)), we can diff them \"by eye\" to see the major change being the change in the `diffusion_step!` function.\n",
146 | "\n",
147 | "In a nutshell:\n",
148 | "- we do no longer explicitly assign flux computation results to temporary variable in global memory (previously `qx` and `qy`);\n",
149 | "\n",
150 | "- we introduce a nested loop of spacial dimensions respecting a **column major order**;\n",
151 | "\n",
152 | "- we introduce a temporary second array `C2` to not read and write from the same array in order to avoid race conditions;\n",
153 | "\n",
154 | "- we use `@inbounds` upon having verified the correctness of the results to skip bound-checking."
155 | ]
156 | },
157 | {
158 | "cell_type": "markdown",
159 | "metadata": {},
160 | "source": [
161 | "**→ Hands-on** (see [README.md](README.md))"
162 | ]
163 | }
164 | ],
165 | "metadata": {
166 | "kernelspec": {
167 | "display_name": "Julia 1.10.4",
168 | "language": "julia",
169 | "name": "julia-1.10"
170 | },
171 | "language_info": {
172 | "file_extension": ".jl",
173 | "mimetype": "application/julia",
174 | "name": "julia",
175 | "version": "1.10.4"
176 | }
177 | },
178 | "nbformat": 4,
179 | "nbformat_minor": 2
180 | }
181 |
--------------------------------------------------------------------------------
/parts/diffusion_2d/diffusion_2d.jl:
--------------------------------------------------------------------------------
1 | # 2D linear diffusion solver - serial, vectorized
2 | using Printf
3 | using CairoMakie
4 | include(joinpath(@__DIR__, "../shared.jl"))
5 |
6 | function compute_flux!(params, qx, qy, C)
7 | (; D, ds) = params
8 | @views qx .= .-D .* diff(C[:, 2:end-1], dims=1) ./ ds
9 | @views qy .= .-D .* diff(C[2:end-1, :], dims=2) ./ ds
10 | return
11 | end
12 |
13 | function diffusion_step!(params, C, qx, qy)
14 | (; ds, dt) = params
15 | @views C[2:end-1, 2:end-1] .-= dt .* (diff(qx, dims=1) ./ ds .+ diff(qy, dims=2) ./ ds)
16 | return
17 | end
18 |
19 | function run_diffusion(; ns=64, nt=100, do_visualize=false)
20 | params = init_params(; ns, nt, do_visualize)
21 | C, qx, qy = init_arrays_with_flux(params)
22 | fig, plt = maybe_init_visualization(params, C)
23 | t_tic = 0.0
24 | # time loop
25 | for it in 1:nt
26 | # time after warmup (ignore first 10 iterations)
27 | (it == 11) && (t_tic = Base.time())
28 | # diffusion
29 | compute_flux!(params, qx, qy, C)
30 | diffusion_step!(params, C, qx, qy)
31 | # visualization
32 | maybe_update_visualization(params, fig, plt, C, it)
33 | end
34 | t_toc = (Base.time() - t_tic)
35 | print_perf(params, t_toc)
36 | return nothing
37 | end
38 |
39 | # Running things...
40 |
41 | # enable visualization by default
42 | (!@isdefined do_visualize) && (do_visualize = true)
43 | # enable execution by default
44 | (!@isdefined do_run) && (do_run = true)
45 |
46 | if do_run
47 | run_diffusion(; ns=256, nt=500, do_visualize)
48 | end
49 |
--------------------------------------------------------------------------------
/parts/diffusion_2d/diffusion_2d_loop.jl:
--------------------------------------------------------------------------------
1 | # 2D linear diffusion solver - serial, loop version
2 | using Printf
3 | using CairoMakie
4 | include(joinpath(@__DIR__, "../shared.jl"))
5 |
6 | # convenience macros simply to avoid writing nested finite-difference expression
7 | macro qx(ix, iy) esc(:(-D * (C[$ix+1, $iy] - C[$ix, $iy]) / ds)) end
8 | macro qy(ix, iy) esc(:(-D * (C[$ix, $iy+1] - C[$ix, $iy]) / ds)) end
9 |
10 | function diffusion_step!(params, C2, C)
11 | (; ds, dt, D) = params
12 | # respect column major order
13 | for iy in 1:size(C, 2)-2
14 | for ix in 1:size(C, 1)-2
15 | @inbounds C2[ix+1, iy+1] = C[ix+1, iy+1] - dt * ((@qx(ix+1, iy+1) - @qx(ix, iy+1)) / ds +
16 | (@qy(ix+1, iy+1) - @qy(ix+1, iy)) / ds)
17 | end
18 | end
19 | return nothing
20 | end
21 |
22 | function run_diffusion(; ns=64, nt=100, do_visualize=false)
23 | params = init_params(; ns, nt, do_visualize)
24 | C, C2 = init_arrays(params)
25 | fig, plt = maybe_init_visualization(params, C)
26 | t_tic = 0.0
27 | # time loop
28 | for it in 1:nt
29 | # time after warmup (ignore first 10 iterations)
30 | (it == 11) && (t_tic = Base.time())
31 | # diffusion
32 | diffusion_step!(params, C2, C)
33 | C, C2 = C2, C # pointer swap
34 | # visualization
35 | maybe_update_visualization(params, fig, plt, C, it)
36 | end
37 | t_toc = (Base.time() - t_tic)
38 | print_perf(params, t_toc)
39 | return nothing
40 | end
41 |
42 | # Running things...
43 |
44 | # enable visualization by default
45 | (!@isdefined do_visualize) && (do_visualize = true)
46 | # enable execution by default
47 | (!@isdefined do_run) && (do_run = true)
48 |
49 | if do_run
50 | run_diffusion(; ns=256, do_visualize)
51 | end
52 |
--------------------------------------------------------------------------------
/parts/diffusion_2d/imgs/initial.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JuliaHPC/juliacon24-hpcworkshop/2c6e764bcf94eebd549785b69264215b1012eaf2/parts/diffusion_2d/imgs/initial.png
--------------------------------------------------------------------------------
/parts/diffusion_2d/imgs/stagg_2D.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JuliaHPC/juliacon24-hpcworkshop/2c6e764bcf94eebd549785b69264215b1012eaf2/parts/diffusion_2d/imgs/stagg_2D.png
--------------------------------------------------------------------------------
/parts/distributed/explanation/Project.toml:
--------------------------------------------------------------------------------
1 | [deps]
2 | ClusterManagers = "34f1f09b-3a8b-5176-ab39-66d58a4d544e"
3 | Dagger = "d58978e5-989f-55fb-8d15-ea34adc7bf54"
4 | DistributedArrays = "aaf54ef3-cdf8-58ed-94cc-d582ad619b94"
5 | NetworkInterfaceControllers = "6f74fd91-2978-43ad-8164-3af8c0ec0142"
6 | Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
7 |
--------------------------------------------------------------------------------
/parts/gpu/README.md:
--------------------------------------------------------------------------------
1 | # Diffusion 2D - GPU acceleration
2 |
3 | In this part, we want to use GPU computing and multi-GPUs (distributed memory parallelization) to accelerate our Diffusion 2D example.
4 |
5 | The starting point is the serial loop version [`diffusion_2d_loop.jl`](./../diffusion_2d/diffusion_2d_loop.jl). The file [`diffusion_2d_cuda.jl`](./diffusion_2d_cuda.jl) in this folder is a slightly modified copy of this version. Specifically, we included the gpu initialization of the arrays `C` and `C2` in form of the function `init_arrays_gpu` and left the computational kernel (`diffusion_step_kernel!`) and the wrapper function (`diffusion_step!`) mostly unimplemented.
6 |
7 | In a second step, we will merge the CUDA and MPI codes in order to achieve a multi-GPU diffusion solver. For this task, the starting point is the [`diffusion_2d_mpi.jl`](./../mpi/diffusion_2d_mpi.jl) script. The file [`diffusion_2d_cuda_mpi.jl`](./diffusion_2d_cuda_mpi.jl) in this folder is a slightly modified copy of this version. Specifically, we included the gpu mpi initialization of the arrays `C` and `C2` in form of the function `init_arrays_gpu_mpi` and left the `update_halo!` and `init_bufs` functions mostly unimplemented. We also did not yet implement the GPU selection from local MPI rank.
8 |
9 | Note that there are few code stubs (indicated by `TODO` comments) that you will implement in the tasks below.
10 |
11 | Recall that on the GPU, you need to explicitly specify the data type to be `Float64` as CUDA.jl defaults to `Float32`.
12 |
13 | ## Reminder
14 |
15 | Remember that, on Perlmutter, **you can't run GPU or MPI processes on a login node**. You have two options to work on a compute node:
16 |
17 | 1) **Interactive session**: You can try to get an interactive session on a compute node by running `sh get_gpu_compute_node_interactive.sh` (but unfortunately, we don't have a node for everyone). **If you can get one**, you can:
18 | - single GPU script: launch Julia from the interactive session and run the single GPU script. Alternatively, you can run `sh job_bench_gpu.sh`.
19 | - multi-GPU: run the GPU MPI code by `mpiexecjl --project -n 4 julia diffusion_2d_cuda_mpi.jl`. Alternatively, you can run `sh job_gpu_mpi_singlenode.sh`.
20 |
21 | 2) **Compute job**: You can always submit a job that runs the code: `sbatch job_gpu_mpi_singlenode.sh`. The output will land in `slurm_gpu_mpi_singlenode.out`. Check out the [Perlmutter cheetsheet](../../help/perlmutter_cheatsheet.md) to learn more about jobs.
22 |
23 | ## Task 1 - CUDA `diffusion_step_kernel!`
24 |
25 | ### Part A
26 |
27 | Your first task is to take the diffusion kernel from `diffusion_2d_loop.jl` and replace the nested loop over spatial dimensions by "vecotized" CUDA indices. See the `TODO` comments inside of the `diffusion_step_kernel!` function. Make sure to correctly handle the ranges where the computation should occur given that we do not want to update the boundary cells of `C2` array.
28 |
29 | Then you should complete the wrapper function `diffusion_step!` we are using the call the GPU kernel (which allows us to have the same function call signature in the `run_diffusion` function). Use the appropriate CUDA launch parameters.
30 |
31 | Note that the number of threads and blocks used to execute the kernel is defined in `init_params_gpu` from [`shared.jl`](./../shared.jl) as:
32 | ```julia
33 | nthreads = 32, 8 # number of threads per block
34 | nblocks = cld.(ns, nthreads) # number of blocks
35 | ```
36 |
37 | **Question:**
38 | * How did you implement the appropriate range selection?
39 |
40 | ### Part B
41 |
42 | Let's make a rough performance benchmark. Run your implementation on a single Nvidia A100 GPU and compare timings/`T_eff` ("strong scaling"). Perform this comparison for five values of `ns`, for example 512, 2048, 4096, 8192 and 16384.
43 |
44 | **How to run the code?**
45 |
46 | You can either perform the rough benchmark in an interactive Julia session or use the script `job_bench_gpu.sh`.
47 |
48 | * Interactive:
49 | * Set `do_visualize=false`.
50 | * Use `include("diffusion_2d_cuda.jl")` to run the code.
51 |
52 | * Script:
53 | * Either just run the script on the current node (`sh job_bench_gpu.sh`) or submit it as a job to SLURM (`sbatch job_bench_gpu.sh`). In the latter case, the output will end up in a file called `slurm_bench_gpu.out`.
54 |
55 | **Questions:**
56 | * What do you observe?
57 | * What about the performance as function of `ns`?
58 | * How does it compare to peak memory throughput of the Nvidia A100 (memcopy only)?
59 |
60 | ## Task 2 - Multi-GPUs
61 |
62 | In this second task, we will see how to combine GPUs and MPI in order to achieve distributed memory parallelization on multiple GPUs. This step is the gateway to run Julia at scale on latest GPU-accelerated supercomputers such as NERSC's Perlmutter.
63 |
64 | We will first make the required changes to the code (Part A), test our implementation (Part B) and perform a weak scaling test (Part C).
65 |
66 | ### Part A
67 |
68 | Complete the `update_halo!` and `init_bufs` functions taking inspiration from the CPU MPI script and making sure to use the the correct data type for the GPU buffers (see the `TODO`s therein).
69 |
70 | Then, in the `run_diffusion` function, we need to implement a procedure to map the GPUs from each node to MPI processes running on that same node. There are various ways to achieve this. We will here use an MPI shared memory communicator to detect all ranks on the same node:
71 | 1. We can use `MPI.Comm_split_type(comm, MPI.COMM_TYPE_SHARED, me)` from MPI.jl passing the existing communicator `comm` and the global rank `me` to retrieve the node-local communicator `comm_l`.
72 | 2. We then need to retrieve the rank from `comm_l` which will give us the node local rank `me_l`.
73 | 3. We can then use to select the GPU device upon `gpu_id = CUDA.device!(me_l)`.
74 |
75 | ### Part B
76 |
77 | We will now run the GPU MPI code on a single node using all 4 A100 Nvidia GPUs on that node and assess whether the GPU selection works and asses the correctness of the implementation doing an "eye test" looking at the plotting results.
78 |
79 | **How to run the code?**
80 |
81 | You can run the GPU MPI script using the script `job_bench_gpu_mpi_singlenode.sh`, submitting it as a job to SLURM (`sbatch job_bench_gpu_mpi_singlenode.sh`). The output will end up in a file called `slurm_bench_gpu_mpi.out`.
82 |
83 | Then, try running the same job but this time on 4 nodes, using 1 GPU per node. You can achieve this by using the `job_bench_gpu_mpi_multinode.sh` script we prepared for you.
84 |
85 | **How to visualize the results?**
86 |
87 | The code will save one output file per rank, having hte rank ID in the filename such as `out_$(me).jld2`.
88 |
89 | You can run the [`visualize_mpi.jl`](./visualize_mpi.jl) script in order to visualise the results. The visualization script defines the `vizme2D_mpi(nprocs)` function, which takes `nprocs` as argument, defualting to `(2, 2)`, our default MPI topology.
90 |
91 | **Questions:**
92 | * Do you observe correct diffusion results, for both the singlenode and multinode configurations?
93 | * Is each MPI rank accessing a different GPU from that node?
94 |
95 | ### Part C
96 |
97 | As a last step, we will realize a weak scaling to assess the parallel efficiency of our implementation. For this we should set the spatial resolution `ns` to the value that was showing best performance in the strong scaling experiment from Task 1, possibly adapting `nt` such that the code executes not much longer than 1 second and setting `do_save = false`.
98 |
99 | Then one should run the GPU MPI script on one MPI rank (thus one GPU) in order to assess the baseline performance. Once this is done, one should increase the number of MPI ranks, while keeping the same local problem size, making the global problem scale linearly with the computing resources. Performance tests could be achieved for 1, 4, 9, 16, (64) ranks. Parallel efficiency can be reported by normalising the $T_\mathrm{eff}$ or wall-time obtained for tests > 1 rank by the single rank performance.
100 |
101 | **Questions:**
102 | * What parallel efficiency do you observe?
103 | * If it drops, what workaround could one implement?
104 |
--------------------------------------------------------------------------------
/parts/gpu/advanced/closest_device.jl:
--------------------------------------------------------------------------------
1 | using CpuId, MPI, CUDA, Hwloc, AbstractTrees
2 |
3 | import AbstractTrees: PreOrderDFS
4 | import Hwloc: hwloc_pci_class_string
5 |
6 | import Base: filter, Fix1
7 | filter(f::Function)::Function = Fix1(filter, f)
8 |
9 | const cpucycle_mask = (
10 | (1 << (64 - leading_zeros(CpuId.cputhreads()))) - 1
11 | ) % UInt32
12 |
13 | cpucycle_coreid() = Int(cpucycle_id()[2] & cpucycle_mask)
14 |
15 | function get_device_attributes()
16 | attr = Dict{Tuple{Int32, Int32}, Int32}()
17 | for i in 0:(ndevices()-1)
18 | d = CuDevice(i)
19 | attr[(
20 | attribute(d, CUDA.CU_DEVICE_ATTRIBUTE_PCI_BUS_ID),
21 | attribute(d, CUDA.CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID)
22 | )] = d
23 | end
24 | attr
25 | end
26 |
27 | function tag_subtree!(tree_node, val)
28 | for n in collect(AbstractTrees.PreOrderDFS(tree_node))
29 | n.tag = val
30 | end
31 | end
32 |
33 | function distance_to_core!(node, target_index)
34 | # shield re-entrance when iterating
35 | node.tag = 1
36 |
37 | if node.type == :PU
38 | # println("Checking: $(nodevalue(node).os_index)")
39 | if nodevalue(node).os_index == target_index
40 | return true, 0
41 | end
42 | end
43 |
44 | for child in node.children
45 | if child.tag == 1
46 | continue
47 | end
48 |
49 | found, dist = distance_to_core!(child, target_index)
50 | if found
51 | return true, dist + 1
52 | end
53 | end
54 |
55 | if node.parent != nothing
56 | found, dist = distance_to_core!(node.parent, target_index)
57 | if found
58 | return true, dist + 1
59 | end
60 | end
61 |
62 | return false, typemax(Int)
63 | end
64 |
65 | function distance_to_core(root, node, target_index)
66 | tag_subtree!(root, 0)
67 | found, dist = distance_to_core!(node, target_index)
68 | tag_subtree!(root, 0)
69 | return found, dist
70 | end
71 |
72 | sys_devs = children(gettopology())
73 | pci_devs = PreOrderDFS(sys_devs) |> collect |> filter(x->x.type==:PCI_Device)
74 | gpu_devs = pci_devs |> filter(x->hwloc_pci_class_string(nodevalue(x).attr.class_id) == "3D")
75 |
76 | function get_device_distances(core)
77 | attr = get_device_attributes()
78 | dist = Dict{Int32, Int32}()
79 | dev = Dict{Int32, Int32}()
80 | for d in gpu_devs
81 | idx = attr[(nodevalue(d).attr.bus, nodevalue(d).attr.dev)]
82 | found, dev_d = distance_to_core(sys_devs, d, core)
83 | if found
84 | dist[idx] = dev_d
85 | dev[dev_d] = idx
86 | end
87 | end
88 | dist, dev
89 | end
90 |
91 | dist, dev = get_device_distances(cpucycle_coreid())
92 | closest_dev = dev[dev |> keys |> minimum]
93 | println(closest_dev)
94 |
--------------------------------------------------------------------------------
/parts/gpu/advanced/job_gpu_mpi_multinode.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --time=00:05:00
3 | #SBATCH --nodes=4
4 | #SBATCH --ntasks-per-node=4
5 | #SBATCH --constraint=gpu
6 | #SBATCH --account=ntrain1
7 | #SBATCH --output=slurm_gpu_mpi_multinode.out
8 | #SBATCH --qos=regular
9 |
10 | # pin to closest NIC to GPU
11 | export MPICH_OFI_NIC_POLICY=GPU
12 |
13 | # Load julia
14 | ml use /global/common/software/nersc/n9/julia/modules
15 | ml julia
16 |
17 | mpiexecjl -G 16 -c 32 --project julia closest_device.jl
18 |
--------------------------------------------------------------------------------
/parts/gpu/advanced/job_gpu_mpi_singlenode.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --time=00:05:00
3 | #SBATCH --nodes=1
4 | #SBATCH --ntasks-per-node=4
5 | #SBATCH --constraint=gpu
6 | #SBATCH --account=ntrain1
7 | #SBATCH --output=slurm_gpu_mpi_singlenode.out
8 | #SBATCH --qos=regular
9 |
10 | # pin to closest NIC to GPU
11 | export MPICH_OFI_NIC_POLICY=GPU
12 |
13 | # Load julia
14 | ml use /global/common/software/nersc/n9/julia/modules
15 | ml julia
16 |
17 | mpiexecjl -G 4 -c 32 --project julia closest_device.jl
18 |
--------------------------------------------------------------------------------
/parts/gpu/diffusion_2d_cuda.jl:
--------------------------------------------------------------------------------
1 | # 2D linear diffusion solver - GPU cuda version
2 | using Printf
3 | using JLD2
4 | using CUDA
5 | include(joinpath(@__DIR__, "../shared.jl"))
6 |
7 | # convenience macros simply to avoid writing nested finite-difference expression
8 | macro qx(ix, iy) esc(:(-D * (C[$ix+1, $iy] - C[$ix, $iy]) * inv(ds))) end
9 | macro qy(ix, iy) esc(:(-D * (C[$ix, $iy+1] - C[$ix, $iy]) * inv(ds))) end
10 |
11 | function diffusion_step_kernel!(params, C2, C)
12 | (; ds, dt, D) = params
13 | #
14 | # !! TODO !!
15 | #
16 | # We want to replace the nested loop over spatial dimensions by "vecotized" CUDA indices.
17 | # Based off of the serial kernel (see README.md or diffusion_2d_loop.jl) implement
18 | # the CUDA variant using CUDA.jl taking care the handle to range in an appropriate
19 | # manner (see "TODO..." below).
20 | #
21 | ix = # TODO # CUDA vectorised unique index
22 | iy = # TODO # CUDA vectorised unique index
23 | if # TODO select correct range
24 | @inbounds C2[ix+1, iy+1] = C[ix+1, iy+1] - dt * ((@qx(ix + 1, iy + 1) - @qx(ix, iy + 1)) * inv(ds) +
25 | (@qy(ix + 1, iy + 1) - @qy(ix + 1, iy)) * inv(ds))
26 | end
27 | return nothing
28 | end
29 |
30 | function diffusion_step!(params, C2, C)
31 | (; nthreads, nblocks) = params
32 | #
33 | # !! TODO !!
34 | #
35 | # Complete the CPU wrapper function calling the `diffusion_step_kernel!`
36 | # using the `@cuda` macro and appropriate launch parameters (see "TODO..." below).
37 | #
38 | @cuda # TODO
39 | return nothing
40 | end
41 |
42 | function run_diffusion(; ns=64, nt=100, do_save=false)
43 | params = init_params_gpu(; ns, nt, do_save)
44 | C, C2 = init_arrays_gpu(params)
45 | t_tic = 0.0
46 | # Time loop
47 | for it in 1:nt
48 | # time after warmup (ignore first 10 iterations)
49 | (it == 11) && (t_tic = Base.time())
50 | # diffusion
51 | diffusion_step!(params, C2, C)
52 | C, C2 = C2, C # pointer swap
53 | end
54 | # synchronize the gpu before querying the final time
55 | # TODO # Add synchronization
56 | t_toc = (Base.time() - t_tic)
57 | print_perf(params, t_toc)
58 | do_save && jldsave(joinpath(@__DIR__, "out_gpu.jld2"); C = Array(C), l = params.L)
59 | return nothing
60 | end
61 |
62 | # Running things...
63 |
64 | # enable visualization by default
65 | (!@isdefined do_save) && (do_save = true)
66 | # enable execution by default
67 | (!@isdefined do_run) && (do_run = true)
68 |
69 | if do_run
70 | if !isempty(ARGS)
71 | run_diffusion(; ns=parse(Int, ARGS[1]), do_save)
72 | else
73 | run_diffusion(; ns=256, do_save)
74 | end
75 | end
76 |
--------------------------------------------------------------------------------
/parts/gpu/diffusion_2d_cuda_mpi.jl:
--------------------------------------------------------------------------------
1 | # 2D linear diffusion solver - GPU MPI
2 | using Printf
3 | using JLD2
4 | using CUDA
5 | using MPI
6 | include(joinpath(@__DIR__, "../shared.jl"))
7 |
8 | # convenience macros simply to avoid writing nested finite-difference expression
9 | macro qx(ix, iy) esc(:(-D * (C[$ix+1, $iy] - C[$ix, $iy]) * inv(dx))) end
10 | macro qy(ix, iy) esc(:(-D * (C[$ix, $iy+1] - C[$ix, $iy]) * inv(dy))) end
11 |
12 | function diffusion_step_kernel!(params, C2, C)
13 | (; dx, dy, dt, D) = params
14 | ix = (blockIdx().x - 1) * blockDim().x + threadIdx().x # CUDA vectorised unique index
15 | iy = (blockIdx().y - 1) * blockDim().y + threadIdx().y # CUDA vectorised unique index
16 | if ix <= size(C, 1)-2 && iy <= size(C, 2)-2
17 | @inbounds C2[ix+1, iy+1] = C[ix+1, iy+1] - dt * ((@qx(ix + 1, iy + 1) - @qx(ix, iy + 1)) * inv(dx) +
18 | (@qy(ix + 1, iy + 1) - @qy(ix + 1, iy)) * inv(dy))
19 | end
20 | return nothing
21 | end
22 |
23 | function diffusion_step!(params, C2, C)
24 | (; nthreads, nblocks) = params
25 | @cuda threads = nthreads blocks = nblocks diffusion_step_kernel!(params, C2, C)
26 | return nothing
27 | end
28 |
29 | # MPI functions
30 | @views function update_halo!(A, bufs, neighbors, comm)
31 | #
32 | # !! TODO !!
33 | #
34 | # We want to replace use the `update_halo!` function defined in the CPU MPI script
35 | # and use it here. Since we are using GPU-aware MPI, we can directly re-use the
36 | # function since MPI communication will take care of exchanging halo values living
37 | # in GPU memory.
38 | #
39 | return
40 | end
41 |
42 | function init_bufs(A)
43 | #
44 | # !! TODO !!
45 | #
46 | # We are using GPU-aware MPI, which greatly simplifies the implementation and ensures
47 | # good performance. GPU-aware MPI exchanges GPU pointers and thus we shpuld initialize
48 | # the send and receive buffers on the GPU memory. Complete the missing `return` statement
49 | # by replicating what we did for CPU MPI but making sure to initialise buffers on the GPU
50 | # using the correct data type (Float64).
51 | #
52 | return (; # TODO )
53 | end
54 |
55 | function run_diffusion(; ns=64, nt=100, do_save=false)
56 | MPI.Init()
57 | comm = MPI.COMM_WORLD
58 | nprocs = MPI.Comm_size(comm)
59 | dims = MPI.Dims_create(nprocs, (0, 0)) |> Tuple
60 | comm_cart = MPI.Cart_create(comm, dims)
61 | me = MPI.Comm_rank(comm_cart)
62 | coords = MPI.Cart_coords(comm_cart) |> Tuple
63 | neighbors = (; x=MPI.Cart_shift(comm_cart, 0, 1), y=MPI.Cart_shift(comm_cart, 1, 1))
64 | # select GPU on multi-GPU system based on shared memory topology
65 | #
66 | # !! TODO !!
67 | #
68 | # We need to define a local MPI communicator based on `MPI.COMM_TYPE_SHARED` in order to
69 | # retireve the node-local rank of the MPI processes given we want to map each GPU from one
70 | # node to a MPI rank. Then we want to get the rank from the new communicator and use
71 | # it to set the GPU device.
72 | #
73 | println("$(gpu_id), out of: $(ndevices())")
74 | (me == 0) && println("nprocs = $(nprocs), dims = $dims")
75 |
76 | params = init_params_gpu_mpi(; dims, coords, ns, nt, do_save)
77 | C, C2 = init_arrays_gpu_mpi(params)
78 | bufs = init_bufs(C)
79 | t_tic = 0.0
80 | # Time loop
81 | for it in 1:nt
82 | # time after warmup (ignore first 10 iterations)
83 | (it == 11) && (t_tic = Base.time())
84 | # diffusion
85 | diffusion_step!(params, C2, C)
86 | update_halo!(C2, bufs, neighbors, comm_cart)
87 | C, C2 = C2, C # pointer swap
88 | end
89 | # synchronize the gpu before querying the final time
90 | CUDA.synchronize()
91 | t_toc = (Base.time() - t_tic)
92 | # "master" prints performance
93 | (me == 0) && print_perf(params, t_toc)
94 | # save to (maybe) visualize later
95 | if do_save
96 | jldsave(joinpath(@__DIR__, "out_$(me).jld2"); C = Array(C[2:end-1, 2:end-1]), lxy = (; lx=params.L, ly=params.L))
97 | end
98 | MPI.Finalize()
99 | return
100 | end
101 |
102 | # Running things...
103 |
104 | # enable save to disk by default
105 | (!@isdefined do_save) && (do_save = true)
106 | # enable execution by default
107 | (!@isdefined do_run) && (do_run = true)
108 |
109 | if do_run
110 | run_diffusion(; ns=256, do_save)
111 | end
112 |
--------------------------------------------------------------------------------
/parts/gpu/get_gpu_compute_node_interactive.sh:
--------------------------------------------------------------------------------
1 | salloc --nodes 1 --qos interactive --time 00:45:00 --constraint gpu --account=ntrain1
2 |
--------------------------------------------------------------------------------
/parts/gpu/gpu.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# GPU acceleration"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Overview"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "* **Why to bother with GPU computing in 2024**\n",
22 | " * HPC and Supercomputing is GPU-accelerated\n",
23 | " * When Julia overcomes the two-language barrier\n",
24 | "\n",
25 | "* **GPU computing Fast-Forward**\n",
26 | " * Array vs Kernel programming\n",
27 | " * Performance considerations\n",
28 | "\n",
29 | "* **Going multi-GPUs**\n",
30 | " * MPI + GPUs"
31 | ]
32 | },
33 | {
34 | "cell_type": "markdown",
35 | "metadata": {},
36 | "source": [
37 | "### Why to still bother with GPU computing in 2024\n",
38 | "- It's around for more than a decade\n",
39 | "- It shows massive performance gain compared to serial CPU computing\n",
40 | "- First exascale supercomputer, Frontier, is full of GPUs\n",
41 | "\n",
42 | "
\n",
43 | "\n",
44 | "### Performance that matters\n",
45 | "\n",
46 | "
\n",
47 | "\n",
48 | "Taking a look at a recent GPU and CPU:\n",
49 | "- Nvidia Tesla A100 GPU\n",
50 | "- AMD EPYC \"Rome\" 7282 (16 cores) CPU\n",
51 | "\n",
52 | "| Device | TFLOP/s (FP64) | Memory BW TB/s | Imbalance (FP64) |\n",
53 | "| :------------: | :------------: | :------------: | :------------------: |\n",
54 | "| Tesla A100 | 9.7 | 1.55 | 9.7 / 1.55 × 8 = 50 |\n",
55 | "| AMD EPYC 7282 | 0.7 | 0.085 | 0.7 / 0.085 × 8 = 66 |\n",
56 | "\n",
57 | "**Meaning:** we can do about 50 floating point operations per number accessed from main memory.\n",
58 | "Floating point operations are \"for free\" when we work in memory-bounded regimes.\n",
59 | "\n",
60 | "👉 Requires re-thinking the numerical implementation and solution strategies\n",
61 | "\n",
62 | "Unfortunately, the cost of evaluating a first derivative $∂A / ∂x$ in, e.g., diffusive flux calculations using finite-differences:\n",
63 | "\n",
64 | "`q[ix] = -D * (A[ix+1] - A[ix]) / dx`\n",
65 | "\n",
66 | "consists of:\n",
67 | "- 1 read (`A`) + 1 write (`q`) => $2 × 8$ = **16 Bytes transferred**\n",
68 | "- 1 addition + 1 multiplication + 1 division => **3 floating point operations**\n",
69 | "\n",
70 | "👉 assuming `D`, `dx` are scalars, `q` and `A` are arrays of `Float64` (read from main memory)\n",
71 | "\n",
72 | "### Performance that matters - an example\n",
73 | "Not yet convinced? Let's have a look at an example.\n",
74 | "\n",
75 | "Let's assess how close from memory copy (1400 GB/s) we can get solving a 2D diffusion problem on an Nvidia Tesla A100 GPU.\n",
76 | "\n",
77 | "$$ \\frac{\\partial C}{\\partial t} = \\frac{\\partial^2 C}{\\partial x^2} + \\frac{\\partial^2 C}{\\partial y^2} $$\n",
78 | "\n",
79 | "👉 Let's test the performance using a simple script."
80 | ]
81 | },
82 | {
83 | "cell_type": "markdown",
84 | "metadata": {},
85 | "source": [
86 | "### Measuring GPU performance\n",
87 | "\n",
88 | "Load modules:"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": 3,
94 | "metadata": {},
95 | "outputs": [],
96 | "source": [
97 | "using CUDA\n",
98 | "using BenchmarkTools\n",
99 | "using Printf"
100 | ]
101 | },
102 | {
103 | "cell_type": "markdown",
104 | "metadata": {},
105 | "source": [
106 | "Memory copy function to measure the \"peak\" memory throughput:"
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": 4,
112 | "metadata": {},
113 | "outputs": [
114 | {
115 | "data": {
116 | "text/plain": [
117 | "mycopy! (generic function with 1 method)"
118 | ]
119 | },
120 | "metadata": {},
121 | "output_type": "display_data"
122 | }
123 | ],
124 | "source": [
125 | "function mycopy!(A, B)\n",
126 | " ix = (blockIdx().x - 1) * blockDim().x + threadIdx().x\n",
127 | " iy = (blockIdx().y - 1) * blockDim().y + threadIdx().y\n",
128 | " if ix <= size(A, 1) && iy <= size(A, 2)\n",
129 | " @inbounds A[ix, iy] = B[ix, iy] + 1\n",
130 | " end\n",
131 | " return\n",
132 | "end"
133 | ]
134 | },
135 | {
136 | "cell_type": "markdown",
137 | "metadata": {},
138 | "source": [
139 | "Laplacian kernel using the finite difference method (FDM):"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": 5,
145 | "metadata": {},
146 | "outputs": [
147 | {
148 | "data": {
149 | "text/plain": [
150 | "laplacian! (generic function with 1 method)"
151 | ]
152 | },
153 | "metadata": {},
154 | "output_type": "display_data"
155 | }
156 | ],
157 | "source": [
158 | "function laplacian!(A, B, dt, _dx2, _dy2)\n",
159 | " ix = (blockIdx().x - 1) * blockDim().x + threadIdx().x\n",
160 | " iy = (blockIdx().y - 1) * blockDim().y + threadIdx().y\n",
161 | " if ix <= size(A, 1) - 2 && iy <= size(A, 2) - 2\n",
162 | " @inbounds A[ix+1, iy+1] = B[ix+1, iy+1] + dt *\n",
163 | " ((B[ix+2, iy+1] - 2 * B[ix+1, iy+1] + B[ix, iy+1]) * _dx2 +\n",
164 | " (B[ix+1, iy+2] - 2 * B[ix+1, iy+1] + B[ix+1, iy]) * _dy2)\n",
165 | " end\n",
166 | " return\n",
167 | "end"
168 | ]
169 | },
170 | {
171 | "cell_type": "markdown",
172 | "metadata": {},
173 | "source": [
174 | "Let's test the performance!"
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": 8,
180 | "metadata": {},
181 | "outputs": [
182 | {
183 | "name": "stdout",
184 | "output_type": "stream",
185 | "text": [
186 | "Effective memory throughput (copy) : 1335.85 GB/s\n",
187 | "Effective memory throughput (laplacian) : 1303.32 GB/s\n",
188 | "Theoretical peak memory throughput : 1555.20 GB/s\n",
189 | "\n",
190 | "Wow 🚀! Laplacian runs at:\n",
191 | " 97.56% of copy speed\n",
192 | " 83.80% of peak memory bandwidth\n",
193 | "on a NVIDIA A100-SXM4-40GB device\n"
194 | ]
195 | }
196 | ],
197 | "source": [
198 | "# if the array size is too small, the GPU will not be fully utilized\n",
199 | "nx = ny = 512 * 32\n",
200 | "A = CUDA.rand(Float64, nx, ny)\n",
201 | "B = CUDA.rand(Float64, nx, ny)\n",
202 | "\n",
203 | "_dx2 = _dy2 = dt = rand()\n",
204 | "\n",
205 | "# launch configuration\n",
206 | "nthreads = (16, 16)\n",
207 | "nblocks = cld.((nx, ny), nthreads)\n",
208 | "\n",
209 | "# measure the execution times\n",
210 | "time_copy = @belapsed CUDA.@sync @cuda threads=nthreads blocks=nblocks mycopy!(A, B)\n",
211 | "time_lapl = @belapsed CUDA.@sync @cuda threads=nthreads blocks=nblocks laplacian!(A, B, dt, _dx2, _dy2)\n",
212 | "\n",
213 | "# effective memory throughput (1 read + 1 write per element)\n",
214 | "Teff_copy = 2 * nx * ny * sizeof(Float64) / time_copy / 1e9\n",
215 | "Teff_lapl = 2 * nx * ny * sizeof(Float64) / time_lapl / 1e9\n",
216 | "\n",
217 | "# compute theoretical peak memory bandwidth\n",
218 | "dev = CUDA.device()\n",
219 | "\n",
220 | "bus_width = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH) |> Float64 # in bits\n",
221 | "clock_rate = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE) |> Float64 # in kHz\n",
222 | "rate_multiplier = 2 # 2 for HBM2/DDR, 4 for HBM3/GDDR5, 8 for GDDR6\n",
223 | "\n",
224 | "Teff_peak = bus_width * clock_rate * rate_multiplier / 1e6 / 8\n",
225 | "\n",
226 | "# report results\n",
227 | "@printf(\"Effective memory throughput (copy) : %.2f GB/s\\n\", Teff_copy)\n",
228 | "@printf(\"Effective memory throughput (laplacian) : %.2f GB/s\\n\", Teff_lapl)\n",
229 | "@printf(\"Theoretical peak memory throughput : %.2f GB/s\\n\", Teff_peak)\n",
230 | "\n",
231 | "@printf(\"\\nWow 🚀! Laplacian runs at:\\n\")\n",
232 | "@printf(\" %.2f%% of copy speed\\n\" , 100 * Teff_lapl / Teff_copy)\n",
233 | "@printf(\" %.2f%% of peak memory bandwidth\\n\", 100 * Teff_lapl / Teff_peak)\n",
234 | "@printf(\"on a %s device\\n\", CUDA.name(dev))"
235 | ]
236 | },
237 | {
238 | "cell_type": "markdown",
239 | "metadata": {},
240 | "source": [
241 | "### GPU kernel programming\n",
242 | "\n",
243 | "We'll get started with a brief overview of the Nvidia GPU architecture and how to program it.\n",
244 | "\n",
245 | "The Nvidia general purpose GPUs can be programmed using the CUDA language extension. CUDA is accessible in Julia via [CUDA.jl](https://cuda.juliagpu.org/stable/), which exposes most of the native CUDA features to the Julia ecosystem.\n",
246 | "\n",
247 | "In the CUDA programming model, `blocks` of `threads` compose the `grid`. In our implementation, we want to map one thread to each finite-difference cell of the 2D Cartesian domain.\n",
248 | "\n",
249 | "The figure hereafter depicts the relation between the CUDA domain and the finite-difference domain:\n",
250 | "\n",
251 | "
\n",
252 | "\n",
253 | "**Playing with GPUs: the rules**\n",
254 | "\n",
255 | "- Current GPUs allow typically a maximum of 1024 threads per block.\n",
256 | "\n",
257 | "- The maximum number of blocks allowed is huge; computing the largest possible array on the GPU will make you run out of device memory (currently 16-80 GB) before hitting the maximal number of blocks when selecting sensible kernel launch parameters (usually threads per block >= 128).\n",
258 | "\n",
259 | "- Threads, blocks and grid have 3D \"Cartesian\" topology, which is very useful for 1D, 2D and 3D Cartesian finite-difference domains."
260 | ]
261 | },
262 | {
263 | "cell_type": "markdown",
264 | "metadata": {},
265 | "source": [
266 | "### Multi-GPU\n",
267 | "\n",
268 | "#### GPU - MPI ranks mapping\n",
269 | "The challenging part is to run on multiple GPUs using MPI. To achieve this, we need to map node-local MPI ranks to GPU IDs.\n",
270 | "\n",
271 | "This can be achieved in Julia using MPI.jl and CUDA.jl by\n",
272 | "```julia\n",
273 | "comm = MPI.COMM_WORLD\n",
274 | "rank = MPI.Comm_rank(comm)\n",
275 | "comm_l = MPI.Comm_split_type(comm, MPI.COMM_TYPE_SHARED, rank)\n",
276 | "rank_l = MPI.Comm_rank(comm_l)\n",
277 | "gpu_id = CUDA.device!(rank_l)\n",
278 | "```\n",
279 | "\n",
280 | "#### GPU-aware MPI\n",
281 | "\n",
282 | "On modern supercomputers, one has access to GPU-aware MPI. GPU aware-MPI allows to directly exchange GPU memory by-passing an explicit host copy.\n",
283 | "\n",
284 | "The file [`multigpu.jl`](./multigpu.jl) implements this and would check that GPU-aware MPI works:"
285 | ]
286 | },
287 | {
288 | "cell_type": "code",
289 | "execution_count": 2,
290 | "metadata": {},
291 | "outputs": [
292 | {
293 | "name": "stderr",
294 | "output_type": "stream",
295 | "text": [
296 | "srun: Job 27855189 step creation temporarily disabled, retrying (Requested nodes are busy)\n",
297 | "srun: Step created for StepId=27855189.1\n"
298 | ]
299 | },
300 | {
301 | "name": "stdout",
302 | "output_type": "stream",
303 | "text": [
304 | "rank=3 rank_loc=3 (gpu_id=CuDevice(3)), size=4, dst=0, src=2\n",
305 | "rank=0 rank_loc=0 (gpu_id=CuDevice(0)), size=4, dst=1, src=3\n",
306 | "rank=1 rank_loc=1 (gpu_id=CuDevice(1)), size=4, dst=2, src=0\n",
307 | "rank=2 rank_loc=2 (gpu_id=CuDevice(2)), size=4, dst=3, src=1\n",
308 | "start sending...\n",
309 | "recv_mesg on proc 3: [2.0, 2.0, 2.0, 2.0]\n",
310 | "recv_mesg on proc 0: [3.0, 3.0, 3.0, 3.0]\n",
311 | "done.\n",
312 | "recv_mesg on proc 2: [1.0, 1.0, 1.0, 1.0]\n",
313 | "recv_mesg on proc 1: [0.0, 0.0, 0.0, 0.0]\n"
314 | ]
315 | }
316 | ],
317 | "source": [
318 | "run_cmd = `mpiexecjl -n 4 -G 4 --nodes 1 --qos regular --constraint gpu --gpus 4 --account=ntrain1 --project julia multigpu.jl`\n",
319 | "run(run_cmd);"
320 | ]
321 | }
322 | ],
323 | "metadata": {
324 | "kernelspec": {
325 | "display_name": "Julia 1.10.4",
326 | "language": "julia",
327 | "name": "julia-1.10"
328 | },
329 | "language_info": {
330 | "file_extension": ".jl",
331 | "mimetype": "application/julia",
332 | "name": "julia",
333 | "version": "1.10.4"
334 | }
335 | },
336 | "nbformat": 4,
337 | "nbformat_minor": 2
338 | }
339 |
--------------------------------------------------------------------------------
/parts/gpu/imgs/cpu_gpu_evo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JuliaHPC/juliacon24-hpcworkshop/2c6e764bcf94eebd549785b69264215b1012eaf2/parts/gpu/imgs/cpu_gpu_evo.png
--------------------------------------------------------------------------------
/parts/gpu/imgs/cuda_grid.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JuliaHPC/juliacon24-hpcworkshop/2c6e764bcf94eebd549785b69264215b1012eaf2/parts/gpu/imgs/cuda_grid.png
--------------------------------------------------------------------------------
/parts/gpu/imgs/frontier.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JuliaHPC/juliacon24-hpcworkshop/2c6e764bcf94eebd549785b69264215b1012eaf2/parts/gpu/imgs/frontier.png
--------------------------------------------------------------------------------
/parts/gpu/job_bench_gpu.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --time=00:05:00
3 | #SBATCH --nodes=1
4 | #SBATCH --ntasks-per-node=1
5 | #SBATCH --cpus-per-task=128
6 | #SBATCH --gpus-per-task=1
7 | #SBATCH --constraint=gpu
8 | #SBATCH --account=ntrain1
9 | #SBATCH --output=slurm_bench_gpu.out
10 | #SBATCH --qos=regular
11 |
12 | # Load julia
13 | ml use /global/common/software/nersc/n9/julia/modules
14 | ml julia
15 |
16 | for i in 512 2048 4096 8192 16384
17 | do
18 | echo -e "\n\n#### GPU run $i"
19 |
20 | julia --project -e 'do_save=false; include("diffusion_2d_cuda.jl")' $i
21 | done
22 |
--------------------------------------------------------------------------------
/parts/gpu/job_gpu_mpi_multinode.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #SBATCH -A ntrain1
4 | #SBATCH -C gpu
5 | #SBATCH -q regular
6 | #SBATCH --output=slurm_gpu_mpi_multinode.out
7 | #SBATCH --time=00:05:00
8 | #SBATCH --nodes=4
9 | #SBATCH --ntasks=16
10 | #SBATCH --gpus-per-node=4
11 | #SBATCH --exclusive
12 | #SBATCH --gpu-bind=none
13 |
14 | # pin to closest NIC to GPU
15 | export MPICH_OFI_NIC_POLICY=GPU
16 |
17 | # default to std memory pool, see: https://juliaparallel.org/MPI.jl/stable/knownissues/#Memory-pool
18 | export JULIA_CUDA_MEMORY_POOL=none
19 |
20 | ml use /global/common/software/nersc/n9/julia/modules
21 | ml julia
22 |
23 | mpiexecjl --project=../.. julia diffusion_2d_cuda_mpi.jl
24 |
--------------------------------------------------------------------------------
/parts/gpu/job_gpu_mpi_singlenode.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #SBATCH -A ntrain1
4 | #SBATCH -C gpu
5 | #SBATCH -q regular
6 | #SBATCH --output=slurm_gpu_mpi_singlenode.out
7 | #SBATCH --time=00:05:00
8 | #SBATCH --nodes=1
9 | #SBATCH --ntasks=4
10 | #SBATCH --gpus-per-node=4
11 | #SBATCH --exclusive
12 | #SBATCH --gpu-bind=none
13 |
14 | # pin to closest NIC to GPU
15 | export MPICH_OFI_NIC_POLICY=GPU
16 |
17 | # default to std memory pool, see: https://juliaparallel.org/MPI.jl/stable/knownissues/#Memory-pool
18 | export JULIA_CUDA_MEMORY_POOL=none
19 |
20 | ml use /global/common/software/nersc/n9/julia/modules
21 | ml julia
22 |
23 | mpiexecjl --project=../.. julia diffusion_2d_cuda_mpi.jl
24 |
--------------------------------------------------------------------------------
/parts/gpu/multigpu.jl:
--------------------------------------------------------------------------------
1 | using MPI
2 | using CUDA
3 | MPI.Init()
4 | comm = MPI.COMM_WORLD
5 | rank = MPI.Comm_rank(comm)
6 | # select device
7 | comm_l = MPI.Comm_split_type(comm, MPI.COMM_TYPE_SHARED, rank)
8 | rank_l = MPI.Comm_rank(comm_l)
9 | gpu_id = CUDA.device!(rank_l)
10 | # select device
11 | size = MPI.Comm_size(comm)
12 | dst = mod(rank+1, size)
13 | src = mod(rank-1, size)
14 | println("rank=$rank rank_loc=$rank_l (gpu_id=$gpu_id), size=$size, dst=$dst, src=$src")
15 | N = 4
16 | send_mesg = CuArray{Float64}(undef, N)
17 | recv_mesg = CuArray{Float64}(undef, N)
18 | fill!(send_mesg, Float64(rank))
19 | CUDA.synchronize()
20 | rank==0 && println("start sending...")
21 | MPI.Sendrecv!(send_mesg, dst, 0, recv_mesg, src, 0, comm)
22 | println("recv_mesg on proc $rank_l: $recv_mesg")
23 | rank==0 && println("done.")
24 |
--------------------------------------------------------------------------------
/parts/gpu/slurm/hello.jl:
--------------------------------------------------------------------------------
1 | using MPI, CUDA, Libdl
2 |
3 | #_______________________________________________________________________________
4 | # Get MPI version string from libmpi.so
5 | #
6 |
7 | function get_mpi_version_string()
8 | buf_size = 8192 # HACK: this should be enough space
9 | buf = Array{UInt8}(undef, buf_size)
10 | buflen = Ref{Cint}()
11 |
12 | hndl = Libdl.dlopen(MPI.libmpi, Libdl.RTLD_LAZY | Libdl.RTLD_GLOBAL)
13 |
14 | try
15 | ptr = Libdl.dlsym(hndl, :MPI_Get_library_version)
16 | ccall(ptr, Cint, (Ptr{UInt8}, Ref{Cint}), buf, buflen)
17 | finally
18 | Libdl.dlclose(hndl)
19 | end
20 |
21 | @assert buflen[] < buf_size
22 | resize!(buf, buflen[])
23 | return String(buf)
24 | end
25 |
26 | #-------------------------------------------------------------------------------
27 |
28 |
29 | #_______________________________________________________________________________
30 | # Get information on which Device and Bus a GPU is connected to:
31 | #
32 |
33 | function get_device_attributes()
34 | attr = Dict{Tuple{Int32, Int32}, Int32}()
35 | for i in 0:(ndevices()-1)
36 | d = CuDevice(i)
37 | attr[(
38 | attribute(d, CUDA.CU_DEVICE_ATTRIBUTE_PCI_BUS_ID),
39 | attribute(d, CUDA.CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID)
40 | )] = d
41 | end
42 | attr
43 | end
44 |
45 | #-------------------------------------------------------------------------------
46 |
47 |
48 | MPI.Init()
49 | MPI.ThreadLevel(2)
50 |
51 | comm = MPI.COMM_WORLD
52 | rank = MPI.Comm_rank(comm)
53 | size = MPI.Comm_size(comm)
54 | name = gethostname()
55 |
56 | devices = get_device_attributes()
57 |
58 | # get the MPI version string and print it. This will be the same for every
59 | # rank, so do this only on Rank 0
60 | if rank == 0
61 | version_string = get_mpi_version_string()
62 | println("MPI Version: $(version_string)")
63 | end
64 |
65 | println(
66 | "Hello world, I am rank $(rank) of $(size) on $(name). " *
67 | "I have $(ndevices()) GPUs with properties: $(devices)"
68 | )
69 |
70 | MPI.Barrier(comm)
71 |
--------------------------------------------------------------------------------
/parts/gpu/slurm/job_hello_multinode.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --time=00:05:00
3 | #SBATCH --nodes=4
4 | #SBATCH --ntasks-per-node=4
5 | #SBATCH --gpus-per-task=1
6 | #SBATCH --constraint=gpu
7 | #SBATCH --account=ntrain1
8 | #SBATCH --output=slurm_hello_multinode.out
9 | #SBATCH --qos=regular
10 |
11 | # pin to closest NIC to GPU
12 | export MPICH_OFI_NIC_POLICY=GPU
13 |
14 | # Load julia
15 | ml use /global/common/software/nersc/n9/julia/modules
16 | ml julia
17 |
18 | mpiexecjl --project julia hello.jl
19 |
--------------------------------------------------------------------------------
/parts/gpu/slurm/job_hello_singlenode.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --time=00:05:00
3 | #SBATCH --nodes=1
4 | #SBATCH --ntasks-per-node=4
5 | #SBATCH --gpus-per-task=1
6 | #SBATCH --constraint=gpu
7 | #SBATCH --account=ntrain1
8 | #SBATCH --output=slurm_hello_singlenode.out
9 | #SBATCH --qos=regular
10 |
11 | # pin to closest NIC to GPU
12 | export MPICH_OFI_NIC_POLICY=GPU
13 |
14 | # Load julia
15 | ml use /global/common/software/nersc/n9/julia/modules
16 | ml julia
17 |
18 | mpiexecjl --project julia hello.jl
19 |
--------------------------------------------------------------------------------
/parts/gpu/solution/diffusion_2d_cuda.jl:
--------------------------------------------------------------------------------
1 | # 2D linear diffusion solver - GPU cuda version
2 | using Printf
3 | using JLD2
4 | using CUDA
5 | include(joinpath(@__DIR__, "../../shared.jl"))
6 |
7 | # convenience macros simply to avoid writing nested finite-difference expression
8 | macro qx(ix, iy) esc(:(-D * (C[$ix+1, $iy] - C[$ix, $iy]) * inv(ds))) end
9 | macro qy(ix, iy) esc(:(-D * (C[$ix, $iy+1] - C[$ix, $iy]) * inv(ds))) end
10 |
11 | function diffusion_step_kernel!(params, C2, C)
12 | (; ds, dt, D) = params
13 | ix = (blockIdx().x - 1) * blockDim().x + threadIdx().x # CUDA vectorised unique index
14 | iy = (blockIdx().y - 1) * blockDim().y + threadIdx().y # CUDA vectorised unique index
15 | if ix <= size(C, 1)-2 && iy <= size(C, 2)-2
16 | @inbounds C2[ix+1, iy+1] = C[ix+1, iy+1] - dt * ((@qx(ix + 1, iy + 1) - @qx(ix, iy + 1)) * inv(ds) +
17 | (@qy(ix + 1, iy + 1) - @qy(ix + 1, iy)) * inv(ds))
18 | end
19 | return nothing
20 | end
21 |
22 | function diffusion_step!(params, C2, C)
23 | (; nthreads, nblocks) = params
24 | @cuda threads = nthreads blocks = nblocks diffusion_step_kernel!(params, C2, C)
25 | return nothing
26 | end
27 |
28 | function run_diffusion(; ns=64, nt=100, do_save=false)
29 | params = init_params_gpu(; ns, nt, do_save)
30 | C, C2 = init_arrays_gpu(params)
31 | t_tic = 0.0
32 | # Time loop
33 | for it in 1:nt
34 | # time after warmup (ignore first 10 iterations)
35 | (it == 11) && (t_tic = Base.time())
36 | # diffusion
37 | diffusion_step!(params, C2, C)
38 | C, C2 = C2, C # pointer swap
39 | end
40 | # synchronize the gpu before querying the final time
41 | CUDA.synchronize()
42 | t_toc = (Base.time() - t_tic)
43 | print_perf(params, t_toc)
44 | do_save && jldsave(joinpath(@__DIR__, "out_gpu.jld2"); C = Array(C), l = params.L)
45 | return nothing
46 | end
47 |
48 | # Running things...
49 |
50 | # enable saving by default
51 | (!@isdefined do_save) && (do_save = true)
52 | # enable execution by default
53 | (!@isdefined do_run) && (do_run = true)
54 |
55 | if do_run
56 | if !isempty(ARGS)
57 | run_diffusion(; ns=parse(Int, ARGS[1]), do_save)
58 | else
59 | run_diffusion(; ns=256, do_save)
60 | end
61 | end
62 |
--------------------------------------------------------------------------------
/parts/gpu/solution/diffusion_2d_cuda_mpi.jl:
--------------------------------------------------------------------------------
1 | # 2D linear diffusion solver - GPU MPI
2 | using Printf
3 | using JLD2
4 | using CUDA
5 | using MPI
6 | include(joinpath(@__DIR__, "../../shared.jl"))
7 |
8 | # convenience macros simply to avoid writing nested finite-difference expression
9 | macro qx(ix, iy) esc(:(-D * (C[$ix+1, $iy] - C[$ix, $iy]) * inv(dx))) end
10 | macro qy(ix, iy) esc(:(-D * (C[$ix, $iy+1] - C[$ix, $iy]) * inv(dy))) end
11 |
12 | function diffusion_step_kernel!(params, C2, C)
13 | (; dx, dy, dt, D) = params
14 | ix = (blockIdx().x - 1) * blockDim().x + threadIdx().x # CUDA vectorised unique index
15 | iy = (blockIdx().y - 1) * blockDim().y + threadIdx().y # CUDA vectorised unique index
16 | if ix <= size(C, 1)-2 && iy <= size(C, 2)-2
17 | @inbounds C2[ix+1, iy+1] = C[ix+1, iy+1] - dt * ((@qx(ix + 1, iy + 1) - @qx(ix, iy + 1)) * inv(dx) +
18 | (@qy(ix + 1, iy + 1) - @qy(ix + 1, iy)) * inv(dy))
19 | end
20 | return nothing
21 | end
22 |
23 | function diffusion_step!(params, C2, C)
24 | (; nthreads, nblocks) = params
25 | @cuda threads = nthreads blocks = nblocks diffusion_step_kernel!(params, C2, C)
26 | return nothing
27 | end
28 |
29 | # MPI functions
30 | @views function update_halo!(A, bufs, neighbors, comm)
31 | # dim-1 (x)
32 | (neighbors.x[1] != MPI.PROC_NULL) && copyto!(bufs.send_1_1, A[2 , :])
33 | (neighbors.x[2] != MPI.PROC_NULL) && copyto!(bufs.send_1_2, A[end-1, :])
34 |
35 | reqs = MPI.MultiRequest(4)
36 | (neighbors.x[1] != MPI.PROC_NULL) && MPI.Irecv!(bufs.recv_1_1, comm, reqs[1]; source=neighbors.x[1])
37 | (neighbors.x[2] != MPI.PROC_NULL) && MPI.Irecv!(bufs.recv_1_2, comm, reqs[2]; source=neighbors.x[2])
38 |
39 | (neighbors.x[1] != MPI.PROC_NULL) && MPI.Isend(bufs.send_1_1, comm, reqs[3]; dest=neighbors.x[1])
40 | (neighbors.x[2] != MPI.PROC_NULL) && MPI.Isend(bufs.send_1_2, comm, reqs[4]; dest=neighbors.x[2])
41 | MPI.Waitall(reqs) # blocking
42 |
43 | (neighbors.x[1] != MPI.PROC_NULL) && copyto!(A[1 , :], bufs.recv_1_1)
44 | (neighbors.x[2] != MPI.PROC_NULL) && copyto!(A[end, :], bufs.recv_1_2)
45 |
46 | # dim-2 (y)
47 | (neighbors.y[1] != MPI.PROC_NULL) && copyto!(bufs.send_2_1, A[:, 2 ])
48 | (neighbors.y[2] != MPI.PROC_NULL) && copyto!(bufs.send_2_2, A[:, end-1])
49 |
50 | reqs = MPI.MultiRequest(4)
51 | (neighbors.y[1] != MPI.PROC_NULL) && MPI.Irecv!(bufs.recv_2_1, comm, reqs[1]; source=neighbors.y[1])
52 | (neighbors.y[2] != MPI.PROC_NULL) && MPI.Irecv!(bufs.recv_2_2, comm, reqs[2]; source=neighbors.y[2])
53 |
54 | (neighbors.y[1] != MPI.PROC_NULL) && MPI.Isend(bufs.send_2_1, comm, reqs[3]; dest=neighbors.y[1])
55 | (neighbors.y[2] != MPI.PROC_NULL) && MPI.Isend(bufs.send_2_2, comm, reqs[4]; dest=neighbors.y[2])
56 | MPI.Waitall(reqs) # blocking
57 |
58 | (neighbors.y[1] != MPI.PROC_NULL) && copyto!(A[:, 1 ], bufs.recv_2_1)
59 | (neighbors.y[2] != MPI.PROC_NULL) && copyto!(A[:, end], bufs.recv_2_2)
60 | return
61 | end
62 |
63 | function init_bufs(A)
64 | return (; send_1_1=CUDA.zeros(Float64, size(A, 2)), send_1_2=CUDA.zeros(Float64, size(A, 2)),
65 | send_2_1=CUDA.zeros(Float64, size(A, 1)), send_2_2=CUDA.zeros(Float64, size(A, 1)),
66 | recv_1_1=CUDA.zeros(Float64, size(A, 2)), recv_1_2=CUDA.zeros(Float64, size(A, 2)),
67 | recv_2_1=CUDA.zeros(Float64, size(A, 1)), recv_2_2=CUDA.zeros(Float64, size(A, 1)))
68 | end
69 |
70 | function run_diffusion(; ns=64, nt=100, do_save=false)
71 | MPI.Init()
72 | comm = MPI.COMM_WORLD
73 | nprocs = MPI.Comm_size(comm)
74 | dims = MPI.Dims_create(nprocs, (0, 0)) |> Tuple
75 | comm_cart = MPI.Cart_create(comm, dims)
76 | me = MPI.Comm_rank(comm_cart)
77 | coords = MPI.Cart_coords(comm_cart) |> Tuple
78 | neighbors = (; x=MPI.Cart_shift(comm_cart, 0, 1), y=MPI.Cart_shift(comm_cart, 1, 1))
79 | # select GPU on multi-GPU system based on shared memory topology
80 | comm_l = MPI.Comm_split_type(comm, MPI.COMM_TYPE_SHARED, me)
81 | me_l = MPI.Comm_rank(comm_l)
82 | # set GPU, but only if more than one device present
83 | gpu_id = CUDA.device!(me_l % ndevices())
84 | println("$(gpu_id), out of: $(ndevices())")
85 | (me == 0) && println("nprocs = $(nprocs), dims = $dims")
86 |
87 | params = init_params_gpu_mpi(; dims, coords, ns, nt, do_save)
88 | C, C2 = init_arrays_gpu_mpi(params)
89 | bufs = init_bufs(C)
90 | t_tic = 0.0
91 | # Time loop
92 | for it in 1:nt
93 | # time after warmup (ignore first 10 iterations)
94 | (it == 11) && (t_tic = Base.time())
95 | # diffusion
96 | diffusion_step!(params, C2, C)
97 | update_halo!(C2, bufs, neighbors, comm_cart)
98 | C, C2 = C2, C # pointer swap
99 | end
100 | # synchronize the gpu before querying the final time
101 | CUDA.synchronize()
102 | t_toc = (Base.time() - t_tic)
103 | # "master" prints performance
104 | (me == 0) && print_perf(params, t_toc)
105 | # save to (maybe) visualize later
106 | if do_save
107 | jldsave(joinpath(@__DIR__, "out_$(me).jld2"); C = Array(C[2:end-1, 2:end-1]), lxy = (; lx=params.L, ly=params.L))
108 | end
109 | MPI.Finalize()
110 | return
111 | end
112 |
113 | # Running things...
114 |
115 | # enable save to disk by default
116 | (!@isdefined do_save) && (do_save = true)
117 | # enable execution by default
118 | (!@isdefined do_run) && (do_run = true)
119 |
120 | if do_run
121 | # run_diffusion(; ns=256, do_save)
122 | run_diffusion(; ns=16384, do_save=false)
123 | end
124 |
--------------------------------------------------------------------------------
/parts/gpu/solution/job_bench_gpu.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --time=00:05:00
3 | #SBATCH --nodes=1
4 | #SBATCH --ntasks-per-node=1
5 | #SBATCH --cpus-per-task=128
6 | #SBATCH --gpus-per-task=1
7 | #SBATCH --constraint=gpu
8 | #SBATCH --account=ntrain1
9 | #SBATCH --output=slurm_bench_gpu.out
10 | #SBATCH --qos=regular
11 |
12 | # Load julia
13 | ml use /global/common/software/nersc/n9/julia/modules
14 | ml julia
15 |
16 | for i in 512 2048 4096 8192 16384
17 | do
18 | echo -e "\n\n#### GPU run $i"
19 |
20 | julia --project -e 'do_save=false; include("diffusion_2d_cuda.jl")' $i
21 | done
22 |
--------------------------------------------------------------------------------
/parts/gpu/solution/job_gpu_mpi_multinode.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #SBATCH -A ntrain1
4 | #SBATCH -C gpu
5 | #SBATCH -q regular
6 | #SBATCH --output=slurm_gpu_mpi_multinode.out
7 | #SBATCH --time=00:05:00
8 | #SBATCH --nodes=4
9 | #SBATCH --ntasks=16
10 | #SBATCH --gpus-per-node=4
11 | #SBATCH --exclusive
12 | #SBATCH --gpu-bind=none
13 |
14 | # pin to closest NIC to GPU
15 | export MPICH_OFI_NIC_POLICY=GPU
16 |
17 | # default to std memory pool, see: https://juliaparallel.org/MPI.jl/stable/knownissues/#Memory-pool
18 | export JULIA_CUDA_MEMORY_POOL=none
19 |
20 | ml use /global/common/software/nersc/n9/julia/modules
21 | ml julia
22 |
23 | mpiexecjl --project=../../.. julia diffusion_2d_cuda_mpi.jl
24 |
--------------------------------------------------------------------------------
/parts/gpu/solution/job_gpu_mpi_singlenode.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #SBATCH -A ntrain1
4 | #SBATCH -C gpu
5 | #SBATCH -q regular
6 | #SBATCH --output=slurm_gpu_mpi_singlenode.out
7 | #SBATCH --time=00:05:00
8 | #SBATCH --nodes=1
9 | #SBATCH --ntasks=4
10 | #SBATCH --gpus-per-node=4
11 | #SBATCH --exclusive
12 | #SBATCH --gpu-bind=none
13 |
14 | # pin to closest NIC to GPU
15 | export MPICH_OFI_NIC_POLICY=GPU
16 |
17 | # default to std memory pool, see: https://juliaparallel.org/MPI.jl/stable/knownissues/#Memory-pool
18 | export JULIA_CUDA_MEMORY_POOL=none
19 |
20 | ml use /global/common/software/nersc/n9/julia/modules
21 | ml julia
22 |
23 | mpiexecjl --project=../../.. julia diffusion_2d_cuda_mpi.jl
24 |
--------------------------------------------------------------------------------
/parts/gpu/solution/visualize_mpi.jl:
--------------------------------------------------------------------------------
1 | # Visualisation script for the 2D MPI solver
2 | using CairoMakie
3 | using JLD2
4 |
5 | function vizme2D_mpi(nprocs)
6 | C = []
7 | lx = ly = 0.0
8 | ip = 1
9 | for ipx in 1:nprocs[1]
10 | for ipy in 1:nprocs[2]
11 | C_loc, lxy = load(joinpath(@__DIR__, "out_$(ip-1).jld2"), "C", "lxy")
12 | nx_i, ny_i = size(C_loc, 1), size(C_loc, 2)
13 | ix1, iy1 = 1 + (ipx - 1) * nx_i, 1 + (ipy - 1) * ny_i
14 | if ip == 1
15 | C = zeros(nprocs[1] * nx_i, nprocs[2] * ny_i)
16 | lx, ly = lxy
17 | end
18 | C[ix1:ix1+nx_i-1, iy1:iy1+ny_i-1] .= C_loc
19 | ip += 1
20 | end
21 | end
22 | xc, yc = LinRange.(0, (lx, ly), size(C))
23 | fig = Figure(; size=(500, 400), fontsize=14)
24 | ax = Axis(fig[1, 1][1, 1]; aspect=DataAspect(), title="C")
25 | hm = heatmap!(ax, xc, yc, C; colormap=:turbo, colorrange=(0, 1))
26 | cb = Colorbar(fig[1, 1][1, 2], hm)
27 | display(fig)
28 | return
29 | end
30 |
31 | nprocs = (2, 2) # nprocs (x, y) dim
32 | vizme2D_mpi(nprocs)
33 |
--------------------------------------------------------------------------------
/parts/gpu/visualize.jl:
--------------------------------------------------------------------------------
1 | # Visualisation script for the 2D MPI solver
2 | using CairoMakie
3 | using JLD2
4 |
5 | function vizme2D()
6 | C, l = load(joinpath(@__DIR__, "out_gpu.jld2"), "C", "l")
7 | xc, yc = LinRange.(0, (l, l), size(C))
8 | fig = Figure(; size=(500, 400), fontsize=14)
9 | ax = Axis(fig[1, 1][1, 1]; aspect=DataAspect(), title="C")
10 | hm = heatmap!(ax, xc, yc, C; colormap=:turbo, colorrange=(0, 1))
11 | cb = Colorbar(fig[1, 1][1, 2], hm)
12 | display(fig)
13 | return
14 | end
15 |
16 | vizme2D()
17 |
--------------------------------------------------------------------------------
/parts/gpu/visualize_mpi.jl:
--------------------------------------------------------------------------------
1 | # Visualisation script for the 2D MPI solver
2 | using CairoMakie
3 | using JLD2
4 |
5 | function vizme2D_mpi(nprocs)
6 | C = []
7 | lx = ly = 0.0
8 | ip = 1
9 | for ipx in 1:nprocs[1]
10 | for ipy in 1:nprocs[2]
11 | C_loc, lxy = load(joinpath(@__DIR__, "out_$(ip-1).jld2"), "C", "lxy")
12 | nx_i, ny_i = size(C_loc, 1), size(C_loc, 2)
13 | ix1, iy1 = 1 + (ipx - 1) * nx_i, 1 + (ipy - 1) * ny_i
14 | if ip == 1
15 | C = zeros(nprocs[1] * nx_i, nprocs[2] * ny_i)
16 | lx, ly = lxy
17 | end
18 | C[ix1:ix1+nx_i-1, iy1:iy1+ny_i-1] .= C_loc
19 | ip += 1
20 | end
21 | end
22 | xc, yc = LinRange.(0, (lx, ly), size(C))
23 | fig = Figure(; size=(500, 400), fontsize=14)
24 | ax = Axis(fig[1, 1][1, 1]; aspect=DataAspect(), title="C")
25 | hm = heatmap!(ax, xc, yc, C; colormap=:turbo, colorrange=(0, 1))
26 | cb = Colorbar(fig[1, 1][1, 2], hm)
27 | display(fig)
28 | return
29 | end
30 |
31 | nprocs = (2, 2) # nprocs (x, y) dim
32 | vizme2D_mpi(nprocs)
33 |
--------------------------------------------------------------------------------
/parts/mpi/README.md:
--------------------------------------------------------------------------------
1 | # Diffusion 2D - MPI
2 |
3 | In this part, we want to use MPI (distributed parallelism) to parallelize our Diffusion 2D example.
4 |
5 | The starting point is (once again) the serial loop version [`diffusion_2d_loop.jl`](./../diffusion_2d/diffusion_2d_loop.jl). The file [`diffusion_2d_mpi.jl`](./diffusion_2d_mpi.jl) in this folder is a modified copy of this variant. While the computational kernel `diffusion_step!` is essentially untouched, we included MPI bits at the beginning of the `run_diffusion` function and introduced the key function `update_halo!`, which is supposed to take care of data exchange between MPI ranks. However, as of now, the function isn't communicating anything and it will be (one of) your tasks to fix that 😉.
6 |
7 |
8 | ## Task 1 - Running the MPI code
9 |
10 | Although incomplete from a semantic point of view, the code in `diffusion_2d_mpi.jl` is perfectly runnable as is. It won't compute the right thing, but it runs 😉. So **let's run it**. But how?
11 |
12 | First thing to realize is that, on Perlmutter, **you can't run MPI on a login node**. You have two options to work on a compute node:
13 |
14 | 1) **Interactive session**: You can try to get an interactive session on a compute node by running `sh get_compute_node_interactive.sh`. But unfortunately, we don't have a node for everyone, so you might not get one (Sorry!). **If you can get one**, you can use `mpiexecjl --project -n 4 julia diffusion_2d_mpi.jl` to run the code. Alternatively, you can run `sh job_mpi_singlenode.sh`.
15 |
16 | 2) **Compute job**: You can always submit a job that runs the code: `sbatch job_mpi_singlenode.sh`. The output will land in `slurm_mpi_singlenode.out`. Check out the [Perlmutter cheetsheet](../../help/perlmutter_cheatsheet.md) to learn more about jobs.
17 |
18 | Irrespective of which option you choose, **go ahead an run the code** (with 4 MPI ranks).
19 |
20 | To see that the code is currently not working properly (in the sense of computing the right thing), run `julia --project visualize_mpi.jl` to combine the results of different MPI ranks (`*.jld2` files) into a visualization (`visualization.png`). Inspect the visualization and notice the undesired dark lines.
21 |
22 | ## Task 2 - Halo exchange
23 |
24 | Take a look at the general MPI setup (the beginning of `run_diffusion`) and the `update_halo!` function (the bits that are already there) and try to understand it.
25 |
26 | Afterwards, implement the necessary MPI communication. To that end, find the "TODO" block in `update_halo!` and follow the instructions. Note that we want to use **non-blocking** communication, i.e. you should use the functions `MPI.Irecv` and `MPI.Isend`.
27 |
28 | Check that your code is working by comparing the `visualization.png` that you get to this (basic "eye test"):
29 |
30 |
31 |
32 | ## Task 3 - Benchmark
33 |
34 | ### Part A
35 |
36 | Our goal is to perform a rough and basic scaling analysis with 4, 8, and 16 MPI ranks distributed across multiple nodes. Specifically, we want to run 4 MPI ranks on a node and increase the number of nodes to get up to 16 ranks in total.
37 |
38 | The file `job_mpi_multinode.sh` is a job script that currently requests a single node (see the line `#SBATCH --nodes=1`) that runs 4 MPI ranks (see the line `#SBATCH --ntasks-per-node=4`), and then runs our Julia MPI code with `do_save=false` for simplicity and `ns=6144`.
39 |
40 | Submit this file to SLURM via `sbatch job_mpi_multinode.sh`. Once the job has run, the output will land in `slurm_mpi_multinode.sh`. Write the output down somewhere (copy & paste), change the number of nodes to 2 (= 8 MPI ranks in total) and rerun the experiment. Repeat the same thing, this time requesting 3 nodes (= 12 MPI ranks in total) and then requesting 4 nodes (= 16 MPI ranks in total).
41 |
42 | ### Part B
43 |
44 | Inspect the results that you've obtained and compare them.
45 |
46 | **Questions**
47 | * What do you observe?
48 | * Is this what you'd expected?
49 |
50 | Note that in setting up our MPI ranks, we split our global grid into local grids. In the process, the meaning of the input parameter `ns` changed compared to previous codes (serial & multithreading). It now determines the resolution of the **local grid** - that each MPI rank is holding - rather than the resolution of the global grid. Since we keep `ns` fixed (6144 in `job_mpi_multinode.sh`), we thus increase the problem size (the total grid resolution) when we increase the number of MPI ranks. This is known as a "weak scaling" analysis.
51 |
52 | **Question**
53 |
54 | * Given the comment above, what does "ideal parallel scaling" mean in the context of a "weak scaling" analysis?
55 | * What do the observed results tell you?
56 |
--------------------------------------------------------------------------------
/parts/mpi/diffusion_2d_mpi.jl:
--------------------------------------------------------------------------------
1 | # 2D linear diffusion solver - MPI
2 | using Printf
3 | using JLD2
4 | using MPI
5 | include(joinpath(@__DIR__, "../shared.jl"))
6 |
7 | # convenience macros simply to avoid writing nested finite-difference expression
8 | macro qx(ix, iy) esc(:(-D * (C[$ix+1, $iy] - C[$ix, $iy]) / dx)) end
9 | macro qy(ix, iy) esc(:(-D * (C[$ix, $iy+1] - C[$ix, $iy]) / dy)) end
10 |
11 | function diffusion_step!(params, C2, C)
12 | (; dx, dy, dt, D) = params
13 | for iy in 1:size(C, 2)-2
14 | for ix in 1:size(C, 1)-2
15 | @inbounds C2[ix+1, iy+1] = C[ix+1, iy+1] - dt * ((@qx(ix+1, iy+1) - @qx(ix, iy+1)) / dx +
16 | (@qy(ix+1, iy+1) - @qy(ix+1, iy)) / dy)
17 | end
18 | end
19 | return nothing
20 | end
21 |
22 | # MPI functions
23 | @views function update_halo!(A, bufs, neighbors, comm)
24 | #
25 | # !!! TODO
26 | #
27 | # Complete the halo exchange implementation. Specifically, use non-blocking
28 | # MPI communication (Irecv and Isend) at the positions marked by "TODO..." below.
29 | #
30 | # Help:
31 | # left neighbor: neighbors.x[1]
32 | # right neighbor: neighbors.x[2]
33 | # up neighbor: neighbors.y[1]
34 | # down neighbor: neighbors.y[2]
35 | #
36 |
37 | # dim-1 (x)
38 | (neighbors.x[1] != MPI.PROC_NULL) && copyto!(bufs.send_1_1, A[2 , :])
39 | (neighbors.x[2] != MPI.PROC_NULL) && copyto!(bufs.send_1_2, A[end-1, :])
40 |
41 | reqs = MPI.MultiRequest(4)
42 | (neighbors.x[1] != MPI.PROC_NULL) && # TODO... receive from left neighbor into bufs.recv_1_1
43 | (neighbors.x[2] != MPI.PROC_NULL) && # TODO... receive from right neighbor into bufs.recv_1_2
44 |
45 | (neighbors.x[1] != MPI.PROC_NULL) && # TODO... send bufs.send_1_1 to left neighbor
46 | (neighbors.x[2] != MPI.PROC_NULL) && # TODO... send bufs.send_1_2 to right neighbor
47 | MPI.Waitall(reqs) # blocking
48 |
49 | (neighbors.x[1] != MPI.PROC_NULL) && copyto!(A[1 , :], bufs.recv_1_1)
50 | (neighbors.x[2] != MPI.PROC_NULL) && copyto!(A[end, :], bufs.recv_1_2)
51 |
52 | # dim-2 (y)
53 | (neighbors.y[1] != MPI.PROC_NULL) && copyto!(bufs.send_2_1, A[:, 2 ])
54 | (neighbors.y[2] != MPI.PROC_NULL) && copyto!(bufs.send_2_2, A[:, end-1])
55 |
56 | reqs = MPI.MultiRequest(4)
57 | (neighbors.y[1] != MPI.PROC_NULL) && # TODO... receive from up neighbor into bufs.recv_2_1
58 | (neighbors.y[2] != MPI.PROC_NULL) && # TODO... receive from down neighbor into bufs.recv_2_2
59 |
60 | (neighbors.y[1] != MPI.PROC_NULL) && # TODO... send bufs.send_2_1 to up neighbor
61 | (neighbors.y[2] != MPI.PROC_NULL) && # TODO... send bufs.send_2_2 to down neighbor
62 | MPI.Waitall(reqs) # blocking
63 |
64 | (neighbors.y[1] != MPI.PROC_NULL) && copyto!(A[:, 1 ], bufs.recv_2_1)
65 | (neighbors.y[2] != MPI.PROC_NULL) && copyto!(A[:, end], bufs.recv_2_2)
66 | return nothing
67 | end
68 |
69 | function init_bufs(A)
70 | return (; send_1_1=zeros(size(A, 2)), send_1_2=zeros(size(A, 2)),
71 | send_2_1=zeros(size(A, 1)), send_2_2=zeros(size(A, 1)),
72 | recv_1_1=zeros(size(A, 2)), recv_1_2=zeros(size(A, 2)),
73 | recv_2_1=zeros(size(A, 1)), recv_2_2=zeros(size(A, 1)))
74 | end
75 |
76 | function run_diffusion(; ns=64, nt=100, do_save=false)
77 | MPI.Init()
78 | comm = MPI.COMM_WORLD
79 | nprocs = MPI.Comm_size(comm)
80 | dims = MPI.Dims_create(nprocs, (0, 0)) |> Tuple
81 | comm_cart = MPI.Cart_create(comm, dims)
82 | me = MPI.Comm_rank(comm_cart)
83 | coords = MPI.Cart_coords(comm_cart) |> Tuple
84 | neighbors = (; x=MPI.Cart_shift(comm_cart, 0, 1), y=MPI.Cart_shift(comm_cart, 1, 1))
85 | (me == 0) && println("nprocs = $(nprocs), dims = $dims")
86 |
87 | params = init_params_mpi(; dims, coords, ns, nt, do_save)
88 | C, C2 = init_arrays_mpi(params)
89 | bufs = init_bufs(C)
90 | t_tic = 0.0
91 | # time loop
92 | for it in 1:nt
93 | # time after warmup (ignore first 10 iterations)
94 | (it == 11) && (t_tic = Base.time())
95 | # diffusion
96 | diffusion_step!(params, C2, C)
97 | update_halo!(C2, bufs, neighbors, comm_cart)
98 | C, C2 = C2, C # pointer swap
99 | end
100 | t_toc = (Base.time() - t_tic)
101 | # "master" prints performance
102 | (me == 0) && print_perf(params, t_toc)
103 | # save to (maybe) visualize later
104 | if do_save
105 | jldsave(joinpath(@__DIR__, "out_$(me).jld2"); C = Array(C[2:end-1, 2:end-1]), lxy = (; lx=params.L, ly=params.L))
106 | end
107 | MPI.Finalize()
108 | return nothing
109 | end
110 |
111 | # Running things...
112 |
113 | # enable save to disk by default
114 | (!@isdefined do_save) && (do_save = true)
115 | # enable execution by default
116 | (!@isdefined do_run) && (do_run = true)
117 |
118 | if do_run
119 | if !isempty(ARGS)
120 | run_diffusion(; ns=parse(Int, ARGS[1]), do_save)
121 | else
122 | run_diffusion(; ns=256, do_save)
123 | end
124 | end
125 |
--------------------------------------------------------------------------------
/parts/mpi/explanation/01_mpi+jupyter.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "650f758f-84da-4dd3-9479-8dbc49ebc3d4",
6 | "metadata": {
7 | "slideshow": {
8 | "slide_type": "skip"
9 | },
10 | "tags": []
11 | },
12 | "source": [
13 | "# Setup\n",
14 | "\n",
15 | "Note: you might need to run `Pkg.instantiate()` to ensure that the `Manifest.toml` is up to date. This only needs to be done once."
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": 1,
21 | "id": "89ab4e89-10ca-4ba8-a7bc-d33fcf3f2e60",
22 | "metadata": {
23 | "slideshow": {
24 | "slide_type": "skip"
25 | },
26 | "tags": []
27 | },
28 | "outputs": [
29 | {
30 | "name": "stderr",
31 | "output_type": "stream",
32 | "text": [
33 | "\u001b[32m\u001b[1m Activating\u001b[22m\u001b[39m project at `/global/u1/b/blaschke/juliacon24-hpcworkshop/parts/mpi/explanation`\n"
34 | ]
35 | },
36 | {
37 | "name": "stdout",
38 | "output_type": "stream",
39 | "text": [
40 | "\u001b[32m\u001b[1mStatus\u001b[22m\u001b[39m `/global/u1/b/blaschke/juliacon24-hpcworkshop/parts/mpi/explanation/Project.toml`\n",
41 | " \u001b[90m[1520ce14] \u001b[39mAbstractTrees v0.4.5\n",
42 | " \u001b[90m[052768ef] \u001b[39mCUDA v5.4.2\n",
43 | " \u001b[90m[adafc99b] \u001b[39mCpuId v0.3.1\n",
44 | " \u001b[90m[0e44f5e4] \u001b[39mHwloc v3.0.1\n",
45 | " \u001b[90m[da04e1cc] \u001b[39mMPI v0.20.20\n",
46 | " \u001b[90m[e7922434] \u001b[39mMPIClusterManagers v0.2.4\n",
47 | " \u001b[90m[6f74fd91] \u001b[39mNetworkInterfaceControllers v0.1.0\n"
48 | ]
49 | }
50 | ],
51 | "source": [
52 | "import Pkg;\n",
53 | "Pkg.activate(@__DIR__)\n",
54 | "Pkg.status()"
55 | ]
56 | },
57 | {
58 | "cell_type": "markdown",
59 | "id": "53799c57-9c82-4cb2-9a73-f858a8725071",
60 | "metadata": {
61 | "slideshow": {
62 | "slide_type": "slide"
63 | },
64 | "tags": []
65 | },
66 | "source": [
67 | "# Julia + Jupyter + MPI\n",
68 | "\n",
69 | "`MPI.jl` provides wrappers for the system MPI libraries. And the `MPIClusterManagers.jl` package lets you control MPI workflows within Julia"
70 | ]
71 | },
72 | {
73 | "cell_type": "markdown",
74 | "id": "89cfa159-4234-4961-b18e-6f7a4472bb04",
75 | "metadata": {
76 | "slideshow": {
77 | "slide_type": "subslide"
78 | },
79 | "tags": []
80 | },
81 | "source": [
82 | "## MPI.jl"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": 2,
88 | "id": "6bcb1ba8-c4da-4311-a873-3354126c952d",
89 | "metadata": {
90 | "slideshow": {
91 | "slide_type": "fragment"
92 | },
93 | "tags": []
94 | },
95 | "outputs": [],
96 | "source": [
97 | "using MPI"
98 | ]
99 | },
100 | {
101 | "cell_type": "markdown",
102 | "id": "1f4228e3-d910-451b-8523-7b60f342788d",
103 | "metadata": {
104 | "slideshow": {
105 | "slide_type": "fragment"
106 | },
107 | "tags": []
108 | },
109 | "source": [
110 | "`MPI.versioninfo()` tells you which MPI backend is being used by `MPI.jl`. On HPC systems, which rely on vendor-provided MPI implementations (e.g. on HPE Cray systems like Perlmutter), make sure that `MPI.jl` loads the \"right\" `libmpi.so`:"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": 3,
116 | "id": "eb4f99e3-63a2-43af-903d-36cfbe011415",
117 | "metadata": {
118 | "slideshow": {
119 | "slide_type": "subslide"
120 | },
121 | "tags": []
122 | },
123 | "outputs": [
124 | {
125 | "name": "stdout",
126 | "output_type": "stream",
127 | "text": [
128 | "MPIPreferences:\n",
129 | " binary: system\n",
130 | " abi: MPICH\n",
131 | " libmpi: libmpi_gnu_123.so\n",
132 | " mpiexec: srun\n",
133 | "\n",
134 | "Package versions\n",
135 | " MPI.jl: 0.20.20\n",
136 | " MPIPreferences.jl: 0.1.11\n",
137 | "\n",
138 | "Library information:\n",
139 | " libmpi: libmpi_gnu_123.so\n",
140 | " libmpi dlpath: /opt/cray/pe/lib64/libmpi_gnu_123.so\n",
141 | " MPI version: 3.1.0\n",
142 | " Library version: \n",
143 | " MPI VERSION : CRAY MPICH version 8.1.28.29 (ANL base 3.4a2)\n",
144 | " MPI BUILD INFO : Wed Nov 15 20:57 2023 (git hash 1cde46f)\n",
145 | " \n"
146 | ]
147 | }
148 | ],
149 | "source": [
150 | "MPI.versioninfo()"
151 | ]
152 | },
153 | {
154 | "cell_type": "markdown",
155 | "id": "0ebcbfaa-839b-4d40-a9ef-fc99cee61b04",
156 | "metadata": {
157 | "slideshow": {
158 | "slide_type": "subslide"
159 | },
160 | "tags": []
161 | },
162 | "source": [
163 | "## MPIClusterManagers.jl"
164 | ]
165 | },
166 | {
167 | "cell_type": "markdown",
168 | "id": "338abb9b-48de-4c85-9e82-bc08927ad43a",
169 | "metadata": {
170 | "slideshow": {
171 | "slide_type": "fragment"
172 | },
173 | "tags": []
174 | },
175 | "source": [
176 | "`MPIClusterManagers.jl` provide a way for Jupyter to connect to MPI processes."
177 | ]
178 | },
179 | {
180 | "cell_type": "markdown",
181 | "id": "8725708a-b5b5-4cac-8983-c95a0c4b7ab9",
182 | "metadata": {
183 | "slideshow": {
184 | "slide_type": "fragment"
185 | },
186 | "tags": []
187 | },
188 | "source": [
189 | "On Perlmutter, we have a choice among network interfaces:"
190 | ]
191 | },
192 | {
193 | "cell_type": "code",
194 | "execution_count": 3,
195 | "id": "d2e41152-6380-4b21-8bbe-71257eb8aba7",
196 | "metadata": {
197 | "slideshow": {
198 | "slide_type": "fragment"
199 | },
200 | "tags": []
201 | },
202 | "outputs": [
203 | {
204 | "data": {
205 | "text/plain": [
206 | "6-element Vector{NetworkInterfaceControllers.Interface}:\n",
207 | " NetworkInterfaceControllers.Interface(\"nmn0\", :v4, ip\"10.100.108.57\")\n",
208 | " NetworkInterfaceControllers.Interface(\"hsn0\", :v4, ip\"10.249.42.35\")\n",
209 | " NetworkInterfaceControllers.Interface(\"hsn0:chn\", :v4, ip\"128.55.84.171\")\n",
210 | " NetworkInterfaceControllers.Interface(\"hsn1\", :v4, ip\"10.249.42.19\")\n",
211 | " NetworkInterfaceControllers.Interface(\"hsn2\", :v4, ip\"10.249.42.20\")\n",
212 | " NetworkInterfaceControllers.Interface(\"hsn3\", :v4, ip\"10.249.42.36\")"
213 | ]
214 | },
215 | "execution_count": 3,
216 | "metadata": {},
217 | "output_type": "execute_result"
218 | }
219 | ],
220 | "source": [
221 | "using NetworkInterfaceControllers, Sockets\n",
222 | "interfaces = NetworkInterfaceControllers.get_interface_data(IPv4)"
223 | ]
224 | },
225 | {
226 | "cell_type": "markdown",
227 | "id": "78c91aa1-41ce-450a-b646-d8574e8740f4",
228 | "metadata": {
229 | "slideshow": {
230 | "slide_type": "subslide"
231 | },
232 | "tags": []
233 | },
234 | "source": [
235 | "Buf we have to be careful about which network we connect to:"
236 | ]
237 | },
238 | {
239 | "cell_type": "code",
240 | "execution_count": 4,
241 | "id": "a31df2d1-6a35-4420-9385-b60af0831074",
242 | "metadata": {
243 | "slideshow": {
244 | "slide_type": "skip"
245 | },
246 | "tags": []
247 | },
248 | "outputs": [
249 | {
250 | "data": {
251 | "text/plain": [
252 | "filter (generic function with 11 methods)"
253 | ]
254 | },
255 | "execution_count": 4,
256 | "metadata": {},
257 | "output_type": "execute_result"
258 | }
259 | ],
260 | "source": [
261 | "import Base: filter, Fix1\n",
262 | "filter(f::Function)::Function = Fix1(filter, f)"
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": 5,
268 | "id": "26e0a840-7b61-4202-974c-1cda95820690",
269 | "metadata": {
270 | "slideshow": {
271 | "slide_type": "skip"
272 | },
273 | "tags": []
274 | },
275 | "outputs": [],
276 | "source": [
277 | "using Hwloc, AbstractTrees\n",
278 | "\n",
279 | "import AbstractTrees: PreOrderDFS\n",
280 | "import Hwloc: hwloc_pci_class_string\n",
281 | "\n",
282 | "sys_devs = children(gettopology())\n",
283 | "pci_devs = PreOrderDFS(sys_devs) |> collect |> filter(x->x.type==:PCI_Device)\n",
284 | "net_devs = pci_devs |> filter(x->hwloc_pci_class_string(nodevalue(x).attr.class_id) == \"Ethernet\")\n",
285 | "\n",
286 | ";"
287 | ]
288 | },
289 | {
290 | "cell_type": "code",
291 | "execution_count": 6,
292 | "id": "848daddc-d8cb-4ad0-9a33-eed34197e3cb",
293 | "metadata": {
294 | "slideshow": {
295 | "slide_type": "fragment"
296 | },
297 | "tags": []
298 | },
299 | "outputs": [
300 | {
301 | "name": "stdout",
302 | "output_type": "stream",
303 | "text": [
304 | "Device hsn0 is a Slingshot device\n",
305 | "Device nmn0 is a Unknown device\n",
306 | "Device hsn1 is a Slingshot device\n",
307 | "Device hsn2 is a Slingshot device\n",
308 | "Device hsn3 is a Slingshot device\n"
309 | ]
310 | }
311 | ],
312 | "source": [
313 | "# net_devs are populated using Hwloc, please take a look at the source notebook\n",
314 | "# for further information\n",
315 | "\n",
316 | "for dev in net_devs\n",
317 | " io = dev.io_children |> only\n",
318 | " name = io.object.name\n",
319 | " kind = io.object.subtype\n",
320 | " kind = kind == \"\" ? \"Unknown\" : kind\n",
321 | " println(\"Device $(name) is a $(kind) device\")\n",
322 | "end"
323 | ]
324 | },
325 | {
326 | "cell_type": "markdown",
327 | "id": "36cb812b-3779-48ae-a982-d3aa8599b39f",
328 | "metadata": {
329 | "slideshow": {
330 | "slide_type": "fragment"
331 | },
332 | "tags": []
333 | },
334 | "source": [
335 | "Therefore only the `hsn*` defivices are Slingshot devices."
336 | ]
337 | },
338 | {
339 | "cell_type": "markdown",
340 | "id": "f6d965b3-1002-41ec-a964-6e4f71faf95e",
341 | "metadata": {
342 | "slideshow": {
343 | "slide_type": "subslide"
344 | },
345 | "tags": []
346 | },
347 | "source": [
348 | "Let's now use this information to find a HSN device with which we manage our MPI cluster. Note: we'll take the one with `:chn` in the name (as it's the only one with a public IP):"
349 | ]
350 | },
351 | {
352 | "cell_type": "code",
353 | "execution_count": 7,
354 | "id": "af6bdb63-1f0e-4bf6-ad6a-144d365a7e97",
355 | "metadata": {
356 | "slideshow": {
357 | "slide_type": "fragment"
358 | },
359 | "tags": []
360 | },
361 | "outputs": [
362 | {
363 | "data": {
364 | "text/plain": [
365 | "NetworkInterfaceControllers.Interface(\"hsn0:chn\", :v4, ip\"128.55.84.171\")"
366 | ]
367 | },
368 | "execution_count": 7,
369 | "metadata": {},
370 | "output_type": "execute_result"
371 | }
372 | ],
373 | "source": [
374 | "hsn0_public = filter(\n",
375 | " x->(x.name==\"hsn0:chn\" && x.version==:v4), interfaces\n",
376 | ") |> only "
377 | ]
378 | },
379 | {
380 | "cell_type": "code",
381 | "execution_count": 8,
382 | "id": "1a502b97-b4e1-44f9-a5e9-3bc09c0e8491",
383 | "metadata": {
384 | "slideshow": {
385 | "slide_type": "fragment"
386 | },
387 | "tags": []
388 | },
389 | "outputs": [
390 | {
391 | "data": {
392 | "text/plain": [
393 | "\"nid200344-hsn0\""
394 | ]
395 | },
396 | "execution_count": 8,
397 | "metadata": {},
398 | "output_type": "execute_result"
399 | }
400 | ],
401 | "source": [
402 | "public_slingshot_name = getnameinfo(hsn0_public.ip)"
403 | ]
404 | },
405 | {
406 | "cell_type": "markdown",
407 | "id": "70db6ae1-a001-4606-9933-55f2ac158be2",
408 | "metadata": {
409 | "slideshow": {
410 | "slide_type": "slide"
411 | },
412 | "tags": []
413 | },
414 | "source": [
415 | "## MPI Worker Cluster\n",
416 | "\n",
417 | "We use `MPIClusterManagers.jl` to start a cluster of workers. Each worker uses MPI to communicate (`MPIWorkerManager` stars an `srun` session), and is controlled via the device at `public_slingshot_name` (previous section):"
418 | ]
419 | },
420 | {
421 | "cell_type": "code",
422 | "execution_count": 9,
423 | "id": "1c81c337-5e88-4688-bcf2-f48b6eeb98e8",
424 | "metadata": {
425 | "slideshow": {
426 | "slide_type": "fragment"
427 | },
428 | "tags": []
429 | },
430 | "outputs": [
431 | {
432 | "data": {
433 | "text/plain": [
434 | "4-element Vector{Int64}:\n",
435 | " 2\n",
436 | " 3\n",
437 | " 4\n",
438 | " 5"
439 | ]
440 | },
441 | "execution_count": 9,
442 | "metadata": {},
443 | "output_type": "execute_result"
444 | }
445 | ],
446 | "source": [
447 | "# to import MPIManager\n",
448 | "using MPIClusterManagers\n",
449 | "\n",
450 | "# need to also import Distributed to use addprocs()\n",
451 | "using Distributed\n",
452 | "\n",
453 | "# specify, number of mpi workers, launch cmd, etc.\n",
454 | "manager=MPIWorkerManager(4)\n",
455 | "\n",
456 | "# start mpi workers and add them as julia workers too.\n",
457 | "addprocs(\n",
458 | " manager,\n",
459 | " exeflags=`--project=$(Base.active_project())`,\n",
460 | " master_tcp_interface=public_slingshot_name\n",
461 | ")"
462 | ]
463 | },
464 | {
465 | "cell_type": "markdown",
466 | "id": "343ca90a-f66e-43d6-a887-2b6956fae59e",
467 | "metadata": {
468 | "slideshow": {
469 | "slide_type": "subslide"
470 | },
471 | "tags": []
472 | },
473 | "source": [
474 | "Now we can use `@mpi_do` to issue instructions to all of our MPI workers:"
475 | ]
476 | },
477 | {
478 | "cell_type": "code",
479 | "execution_count": 10,
480 | "id": "0f6bc5b9-2973-4dc5-8fdd-bfd483f01460",
481 | "metadata": {
482 | "slideshow": {
483 | "slide_type": "fragment"
484 | },
485 | "tags": []
486 | },
487 | "outputs": [
488 | {
489 | "name": "stdout",
490 | "output_type": "stream",
491 | "text": [
492 | " From worker 5:\tHello world, I am 3 of 4 on nid200349\n",
493 | " From worker 4:\tHello world, I am 2 of 4 on nid200348\n",
494 | " From worker 2:\tHello world, I am 0 of 4 on nid200344\n",
495 | " From worker 3:\tHello world, I am 1 of 4 on nid200345\n"
496 | ]
497 | }
498 | ],
499 | "source": [
500 | "@mpi_do manager begin\n",
501 | " using MPI: MPI, Comm, Win, free\n",
502 | " comm = MPI.COMM_WORLD\n",
503 | " rank = MPI.Comm_rank(comm)\n",
504 | " size = MPI.Comm_size(comm)\n",
505 | " name = gethostname()\n",
506 | " println(\"Hello world, I am $(rank) of $(size) on $(name)\")\n",
507 | "end"
508 | ]
509 | },
510 | {
511 | "cell_type": "markdown",
512 | "id": "98174d30-5828-43f9-b63d-11d85a46185c",
513 | "metadata": {
514 | "slideshow": {
515 | "slide_type": "fragment"
516 | },
517 | "tags": []
518 | },
519 | "source": [
520 | "We started this in a 4-node job. Therefore each worker is on a different node."
521 | ]
522 | },
523 | {
524 | "cell_type": "code",
525 | "execution_count": null,
526 | "id": "88e46e3b-f8d4-48d5-b8fc-2ae660f5a4a8",
527 | "metadata": {
528 | "slideshow": {
529 | "slide_type": "skip"
530 | },
531 | "tags": []
532 | },
533 | "outputs": [],
534 | "source": []
535 | }
536 | ],
537 | "metadata": {
538 | "kernelspec": {
539 | "display_name": "Julia 1.9.4",
540 | "language": "julia",
541 | "name": "julia-1.9.4"
542 | },
543 | "language_info": {
544 | "file_extension": ".jl",
545 | "mimetype": "application/julia",
546 | "name": "julia",
547 | "version": "1.9.4"
548 | }
549 | },
550 | "nbformat": 4,
551 | "nbformat_minor": 5
552 | }
553 |
--------------------------------------------------------------------------------
/parts/mpi/explanation/02_comms.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "4d3cf46f-8189-4609-b217-29948b377255",
6 | "metadata": {
7 | "slideshow": {
8 | "slide_type": "skip"
9 | },
10 | "tags": []
11 | },
12 | "source": [
13 | "# Setup\n",
14 | "\n",
15 | "Note: you might need to run `Pkg.instantiate()` to ensure that the `Manifest.toml` is up to date. This only needs to be done once."
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": 1,
21 | "id": "89ab4e89-10ca-4ba8-a7bc-d33fcf3f2e60",
22 | "metadata": {
23 | "slideshow": {
24 | "slide_type": "skip"
25 | },
26 | "tags": []
27 | },
28 | "outputs": [
29 | {
30 | "name": "stderr",
31 | "output_type": "stream",
32 | "text": [
33 | "\u001b[32m\u001b[1m Activating\u001b[22m\u001b[39m project at `/global/u1/b/blaschke/juliacon24-hpcworkshop/parts/mpi/explanation`\n"
34 | ]
35 | },
36 | {
37 | "name": "stdout",
38 | "output_type": "stream",
39 | "text": [
40 | "\u001b[32m\u001b[1mStatus\u001b[22m\u001b[39m `/global/u1/b/blaschke/juliacon24-hpcworkshop/parts/mpi/explanation/Project.toml`\n",
41 | " \u001b[90m[1520ce14] \u001b[39mAbstractTrees v0.4.5\n",
42 | " \u001b[90m[052768ef] \u001b[39mCUDA v5.4.2\n",
43 | " \u001b[90m[adafc99b] \u001b[39mCpuId v0.3.1\n",
44 | " \u001b[90m[0e44f5e4] \u001b[39mHwloc v3.0.1\n",
45 | " \u001b[90m[da04e1cc] \u001b[39mMPI v0.20.20\n",
46 | " \u001b[90m[e7922434] \u001b[39mMPIClusterManagers v0.2.4\n",
47 | " \u001b[90m[6f74fd91] \u001b[39mNetworkInterfaceControllers v0.1.0\n"
48 | ]
49 | }
50 | ],
51 | "source": [
52 | "import Pkg;\n",
53 | "Pkg.activate(@__DIR__)\n",
54 | "Pkg.status()"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": 2,
60 | "id": "1c81c337-5e88-4688-bcf2-f48b6eeb98e8",
61 | "metadata": {
62 | "slideshow": {
63 | "slide_type": "skip"
64 | },
65 | "tags": []
66 | },
67 | "outputs": [
68 | {
69 | "data": {
70 | "text/plain": [
71 | "4-element Vector{Int64}:\n",
72 | " 2\n",
73 | " 3\n",
74 | " 4\n",
75 | " 5"
76 | ]
77 | },
78 | "execution_count": 2,
79 | "metadata": {},
80 | "output_type": "execute_result"
81 | }
82 | ],
83 | "source": [
84 | "using MPI\n",
85 | "\n",
86 | "using NetworkInterfaceControllers, Sockets\n",
87 | "interfaces = NetworkInterfaceControllers.get_interface_data(IPv4)\n",
88 | "\n",
89 | "hsn0_public = filter(x->(x.name==\"hsn0:chn\" && x.version==:v4), interfaces) |> only \n",
90 | "public_slingshot_name = getnameinfo(hsn0_public.ip)\n",
91 | "\n",
92 | "# to import MPIManager\n",
93 | "using MPIClusterManagers\n",
94 | "\n",
95 | "# need to also import Distributed to use addprocs()\n",
96 | "using Distributed\n",
97 | "\n",
98 | "# specify, number of mpi workers, launch cmd, etc.\n",
99 | "manager=MPIWorkerManager(4)\n",
100 | "\n",
101 | "# start mpi workers and add them as julia workers too.\n",
102 | "addprocs(\n",
103 | " manager,\n",
104 | " exeflags=`--project=$(Base.active_project())`,\n",
105 | " master_tcp_interface=public_slingshot_name\n",
106 | ")"
107 | ]
108 | },
109 | {
110 | "cell_type": "markdown",
111 | "id": "53799c57-9c82-4cb2-9a73-f858a8725071",
112 | "metadata": {
113 | "slideshow": {
114 | "slide_type": "slide"
115 | },
116 | "tags": []
117 | },
118 | "source": [
119 | "# Communication with MPI.jl"
120 | ]
121 | },
122 | {
123 | "cell_type": "markdown",
124 | "id": "332001ad-3b08-4ceb-b4e4-54e619451191",
125 | "metadata": {
126 | "slideshow": {
127 | "slide_type": "fragment"
128 | },
129 | "tags": []
130 | },
131 | "source": [
132 | "Picking up from the previous demo, we have a job with 4 ranks: "
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": 3,
138 | "id": "0f6bc5b9-2973-4dc5-8fdd-bfd483f01460",
139 | "metadata": {
140 | "slideshow": {
141 | "slide_type": "fragment"
142 | },
143 | "tags": []
144 | },
145 | "outputs": [
146 | {
147 | "name": "stdout",
148 | "output_type": "stream",
149 | "text": [
150 | " From worker 5:\tHello world, I am 3 of 4 on nid200349\n",
151 | " From worker 2:\tHello world, I am 0 of 4 on nid200344\n",
152 | " From worker 4:\tHello world, I am 2 of 4 on nid200348\n",
153 | " From worker 3:\tHello world, I am 1 of 4 on nid200345\n"
154 | ]
155 | }
156 | ],
157 | "source": [
158 | "@mpi_do manager begin\n",
159 | " using MPI: MPI, Comm, Win, free\n",
160 | " comm = MPI.COMM_WORLD\n",
161 | " rank = MPI.Comm_rank(comm)\n",
162 | " size = MPI.Comm_size(comm)\n",
163 | " name = gethostname()\n",
164 | " println(\"Hello world, I am $(rank) of $(size) on $(name)\")\n",
165 | "end"
166 | ]
167 | },
168 | {
169 | "cell_type": "markdown",
170 | "id": "7982d349-c25e-4bc9-9624-bbf6f2b6c8cc",
171 | "metadata": {
172 | "slideshow": {
173 | "slide_type": "slide"
174 | },
175 | "tags": []
176 | },
177 | "source": [
178 | "## Domain Decomposition"
179 | ]
180 | },
181 | {
182 | "cell_type": "markdown",
183 | "id": "63c5872e-ab53-4871-8bf0-be59956fd42e",
184 | "metadata": {
185 | "slideshow": {
186 | "slide_type": "fragment"
187 | },
188 | "tags": []
189 | },
190 | "source": [
191 | "PDE solvers often break up work over a \"grid\" of ranks (domain decomposition). This will find the dimension of this grid:"
192 | ]
193 | },
194 | {
195 | "cell_type": "code",
196 | "execution_count": 4,
197 | "id": "1122c61b-aa2b-47e5-871f-ea7f2f1d501b",
198 | "metadata": {
199 | "slideshow": {
200 | "slide_type": "fragment"
201 | },
202 | "tags": []
203 | },
204 | "outputs": [],
205 | "source": [
206 | "@mpi_do manager begin\n",
207 | " dims = [0]\n",
208 | " MPI.Dims_create!(size, dims)\n",
209 | "end"
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": 5,
215 | "id": "612ccdb8-8e29-41cc-8c1f-af533e355715",
216 | "metadata": {
217 | "slideshow": {
218 | "slide_type": "fragment"
219 | },
220 | "tags": []
221 | },
222 | "outputs": [
223 | {
224 | "name": "stdout",
225 | "output_type": "stream",
226 | "text": [
227 | " From worker 3:\t[4]\n",
228 | " From worker 2:\t[4]\n",
229 | " From worker 4:\t[4]\n",
230 | " From worker 5:\t[4]\n"
231 | ]
232 | }
233 | ],
234 | "source": [
235 | "@mpi_do manager begin\n",
236 | " println(dims)\n",
237 | "end"
238 | ]
239 | },
240 | {
241 | "cell_type": "markdown",
242 | "id": "4ec74bff-4668-4c33-b93a-ee19f67551ac",
243 | "metadata": {
244 | "slideshow": {
245 | "slide_type": "fragment"
246 | },
247 | "tags": []
248 | },
249 | "source": [
250 | "Each rank has the same value for `dims`. In $N$-dimensions, `length(dims) == N`."
251 | ]
252 | },
253 | {
254 | "cell_type": "markdown",
255 | "id": "3b3679ec-dfac-46d7-97ef-e0ad10ffe295",
256 | "metadata": {
257 | "slideshow": {
258 | "slide_type": "slide"
259 | },
260 | "tags": []
261 | },
262 | "source": [
263 | "## Cartesian Grids"
264 | ]
265 | },
266 | {
267 | "cell_type": "markdown",
268 | "id": "871f8fd5-7504-4b03-9a62-e63d3278d098",
269 | "metadata": {
270 | "slideshow": {
271 | "slide_type": "fragment"
272 | },
273 | "tags": []
274 | },
275 | "source": [
276 | "We will now lay out each rank in a \"grid\" (in this example, $N=1$ so it's actually a line). In the excercise, $N=2$, so this will be an actual \"grid\". The steps here are pretty much the same though."
277 | ]
278 | },
279 | {
280 | "cell_type": "code",
281 | "execution_count": 6,
282 | "id": "c33bfb02-e341-40e4-8315-83734796a18b",
283 | "metadata": {
284 | "slideshow": {
285 | "slide_type": "fragment"
286 | },
287 | "tags": []
288 | },
289 | "outputs": [],
290 | "source": [
291 | "@mpi_do manager begin\n",
292 | " comm_cart = MPI.Cart_create(\n",
293 | " comm, # MPI Communicator\n",
294 | " dims, # Dimensions of grid\n",
295 | " [0], # 0 == not periodic, 1 == periodic\n",
296 | " 1, # 0 == not allowed to reorder, 1 == allowed to reoder\n",
297 | " )\n",
298 | " me = MPI.Comm_rank(comm_cart)\n",
299 | " coords = MPI.Cart_coords(comm_cart)\n",
300 | " neighbors = MPI.Cart_shift(\n",
301 | " comm_cart,\n",
302 | " 0, # Which dimension to shift (zero-indexed)\n",
303 | " 1, # Shift magnitude\n",
304 | " )\n",
305 | "end"
306 | ]
307 | },
308 | {
309 | "cell_type": "code",
310 | "execution_count": 7,
311 | "id": "e8cf1293-b416-415f-a14e-d529a9e3e7bc",
312 | "metadata": {
313 | "slideshow": {
314 | "slide_type": "subslide"
315 | },
316 | "tags": []
317 | },
318 | "outputs": [],
319 | "source": [
320 | "@mpi_do manager begin\n",
321 | " comm_cart = MPI.Cart_create(\n",
322 | " comm, # MPI Communicator\n",
323 | " dims, # Dimensions of grid\n",
324 | " [0], # 0 == not periodic, 1 == periodic\n",
325 | " 1, # 0 == not allowed to reorder, 1 == allowed to reoder\n",
326 | " )\n",
327 | " me = MPI.Comm_rank(comm_cart)\n",
328 | " coords = MPI.Cart_coords(comm_cart)\n",
329 | " neighbors = MPI.Cart_shift(\n",
330 | " comm_cart,\n",
331 | " 0, # Which dimension to shift (zero-indexed)\n",
332 | " 1, # Shift magnitude\n",
333 | " )\n",
334 | "end"
335 | ]
336 | },
337 | {
338 | "cell_type": "code",
339 | "execution_count": 8,
340 | "id": "d3ab1a58-0aea-4ec5-a79b-48bcd810c631",
341 | "metadata": {
342 | "slideshow": {
343 | "slide_type": "fragment"
344 | },
345 | "tags": []
346 | },
347 | "outputs": [
348 | {
349 | "name": "stdout",
350 | "output_type": "stream",
351 | "text": [
352 | " From worker 2:\trank=0; coord=[0], neighbors=(-1, 1)\n",
353 | " From worker 3:\trank=1; coord=[1], neighbors=(0, 2)\n",
354 | " From worker 5:\trank=3; coord=[3], neighbors=(2, -1)\n",
355 | " From worker 4:\trank=2; coord=[2], neighbors=(1, 3)\n"
356 | ]
357 | }
358 | ],
359 | "source": [
360 | "@mpi_do manager begin\n",
361 | " println(\"rank=$(me); coord=$(coords), neighbors=$(neighbors)\")\n",
362 | "end"
363 | ]
364 | },
365 | {
366 | "cell_type": "code",
367 | "execution_count": 9,
368 | "id": "63bda425-3a47-4a1c-ba8b-ae3c891d3021",
369 | "metadata": {
370 | "slideshow": {
371 | "slide_type": "subslide"
372 | },
373 | "tags": []
374 | },
375 | "outputs": [
376 | {
377 | "name": "stdout",
378 | "output_type": "stream",
379 | "text": [
380 | " From worker 5:\trank=3; coord=[3], neighbors=(2, -1)\n",
381 | " From worker 2:\trank=0; coord=[0], neighbors=(-1, 1)\n",
382 | " From worker 4:\trank=2; coord=[2], neighbors=(1, 3)\n",
383 | " From worker 3:\trank=1; coord=[1], neighbors=(0, 2)\n"
384 | ]
385 | }
386 | ],
387 | "source": [
388 | "@mpi_do manager begin\n",
389 | " println(\"rank=$(me); coord=$(coords), neighbors=$(neighbors)\")\n",
390 | "end"
391 | ]
392 | },
393 | {
394 | "cell_type": "markdown",
395 | "id": "b80b410a-c68c-4e38-ab1c-e355c4d20d8c",
396 | "metadata": {
397 | "slideshow": {
398 | "slide_type": "fragment"
399 | },
400 | "tags": []
401 | },
402 | "source": [
403 | "MPI contains several constants, for example what `-1` means in the context above. This means that there is \"no neighbor\" there:"
404 | ]
405 | },
406 | {
407 | "cell_type": "code",
408 | "execution_count": 10,
409 | "id": "94bc63d1-24cc-47f6-a6ab-4624d95523fd",
410 | "metadata": {
411 | "slideshow": {
412 | "slide_type": "fragment"
413 | },
414 | "tags": []
415 | },
416 | "outputs": [
417 | {
418 | "data": {
419 | "text/plain": [
420 | "-1"
421 | ]
422 | },
423 | "execution_count": 10,
424 | "metadata": {},
425 | "output_type": "execute_result"
426 | }
427 | ],
428 | "source": [
429 | "MPI.PROC_NULL"
430 | ]
431 | },
432 | {
433 | "cell_type": "markdown",
434 | "id": "b165e80a-91ce-4233-a8e4-4bd3f09786c1",
435 | "metadata": {
436 | "slideshow": {
437 | "slide_type": "slide"
438 | },
439 | "tags": []
440 | },
441 | "source": [
442 | "## Point-to-point Communication"
443 | ]
444 | },
445 | {
446 | "cell_type": "markdown",
447 | "id": "07f6f278-6dc3-4042-aa06-4abc9a7fa7f4",
448 | "metadata": {
449 | "slideshow": {
450 | "slide_type": "fragment"
451 | },
452 | "tags": []
453 | },
454 | "source": [
455 | "Let's do something harder:\n",
456 | "1. Each rank draws a random number between 1 and 100\n",
457 | "2. Each rank's random number is shared with its neighbors"
458 | ]
459 | },
460 | {
461 | "cell_type": "markdown",
462 | "id": "b286f218-4851-4f11-b3e2-550635a2c688",
463 | "metadata": {
464 | "slideshow": {
465 | "slide_type": "fragment"
466 | },
467 | "tags": []
468 | },
469 | "source": [
470 | "This is an example of point-to-point communication on a grid. We'll be using the same communication pattern in the excercise."
471 | ]
472 | },
473 | {
474 | "cell_type": "markdown",
475 | "id": "45478166-3101-4380-9149-e9ee101b3b06",
476 | "metadata": {
477 | "slideshow": {
478 | "slide_type": "subslide"
479 | },
480 | "tags": []
481 | },
482 | "source": [
483 | "First we generate a andom number on each rank"
484 | ]
485 | },
486 | {
487 | "cell_type": "code",
488 | "execution_count": 11,
489 | "id": "e5187bd3-8699-4a3b-a43c-28d4a647cdc0",
490 | "metadata": {
491 | "slideshow": {
492 | "slide_type": "fragment"
493 | },
494 | "tags": []
495 | },
496 | "outputs": [],
497 | "source": [
498 | "@mpi_do manager begin\n",
499 | " using Random\n",
500 | " my_int = rand(1:100)\n",
501 | "end"
502 | ]
503 | },
504 | {
505 | "cell_type": "code",
506 | "execution_count": 12,
507 | "id": "a926edfd-9b22-4e33-851d-6d9e26429065",
508 | "metadata": {
509 | "slideshow": {
510 | "slide_type": "fragment"
511 | },
512 | "tags": []
513 | },
514 | "outputs": [
515 | {
516 | "name": "stdout",
517 | "output_type": "stream",
518 | "text": [
519 | " From worker 2:\trank=0; my_int=38\n",
520 | " From worker 4:\trank=2; my_int=29\n",
521 | " From worker 5:\trank=3; my_int=70\n",
522 | " From worker 3:\trank=1; my_int=71\n"
523 | ]
524 | }
525 | ],
526 | "source": [
527 | "@mpi_do manager begin\n",
528 | " println(\"rank=$(me); my_int=$(my_int)\")\n",
529 | "end"
530 | ]
531 | },
532 | {
533 | "cell_type": "markdown",
534 | "id": "064d74de-4b8a-4962-a521-f620f8164cae",
535 | "metadata": {
536 | "slideshow": {
537 | "slide_type": "subslide"
538 | },
539 | "tags": []
540 | },
541 | "source": [
542 | "MPI uses zero-copy memory access => we need to set up buffers (arrays) to send and receive data."
543 | ]
544 | },
545 | {
546 | "cell_type": "code",
547 | "execution_count": 13,
548 | "id": "343bd286-e07b-49b6-8342-ebd85b1a2af7",
549 | "metadata": {
550 | "slideshow": {
551 | "slide_type": "fragment"
552 | },
553 | "tags": []
554 | },
555 | "outputs": [],
556 | "source": [
557 | "@mpi_do manager begin\n",
558 | " send_1 = zeros(Int64, 1)\n",
559 | " send_2 = zeros(Int64, 1)\n",
560 | " recv_1 = zeros(Int64, 1)\n",
561 | " recv_2 = zeros(Int64, 1)\n",
562 | "end"
563 | ]
564 | },
565 | {
566 | "cell_type": "markdown",
567 | "id": "5669bf32-cc11-42b3-b353-31b3231999b4",
568 | "metadata": {
569 | "slideshow": {
570 | "slide_type": "fragment"
571 | },
572 | "tags": []
573 | },
574 | "source": [
575 | "Now we fill the buffers by copying out data into it -- wherever a buffer is needed."
576 | ]
577 | },
578 | {
579 | "cell_type": "code",
580 | "execution_count": 14,
581 | "id": "48a0fa62-2cd2-4071-9046-958e0b335916",
582 | "metadata": {
583 | "slideshow": {
584 | "slide_type": "fragment"
585 | },
586 | "tags": []
587 | },
588 | "outputs": [],
589 | "source": [
590 | "@mpi_do manager begin\n",
591 | " if neighbors[1] != MPI.PROC_NULL\n",
592 | " copyto!(send_1, my_int)\n",
593 | " end\n",
594 | " if neighbors[2] != MPI.PROC_NULL\n",
595 | " copyto!(send_2, my_int)\n",
596 | " end \n",
597 | "end"
598 | ]
599 | },
600 | {
601 | "cell_type": "markdown",
602 | "id": "b79dfa66-e9c8-455f-b658-004e49ea4df2",
603 | "metadata": {
604 | "slideshow": {
605 | "slide_type": "subslide"
606 | },
607 | "tags": []
608 | },
609 | "source": [
610 | "Now we're ready to perform a data transfer with MPI. MPI is (largely) transaction based. There is a receiving end, and a sending end. In order for a send to be successful, the receiver must be ready to receive."
611 | ]
612 | },
613 | {
614 | "cell_type": "markdown",
615 | "id": "2d89f9e2-2527-4700-9eeb-600c1844eb06",
616 | "metadata": {
617 | "slideshow": {
618 | "slide_type": "fragment"
619 | },
620 | "tags": []
621 | },
622 | "source": [
623 | "To help coordinate all of this, we set up a request store:"
624 | ]
625 | },
626 | {
627 | "cell_type": "code",
628 | "execution_count": 15,
629 | "id": "c05abe1a-8d67-4aff-9191-d135272ca4be",
630 | "metadata": {
631 | "slideshow": {
632 | "slide_type": "fragment"
633 | },
634 | "tags": []
635 | },
636 | "outputs": [],
637 | "source": [
638 | "@mpi_do manager begin\n",
639 | " reqs = MPI.MultiRequest(4)\n",
640 | "end"
641 | ]
642 | },
643 | {
644 | "cell_type": "markdown",
645 | "id": "2256d83d-f6fe-4bed-88d0-e405f53dd664",
646 | "metadata": {
647 | "slideshow": {
648 | "slide_type": "subslide"
649 | },
650 | "tags": []
651 | },
652 | "source": [
653 | "And we transfer the data using non-blocking MPI communivation (`Isend` and `Irecv`). Pro tip: initiate receive before send"
654 | ]
655 | },
656 | {
657 | "cell_type": "code",
658 | "execution_count": 16,
659 | "id": "d847d757-71b4-4d62-8faa-228962bb4794",
660 | "metadata": {
661 | "slideshow": {
662 | "slide_type": "fragment"
663 | },
664 | "tags": []
665 | },
666 | "outputs": [],
667 | "source": [
668 | "@mpi_do manager begin\n",
669 | " # Initiate data reciever\n",
670 | " if neighbors[1] != MPI.PROC_NULL\n",
671 | " MPI.Irecv!(recv_1, comm_cart, reqs[1]; source=neighbors[1])\n",
672 | " end\n",
673 | " if neighbors[2] != MPI.PROC_NULL\n",
674 | " MPI.Irecv!(recv_2, comm_cart, reqs[2]; source=neighbors[2])\n",
675 | " end\n",
676 | " # Send data\n",
677 | " if neighbors[1] != MPI.PROC_NULL\n",
678 | " MPI.Isend(send_1, comm_cart, reqs[3]; dest=neighbors[1])\n",
679 | " end\n",
680 | " if neighbors[2] != MPI.PROC_NULL\n",
681 | " MPI.Isend(send_2, comm_cart, reqs[4]; dest=neighbors[2])\n",
682 | " end\n",
683 | "end"
684 | ]
685 | },
686 | {
687 | "cell_type": "markdown",
688 | "id": "4b2ef6ff-dead-4aff-981c-21407f01c9ef",
689 | "metadata": {
690 | "slideshow": {
691 | "slide_type": "fragment"
692 | },
693 | "tags": []
694 | },
695 | "source": [
696 | "Notice how we tagged data with `source` and `dest`. This makes sure that data is received in the correct order (the middle ranks receive data from _both_ sides), and -- in the case of `Isend` -- that the data is sent to the correct rank."
697 | ]
698 | },
699 | {
700 | "cell_type": "markdown",
701 | "id": "54ed66db-1274-4efe-a2a0-a8b2b7986527",
702 | "metadata": {
703 | "slideshow": {
704 | "slide_type": "subslide"
705 | },
706 | "tags": []
707 | },
708 | "source": [
709 | "When using non-blocking communication, it's good to wait for all transactions to be completed before using the buffers:"
710 | ]
711 | },
712 | {
713 | "cell_type": "code",
714 | "execution_count": 17,
715 | "id": "113d8a31-1834-4d6d-931f-3991592e7ab5",
716 | "metadata": {
717 | "slideshow": {
718 | "slide_type": "fragment"
719 | },
720 | "tags": []
721 | },
722 | "outputs": [],
723 | "source": [
724 | "@mpi_do manager begin\n",
725 | " # Wait for all requests to finish\n",
726 | " MPI.Waitall(reqs)\n",
727 | "end"
728 | ]
729 | },
730 | {
731 | "cell_type": "markdown",
732 | "id": "26e5ed91-9afd-4764-b875-ebbf924dc077",
733 | "metadata": {
734 | "slideshow": {
735 | "slide_type": "subslide"
736 | },
737 | "tags": []
738 | },
739 | "source": [
740 | "Let's take a look at what we've transferred:"
741 | ]
742 | },
743 | {
744 | "cell_type": "code",
745 | "execution_count": 19,
746 | "id": "c7f159d3-e651-4795-b63b-2b49a03af961",
747 | "metadata": {
748 | "slideshow": {
749 | "slide_type": "fragment"
750 | },
751 | "tags": []
752 | },
753 | "outputs": [
754 | {
755 | "name": "stdout",
756 | "output_type": "stream",
757 | "text": [
758 | " From worker 4:\trank=2; my_int=29; prev=[71]; next=[70]\n",
759 | " From worker 2:\trank=0; my_int=38; prev=[0]; next=[71]\n",
760 | " From worker 5:\trank=3; my_int=70; prev=[29]; next=[0]\n",
761 | " From worker 3:\trank=1; my_int=71; prev=[38]; next=[29]\n"
762 | ]
763 | },
764 | {
765 | "ename": "KeyError",
766 | "evalue": "KeyError: key \"usage_request\" not found",
767 | "output_type": "error",
768 | "traceback": [
769 | "KERNEL EXCEPTION",
770 | "KeyError: key \"usage_request\" not found",
771 | "",
772 | "Stacktrace:",
773 | " [1] getindex(h::Dict{String, Function}, key::String)",
774 | " @ Base ./dict.jl:484",
775 | " [2] eventloop(socket::ZMQ.Socket)",
776 | " @ IJulia ~/.julia/packages/IJulia/Vo51o/src/eventloop.jl:8",
777 | " [3] (::IJulia.var\"#14#17\")()",
778 | " @ IJulia ./task.jl:514"
779 | ]
780 | },
781 | {
782 | "ename": "KeyError",
783 | "evalue": "KeyError: key \"usage_request\" not found",
784 | "output_type": "error",
785 | "traceback": [
786 | "KERNEL EXCEPTION",
787 | "KeyError: key \"usage_request\" not found",
788 | "",
789 | "Stacktrace:",
790 | " [1] getindex(h::Dict{String, Function}, key::String)",
791 | " @ Base ./dict.jl:484",
792 | " [2] eventloop(socket::ZMQ.Socket)",
793 | " @ IJulia ~/.julia/packages/IJulia/Vo51o/src/eventloop.jl:8",
794 | " [3] (::IJulia.var\"#14#17\")()",
795 | " @ IJulia ./task.jl:514"
796 | ]
797 | },
798 | {
799 | "ename": "KeyError",
800 | "evalue": "KeyError: key \"usage_request\" not found",
801 | "output_type": "error",
802 | "traceback": [
803 | "KERNEL EXCEPTION",
804 | "KeyError: key \"usage_request\" not found",
805 | "",
806 | "Stacktrace:",
807 | " [1] getindex(h::Dict{String, Function}, key::String)",
808 | " @ Base ./dict.jl:484",
809 | " [2] eventloop(socket::ZMQ.Socket)",
810 | " @ IJulia ~/.julia/packages/IJulia/Vo51o/src/eventloop.jl:8",
811 | " [3] (::IJulia.var\"#14#17\")()",
812 | " @ IJulia ./task.jl:514"
813 | ]
814 | }
815 | ],
816 | "source": [
817 | "@mpi_do manager begin\n",
818 | " println(\n",
819 | " \"rank=$(me); \" *\n",
820 | " \"my_int=$(my_int); prev=$(recv_1); next=$(recv_2)\"\n",
821 | " )\n",
822 | "end"
823 | ]
824 | },
825 | {
826 | "cell_type": "code",
827 | "execution_count": null,
828 | "id": "88e46e3b-f8d4-48d5-b8fc-2ae660f5a4a8",
829 | "metadata": {
830 | "slideshow": {
831 | "slide_type": "skip"
832 | },
833 | "tags": []
834 | },
835 | "outputs": [],
836 | "source": []
837 | }
838 | ],
839 | "metadata": {
840 | "kernelspec": {
841 | "display_name": "Julia 1.9.4",
842 | "language": "julia",
843 | "name": "julia-1.9.4"
844 | },
845 | "language_info": {
846 | "file_extension": ".jl",
847 | "mimetype": "application/julia",
848 | "name": "julia",
849 | "version": "1.9.4"
850 | }
851 | },
852 | "nbformat": 4,
853 | "nbformat_minor": 5
854 | }
855 |
--------------------------------------------------------------------------------
/parts/mpi/explanation/03_halo.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "7d81f9b4-89d8-4597-a458-4bfff3c27b81",
6 | "metadata": {
7 | "slideshow": {
8 | "slide_type": "skip"
9 | },
10 | "tags": []
11 | },
12 | "source": [
13 | "# Setup\n",
14 | "\n",
15 | "Note: you might need to run `Pkg.instantiate()` to ensure that the `Manifest.toml` is up to date. This only needs to be done once."
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": 1,
21 | "id": "df64b70e-4682-4885-b055-056bc4e88a59",
22 | "metadata": {
23 | "slideshow": {
24 | "slide_type": "skip"
25 | },
26 | "tags": []
27 | },
28 | "outputs": [
29 | {
30 | "name": "stderr",
31 | "output_type": "stream",
32 | "text": [
33 | "\u001b[32m\u001b[1m Activating\u001b[22m\u001b[39m project at `/global/u1/b/blaschke/juliacon24-hpcworkshop/parts/mpi/explanation`\n"
34 | ]
35 | },
36 | {
37 | "name": "stdout",
38 | "output_type": "stream",
39 | "text": [
40 | "\u001b[32m\u001b[1mStatus\u001b[22m\u001b[39m `/global/u1/b/blaschke/juliacon24-hpcworkshop/parts/mpi/explanation/Project.toml`\n",
41 | " \u001b[90m[1520ce14] \u001b[39mAbstractTrees v0.4.5\n",
42 | " \u001b[90m[052768ef] \u001b[39mCUDA v5.4.2\n",
43 | " \u001b[90m[adafc99b] \u001b[39mCpuId v0.3.1\n",
44 | " \u001b[90m[0e44f5e4] \u001b[39mHwloc v3.0.1\n",
45 | " \u001b[90m[da04e1cc] \u001b[39mMPI v0.20.20\n",
46 | " \u001b[90m[e7922434] \u001b[39mMPIClusterManagers v0.2.4\n",
47 | " \u001b[90m[6f74fd91] \u001b[39mNetworkInterfaceControllers v0.1.0\n"
48 | ]
49 | }
50 | ],
51 | "source": [
52 | "import Pkg;\n",
53 | "Pkg.activate(@__DIR__)\n",
54 | "Pkg.status()"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": 2,
60 | "id": "cd1e1253-87a0-47b9-a225-33dffac6d33f",
61 | "metadata": {
62 | "slideshow": {
63 | "slide_type": "skip"
64 | },
65 | "tags": []
66 | },
67 | "outputs": [
68 | {
69 | "data": {
70 | "text/plain": [
71 | "\"nid200360-hsn0\""
72 | ]
73 | },
74 | "execution_count": 2,
75 | "metadata": {},
76 | "output_type": "execute_result"
77 | }
78 | ],
79 | "source": [
80 | "using MPI\n",
81 | "\n",
82 | "using NetworkInterfaceControllers, Sockets\n",
83 | "interfaces = NetworkInterfaceControllers.get_interface_data(IPv4)\n",
84 | "\n",
85 | "hsn0_public = filter(x->(x.name==\"hsn0:chn\" && x.version==:v4), interfaces) |> only \n",
86 | "public_slingshot_name = getnameinfo(hsn0_public.ip)"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": 3,
92 | "id": "68377016-c3df-4a1c-9c42-150d6af80de8",
93 | "metadata": {
94 | "slideshow": {
95 | "slide_type": "skip"
96 | },
97 | "tags": []
98 | },
99 | "outputs": [
100 | {
101 | "data": {
102 | "text/plain": [
103 | "4-element Vector{Int64}:\n",
104 | " 2\n",
105 | " 3\n",
106 | " 4\n",
107 | " 5"
108 | ]
109 | },
110 | "execution_count": 3,
111 | "metadata": {},
112 | "output_type": "execute_result"
113 | }
114 | ],
115 | "source": [
116 | "# to import MPIManager\n",
117 | "using MPIClusterManagers\n",
118 | "\n",
119 | "# need to also import Distributed to use addprocs()\n",
120 | "using Distributed\n",
121 | "\n",
122 | "# specify, number of mpi workers, launch cmd, etc.\n",
123 | "manager=MPIWorkerManager(4)\n",
124 | "\n",
125 | "# start mpi workers and add them as julia workers too.\n",
126 | "addprocs(\n",
127 | " manager,\n",
128 | " exeflags=`--project=$(Base.active_project())`,\n",
129 | " master_tcp_interface=public_slingshot_name\n",
130 | ")"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": 5,
136 | "id": "b243745d-ad52-4d52-873a-b9bc6575054a",
137 | "metadata": {
138 | "slideshow": {
139 | "slide_type": "skip"
140 | },
141 | "tags": []
142 | },
143 | "outputs": [
144 | {
145 | "name": "stdout",
146 | "output_type": "stream",
147 | "text": [
148 | " From worker 5:\tHello world, I am 3 of 4 on nid200365\n",
149 | " From worker 2:\tHello world, I am 0 of 4 on nid200360\n",
150 | " From worker 4:\tHello world, I am 2 of 4 on nid200364\n",
151 | " From worker 3:\tHello world, I am 1 of 4 on nid200361\n"
152 | ]
153 | },
154 | {
155 | "ename": "KeyError",
156 | "evalue": "KeyError: key \"usage_request\" not found",
157 | "output_type": "error",
158 | "traceback": [
159 | "KERNEL EXCEPTION",
160 | "KeyError: key \"usage_request\" not found",
161 | "",
162 | "Stacktrace:",
163 | " [1] getindex(h::Dict{String, Function}, key::String)",
164 | " @ Base ./dict.jl:484",
165 | " [2] eventloop(socket::ZMQ.Socket)",
166 | " @ IJulia ~/.julia/packages/IJulia/Vo51o/src/eventloop.jl:8",
167 | " [3] (::IJulia.var\"#14#17\")()",
168 | " @ IJulia ./task.jl:514"
169 | ]
170 | },
171 | {
172 | "ename": "KeyError",
173 | "evalue": "KeyError: key \"usage_request\" not found",
174 | "output_type": "error",
175 | "traceback": [
176 | "KERNEL EXCEPTION",
177 | "KeyError: key \"usage_request\" not found",
178 | "",
179 | "Stacktrace:",
180 | " [1] getindex(h::Dict{String, Function}, key::String)",
181 | " @ Base ./dict.jl:484",
182 | " [2] eventloop(socket::ZMQ.Socket)",
183 | " @ IJulia ~/.julia/packages/IJulia/Vo51o/src/eventloop.jl:8",
184 | " [3] (::IJulia.var\"#14#17\")()",
185 | " @ IJulia ./task.jl:514"
186 | ]
187 | }
188 | ],
189 | "source": [
190 | "@mpi_do manager begin\n",
191 | " using MPI: MPI, Comm, Win, free\n",
192 | " comm = MPI.COMM_WORLD\n",
193 | " rank = MPI.Comm_rank(comm)\n",
194 | " mpi_size = MPI.Comm_size(comm) # don't use \"size\" as this overwrites the `size` function\n",
195 | " name = gethostname()\n",
196 | " println(\"Hello world, I am $(rank) of $(mpi_size) on $(name)\")\n",
197 | "end"
198 | ]
199 | },
200 | {
201 | "cell_type": "code",
202 | "execution_count": 6,
203 | "id": "f4197ff5-6ba6-4964-aca4-178147857b74",
204 | "metadata": {
205 | "slideshow": {
206 | "slide_type": "skip"
207 | },
208 | "tags": []
209 | },
210 | "outputs": [],
211 | "source": [
212 | "@mpi_do manager begin\n",
213 | " dims = [0]\n",
214 | " MPI.Dims_create!(mpi_size, dims)\n",
215 | "end"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": 7,
221 | "id": "e56347d7-018b-4daa-8b0f-7934a3097718",
222 | "metadata": {
223 | "slideshow": {
224 | "slide_type": "skip"
225 | },
226 | "tags": []
227 | },
228 | "outputs": [],
229 | "source": [
230 | "@mpi_do manager begin\n",
231 | " comm_cart = MPI.Cart_create(\n",
232 | " comm, # MPI Communicator\n",
233 | " dims, # Dimensions of grid\n",
234 | " [0], # 0 == not periodic, 1 == periodic\n",
235 | " 1, # 0 == not allowed to reorder, 1 == allowed to reoder\n",
236 | " )\n",
237 | " me = MPI.Comm_rank(comm_cart)\n",
238 | " coords = MPI.Cart_coords(comm_cart)\n",
239 | " neighbors = MPI.Cart_shift(\n",
240 | " comm_cart,\n",
241 | " 0, # Which dimension to shift (zero-indexed)\n",
242 | " 1, # Shift magnitude\n",
243 | " )\n",
244 | "end"
245 | ]
246 | },
247 | {
248 | "cell_type": "markdown",
249 | "id": "e591dff1-e930-405a-aced-7ba54ef75164",
250 | "metadata": {
251 | "slideshow": {
252 | "slide_type": "slide"
253 | },
254 | "tags": []
255 | },
256 | "source": [
257 | "# Halo Exchange"
258 | ]
259 | },
260 | {
261 | "cell_type": "markdown",
262 | "id": "5eac60ff-cd2d-4561-bf87-a732e93cdbc5",
263 | "metadata": {
264 | "slideshow": {
265 | "slide_type": "fragment"
266 | },
267 | "tags": []
268 | },
269 | "source": [
270 | "When cast into the discrete form:\n",
271 | "\n",
272 | "$$\n",
273 | "\\partial_t x = -D \\mathrm{div}(\\mathrm{grad}(x)) \\\\\n",
274 | "\\Delta_t x = -D \\frac{q_i - q_{i-1}}{\\Delta s} = \\frac{(x_{i+1} - x_i) - (x_{i} - x_{i-1})}{(\\Delta s)^2} = \\frac{x_{i+1} + 2 x_i - x_{i-1}}{(\\Delta s)^2}\n",
275 | "$$\n",
276 | "\n",
277 | "The diffusion equation has a stencil width of 2, but the necessary halo only needs 1 cell to be transferred:"
278 | ]
279 | },
280 | {
281 | "cell_type": "markdown",
282 | "id": "c67e1b1e-7bec-4b02-bcd8-4fecefd8170b",
283 | "metadata": {
284 | "slideshow": {
285 | "slide_type": "subslide"
286 | },
287 | "tags": []
288 | },
289 | "source": [
290 | ""
291 | ]
292 | },
293 | {
294 | "cell_type": "markdown",
295 | "id": "7ebe2ba1-2ea3-498d-be6e-bd34d6a50ad9",
296 | "metadata": {
297 | "slideshow": {
298 | "slide_type": "subslide"
299 | },
300 | "tags": []
301 | },
302 | "source": [
303 | "In 2D this will look as follows:\n",
304 | "\n",
305 | ""
306 | ]
307 | },
308 | {
309 | "cell_type": "markdown",
310 | "id": "d22a1ac9-cc48-4bed-87fc-a2113ebb8067",
311 | "metadata": {
312 | "slideshow": {
313 | "slide_type": "slide"
314 | },
315 | "tags": []
316 | },
317 | "source": [
318 | "## 1D Solver Example"
319 | ]
320 | },
321 | {
322 | "cell_type": "markdown",
323 | "id": "bc43046b-6490-429c-bd4e-a442e0c2cafd",
324 | "metadata": {
325 | "slideshow": {
326 | "slide_type": "fragment"
327 | },
328 | "tags": []
329 | },
330 | "source": [
331 | "Let's set up a basic example: 1D diffusion! First we need some parameters:"
332 | ]
333 | },
334 | {
335 | "cell_type": "code",
336 | "execution_count": 245,
337 | "id": "49fc3f16-27d8-4589-8f89-76d868c4f3c1",
338 | "metadata": {
339 | "slideshow": {
340 | "slide_type": "fragment"
341 | },
342 | "tags": []
343 | },
344 | "outputs": [],
345 | "source": [
346 | "@mpi_do manager begin\n",
347 | " D = 1e-4\n",
348 | " ds = 1e-4\n",
349 | " dt = ds^2 / D / 8.2 \n",
350 | " qx(ix, D, C, ds) = -D * (C[ix+1, 1] - C[ix, 1]) / ds\n",
351 | "end"
352 | ]
353 | },
354 | {
355 | "cell_type": "markdown",
356 | "id": "2062fc6f-d631-46a8-8908-08db59eb3c43",
357 | "metadata": {
358 | "slideshow": {
359 | "slide_type": "subslide"
360 | },
361 | "tags": []
362 | },
363 | "source": [
364 | "We can now iterate over the local array (which has a halo of 2 cells):"
365 | ]
366 | },
367 | {
368 | "cell_type": "code",
369 | "execution_count": 248,
370 | "id": "b1f885ad-b823-45da-a33c-8ab615425362",
371 | "metadata": {
372 | "slideshow": {
373 | "slide_type": "fragment"
374 | },
375 | "tags": []
376 | },
377 | "outputs": [],
378 | "source": [
379 | "@mpi_do manager begin\n",
380 | " function step_diffusion!(C2, C)\n",
381 | " for i in 1:size(C, 1) - 2\n",
382 | " C2[i+1] = C[i+1] - dt * (qx(i+1, D, C, ds) - qx(i, D, C, ds)) / ds\n",
383 | " end\n",
384 | " end\n",
385 | "end"
386 | ]
387 | },
388 | {
389 | "cell_type": "markdown",
390 | "id": "3f237d99-4457-4e0f-abbf-2dbb676ef837",
391 | "metadata": {
392 | "slideshow": {
393 | "slide_type": "subslide"
394 | },
395 | "tags": []
396 | },
397 | "source": [
398 | "We set up an initial condition where a single cell at the edge of domain 2 (rank 1) is non-zero. Recall that the halo is 2-cells wide => `C[8]` is at the very end of domain 2."
399 | ]
400 | },
401 | {
402 | "cell_type": "code",
403 | "execution_count": 246,
404 | "id": "e0085955-4b07-4942-b471-7f9a130ab908",
405 | "metadata": {
406 | "slideshow": {
407 | "slide_type": "fragment"
408 | },
409 | "tags": []
410 | },
411 | "outputs": [],
412 | "source": [
413 | "@mpi_do manager begin\n",
414 | " C = zeros(10, 1)\n",
415 | " if rank == 1\n",
416 | " C[8] = 1/ds\n",
417 | " end\n",
418 | "end"
419 | ]
420 | },
421 | {
422 | "cell_type": "code",
423 | "execution_count": 247,
424 | "id": "9c67b1a0-3dfa-44a6-b0a7-32e2cf550db8",
425 | "metadata": {
426 | "slideshow": {
427 | "slide_type": "fragment"
428 | },
429 | "tags": []
430 | },
431 | "outputs": [
432 | {
433 | "name": "stdout",
434 | "output_type": "stream",
435 | "text": [
436 | " From worker 2:\t[0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0;;]\n",
437 | " From worker 4:\t[0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0;;]\n",
438 | " From worker 5:\t[0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0;;]\n",
439 | " From worker 3:\t[0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 10000.0; 0.0; 0.0;;]\n"
440 | ]
441 | }
442 | ],
443 | "source": [
444 | "@mpi_do manager begin\n",
445 | " println(C)\n",
446 | "end"
447 | ]
448 | },
449 | {
450 | "cell_type": "code",
451 | "execution_count": 249,
452 | "id": "b09f3b69-a733-467a-84ed-a91b577c89ba",
453 | "metadata": {
454 | "slideshow": {
455 | "slide_type": "fragment"
456 | },
457 | "tags": []
458 | },
459 | "outputs": [],
460 | "source": [
461 | "@mpi_do manager begin\n",
462 | " C2 = similar(C)\n",
463 | " fill!(C2, 0.)\n",
464 | "end"
465 | ]
466 | },
467 | {
468 | "cell_type": "markdown",
469 | "id": "f8307d70-8e35-4285-bca5-63ed716a417a",
470 | "metadata": {
471 | "slideshow": {
472 | "slide_type": "slide"
473 | },
474 | "tags": []
475 | },
476 | "source": [
477 | "## Halo Exchanges in 1D"
478 | ]
479 | },
480 | {
481 | "cell_type": "markdown",
482 | "id": "1e075e1e-0575-4a32-b5ad-da0fa724279b",
483 | "metadata": {
484 | "slideshow": {
485 | "slide_type": "subslide"
486 | },
487 | "tags": []
488 | },
489 | "source": [
490 | "In the previous example we exchanged `Int64`, now we're going to tranfer `Float64`"
491 | ]
492 | },
493 | {
494 | "cell_type": "code",
495 | "execution_count": 8,
496 | "id": "d36f644b-65fb-44d0-998c-09c74e805235",
497 | "metadata": {
498 | "slideshow": {
499 | "slide_type": "fragment"
500 | },
501 | "tags": []
502 | },
503 | "outputs": [],
504 | "source": [
505 | "@mpi_do manager begin\n",
506 | " send_1 = zeros(Float64, 1)\n",
507 | " send_2 = zeros(Float64, 1)\n",
508 | " recv_1 = zeros(Float64, 1)\n",
509 | " recv_2 = zeros(Float64, 1)\n",
510 | "end"
511 | ]
512 | },
513 | {
514 | "cell_type": "markdown",
515 | "id": "8f3b2de0-563f-4bb0-acb7-99d7be2a2c66",
516 | "metadata": {
517 | "slideshow": {
518 | "slide_type": "subslide"
519 | },
520 | "tags": []
521 | },
522 | "source": [
523 | "We set up a halo-exchange function using the previous section's point-to-point communication pattern"
524 | ]
525 | },
526 | {
527 | "cell_type": "code",
528 | "execution_count": 257,
529 | "id": "a6891fa9-6d92-4c65-b791-2aa4246e1e2e",
530 | "metadata": {
531 | "slideshow": {
532 | "slide_type": "fragment"
533 | },
534 | "tags": []
535 | },
536 | "outputs": [],
537 | "source": [
538 | "@mpi_do manager begin\n",
539 | " function halo_exchange!(A)\n",
540 | " # Copy to buffers\n",
541 | " (neighbors[1] != MPI.PROC_NULL) && copyto!(send_1, A[2:2, 1])\n",
542 | " (neighbors[2] != MPI.PROC_NULL) && copyto!(send_2, A[(end-1):(end-1), 1]) \n",
543 | " # Request handler\n",
544 | " reqs = MPI.MultiRequest(4)\n",
545 | " # Initiate data reciever\n",
546 | " (neighbors[1] != MPI.PROC_NULL) && MPI.Irecv!(recv_1, comm_cart, reqs[1]; source=neighbors[1])\n",
547 | " (neighbors[2] != MPI.PROC_NULL) && MPI.Irecv!(recv_2, comm_cart, reqs[2]; source=neighbors[2])\n",
548 | " # Send data\n",
549 | " (neighbors[1] != MPI.PROC_NULL) && MPI.Isend(send_1, comm_cart, reqs[3]; dest=neighbors[1])\n",
550 | " (neighbors[2] != MPI.PROC_NULL) && MPI.Isend(send_2, comm_cart, reqs[4]; dest=neighbors[2])\n",
551 | " # Block until all transactions are done before touching buffers\n",
552 | " MPI.Waitall(reqs) \n",
553 | " # Copy from buffers (copyto! needs a pointer to the cell)\n",
554 | " r1 = @view A[1:1, 1] \n",
555 | " r2 = @view A[end:end, 1]\n",
556 | " (neighbors[1] != MPI.PROC_NULL) && copyto!(r1, recv_1)\n",
557 | " (neighbors[2] != MPI.PROC_NULL) && copyto!(r2, recv_2)\n",
558 | " end\n",
559 | "end"
560 | ]
561 | },
562 | {
563 | "cell_type": "markdown",
564 | "id": "1944ce97-2586-42b5-b954-b5b4a587766c",
565 | "metadata": {
566 | "slideshow": {
567 | "slide_type": "subslide"
568 | },
569 | "tags": []
570 | },
571 | "source": [
572 | "Let's run 1 step of the diffusion algorithm to see how the halo exchane works:"
573 | ]
574 | },
575 | {
576 | "cell_type": "code",
577 | "execution_count": 250,
578 | "id": "871692b2-3589-4e13-9889-bad943325e23",
579 | "metadata": {
580 | "slideshow": {
581 | "slide_type": "fragment"
582 | },
583 | "tags": []
584 | },
585 | "outputs": [],
586 | "source": [
587 | "@mpi_do manager begin\n",
588 | " step_diffusion!(C2, C)\n",
589 | " halo_exchange!(C2)\n",
590 | " C, C2 = C2, C\n",
591 | "end"
592 | ]
593 | },
594 | {
595 | "cell_type": "code",
596 | "execution_count": 252,
597 | "id": "7ccd49d0-3eee-46ea-a888-8094047e3bd8",
598 | "metadata": {
599 | "slideshow": {
600 | "slide_type": "fragment"
601 | },
602 | "tags": []
603 | },
604 | "outputs": [
605 | {
606 | "name": "stdout",
607 | "output_type": "stream",
608 | "text": [
609 | " From worker 5:\t[0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0;;]\n",
610 | " From worker 4:\t[1219.5121951219512; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0;;]\n",
611 | " From worker 2:\t[0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0;;]\n",
612 | " From worker 3:\t[0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 1219.5121951219512; 7560.975609756098; 1219.5121951219512; 0.0;;]\n"
613 | ]
614 | }
615 | ],
616 | "source": [
617 | "@mpi_do manager begin\n",
618 | " println(C)\n",
619 | "end"
620 | ]
621 | },
622 | {
623 | "cell_type": "code",
624 | "execution_count": null,
625 | "id": "2dc8e0fd-66e7-43aa-9035-95e84915971b",
626 | "metadata": {
627 | "slideshow": {
628 | "slide_type": "skip"
629 | },
630 | "tags": []
631 | },
632 | "outputs": [],
633 | "source": []
634 | }
635 | ],
636 | "metadata": {
637 | "kernelspec": {
638 | "display_name": "Julia 1.9.4",
639 | "language": "julia",
640 | "name": "julia-1.9.4"
641 | },
642 | "language_info": {
643 | "file_extension": ".jl",
644 | "mimetype": "application/julia",
645 | "name": "julia",
646 | "version": "1.9.4"
647 | }
648 | },
649 | "nbformat": 4,
650 | "nbformat_minor": 5
651 | }
652 |
--------------------------------------------------------------------------------
/parts/mpi/explanation/Project.toml:
--------------------------------------------------------------------------------
1 | [deps]
2 | AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
3 | CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
4 | CpuId = "adafc99b-e345-5852-983c-f28acb93d879"
5 | Hwloc = "0e44f5e4-bd66-52a0-8798-143a42290a1d"
6 | MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
7 | MPIClusterManagers = "e7922434-ae4b-11e9-05c5-9780451d2c66"
8 | NetworkInterfaceControllers = "6f74fd91-2978-43ad-8164-3af8c0ec0142"
9 |
--------------------------------------------------------------------------------
/parts/mpi/explanation/advanced/00_gpu_select.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "7fc2f000-ba64-483f-99d7-37b7f24969d1",
7 | "metadata": {
8 | "tags": []
9 | },
10 | "outputs": [
11 | {
12 | "name": "stderr",
13 | "output_type": "stream",
14 | "text": [
15 | "\u001b[32m\u001b[1m Activating\u001b[22m\u001b[39m project at `/global/u1/b/blaschke/juliacon24-hpcworkshop/parts/mpi/explanation`\n"
16 | ]
17 | },
18 | {
19 | "name": "stdout",
20 | "output_type": "stream",
21 | "text": [
22 | "\u001b[32m\u001b[1mStatus\u001b[22m\u001b[39m `/global/u1/b/blaschke/juliacon24-hpcworkshop/parts/mpi/explanation/Project.toml`\n",
23 | " \u001b[90m[1520ce14] \u001b[39mAbstractTrees v0.4.5\n",
24 | " \u001b[90m[052768ef] \u001b[39mCUDA v5.4.2\n",
25 | " \u001b[90m[adafc99b] \u001b[39mCpuId v0.3.1\n",
26 | " \u001b[90m[0e44f5e4] \u001b[39mHwloc v3.0.1\n",
27 | " \u001b[90m[da04e1cc] \u001b[39mMPI v0.20.20\n",
28 | " \u001b[90m[e7922434] \u001b[39mMPIClusterManagers v0.2.4\n",
29 | " \u001b[90m[6f74fd91] \u001b[39mNetworkInterfaceControllers v0.1.0\n"
30 | ]
31 | }
32 | ],
33 | "source": [
34 | "import Pkg;\n",
35 | "Pkg.activate(@__DIR__)\n",
36 | "Pkg.status()"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 2,
42 | "id": "2fffd2fb-ff8c-45a3-963a-06e40f4511f7",
43 | "metadata": {
44 | "tags": []
45 | },
46 | "outputs": [
47 | {
48 | "data": {
49 | "text/plain": [
50 | "cpucycle_coreid (generic function with 1 method)"
51 | ]
52 | },
53 | "execution_count": 2,
54 | "metadata": {},
55 | "output_type": "execute_result"
56 | }
57 | ],
58 | "source": [
59 | "using CpuId\n",
60 | "\n",
61 | "const cpucycle_mask = (\n",
62 | " (1 << (64 - leading_zeros(CpuId.cputhreads()))) - 1\n",
63 | ") % UInt32\n",
64 | "\n",
65 | "cpucycle_coreid() = Int(cpucycle_id()[2] & cpucycle_mask)"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": 3,
71 | "id": "a38f335a-0c2c-43c3-bd9a-45656331d464",
72 | "metadata": {
73 | "tags": []
74 | },
75 | "outputs": [
76 | {
77 | "data": {
78 | "text/plain": [
79 | "13"
80 | ]
81 | },
82 | "execution_count": 3,
83 | "metadata": {},
84 | "output_type": "execute_result"
85 | }
86 | ],
87 | "source": [
88 | "cpucycle_coreid()"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": 4,
94 | "id": "0ab0999d-bcac-4a10-885a-c689eda97924",
95 | "metadata": {
96 | "tags": []
97 | },
98 | "outputs": [],
99 | "source": [
100 | "using MPI, CUDA"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": 5,
106 | "id": "7bc9f6ae-0ba6-4206-84d6-ce4dc6576f24",
107 | "metadata": {
108 | "tags": []
109 | },
110 | "outputs": [
111 | {
112 | "name": "stdout",
113 | "output_type": "stream",
114 | "text": [
115 | "MPIPreferences:\n",
116 | " binary: system\n",
117 | " abi: MPICH\n",
118 | " libmpi: libmpi_gnu_123.so\n",
119 | " mpiexec: srun\n",
120 | "\n",
121 | "Package versions\n",
122 | " MPI.jl: 0.20.20\n",
123 | " MPIPreferences.jl: 0.1.11\n",
124 | "\n",
125 | "Library information:\n",
126 | " libmpi: libmpi_gnu_123.so\n",
127 | " libmpi dlpath: /opt/cray/pe/lib64/libmpi_gnu_123.so\n",
128 | " MPI version: 3.1.0\n",
129 | " Library version: \n",
130 | " MPI VERSION : CRAY MPICH version 8.1.28.29 (ANL base 3.4a2)\n",
131 | " MPI BUILD INFO : Wed Nov 15 20:57 2023 (git hash 1cde46f)\n",
132 | " \n"
133 | ]
134 | }
135 | ],
136 | "source": [
137 | "MPI.versioninfo()"
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": 6,
143 | "id": "27fb385f-7c83-421f-b77d-a59289004f8e",
144 | "metadata": {
145 | "tags": []
146 | },
147 | "outputs": [
148 | {
149 | "name": "stdout",
150 | "output_type": "stream",
151 | "text": [
152 | "CUDA runtime 12.2, local installation\n",
153 | "CUDA driver 12.2\n",
154 | "NVIDIA driver 525.105.17\n",
155 | "\n",
156 | "CUDA libraries: \n",
157 | "- CUBLAS: 12.2.1\n",
158 | "- CURAND: 10.3.3\n",
159 | "- CUFFT: 11.0.8\n",
160 | "- CUSOLVER: 11.5.0\n",
161 | "- CUSPARSE: 12.1.1\n",
162 | "- CUPTI: 20.0.0\n",
163 | "- NVML: 12.0.0+525.105.17\n",
164 | "\n",
165 | "Julia packages: \n",
166 | "- CUDA: 5.4.2\n",
167 | "- CUDA_Driver_jll: 0.9.1+1\n",
168 | "- CUDA_Runtime_jll: 0.14.1+0\n",
169 | "- CUDA_Runtime_Discovery: 0.3.4\n",
170 | "\n",
171 | "Toolchain:\n",
172 | "- Julia: 1.9.4\n",
173 | "- LLVM: 14.0.6\n",
174 | "\n",
175 | "Preferences:\n",
176 | "- CUDA_Runtime_jll.version: 12.2\n",
177 | "- CUDA_Runtime_jll.local: true\n",
178 | "\n",
179 | "4 devices:\n",
180 | " 0: NVIDIA A100-SXM4-80GB (sm_80, 79.150 GiB / 80.000 GiB available)\n",
181 | " 1: NVIDIA A100-SXM4-80GB (sm_80, 79.150 GiB / 80.000 GiB available)\n",
182 | " 2: NVIDIA A100-SXM4-80GB (sm_80, 79.150 GiB / 80.000 GiB available)\n",
183 | " 3: NVIDIA A100-SXM4-80GB (sm_80, 79.150 GiB / 80.000 GiB available)\n"
184 | ]
185 | }
186 | ],
187 | "source": [
188 | "CUDA.versioninfo()"
189 | ]
190 | },
191 | {
192 | "cell_type": "code",
193 | "execution_count": 7,
194 | "id": "57b8e1ad-c17a-4af5-abda-a6bddb59c15f",
195 | "metadata": {
196 | "slideshow": {
197 | "slide_type": "skip"
198 | },
199 | "tags": []
200 | },
201 | "outputs": [
202 | {
203 | "data": {
204 | "text/plain": [
205 | "filter (generic function with 26 methods)"
206 | ]
207 | },
208 | "execution_count": 7,
209 | "metadata": {},
210 | "output_type": "execute_result"
211 | }
212 | ],
213 | "source": [
214 | "import Base: filter, Fix1\n",
215 | "filter(f::Function)::Function = Fix1(filter, f)"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": 8,
221 | "id": "f51cb426-9357-4ed3-9ca9-319e81bc4f69",
222 | "metadata": {
223 | "tags": []
224 | },
225 | "outputs": [
226 | {
227 | "data": {
228 | "text/plain": [
229 | "get_device_attributes (generic function with 1 method)"
230 | ]
231 | },
232 | "execution_count": 8,
233 | "metadata": {},
234 | "output_type": "execute_result"
235 | }
236 | ],
237 | "source": [
238 | "function get_device_attributes()\n",
239 | " attr = Dict{Tuple{Int32, Int32}, Int32}()\n",
240 | " for i in 0:(ndevices()-1)\n",
241 | " d = CuDevice(i)\n",
242 | " attr[(\n",
243 | " attribute(d, CUDA.CU_DEVICE_ATTRIBUTE_PCI_BUS_ID),\n",
244 | " attribute(d, CUDA.CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID)\n",
245 | " )] = d\n",
246 | " end\n",
247 | " attr\n",
248 | "end"
249 | ]
250 | },
251 | {
252 | "cell_type": "code",
253 | "execution_count": 9,
254 | "id": "bc7e78ce-3ed7-4663-981e-99da96e9f5c7",
255 | "metadata": {
256 | "slideshow": {
257 | "slide_type": "skip"
258 | },
259 | "tags": []
260 | },
261 | "outputs": [],
262 | "source": [
263 | "using Hwloc, AbstractTrees\n",
264 | "\n",
265 | "\n",
266 | "import AbstractTrees: PreOrderDFS\n",
267 | "import Hwloc: hwloc_pci_class_string"
268 | ]
269 | },
270 | {
271 | "cell_type": "code",
272 | "execution_count": 34,
273 | "id": "cc2c286d-9822-4ad6-8d55-efd4dcc442b0",
274 | "metadata": {
275 | "tags": []
276 | },
277 | "outputs": [
278 | {
279 | "data": {
280 | "text/plain": [
281 | "distance_to_core (generic function with 1 method)"
282 | ]
283 | },
284 | "execution_count": 34,
285 | "metadata": {},
286 | "output_type": "execute_result"
287 | }
288 | ],
289 | "source": [
290 | "function tag_subtree!(tree_node, val)\n",
291 | " for n in collect(AbstractTrees.PreOrderDFS(tree_node))\n",
292 | " n.tag = val\n",
293 | " end\n",
294 | "end\n",
295 | "\n",
296 | "function distance_to_core!(node, target_index)\n",
297 | " # shield re-entrance when iterating\n",
298 | " node.tag = 1\n",
299 | "\n",
300 | " if node.type == :PU\n",
301 | " # println(\"Checking: $(nodevalue(node).os_index)\")\n",
302 | " if nodevalue(node).os_index == target_index\n",
303 | " return true, 0\n",
304 | " end\n",
305 | " end\n",
306 | "\n",
307 | " for child in node.children\n",
308 | " if child.tag == 1\n",
309 | " continue\n",
310 | " end\n",
311 | "\n",
312 | " found, dist = distance_to_core!(child, target_index)\n",
313 | " if found\n",
314 | " return true, dist + 1\n",
315 | " end\n",
316 | " end\n",
317 | "\n",
318 | " if node.parent != nothing\n",
319 | " found, dist = distance_to_core!(node.parent, target_index)\n",
320 | " if found\n",
321 | " return true, dist + 1\n",
322 | " end\n",
323 | " end\n",
324 | "\n",
325 | " return false, typemax(Int)\n",
326 | "end\n",
327 | "\n",
328 | "function distance_to_core(root, node, target_index)\n",
329 | " tag_subtree!(root, 0) \n",
330 | " found, dist = distance_to_core!(node, target_index)\n",
331 | " tag_subtree!(root, 0) \n",
332 | " return found, dist\n",
333 | "end"
334 | ]
335 | },
336 | {
337 | "cell_type": "code",
338 | "execution_count": 35,
339 | "id": "ddbadb39-1998-4472-b940-09648284ad8c",
340 | "metadata": {
341 | "tags": []
342 | },
343 | "outputs": [
344 | {
345 | "data": {
346 | "text/plain": [
347 | "Dict{Tuple{Int32, Int32}, Int32} with 4 entries:\n",
348 | " (65, 0) => 1\n",
349 | " (193, 0) => 3\n",
350 | " (130, 0) => 2\n",
351 | " (3, 0) => 0"
352 | ]
353 | },
354 | "execution_count": 35,
355 | "metadata": {},
356 | "output_type": "execute_result"
357 | }
358 | ],
359 | "source": [
360 | "get_device_attributes()"
361 | ]
362 | },
363 | {
364 | "cell_type": "code",
365 | "execution_count": 36,
366 | "id": "00f54295-a079-474c-8028-57fcea1fa288",
367 | "metadata": {
368 | "slideshow": {
369 | "slide_type": "skip"
370 | },
371 | "tags": []
372 | },
373 | "outputs": [
374 | {
375 | "data": {
376 | "text/plain": [
377 | "get_device_distances (generic function with 1 method)"
378 | ]
379 | },
380 | "execution_count": 36,
381 | "metadata": {},
382 | "output_type": "execute_result"
383 | }
384 | ],
385 | "source": [
386 | "sys_devs = children(gettopology())\n",
387 | "pci_devs = PreOrderDFS(sys_devs) |> collect |> filter(x->x.type==:PCI_Device)\n",
388 | "gpu_devs = pci_devs |> filter(x->hwloc_pci_class_string(nodevalue(x).attr.class_id) == \"3D\")\n",
389 | "\n",
390 | "function get_device_distances(core)\n",
391 | " attr = get_device_attributes()\n",
392 | " dist = Dict{Int32, Int32}()\n",
393 | " dev = Dict{Int32, Int32}()\n",
394 | " for d in gpu_devs\n",
395 | " idx = attr[(nodevalue(d).attr.bus, nodevalue(d).attr.dev)]\n",
396 | " found, dev_d = distance_to_core(sys_devs, d, core)\n",
397 | " if found\n",
398 | " dist[idx] = dev_d\n",
399 | " dev[dev_d] = idx\n",
400 | " end\n",
401 | " end\n",
402 | " dist, dev\n",
403 | "end"
404 | ]
405 | },
406 | {
407 | "cell_type": "code",
408 | "execution_count": 37,
409 | "id": "d35ac271-b857-46f3-9744-a2512642c009",
410 | "metadata": {
411 | "tags": []
412 | },
413 | "outputs": [
414 | {
415 | "data": {
416 | "text/plain": [
417 | "49"
418 | ]
419 | },
420 | "execution_count": 37,
421 | "metadata": {},
422 | "output_type": "execute_result"
423 | }
424 | ],
425 | "source": [
426 | "cpucycle_coreid()"
427 | ]
428 | },
429 | {
430 | "cell_type": "code",
431 | "execution_count": 48,
432 | "id": "c867b0ed-85df-47c1-b85a-c99312c66430",
433 | "metadata": {
434 | "tags": []
435 | },
436 | "outputs": [
437 | {
438 | "data": {
439 | "text/plain": [
440 | "0"
441 | ]
442 | },
443 | "execution_count": 48,
444 | "metadata": {},
445 | "output_type": "execute_result"
446 | }
447 | ],
448 | "source": [
449 | "dist, dev = get_device_distances(cpucycle_coreid())\n",
450 | "closest_dev = dev[dev |> keys |> minimum]"
451 | ]
452 | },
453 | {
454 | "cell_type": "code",
455 | "execution_count": 45,
456 | "id": "e22f3df1-afef-45da-baef-8554c5f69189",
457 | "metadata": {
458 | "tags": []
459 | },
460 | "outputs": [
461 | {
462 | "data": {
463 | "text/plain": [
464 | "Dict{Int32, Int32} with 4 entries:\n",
465 | " 0 => 18\n",
466 | " 2 => 516\n",
467 | " 3 => 516\n",
468 | " 1 => 516"
469 | ]
470 | },
471 | "execution_count": 45,
472 | "metadata": {},
473 | "output_type": "execute_result"
474 | }
475 | ],
476 | "source": [
477 | "dist"
478 | ]
479 | },
480 | {
481 | "cell_type": "code",
482 | "execution_count": 46,
483 | "id": "24afd17a-2308-4089-933b-5d138f555482",
484 | "metadata": {
485 | "tags": []
486 | },
487 | "outputs": [
488 | {
489 | "data": {
490 | "text/plain": [
491 | "Dict{Int32, Int32} with 2 entries:\n",
492 | " 18 => 0\n",
493 | " 516 => 1"
494 | ]
495 | },
496 | "execution_count": 46,
497 | "metadata": {},
498 | "output_type": "execute_result"
499 | }
500 | ],
501 | "source": [
502 | "dev"
503 | ]
504 | },
505 | {
506 | "cell_type": "code",
507 | "execution_count": 47,
508 | "id": "8c844ec9-bb7b-4142-899c-acac3035d841",
509 | "metadata": {
510 | "tags": []
511 | },
512 | "outputs": [
513 | {
514 | "data": {
515 | "text/plain": [
516 | "0"
517 | ]
518 | },
519 | "execution_count": 47,
520 | "metadata": {},
521 | "output_type": "execute_result"
522 | }
523 | ],
524 | "source": []
525 | },
526 | {
527 | "cell_type": "code",
528 | "execution_count": null,
529 | "id": "a17f0a7a-5ced-4fa7-8c65-ee4da39d54af",
530 | "metadata": {},
531 | "outputs": [],
532 | "source": []
533 | }
534 | ],
535 | "metadata": {
536 | "kernelspec": {
537 | "display_name": "Julia 1.9.4",
538 | "language": "julia",
539 | "name": "julia-1.9.4"
540 | },
541 | "language_info": {
542 | "file_extension": ".jl",
543 | "mimetype": "application/julia",
544 | "name": "julia",
545 | "version": "1.9.4"
546 | }
547 | },
548 | "nbformat": 4,
549 | "nbformat_minor": 5
550 | }
551 |
--------------------------------------------------------------------------------
/parts/mpi/explanation/diffusion_2d_halo_exchange.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JuliaHPC/juliacon24-hpcworkshop/2c6e764bcf94eebd549785b69264215b1012eaf2/parts/mpi/explanation/diffusion_2d_halo_exchange.pdf
--------------------------------------------------------------------------------
/parts/mpi/explanation/diffusion_2d_halo_exchange.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JuliaHPC/juliacon24-hpcworkshop/2c6e764bcf94eebd549785b69264215b1012eaf2/parts/mpi/explanation/diffusion_2d_halo_exchange.png
--------------------------------------------------------------------------------
/parts/mpi/explanation/l8_1D_global_grid.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JuliaHPC/juliacon24-hpcworkshop/2c6e764bcf94eebd549785b69264215b1012eaf2/parts/mpi/explanation/l8_1D_global_grid.png
--------------------------------------------------------------------------------
/parts/mpi/get_compute_node_interactive.sh:
--------------------------------------------------------------------------------
1 | salloc --nodes 1 --cpus-per-task=1 --qos interactive --time 00:45:00 --constraint cpu --ntasks-per-node=4 --account=ntrain1
2 |
--------------------------------------------------------------------------------
/parts/mpi/job_mpi_multinode.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #SBATCH -A ntrain1
4 | #SBATCH -C cpu
5 | #SBATCH -q regular
6 | #SBATCH --output=slurm_mpi_multinode.out
7 | #SBATCH --time=00:05:00
8 | #SBATCH --nodes=4
9 | #SBATCH --ntasks=16
10 | #SBATCH --exclusive
11 |
12 | ml use /global/common/software/nersc/n9/julia/modules
13 | ml julia
14 |
15 | mpiexecjl --project=../.. julia -e 'do_save=false; include("diffusion_2d_mpi.jl");'
16 |
--------------------------------------------------------------------------------
/parts/mpi/job_mpi_singlenode.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #SBATCH -A ntrain1
4 | #SBATCH -C cpu
5 | #SBATCH -q regular
6 | #SBATCH --output=slurm_mpi_singlenode.out
7 | #SBATCH --time=00:05:00
8 | #SBATCH --nodes=1
9 | #SBATCH --ntasks=4
10 | #SBATCH --exclusive
11 |
12 | ml use /global/common/software/nersc/n9/julia/modules
13 | ml julia
14 |
15 | mpiexecjl --project=../.. julia diffusion_2d_mpi.jl
16 |
--------------------------------------------------------------------------------
/parts/mpi/solution/diffusion_2d_mpi.jl:
--------------------------------------------------------------------------------
1 | # 2D linear diffusion solver - MPI
2 | using Printf
3 | using JLD2
4 | using MPI
5 | include(joinpath(@__DIR__, "../../shared.jl"))
6 |
7 | # convenience macros simply to avoid writing nested finite-difference expression
8 | macro qx(ix, iy) esc(:(-D * (C[$ix+1, $iy] - C[$ix, $iy]) / dx)) end
9 | macro qy(ix, iy) esc(:(-D * (C[$ix, $iy+1] - C[$ix, $iy]) / dy)) end
10 |
11 | function diffusion_step!(params, C2, C)
12 | (; dx, dy, dt, D) = params
13 | for iy in 1:size(C, 2)-2
14 | for ix in 1:size(C, 1)-2
15 | @inbounds C2[ix+1, iy+1] = C[ix+1, iy+1] - dt * ((@qx(ix+1, iy+1) - @qx(ix, iy+1)) / dx +
16 | (@qy(ix+1, iy+1) - @qy(ix+1, iy)) / dy)
17 | end
18 | end
19 | return nothing
20 | end
21 |
22 | # MPI functions
23 | @views function update_halo!(A, bufs, neighbors, comm)
24 | # dim-1 (x)
25 | (neighbors.x[1] != MPI.PROC_NULL) && copyto!(bufs.send_1_1, A[2 , :])
26 | (neighbors.x[2] != MPI.PROC_NULL) && copyto!(bufs.send_1_2, A[end-1, :])
27 |
28 | reqs = MPI.MultiRequest(4)
29 | (neighbors.x[1] != MPI.PROC_NULL) && MPI.Irecv!(bufs.recv_1_1, comm, reqs[1]; source=neighbors.x[1])
30 | (neighbors.x[2] != MPI.PROC_NULL) && MPI.Irecv!(bufs.recv_1_2, comm, reqs[2]; source=neighbors.x[2])
31 |
32 | (neighbors.x[1] != MPI.PROC_NULL) && MPI.Isend(bufs.send_1_1, comm, reqs[3]; dest=neighbors.x[1])
33 | (neighbors.x[2] != MPI.PROC_NULL) && MPI.Isend(bufs.send_1_2, comm, reqs[4]; dest=neighbors.x[2])
34 | MPI.Waitall(reqs) # blocking
35 |
36 | (neighbors.x[1] != MPI.PROC_NULL) && copyto!(A[1 , :], bufs.recv_1_1)
37 | (neighbors.x[2] != MPI.PROC_NULL) && copyto!(A[end, :], bufs.recv_1_2)
38 |
39 | # dim-2 (y)
40 | (neighbors.y[1] != MPI.PROC_NULL) && copyto!(bufs.send_2_1, A[:, 2 ])
41 | (neighbors.y[2] != MPI.PROC_NULL) && copyto!(bufs.send_2_2, A[:, end-1])
42 |
43 | reqs = MPI.MultiRequest(4)
44 | (neighbors.y[1] != MPI.PROC_NULL) && MPI.Irecv!(bufs.recv_2_1, comm, reqs[1]; source=neighbors.y[1])
45 | (neighbors.y[2] != MPI.PROC_NULL) && MPI.Irecv!(bufs.recv_2_2, comm, reqs[2]; source=neighbors.y[2])
46 |
47 | (neighbors.y[1] != MPI.PROC_NULL) && MPI.Isend(bufs.send_2_1, comm, reqs[3]; dest=neighbors.y[1])
48 | (neighbors.y[2] != MPI.PROC_NULL) && MPI.Isend(bufs.send_2_2, comm, reqs[4]; dest=neighbors.y[2])
49 | MPI.Waitall(reqs) # blocking
50 |
51 | (neighbors.y[1] != MPI.PROC_NULL) && copyto!(A[:, 1 ], bufs.recv_2_1)
52 | (neighbors.y[2] != MPI.PROC_NULL) && copyto!(A[:, end], bufs.recv_2_2)
53 | return nothing
54 | end
55 |
56 | function init_bufs(A)
57 | return (; send_1_1=zeros(size(A, 2)), send_1_2=zeros(size(A, 2)),
58 | send_2_1=zeros(size(A, 1)), send_2_2=zeros(size(A, 1)),
59 | recv_1_1=zeros(size(A, 2)), recv_1_2=zeros(size(A, 2)),
60 | recv_2_1=zeros(size(A, 1)), recv_2_2=zeros(size(A, 1)))
61 | end
62 |
63 | function run_diffusion(; ns=64, nt=100, do_save=false)
64 | MPI.Init()
65 | comm = MPI.COMM_WORLD
66 | nprocs = MPI.Comm_size(comm)
67 | dims = MPI.Dims_create(nprocs, (0, 0)) |> Tuple
68 | comm_cart = MPI.Cart_create(comm, dims)
69 | me = MPI.Comm_rank(comm_cart)
70 | coords = MPI.Cart_coords(comm_cart) |> Tuple
71 | neighbors = (; x=MPI.Cart_shift(comm_cart, 0, 1), y=MPI.Cart_shift(comm_cart, 1, 1))
72 | (me == 0) && println("nprocs = $(nprocs), dims = $dims")
73 |
74 | params = init_params_mpi(; dims, coords, ns, nt, do_save)
75 | C, C2 = init_arrays_mpi(params)
76 | bufs = init_bufs(C)
77 | t_tic = 0.0
78 | # time loop
79 | for it in 1:nt
80 | # time after warmup (ignore first 10 iterations)
81 | (it == 11) && (t_tic = Base.time())
82 | # diffusion
83 | diffusion_step!(params, C2, C)
84 | update_halo!(C2, bufs, neighbors, comm_cart)
85 | C, C2 = C2, C # pointer swap
86 | end
87 | t_toc = (Base.time() - t_tic)
88 | # "master" prints performance
89 | (me == 0) && print_perf(params, t_toc)
90 | # save to (maybe) visualize later
91 | if do_save
92 | jldsave(joinpath(@__DIR__, "out_$(me).jld2"); C = Array(C[2:end-1, 2:end-1]), lxy = (; lx=params.L, ly=params.L))
93 | end
94 | MPI.Finalize()
95 | return nothing
96 | end
97 |
98 | # Running things...
99 |
100 | # enable save to disk by default
101 | (!@isdefined do_save) && (do_save = true)
102 | # enable execution by default
103 | (!@isdefined do_run) && (do_run = true)
104 |
105 | if do_run
106 | if !isempty(ARGS)
107 | run_diffusion(; ns=parse(Int, ARGS[1]), do_save)
108 | else
109 | run_diffusion(; ns=256, do_save)
110 | end
111 | end
112 |
--------------------------------------------------------------------------------
/parts/mpi/solution/job_mpi_multinode.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #SBATCH -A ntrain1
4 | #SBATCH -C cpu
5 | #SBATCH -q regular
6 | #SBATCH --output=slurm_mpi_multinode.out
7 | #SBATCH --time=00:05:00
8 | #SBATCH --nodes=4
9 | #SBATCH --ntasks=16
10 | #SBATCH --exclusive
11 |
12 | ml use /global/common/software/nersc/n9/julia/modules
13 | ml julia
14 |
15 | mpiexecjl --project=../../.. julia -e 'do_save=false; include("diffusion_2d_mpi.jl");'
16 |
--------------------------------------------------------------------------------
/parts/mpi/solution/job_mpi_singlenode.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #SBATCH -A ntrain1
4 | #SBATCH -C cpu
5 | #SBATCH -q regular
6 | #SBATCH --output=slurm_mpi_singlenode.out
7 | #SBATCH --time=00:05:00
8 | #SBATCH --nodes=1
9 | #SBATCH --ntasks=4
10 | #SBATCH --exclusive
11 |
12 | ml use /global/common/software/nersc/n9/julia/modules
13 | ml julia
14 |
15 | mpiexecjl --project=../../.. julia diffusion_2d_mpi.jl
16 |
--------------------------------------------------------------------------------
/parts/mpi/solution/multinode_results.txt:
--------------------------------------------------------------------------------
1 | # 1 node, 4 MPI ranks
2 | nprocs = 4, dims = [2, 2]
3 | Time = 6.5865e+00 s, T_eff = 8.25 GB/s
4 |
5 | # 2 nodes, 8 MPI ranks
6 | nprocs = 8, dims = [4, 2]
7 | Time = 6.5964e+00 s, T_eff = 8.24 GB/s
8 |
9 | # 3 nodes, 12 MPI ranks
10 | nprocs = 12, dims = [4, 3]
11 | Time = 6.5889e+00 s, T_eff = 8.25 GB/s
12 |
13 | # 4 nodes, 16 MPI ranks
14 | nprocs = 16, dims = [4, 4]
15 | Time = 6.6004e+00 s, T_eff = 8.24 GB/s
--------------------------------------------------------------------------------
/parts/mpi/solution/slurm_mpi_singlenode.out:
--------------------------------------------------------------------------------
1 | nprocs = 4, dims = [2, 2]
2 | Time = 1.2309e-02 s, T_eff = 7.67 GB/s
3 |
--------------------------------------------------------------------------------
/parts/mpi/solution/visualization_before.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JuliaHPC/juliacon24-hpcworkshop/2c6e764bcf94eebd549785b69264215b1012eaf2/parts/mpi/solution/visualization_before.png
--------------------------------------------------------------------------------
/parts/mpi/solution/visualization_desired.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JuliaHPC/juliacon24-hpcworkshop/2c6e764bcf94eebd549785b69264215b1012eaf2/parts/mpi/solution/visualization_desired.png
--------------------------------------------------------------------------------
/parts/mpi/visualize_mpi.jl:
--------------------------------------------------------------------------------
1 | # Visualisation script for the 2D MPI solver
2 | using CairoMakie
3 | using JLD2
4 |
5 | function vizme2D_mpi(nprocs)
6 | C = []
7 | lx = ly = 0.0
8 | ip = 1
9 | for ipx in 1:nprocs[1]
10 | for ipy in 1:nprocs[2]
11 | C_loc, lxy = load("out_$(ip-1).jld2", "C", "lxy")
12 | nx_i, ny_i = size(C_loc, 1), size(C_loc, 2)
13 | ix1, iy1 = 1 + (ipx - 1) * nx_i, 1 + (ipy - 1) * ny_i
14 | if ip == 1
15 | C = zeros(nprocs[1] * nx_i, nprocs[2] * ny_i)
16 | lx, ly = lxy
17 | end
18 | C[ix1:ix1+nx_i-1, iy1:iy1+ny_i-1] .= C_loc
19 | ip += 1
20 | end
21 | end
22 | xc, yc = LinRange.(0, (lx, ly), size(C))
23 | fig = Figure(; size=(500, 400), fontsize=14)
24 | ax = Axis(fig[1, 1][1, 1]; aspect=DataAspect(), title="C")
25 | hm = heatmap!(ax, xc, yc, C; colormap=:turbo, colorrange=(0, 1))
26 | cb = Colorbar(fig[1, 1][1, 2], hm)
27 | if isinteractive()
28 | display(fig)
29 | else
30 | save("visualization.png", fig)
31 | end
32 | return
33 | end
34 |
35 | nprocs = (2, 2) # nprocs (x, y) dim
36 | vizme2D_mpi(nprocs)
37 |
--------------------------------------------------------------------------------
/parts/multithreading/README.md:
--------------------------------------------------------------------------------
1 | # Diffusion 2D - Multithreading
2 |
3 | In this part, we want to use multithreading (shared-memory parallelism) to parallelize our Diffusion 2D example.
4 |
5 | The starting point is the serial loop version [`diffusion_2d_loop.jl`](./../diffusion_2d/diffusion_2d_loop.jl). The file [`diffusion_2d_threads.jl`](./diffusion_2d_threads.jl) in this folder is a slightly modified copy of this version. Specifically, we included the serial initialization of the arrays `C` and `C2` in form of the function `init_arrays_threads` and left the computational kernel (`diffusion_step!`) mostly unimplemented. Note that there are few code stubs (indicated by `TODO` comments) that you will implement in the tasks below.
6 |
7 | ## Task 1 - Multithreading `diffusion_step!`
8 |
9 | ### Part A
10 |
11 | Your first task is to take the diffusion kernel from `diffusion_2d_loop.jl` - recited below for your convenience - and use `@threads` to parallelize it. See the `TODO` comments inside of the `diffusion_step!` function.
12 |
13 | You shall implement two variants, one that uses static scheduling and another that uses dynamic scheduling. A variable `static` will be used to switch between the two cases .
14 |
15 | (To test the correctness of your implementation, you can do an "eye test" and just look at the resulting plots.)
16 |
17 | **Question:**
18 | * Should you parallelize the inner or the outer loop?
19 | * (You can try both and compare the two in terms of performance if you are unsure.)
20 |
21 | **Serial kernel from diffusion_2d_loop.jl:**
22 | ```julia
23 | for iy in 1:size(C, 2)-2
24 | for ix in 1:size(C, 1)-2
25 | @inbounds C2[ix+1, iy+1] = C[ix+1, iy+1] - dt * ((@qx(ix+1, iy+1) - @qx(ix, iy+1)) / ds +
26 | (@qy(ix+1, iy+1) - @qy(ix+1, iy)) / ds)
27 | end
28 | end
29 | ```
30 |
31 | ### Part B
32 |
33 | Let's make a first rough performance comparison. Run your implementation using 8 Julia threads and using 1 Julia thread and compare the timings/`T_eff` ("strong scaling"). Perform this comparison for three values of `ns`, for example 512, 2048, and 6144.
34 |
35 | Note that you don't have to implement the other `TODO`s in the file. The code should run just fine if you've implemented `diffusion_step!`.
36 |
37 | **How to run the code?**
38 |
39 | You can either perform the rough benchmark in an interactive Julia session or use the script `job_compare_threads_serial.sh`.
40 |
41 | * Interactive:
42 | * Set `do_visualize=false`.
43 | * Use `include("diffusion_2d_threads.jl")` to run the code.
44 |
45 | * Script::
46 | * Either just run the script on the current node (`sh job_compare_threads_serial.sh`) or submit it as a job to SLURM (`sbatch job_compare_threads_serial.sh`). In the latter case, the output will end up in a file called `slurm_compare_threads_serial.out`.
47 |
48 | **Questions:**
49 | * What do you observe?
50 | * Are you happy with the performance improvement?
51 | * Consider taking ratios of the timings (i.e. $t_{serial}$ / $t_{parallel}$) and compare it to 8 (naively anticipating perfect scaling).
52 |
53 | ## Task 2 - Parallel initialization and thread pinning
54 |
55 | As has been stated before the hands-on, how we pin the Julia threads and how/whether we initialize the data (`C`, `C2`) in serial or parallel can heavily influence the performance of our code. Let's put this to the test!
56 |
57 | ### Part A
58 |
59 | Go ahead and parallelize the initialization of `C` and `C2` in the function `init_arrays_threads` (see the `TODO`s therein) in the same way as you've parallelized the kernel in `diffusion_step!` above.
60 |
61 | The variable `parallel_init` (`true` or `false`) is used to switch between parallel and serial initialization. Similarly, the variable `static` (`true` or `false`) is used to switch between static and dynamic scheduling.
62 |
63 | (To test the correctness of your implementation, you can do an "eye test" and just look at the resulting plots.)
64 |
65 | ### Part B
66 |
67 | Now, we want to systematically compare the performance of our code for
68 | * different combinations of `parallel_init` and `static`,
69 | * different values of `ns` (512, 2048, and 6144), and
70 | * different pinning schemes (`:cores`, `:sockets`, `:numa`)
71 |
72 | While you are more than invited to play around with these degrees of freedom in an interactive Julia session running on a **compute node**, this will likely become rather cumbersome very quickly.
73 | (We still invite you to play around a little bit with ThreadPinning's `threadinfo` and `pinthreads`!)
74 |
75 | To simplify things, we've prepared the script `job_bench_threads.sh` for you, which you can simply submit to SLURM (`sbatch job_bench_threads.sh`). The output will end up in the file `slurm_bench_threads.out`.
76 |
77 | **Questions:**
78 | * First, compare the `static=true` results with increasing `ns` (that is, ignore the dynamic scheduling runs for now). Can you qualitatively explain the performance difference/similarity between the three pinning strategies? And maybe also why it changes with increasing `ns`?
79 | * Why does dynamic scheduling (most of the time) give worse performance than static scheduling?
80 | * The output also shows single-threaded timings. Consider the timing ratio ($t_{serial}$ / $t_{parallel}$) for the best performing cases. Is it an improvement over what you found above (i.e. closer to a factor of 8)?
81 |
--------------------------------------------------------------------------------
/parts/multithreading/diffusion_2d_threads.jl:
--------------------------------------------------------------------------------
1 | # 2D linear diffusion solver - multithreading
2 | using Printf
3 | using CairoMakie
4 | include(joinpath(@__DIR__, "../shared.jl"))
5 |
6 | function init_arrays_threads(params)
7 | (; ns, cs, parallel_init, static) = params
8 | C = Matrix{Float64}(undef, ns, ns)
9 | C2 = Matrix{Float64}(undef, ns, ns)
10 | #
11 | # !! TODO !!
12 | #
13 | # Below, you see how the arrays C and C2 are initialized without multithreading.
14 | # Based off of this serial implementation implement two multithreaded variants
15 | # that use static or dynamic scheduling, respectively (see "TODO..." below).
16 | #
17 | if parallel_init
18 | # parallel initialization
19 | if static
20 | # static scheduling
21 | # TODO...
22 | else
23 | # dynamic scheduling
24 | # TODO...
25 | end
26 | else
27 | # serial initialization
28 | for iy in axes(C, 2)
29 | for ix in axes(C, 1)
30 | C[ix, iy] = exp(- cs[ix]^2 - cs[iy]^2)
31 | C2[ix, iy] = C[ix, iy] # element-wise copy
32 | end
33 | end
34 | end
35 | return C, C2
36 | end
37 |
38 | # to avoid writing nested finite-difference expression
39 | macro qx(ix, iy) esc(:(-D * (C[$ix+1, $iy] - C[$ix, $iy]) / ds)) end
40 | macro qy(ix, iy) esc(:(-D * (C[$ix, $iy+1] - C[$ix, $iy]) / ds)) end
41 |
42 | function diffusion_step!(params, C2, C)
43 | (; ds, dt, D, static) = params
44 | #
45 | # !! TODO !!
46 | #
47 | # We want to multithread the diffusion step (our computational kernel).
48 | # Based off of the serial kernel (see README.md or diffusion_2d_loop.jl) implement
49 | # two multithreaded variants that use static or dynamic scheduling, respectively
50 | # (see "TODO..." below).
51 | #
52 | if static
53 | # static scheduling
54 | # TODO...
55 | else
56 | # dynamic scheduling
57 | # TODO...
58 | end
59 | return nothing
60 | end
61 |
62 | function run_diffusion(; ns=64, nt=100, do_visualize=false, parallel_init=false, static=false)
63 | params = init_params(; ns, nt, do_visualize, parallel_init, static)
64 | C, C2 = init_arrays_threads(params)
65 | fig, plt = maybe_init_visualization(params, C)
66 | t_tic = 0.0
67 | # time loop
68 | for it in 1:nt
69 | # time after warmup (ignore first 10 iterations)
70 | (it == 11) && (t_tic = Base.time())
71 | # diffusion
72 | diffusion_step!(params, C2, C)
73 | C, C2 = C2, C # pointer swap
74 | # visualization
75 | maybe_update_visualization(params, fig, plt, C, it)
76 | end
77 | t_toc = (Base.time() - t_tic)
78 | print_perf(params, t_toc)
79 | return nothing
80 | end
81 |
82 | # Running things...
83 |
84 | # enable visualization by default
85 | (!@isdefined do_visualize) && (do_visualize = true)
86 | # enable execution by default
87 | (!@isdefined do_run) && (do_run = true)
88 |
89 | if do_run
90 | if !isempty(ARGS)
91 | run_diffusion(; ns=parse(Int, ARGS[1]), do_visualize)
92 | else
93 | run_diffusion(; ns=256, do_visualize)
94 | end
95 | end
96 |
--------------------------------------------------------------------------------
/parts/multithreading/imgs/stack_heap_threads.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JuliaHPC/juliacon24-hpcworkshop/2c6e764bcf94eebd549785b69264215b1012eaf2/parts/multithreading/imgs/stack_heap_threads.png
--------------------------------------------------------------------------------
/parts/multithreading/job_bench_threads.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --time=00:05:00
3 | #SBATCH --nodes=1
4 | #SBATCH --ntasks-per-node=1
5 | #SBATCH --cpus-per-task=256
6 | #SBATCH --constraint=cpu
7 | #SBATCH --account=ntrain1
8 | #SBATCH --output=slurm_bench_threads.out
9 |
10 | # Load julia
11 | ml use /global/common/software/nersc/n9/julia/modules
12 | ml julia
13 |
14 | for i in 512 2048 6144
15 | do
16 | echo -e "\n\n#### Run $i"
17 |
18 | echo -e "-- single threaded"
19 | julia --project --threads 1 -e 'do_visualize=false; include("diffusion_2d_threads.jl")' $i
20 | echo -e ""
21 |
22 | julia --project --threads 8 bench_threads.jl $i # benchmark multithreaded variants
23 | done
--------------------------------------------------------------------------------
/parts/multithreading/job_compare_threads_serial.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --time=00:05:00
3 | #SBATCH --nodes=1
4 | #SBATCH --ntasks-per-node=1
5 | #SBATCH --cpus-per-task=256
6 | #SBATCH --constraint=cpu
7 | #SBATCH --account=ntrain1
8 | #SBATCH --output=slurm_compare_threads_serial.out
9 |
10 | # Load julia
11 | ml use /global/common/software/nersc/n9/julia/modules
12 | ml julia
13 |
14 | for i in 512 2048 6144
15 | do
16 | echo -e "\n\n#### Run $i"
17 |
18 | echo -e "-- single threaded"
19 | julia --project --threads 1 -e 'do_visualize=false; include("diffusion_2d_threads.jl")' $i
20 | echo -e ""
21 |
22 | echo -e "-- multithreaded (8 threads)"
23 | julia --project --threads 8 -e 'do_visualize=false; include("diffusion_2d_threads.jl")' $i
24 | echo -e ""
25 | done
--------------------------------------------------------------------------------
/parts/multithreading/multithreading.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Multithreading (shared-memory parallelism)"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Overview"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "* **Running Julia with multiple threads**\n",
22 | "\n",
23 | "* Where are the threads running?\n",
24 | " * ThreadPinning.jl\n",
25 | "\n",
26 | "* **Task-based multithreading**\n",
27 | " * dynamic and static scheduling\n",
28 | "\n",
29 | "* **\"Data pinning\"**\n",
30 | " * NUMA \"first-touch\" policy"
31 | ]
32 | },
33 | {
34 | "cell_type": "markdown",
35 | "metadata": {},
36 | "source": [
37 | "## Running Julia with multiple threads"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "metadata": {},
43 | "source": [
44 | "By default, Julia starts with a single *user thread*. We must tell it explicitly to start multiple user threads.\n",
45 | "\n",
46 | "* Environment variable: `export JULIA_NUM_THREADS=8`\n",
47 | "\n",
48 | "* Command line argument: `julia -t 8` or `julia --threads 8`\n",
49 | "\n",
50 | "* **VS Code:** Add `\"julia.NumThreads\": 8` to workspace settings (`Preferences: Open Workspace Settings (JSON)`)"
51 | ]
52 | },
53 | {
54 | "cell_type": "markdown",
55 | "metadata": {},
56 | "source": [
57 | "**It is currently not really possible to change the number of threads at runtime!**"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": null,
63 | "metadata": {},
64 | "outputs": [],
65 | "source": [
66 | "Threads.nthreads()"
67 | ]
68 | },
69 | {
70 | "cell_type": "markdown",
71 | "metadata": {},
72 | "source": [
73 | "## Where are the threads running?"
74 | ]
75 | },
76 | {
77 | "cell_type": "markdown",
78 | "metadata": {},
79 | "source": [
80 | "[ThreadPinning.jl](https://github.com/carstenbauer/ThreadPinning.jl) is the best tool for visualizing and controlling thread placement in Julia. (Disclaimer: I'm the main author 😉)"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": null,
86 | "metadata": {},
87 | "outputs": [],
88 | "source": [
89 | "using ThreadPinning\n",
90 | "\n",
91 | "threadinfo()"
92 | ]
93 | },
94 | {
95 | "cell_type": "markdown",
96 | "metadata": {},
97 | "source": [
98 | "### Pinning threads (i.e. controling where they are running)"
99 | ]
100 | },
101 | {
102 | "cell_type": "markdown",
103 | "metadata": {},
104 | "source": [
105 | "#### Why?\n",
106 | "\n",
107 | "* To avoid double occupancy of CPU cores.\n",
108 | "\n",
109 | "* To reduce noise in benchmarks.\n",
110 | "\n",
111 | "* To address the complexity of the system topology, e.g. to use specific/all memory domains (NUMA).\n",
112 | "\n",
113 | "* ..."
114 | ]
115 | },
116 | {
117 | "cell_type": "markdown",
118 | "metadata": {},
119 | "source": [
120 | "#### How?\n",
121 | "\n",
122 | "`pinthreads(strategy)`\n",
123 | "* `:cputhreads` pin to CPU threads (incl. \"hypterthreads\") one after another\n",
124 | "* `:cores:` pin to CPU cores one after another\n",
125 | "* `:numa:` alternate between NUMA domains (round-robin)\n",
126 | "* `:sockets:` alternate between sockets (round-robin)\n",
127 | "* `:affinitymask`: pin according to an external affinity mask (e.g. set by SLURM)\n",
128 | "\n",
129 | "(More? See my talk at JuliaCon2023 @ MIT: https://youtu.be/6Whc9XtlCC0)"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": null,
135 | "metadata": {},
136 | "outputs": [],
137 | "source": [
138 | "pinthreads(:cores) # try :cores or :sockets or :random\n",
139 | "threadinfo()"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": null,
145 | "metadata": {},
146 | "outputs": [],
147 | "source": [
148 | "pinthreads(:numa)\n",
149 | "threadinfo(; groupby=:numa)"
150 | ]
151 | },
152 | {
153 | "cell_type": "markdown",
154 | "metadata": {},
155 | "source": [
156 | "#### Memory domains (NUMA)"
157 | ]
158 | },
159 | {
160 | "cell_type": "markdown",
161 | "metadata": {},
162 | "source": [
163 | "NUMA = **n**on-**u**niform **m**emory **a**ccess"
164 | ]
165 | },
166 | {
167 | "cell_type": "markdown",
168 | "metadata": {},
169 | "source": [
170 | "One (of two) AMD Milan CPUs in a Perlmutter node:\n",
171 | "\n",
172 | "
\n",
173 | "\n",
174 | "**Image source:** [AMD, High Performance Computing (HPC) Tuning Guide for AMD EPYCTM 7003 Series Processors](https://www.amd.com/system/files/documents/high-performance-computing-tuning-guide-amd-epyc7003-series-processors.pdf)"
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": null,
180 | "metadata": {},
181 | "outputs": [],
182 | "source": [
183 | "# Other useful options for querying system information\n",
184 | "\n",
185 | "# using CpuId\n",
186 | "# cpuinfo()\n",
187 | "\n",
188 | "# using Hwloc\n",
189 | "# topology_graphical()"
190 | ]
191 | },
192 | {
193 | "cell_type": "markdown",
194 | "metadata": {},
195 | "source": [
196 | "## Task-based multithreading\n",
197 | "\n",
198 | "
\n",
199 | "
\n",
200 | "\n",
201 | "\n",
202 | "The user doesn't control threads but tasks that get scheduled on threads.\n",
203 | "\n",
204 | "**Advantages:** 👍\n",
205 | "* high-level abstraction\n",
206 | "* nestability / composability\n",
207 | "\n",
208 | "**Disadvantages:** 👎\n",
209 | "* scheduling overhead\n",
210 | "* uncertain and potentially suboptimal task → thread assignment\n",
211 | " * scheduler has limited information (e.g. about the system topology)\n",
212 | " * task migration"
213 | ]
214 | },
215 | {
216 | "cell_type": "markdown",
217 | "metadata": {},
218 | "source": [
219 | "### Dynamic scheduling: `@threads :dynamic for ... in ...`"
220 | ]
221 | },
222 | {
223 | "cell_type": "markdown",
224 | "metadata": {},
225 | "source": [
226 | "* **Splits up the iteration space into `nthreads()` contiguous chunks**\n",
227 | "\n",
228 | "* Creates a task for each of them and hands them off to the dynamic scheduler (essentially `@spawn`s each chunk)."
229 | ]
230 | },
231 | {
232 | "cell_type": "code",
233 | "execution_count": null,
234 | "metadata": {},
235 | "outputs": [],
236 | "source": [
237 | "using Base.Threads: @threads, threadid, nthreads"
238 | ]
239 | },
240 | {
241 | "cell_type": "code",
242 | "execution_count": null,
243 | "metadata": {},
244 | "outputs": [],
245 | "source": [
246 | "# implicitly creates nthreads() many tasks, each of which handles 2 iterations\n",
247 | "@threads :dynamic for i in 1:2*nthreads()\n",
248 | " println(\"Running iteration \", i, \" on thread \", threadid())\n",
249 | "end"
250 | ]
251 | },
252 | {
253 | "cell_type": "markdown",
254 | "metadata": {},
255 | "source": [
256 | "#### Static scheduling: `@threads :static for ... in ...`"
257 | ]
258 | },
259 | {
260 | "cell_type": "markdown",
261 | "metadata": {},
262 | "source": [
263 | "* `:static` option to opt-out of dynamic scheduling\n",
264 | "\n",
265 | "* Statically **\"pins\" tasks to threads**\n",
266 | " * task 1 → thread 1, task 2 → thread 2, and so on.\n",
267 | "\n",
268 | "Pro 👍\n",
269 | " * **fixed task-thread mapping** (no task migration)\n",
270 | " * very little overhead\n",
271 | " \n",
272 | "Con 👎\n",
273 | " * not composable / nestable"
274 | ]
275 | },
276 | {
277 | "cell_type": "code",
278 | "execution_count": null,
279 | "metadata": {},
280 | "outputs": [],
281 | "source": [
282 | "@threads :static for i in 1:2*nthreads()\n",
283 | " println(\"Running iteration \", i, \" on thread \", threadid());\n",
284 | "end"
285 | ]
286 | },
287 | {
288 | "cell_type": "markdown",
289 | "metadata": {},
290 | "source": [
291 | "(For `@threads :static`, every thread handles precisely two iterations!)"
292 | ]
293 | },
294 | {
295 | "cell_type": "markdown",
296 | "metadata": {},
297 | "source": [
298 | "## \"Data pinning\" (NUMA revisited)"
299 | ]
300 | },
301 | {
302 | "cell_type": "markdown",
303 | "metadata": {},
304 | "source": [
305 | "Implicitly → **NUMA \"first-touch\" policy**\n",
306 | "\n",
307 | "Explicitly → [NUMA.jl](https://github.com/JuliaPerf/NUMA.jl)"
308 | ]
309 | },
310 | {
311 | "cell_type": "markdown",
312 | "metadata": {},
313 | "source": [
314 | "### NUMA \"first-touch\" policy\n",
315 | "\n",
316 | "Data is (typically) placed in the **NUMA domain that is closest to the thread/CPU core** that is \"touching\" the data.\n",
317 | "\n",
318 | "```julia\n",
319 | "x = Vector{Float64}(undef, 10) # allocation, no \"touch\" yet\n",
320 | "rand!(x) # first touch == first write\n",
321 | "```"
322 | ]
323 | },
324 | {
325 | "cell_type": "code",
326 | "execution_count": null,
327 | "metadata": {},
328 | "outputs": [],
329 | "source": [
330 | "pinthreads(:numa)\n",
331 | "threadinfo(; groupby=:numa)"
332 | ]
333 | },
334 | {
335 | "cell_type": "markdown",
336 | "metadata": {},
337 | "source": [
338 | "### Array initialization: serial vs parallel"
339 | ]
340 | },
341 | {
342 | "cell_type": "markdown",
343 | "metadata": {},
344 | "source": [
345 | "**Different parts of an array can be placed in different NUMA domains!**\n",
346 | "\n",
347 | "Data is managed in terms of memory pages (\"unit of data\")."
348 | ]
349 | },
350 | {
351 | "cell_type": "markdown",
352 | "metadata": {},
353 | "source": [
354 | "#### Serial\n",
355 | "\n",
356 | "```julia\n",
357 | "x = Vector{Float64}(undef, 100) # allocation, no \"touch\" yet\n",
358 | "rand!(x) # first touch == first write\n",
359 | "```\n",
360 | "\n",
361 | "The location of the \"main\" thread determines the NUMA domain of the entire array!\n",
362 | "\n",
363 | "If we later access the data in parallel, all threads must read from the same NUMA domain → competition for the memory bus → potential bottleneck."
364 | ]
365 | },
366 | {
367 | "cell_type": "markdown",
368 | "metadata": {},
369 | "source": [
370 | "#### Parallel\n",
371 | "\n",
372 | "```julia\n",
373 | "pinthreads(:numa) # pin threads to different NUMA domains\n",
374 | "x = Vector{Float64}(undef, 100) # allocation, no \"touch\" yet\n",
375 | "@threads :static for i in eachindex(x) # parallel iteration\n",
376 | " x[i] = rand() # first touch == first write\n",
377 | "end\n",
378 | "```\n",
379 | "\n",
380 | "Different threads - running in different NUMA regions - touch different parts of the array → the latter will (likely) be placed in different NUMA domains.\n",
381 | "\n",
382 | "If we later access the data in parallel, all threads can read their part of the array from their local NUMA domain → no bottleneck."
383 | ]
384 | },
385 | {
386 | "cell_type": "markdown",
387 | "metadata": {},
388 | "source": [
389 | "Crucial point: **How you initialize your data influences the performance of your computational kernel!** (non-local effect)\n",
390 | "\n",
391 | "**→ Hands-on** (see [README.md](README.md))"
392 | ]
393 | }
394 | ],
395 | "metadata": {
396 | "kernelspec": {
397 | "display_name": "Julia 1.10.4",
398 | "language": "julia",
399 | "name": "julia-1.10"
400 | },
401 | "language_info": {
402 | "file_extension": ".jl",
403 | "mimetype": "application/julia",
404 | "name": "julia",
405 | "version": "1.10.4"
406 | }
407 | },
408 | "nbformat": 4,
409 | "nbformat_minor": 2
410 | }
411 |
--------------------------------------------------------------------------------
/parts/multithreading/solution/bench_threads.jl:
--------------------------------------------------------------------------------
1 | # Script for benchmarking and comparing the multithreaded variants.
2 | # - Supposed to be run on an entire (exclusive) compute node.
3 | # - Takes `ns` as (the only) input argument.
4 | #
5 | using ThreadPinning
6 |
7 | do_visualize = false
8 | do_run = false
9 | codefile = joinpath(@__DIR__, "diffusion_2d_threads.jl")
10 | include(codefile)
11 |
12 | ns = parse(Int, ARGS[1])
13 | nt = 100
14 |
15 | println("-- ns=$ns, nt=$nt, SERIAL initialization, STATIC scheduling")
16 | for pin in (:cores, :sockets, :numa)
17 | println("pinthreads($pin)")
18 | pinthreads(pin)
19 | run_diffusion(; ns, nt, do_visualize, parallel_init=false, static=true)
20 | end
21 |
22 | println("\n-- ns=$ns, nt=$nt, PARALLEL initialization, STATIC scheduling")
23 | for pin in (:cores, :sockets, :numa)
24 | println("pinthreads($pin)")
25 | pinthreads(pin)
26 | run_diffusion(; ns, nt, do_visualize, parallel_init=true, static=true)
27 | end
28 |
29 | println("\n-- ns=$ns, nt=$nt, PARALLEL initialization, DYNAMIC scheduling")
30 | for pin in (:cores, :sockets, :numa)
31 | println("pinthreads($pin)")
32 | pinthreads(pin)
33 | run_diffusion(; ns, nt, do_visualize, parallel_init=true, static=false)
34 | end
35 |
--------------------------------------------------------------------------------
/parts/multithreading/solution/diffusion_2d_threads.jl:
--------------------------------------------------------------------------------
1 | # 2D linear diffusion solver - multithreading
2 | using Printf
3 | using CairoMakie
4 | include(joinpath(@__DIR__, "../../shared.jl"))
5 |
6 | function init_arrays_threads(params)
7 | (; ns, cs, parallel_init, static) = params
8 | C = Matrix{Float64}(undef, ns, ns)
9 | C2 = Matrix{Float64}(undef, ns, ns)
10 | if parallel_init
11 | # parallel initialization
12 | if static
13 | # static scheduling
14 | Threads.@threads :static for iy in axes(C, 2)
15 | for ix in axes(C, 1)
16 | C[ix, iy] = exp(- cs[ix]^2 - cs[iy]^2)
17 | C2[ix, iy] = C[ix, iy] # element-wise copy
18 | end
19 | end
20 | else
21 | # dynamic scheduling
22 | Threads.@threads :dynamic for iy in axes(C, 2)
23 | for ix in axes(C, 1)
24 | C[ix, iy] = exp(- cs[ix]^2 - cs[iy]^2)
25 | C2[ix, iy] = C[ix, iy] # element-wise copy
26 | end
27 | end
28 | end
29 | else
30 | # serial initialization
31 | for iy in axes(C, 2)
32 | for ix in axes(C, 1)
33 | C[ix, iy] = exp(- cs[ix]^2 - cs[iy]^2)
34 | C2[ix, iy] = C[ix, iy] # element-wise copy
35 | end
36 | end
37 | end
38 | return C, C2
39 | end
40 |
41 | # to avoid writing nested finite-difference expression
42 | macro qx(ix, iy) esc(:(-D * (C[$ix+1, $iy] - C[$ix, $iy]) / ds)) end
43 | macro qy(ix, iy) esc(:(-D * (C[$ix, $iy+1] - C[$ix, $iy]) / ds)) end
44 |
45 | function diffusion_step!(params, C2, C)
46 | (; ds, dt, D, static) = params
47 | if static
48 | # static scheduling
49 | Threads.@threads :static for iy in 1:size(C, 2)-2
50 | for ix in 1:size(C, 1)-2
51 | @inbounds C2[ix+1, iy+1] = C[ix+1, iy+1] - dt * ((@qx(ix+1, iy+1) - @qx(ix, iy+1)) / ds +
52 | (@qy(ix+1, iy+1) - @qy(ix+1, iy)) / ds)
53 | end
54 | end
55 | else
56 | # dynamic scheduling
57 | Threads.@threads :dynamic for iy in 1:size(C, 2)-2
58 | for ix in 1:size(C, 1)-2
59 | @inbounds C2[ix+1, iy+1] = C[ix+1, iy+1] - dt * ((@qx(ix+1, iy+1) - @qx(ix, iy+1)) / ds +
60 | (@qy(ix+1, iy+1) - @qy(ix+1, iy)) / ds)
61 | end
62 | end
63 | end
64 | return nothing
65 | end
66 |
67 | function run_diffusion(; ns=64, nt=100, do_visualize=false, parallel_init=false, static=false)
68 | params = init_params(; ns, nt, do_visualize, parallel_init, static)
69 | C, C2 = init_arrays_threads(params)
70 | fig, plt = maybe_init_visualization(params, C)
71 | t_tic = 0.0
72 | # time loop
73 | for it in 1:nt
74 | # time after warmup (ignore first 10 iterations)
75 | (it == 11) && (t_tic = Base.time())
76 | # diffusion
77 | diffusion_step!(params, C2, C)
78 | C, C2 = C2, C # pointer swap
79 | # visualization
80 | maybe_update_visualization(params, fig, plt, C, it)
81 | end
82 | t_toc = (Base.time() - t_tic)
83 | print_perf(params, t_toc)
84 | return nothing
85 | end
86 |
87 | # Running things...
88 |
89 | # enable visualization by default
90 | (!@isdefined do_visualize) && (do_visualize = true)
91 | # enable execution by default
92 | (!@isdefined do_run) && (do_run = true)
93 |
94 | if do_run
95 | if !isempty(ARGS)
96 | run_diffusion(; ns=parse(Int, ARGS[1]), do_visualize)
97 | else
98 | run_diffusion(; ns=256, do_visualize)
99 | end
100 | end
101 |
--------------------------------------------------------------------------------
/parts/multithreading/solution/job_bench_threads.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --time=00:05:00
3 | #SBATCH --nodes=1
4 | #SBATCH --ntasks-per-node=1
5 | #SBATCH --cpus-per-task=256
6 | #SBATCH --constraint=cpu
7 | #SBATCH --account=ntrain1
8 | #SBATCH --output=slurm_bench_threads.out
9 |
10 | # Load julia
11 | ml use /global/common/software/nersc/n9/julia/modules
12 | ml julia
13 |
14 | for i in 512 2048 6144
15 | do
16 | echo -e "\n\n#### Run $i"
17 |
18 | echo -e "-- single threaded"
19 | julia --project --threads 1 -e 'do_visualize=false; include("diffusion_2d_threads.jl")' $i
20 | echo -e ""
21 |
22 | julia --project --threads 8 bench_threads.jl $i # benchmark multithreaded variants
23 | done
--------------------------------------------------------------------------------
/parts/multithreading/solution/job_compare_threads_serial.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --time=00:05:00
3 | #SBATCH --nodes=1
4 | #SBATCH --ntasks-per-node=1
5 | #SBATCH --cpus-per-task=256
6 | #SBATCH --constraint=cpu
7 | #SBATCH --account=ntrain1
8 | #SBATCH --output=slurm_compare_threads_serial.out
9 |
10 | # Load julia
11 | ml use /global/common/software/nersc/n9/julia/modules
12 | ml julia
13 |
14 | for i in 512 2048 6144
15 | do
16 | echo -e "\n\n#### Run $i"
17 |
18 | echo -e "-- single threaded"
19 | julia --project --threads 1 -e 'do_visualize=false; include("diffusion_2d_threads.jl")' $i
20 | echo -e ""
21 |
22 | echo -e "-- multithreaded (8 threads)"
23 | julia --project --threads 8 -e 'do_visualize=false; include("diffusion_2d_threads.jl")' $i
24 | echo -e ""
25 | done
--------------------------------------------------------------------------------
/parts/multithreading/solution/slurm_bench_threads.out:
--------------------------------------------------------------------------------
1 |
2 |
3 | #### Run 512
4 | -- single threaded
5 | Time = 4.6068e-02 s, T_eff = 8.19 GB/s
6 |
7 | -- ns=512, nt=100, SERIAL initialization, STATIC scheduling
8 | pinthreads(cores)
9 | Time = 6.8419e-03 s, T_eff = 55.17 GB/s
10 | pinthreads(sockets)
11 | Time = 6.9280e-03 s, T_eff = 54.49 GB/s
12 | pinthreads(numa)
13 | Time = 7.0949e-03 s, T_eff = 53.21 GB/s
14 |
15 | -- ns=512, nt=100, PARALLEL initialization, STATIC scheduling
16 | pinthreads(cores)
17 | Time = 6.8409e-03 s, T_eff = 55.18 GB/s
18 | pinthreads(sockets)
19 | Time = 6.7821e-03 s, T_eff = 55.66 GB/s
20 | pinthreads(numa)
21 | Time = 7.6101e-03 s, T_eff = 49.60 GB/s
22 |
23 | -- ns=512, nt=100, PARALLEL initialization, DYNAMIC scheduling
24 | pinthreads(cores)
25 | Time = 6.6450e-03 s, T_eff = 56.81 GB/s
26 | pinthreads(sockets)
27 | Time = 1.1217e-02 s, T_eff = 33.65 GB/s
28 | pinthreads(numa)
29 | Time = 9.5861e-03 s, T_eff = 39.38 GB/s
30 |
31 |
32 | #### Run 2048
33 | -- single threaded
34 | Time = 7.3495e-01 s, T_eff = 8.22 GB/s
35 |
36 | -- ns=2048, nt=100, SERIAL initialization, STATIC scheduling
37 | pinthreads(cores)
38 | Time = 1.4753e-01 s, T_eff = 40.94 GB/s
39 | pinthreads(sockets)
40 | Time = 9.4935e-02 s, T_eff = 63.62 GB/s
41 | pinthreads(numa)
42 | Time = 9.5550e-02 s, T_eff = 63.21 GB/s
43 |
44 | -- ns=2048, nt=100, PARALLEL initialization, STATIC scheduling
45 | pinthreads(cores)
46 | Time = 1.7262e-01 s, T_eff = 34.99 GB/s
47 | pinthreads(sockets)
48 | Time = 1.0453e-01 s, T_eff = 57.78 GB/s
49 | pinthreads(numa)
50 | Time = 1.0394e-01 s, T_eff = 58.11 GB/s
51 |
52 | -- ns=2048, nt=100, PARALLEL initialization, DYNAMIC scheduling
53 | pinthreads(cores)
54 | Time = 1.4637e-01 s, T_eff = 41.26 GB/s
55 | pinthreads(sockets)
56 | Time = 2.4384e-01 s, T_eff = 24.77 GB/s
57 | pinthreads(numa)
58 | Time = 1.2115e-01 s, T_eff = 49.85 GB/s
59 |
60 |
61 | #### Run 6144
62 | -- single threaded
63 | Time = 6.5916e+00 s, T_eff = 8.25 GB/s
64 |
65 | -- ns=6144, nt=100, SERIAL initialization, STATIC scheduling
66 | pinthreads(cores)
67 | Time = 2.1146e+00 s, T_eff = 25.71 GB/s
68 | pinthreads(sockets)
69 | Time = 1.9464e+00 s, T_eff = 27.93 GB/s
70 | pinthreads(numa)
71 | Time = 1.5678e+00 s, T_eff = 34.67 GB/s
72 |
73 | -- ns=6144, nt=100, PARALLEL initialization, STATIC scheduling
74 | pinthreads(cores)
75 | Time = 2.1160e+00 s, T_eff = 25.69 GB/s
76 | pinthreads(sockets)
77 | Time = 9.7413e-01 s, T_eff = 55.80 GB/s
78 | pinthreads(numa)
79 | Time = 8.2886e-01 s, T_eff = 65.58 GB/s
80 |
81 | -- ns=6144, nt=100, PARALLEL initialization, DYNAMIC scheduling
82 | pinthreads(cores)
83 | Time = 2.1081e+00 s, T_eff = 25.79 GB/s
84 | pinthreads(sockets)
85 | Time = 1.5051e+00 s, T_eff = 36.11 GB/s
86 | pinthreads(numa)
87 | Time = 1.6027e+00 s, T_eff = 33.92 GB/s
88 |
--------------------------------------------------------------------------------
/parts/multithreading/solution/slurm_compare_threads_serial.out:
--------------------------------------------------------------------------------
1 |
2 |
3 | #### Run 512
4 | -- single threaded
5 | Time = 4.6421e-02 s, T_eff = 8.13 GB/s
6 |
7 | -- multithreaded (8 threads)
8 | Time = 1.6735e-02 s, T_eff = 22.56 GB/s
9 |
10 |
11 |
12 | #### Run 2048
13 | -- single threaded
14 | Time = 7.3168e-01 s, T_eff = 8.25 GB/s
15 |
16 | -- multithreaded (8 threads)
17 | Time = 5.6353e-01 s, T_eff = 10.72 GB/s
18 |
19 |
20 |
21 | #### Run 6144
22 | -- single threaded
23 | Time = 6.5959e+00 s, T_eff = 8.24 GB/s
24 |
25 | -- multithreaded (8 threads)
26 | Time = 3.2809e+00 s, T_eff = 16.57 GB/s
27 |
28 |
--------------------------------------------------------------------------------
/parts/shared.jl:
--------------------------------------------------------------------------------
1 | ## PARAMETER INITIALIZATION
2 | function init_params(; ns=64, nt=100, kwargs...)
3 | L = 10.0 # physical domain length
4 | D = 1.0 # diffusion coefficient
5 | ds = L / ns # grid spacing
6 | dt = ds^2 / D / 8.2 # time step
7 | cs = range(start=ds / 2, stop=L - ds / 2, length=ns) .- 0.5 * L # vector of coord points
8 | nout = floor(Int, nt / 5) # plotting frequency
9 | return (; L, D, ns, nt, ds, dt, cs, nout, kwargs...)
10 | end
11 |
12 | function init_params_mpi(; dims, coords, ns=64, nt=100, kwargs...)
13 | L = 10.0 # physical domain length
14 | D = 1.0 # diffusion coefficient
15 | nx_g = dims[1] * (ns - 2) + 2 # global number of grid points along dim 1
16 | ny_g = dims[2] * (ns - 2) + 2 # global number of grid points along dim 2
17 | dx = L / nx_g # grid spacing
18 | dy = L / ny_g # grid spacing
19 | dt = min(dx, dy)^2 / D / 8.2 # time step
20 | x0 = coords[1] * (ns - 2) * dx # coords shift to get global coords on local process
21 | y0 = coords[2] * (ns - 2) * dy # coords shift to get global coords on local process
22 | xcs = LinRange(x0 + dx / 2, x0 + ns * dx - dx / 2, ns) .- 0.5 .* L # local vector of global coord points
23 | ycs = LinRange(y0 + dy / 2, y0 + ns * dx - dy / 2, ns) .- 0.5 .* L # local vector of global coord points
24 | return (; L, D, ns, nt, dx, dy, dt, xcs, ycs, kwargs...)
25 | end
26 |
27 | function init_params_gpu(; ns=64, nt=100, kwargs...)
28 | L = 10.0 # physical domain length
29 | D = 1.0 # diffusion coefficient
30 | ds = L / ns # grid spacing
31 | dt = ds^2 / D / 8.2 # time step
32 | cs = range(start=ds / 2, stop=L - ds / 2, length=ns) .- 0.5 * L # vector of coord points
33 | nout = floor(Int, nt / 5) # plotting frequency
34 | nthreads = 32, 8 # number of threads per block
35 | nblocks = cld.(ns, nthreads) # number of blocks
36 | return (; L, D, ns, nt, ds, dt, cs, nout, nthreads, nblocks, kwargs...)
37 | end
38 |
39 | function init_params_gpu_mpi(; dims, coords, ns=64, nt=100, kwargs...)
40 | L = 10.0 # physical domain length
41 | D = 1.0 # diffusion coefficient
42 | nx_g = dims[1] * (ns - 2) + 2 # global number of grid points along dim 1
43 | ny_g = dims[2] * (ns - 2) + 2 # global number of grid points along dim 2
44 | dx = L / nx_g # grid spacing
45 | dy = L / ny_g # grid spacing
46 | dt = min(dx, dy)^2 / D / 8.2 # time step
47 | x0 = coords[1] * (ns - 2) * dx # coords shift to get global coords on local process
48 | y0 = coords[2] * (ns - 2) * dy # coords shift to get global coords on local process
49 | xcs = LinRange(x0 + dx / 2, x0 + ns * dx - dx / 2, ns) .- 0.5 * L # local vector of global coord points
50 | ycs = LinRange(y0 + dy / 2, y0 + ns * dy - dy / 2, ns) .- 0.5 * L # local vector of global coord points
51 | nthreads = 32, 8 # number of threads per block
52 | nblocks = cld.(ns, nthreads) # number of blocks
53 | return (; L, D, ns, nt, dx, dy, dt, xcs, ycs, nthreads, nblocks, kwargs...)
54 | end
55 |
56 | ## ARRAY INITIALIZATION
57 | function init_arrays_with_flux(params)
58 | (; cs, ns) = params
59 | C = @. exp(-cs^2 - (cs')^2)
60 | qx = zeros(ns - 1, ns - 2)
61 | qy = zeros(ns - 2, ns - 1)
62 | return C, qx, qy
63 | end
64 |
65 | function init_arrays(params)
66 | (; cs) = params
67 | C = @. exp(-cs^2 - (cs')^2)
68 | C2 = copy(C)
69 | return C, C2
70 | end
71 |
72 | function init_arrays_mpi(params)
73 | (; xcs, ycs) = params
74 | C = @. exp(-xcs^2 - (ycs')^2)
75 | C2 = copy(C)
76 | return C, C2
77 | end
78 |
79 | function init_arrays_gpu(params)
80 | (; cs) = params
81 | C = CuArray(@. exp(-cs^2 - (cs')^2))
82 | C2 = copy(C)
83 | return C, C2
84 | end
85 |
86 | function init_arrays_gpu_mpi(params)
87 | (; xcs, ycs) = params
88 | C = CuArray(@. exp(-xcs^2 - (ycs')^2))
89 | C2 = copy(C)
90 | return C, C2
91 | end
92 |
93 | ## VISUALIZATION & PRINTING
94 | function maybe_init_visualization(params, C)
95 | if params.do_visualize
96 | fig = Figure(; size=(500, 400), fontsize=14)
97 | ax = Axis(fig[1, 1][1, 1]; aspect=DataAspect(), title="C")
98 | plt = heatmap!(ax, params.cs, params.cs, Array(C); colormap=:turbo, colorrange=(0, 1))
99 | cb = Colorbar(fig[1, 1][1, 2], plt)
100 | display(fig)
101 | return fig, plt
102 | end
103 | return nothing, nothing
104 | end
105 |
106 | function maybe_update_visualization(params, fig, plt, C, it)
107 | if params.do_visualize && (it % params.nout == 0)
108 | plt[3] = Array(C)
109 | display(fig)
110 | end
111 | return nothing
112 | end
113 |
114 | function print_perf(params, t_toc)
115 | (; ns, nt) = params
116 | @printf("Time = %1.4e s, T_eff = %1.2f GB/s \n", t_toc, round((2 / 1e9 * ns^2 * sizeof(Float64)) / (t_toc / (nt - 10)), sigdigits=6))
117 | return nothing
118 | end
119 |
--------------------------------------------------------------------------------
/setup.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env sh
2 | #=
3 | $(dirname "$0")/julia_wrapper.sh $0
4 | exit
5 | # =#
6 |
7 | @info("Preparing .bashrc")
8 | bashrc = joinpath(ENV["HOME"], ".bashrc")
9 | str = """\n
10 | # --- JULIACON24-HPCWORKSHOP ---
11 | export JULIA_DEPOT_PATH=\$SCRATCH/.julia:/global/common/software/ntrain1/.julia
12 | export PATH=\$SCRATCH/.julia/bin:\$PATH
13 | # auto-load the Julia module
14 | ml use /global/common/software/nersc/n9/julia/modules
15 | ml julia\n
16 | """
17 | open(bashrc; append=true) do f
18 | write(f, str)
19 | end
20 | @info("Done!")
21 |
22 | @info("Instantiating Julia environment")
23 | empty!(DEPOT_PATH)
24 | push!(DEPOT_PATH, joinpath(ENV["SCRATCH"], ".julia"))
25 | push!(DEPOT_PATH, "/global/common/software/ntrain1/.julia")
26 | using Pkg
27 | Pkg.activate(@__DIR__)
28 | Pkg.instantiate()
29 | @info("Done!")
30 |
31 | using MPI
32 | MPI.install_mpiexecjl(force=true)
33 |
34 | @info("Installing Jupyter kernel")
35 | Pkg.build("IJulia") # to be safe
36 | using IJulia
37 | IJulia.installkernel("JuliaCon24 HPC Workshop"; env=Dict("JULIA_NUM_THREADS" => "8", "JULIA_PROJECT" => @__DIR__))
38 | @info("Done!")
39 |
--------------------------------------------------------------------------------