├── .gitignore ├── LICENSE ├── Manifest.toml ├── Project.toml ├── README.md ├── help ├── NERSC Education Resources.pdf ├── README.md ├── cpunode.sh ├── gpunode.sh ├── jupyter-kernel │ ├── install.sh │ └── julia-tutorial │ │ ├── kernel-helper.sh │ │ ├── kernel.json │ │ ├── logo-32x32.png │ │ └── logo-64x64.png ├── perlmutter_cheatsheet.md └── vscode_cheatsheet.md ├── imgs └── julia_hpc_workshop.png ├── julia_wrapper.sh ├── onboarding ├── README.md ├── intro.pdf ├── julia_vscode_on_perlmutter.pdf └── overview.pdf ├── parts ├── diffusion_2d │ ├── README.md │ ├── diffusion_2d.ipynb │ ├── diffusion_2d.jl │ ├── diffusion_2d_loop.jl │ └── imgs │ │ ├── initial.png │ │ └── stagg_2D.png ├── distributed │ └── explanation │ │ ├── 01_distributed.ipynb │ │ ├── 01_distributed.slides.html │ │ ├── 02_dagger.ipynb │ │ ├── 02_dagger.slides.html │ │ └── Project.toml ├── gpu │ ├── README.md │ ├── advanced │ │ ├── closest_device.jl │ │ ├── job_gpu_mpi_multinode.sh │ │ └── job_gpu_mpi_singlenode.sh │ ├── diffusion_2d_cuda.jl │ ├── diffusion_2d_cuda_mpi.jl │ ├── get_gpu_compute_node_interactive.sh │ ├── gpu.ipynb │ ├── imgs │ │ ├── cpu_gpu_evo.png │ │ ├── cuda_grid.png │ │ └── frontier.png │ ├── job_bench_gpu.sh │ ├── job_gpu_mpi_multinode.sh │ ├── job_gpu_mpi_singlenode.sh │ ├── multigpu.jl │ ├── slurm │ │ ├── hello.jl │ │ ├── job_hello_multinode.sh │ │ └── job_hello_singlenode.sh │ ├── solution │ │ ├── diffusion_2d_cuda.jl │ │ ├── diffusion_2d_cuda_mpi.jl │ │ ├── job_bench_gpu.sh │ │ ├── job_gpu_mpi_multinode.sh │ │ ├── job_gpu_mpi_singlenode.sh │ │ └── visualize_mpi.jl │ ├── visualize.jl │ └── visualize_mpi.jl ├── mpi │ ├── README.md │ ├── diffusion_2d_mpi.jl │ ├── explanation │ │ ├── 01_mpi+jupyter.ipynb │ │ ├── 01_mpi+jupyter.slides.html │ │ ├── 02_comms.ipynb │ │ ├── 02_comms.slides.html │ │ ├── 03_halo.ipynb │ │ ├── 03_halo.slides.html │ │ ├── Project.toml │ │ ├── advanced │ │ │ └── 00_gpu_select.ipynb │ │ ├── diffusion_2d_halo_exchange.pdf │ │ ├── diffusion_2d_halo_exchange.png │ │ └── l8_1D_global_grid.png │ ├── get_compute_node_interactive.sh │ ├── job_mpi_multinode.sh │ ├── job_mpi_singlenode.sh │ ├── solution │ │ ├── diffusion_2d_mpi.jl │ │ ├── job_mpi_multinode.sh │ │ ├── job_mpi_singlenode.sh │ │ ├── multinode_results.txt │ │ ├── slurm_mpi_singlenode.out │ │ ├── visualization_before.png │ │ └── visualization_desired.png │ ├── visualize_mpi.ipynb │ └── visualize_mpi.jl ├── multithreading │ ├── README.md │ ├── diffusion_2d_threads.jl │ ├── imgs │ │ ├── amd_milan_cpu_die.svg │ │ ├── stack_heap_threads.png │ │ ├── stack_heap_threads.svg │ │ ├── tasks_threads_cores.svg │ │ └── topo.svg │ ├── job_bench_threads.sh │ ├── job_compare_threads_serial.sh │ ├── multithreading.ipynb │ └── solution │ │ ├── bench_threads.jl │ │ ├── diffusion_2d_threads.jl │ │ ├── job_bench_threads.sh │ │ ├── job_compare_threads_serial.sh │ │ ├── slurm_bench_threads.out │ │ └── slurm_compare_threads_serial.out └── shared.jl └── setup.sh /.gitignore: -------------------------------------------------------------------------------- 1 | # misc 2 | .DS_Store 3 | .vscode 4 | .ipynb_checkpoints 5 | 6 | # julia 7 | # Manifest.toml 8 | 9 | # output 10 | out* 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 JuliaHPC 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Project.toml: -------------------------------------------------------------------------------- 1 | [deps] 2 | AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" 3 | BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" 4 | CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" 5 | CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" 6 | CpuId = "adafc99b-e345-5852-983c-f28acb93d879" 7 | Hwloc = "0e44f5e4-bd66-52a0-8798-143a42290a1d" 8 | IJulia = "7073ff75-c697-5162-941a-fcdaad2a7d2a" 9 | JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819" 10 | MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195" 11 | NUMA = "292f1341-b53f-425a-80e5-3597ad0961bf" 12 | ThreadPinning = "811555cd-349b-4f26-b7bc-1f208b848042" 13 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # JuliaCon24 Workshop: Hands-on with Julia for HPC on GPUs and CPUs 2 | 3 | 4 | 5 |
6 | 7 | **Instructors:** [Carsten Bauer](https://github.com/carstenbauer), [Ludovic Räss](https://github.com/luraess), [Ivan Utkin](https://github.com/utkinis), and [Johannes Blaschke](https://github.com/JBlaschke) (remote). 8 | 9 | **Where:** TU-Eindhoven 0.244 10 | 11 | **When:** July 9th, 1:30 PM (CEST) 12 | 13 | **More:** https://pretalx.com/juliacon2024/talk/NTQZJJ/ 14 | 15 | ## Schedule 16 | 17 | * **Onboarding** 18 | * [Introduction](./onboarding/intro.pdf) 19 | * [NERSC overview](./onboarding/overview.pdf) 20 | * [Julia + VS Code on Perlmutter](./onboarding/julia_vscode_on_perlmutter.pdf) 21 | 22 | * **Introducing the example** 23 | * [2D linear diffusion solver](./parts/diffusion_2d) 24 | 25 | * **Parallelization on Perlmutter** 26 | * [Multithreading](./parts/multithreading) 27 | 28 | (short break) 29 | * [MPI parallelization](./parts/mpi) 30 | * [GPU acceleration](./parts/gpu) 31 | 32 | ## Prepare for the workshop 33 | 34 | To begin with, make sure that you have [VS Code](https://code.visualstudio.com/download) installed on your laptop. 35 | 36 | ### VS Code → Perlmutter (via SSH) 37 | 38 | 1) In VS Code, press `F1` and run the `Remote-SSH: Open SSH Host...` command. 39 | - If the command isn't available, make sure that [Remote - SSH extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-ssh) is installed (but it should be available out of the box). 40 | 2) Enter `trainXY@perlmutter.nersc.gov` (with `trainXY` replaced by your training account) and press enter. 41 | 3) In the popup input box, enter your password and press enter. 42 | 43 | After a second or two, you should have VS Code running on a Perlmutter login node! 🎉 44 | 45 | 46 | ### On Perlmutter 47 | 1. Clone the workshop materials into `$SCRATCH/juliacon24-hpcworkshop`by running the following command. 48 | 49 | git clone https://github.com/JuliaHPC/juliacon24-hpcworkshop $SCRATCH/juliacon24-hpcworkshop 50 | 51 | * **You will always work in this folder (`$SCRATCH/juliacon24-hpcworkshop`) during the workshop.** 52 | 2. Run the following commands: 53 | 54 | cd $SCRATCH/juliacon24-hpcworkshop 55 | ./setup.sh 56 | 57 |
58 | What does this do? (click me if you're curious) 59 | 60 | * The setup script 61 | * modifies your `$HOME/.bashrc` to 62 | * permanently put your Julia depot onto the parallel file system (`$SCRATCH/.julia`) 63 | * auto-load the Julia module when you login (such that the `julia` command is available) 64 | * make `mpiexecjl` available (i.e. modify `$PATH`) 65 | * instantiates the Julia environment 66 | * installs MPI.jl's `mpiexecjl` wrapper 67 | * installs a Jupyter kernel (for NERSC's Jupyter hub) 68 | 69 |
70 | 71 | 3. **!! Before you proceed, restart VS Code !!** 72 | * Close it fully, open it again, and connect to Perlmutter again (see above). Otherwise the `.bashrc` changes won't be in effect. 73 | 74 | 4. Let's now turn to the Julia VS Code extension. 75 | 76 | 1) Installing the extension 77 | - Open the extensions view (press `CTRL/CMD + SHIFT + X`). 78 | - Search for `julia`. 79 | - Click on `install`. 80 | 2) Pointing it to `julia_wrapper.sh` 81 | - Open the VS Code Settings (press `CTRL/CMD + ,`). 82 | - Click on the tab `Remote [SSH: perlmutter.nersc.gov]`. 83 | - Search for `Julia executable`. 84 | - Insert `/pscratch/sd/t/trainXY/juliacon24-hpcworkshop/julia_wrapper.sh` - with `trainXY` replaced by you training account name - into the text field under `Julia: Executable Path`. 85 | 3) If `ALT/OPTION + J` followed by `ALT/OPTION + O` (**or** pressing `F1` and executing the `Julia: Start REPL` command) successfully spins up the integrated Julia REPL, you know that the setup is working! 🎉 86 | 87 | 5. Finally, you should open the workshop directory in VS Code. 88 | * In the VS Code terminal, run `cd $SCRATCH/juliacon24-hpcworkshop` followed by `code -r .` 89 | * Manual alternative: Click on the green button "Open Folder" (or press `CTRL/CMD + O`) and enter `/pscratch/sd/t/trainXY/juliacon24-hpcworkshop` - **with `trainXY` replaced by you training account name**. 90 | 91 | ## Help? 92 | 93 | ### Cheatsheets 94 | 95 | * [Perlmutter cheatsheet](./help/perlmutter_cheatsheet.md) 96 | * [VS Code cheatsheet](./help/vscode_cheatsheet.md) 97 | 98 | ### VS Code isn't working for me, what should I do? 99 | 100 | As a fallback, you can also try to use Jupyter under https://jupyter.nersc.gov. Just make sure to use the `JuliaCon24 HPC Workshop 1.10.4` kernel (open a notebook and select the kernel in the top right corner). 101 | 102 | ## Applying for NERSC Training Account 103 | 104 | To get the most out of the workshop, you need to apply for a NERSC training account **before the workshop (as early as possible)**! The reason for this is that everyone who applies for an account has to be checked, which can take some time (between a few minutes and a week) depending on their personal background (e.g. nationality and affiliation). 105 | 106 | **Please only apply for an account if you 1) have a workshop ticket and 2) really plan to participate in the JuliaCon 2024 workshop on Tuesday, July 9 in person!** 107 | 108 | ### Sign up for an account 109 | 110 | To apply for an account: 111 | 1. Go to https://iris.nersc.gov/train 112 | 2. Fill out the application form with your details and **use the training code that you've received by email**. 113 | 3. Iris will display your training account's login credials **only once**. **Take a screenshot of your login credials**, you will not be able to change or recover these after you close this tab! 114 | 4. You can already start experimenting once your account has been approved. Your training account will be availabe until July 14th (end of JuliaCon). Accounts get deleted afterwards, so remember to **backup your data** before July 14th. 115 | 116 | **If your institution is not listed in the drop down menu at https://iris.nersc.gov/train:** Please choose "Training Account Only - Org Not Listed", and put your organization name in the "Department" field next. 117 | -------------------------------------------------------------------------------- /help/NERSC Education Resources.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JuliaHPC/juliacon24-hpcworkshop/2c6e764bcf94eebd549785b69264215b1012eaf2/help/NERSC Education Resources.pdf -------------------------------------------------------------------------------- /help/README.md: -------------------------------------------------------------------------------- 1 | * The files `cpunode.sh` and `gpunode.sh` can be used to get an interactive shell on a (full) CPU/GPU node on Perlmutter (e.g. `sh cpunode.sh`). 2 | * The `perlmutter_cheatsheet.md` collects a bunch of useful information and commands for Perlmutter, like example job scripts. 3 | * The folder `jupyter-kernel` is only for backup purposes and shouldn't be needed. 4 | -------------------------------------------------------------------------------- /help/cpunode.sh: -------------------------------------------------------------------------------- 1 | # Request an entire CPU node for interactive usage (you'll end up with a shell on the compute node) 2 | # Run as: sh cpunode.sh 3 | salloc --nodes 1 --qos interactive --time 01:00:00 --constraint cpu --account=ntrain1 4 | -------------------------------------------------------------------------------- /help/gpunode.sh: -------------------------------------------------------------------------------- 1 | # Request an entire GPU node for interactive usage (you'll end up with a shell on the compute node) 2 | # Run as: sh gpunode.sh 3 | salloc --nodes 1 --qos interactive --time 01:00:00 --constraint gpu --gpus 4 --account=ntrain1 4 | -------------------------------------------------------------------------------- /help/jupyter-kernel/install.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | kernel_dir=${HOME}/.local/share/jupyter/kernels 4 | mkdir -p $kernel_dir 5 | cp -r ${SCRATCH}/juliacon24-hpcworkshop/help/jupyter-kernel/julia-tutorial $kernel_dir 6 | -------------------------------------------------------------------------------- /help/jupyter-kernel/julia-tutorial/kernel-helper.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | module load PrgEnv-gnu 4 | module load cray-hdf5-parallel 5 | module load python 6 | 7 | module use /global/common/software/nersc/julia_hpc_24/modules/ 8 | module use /global/common/software/nersc/n9/julia/modules/ 9 | module load adios2 julia 10 | 11 | readarray -t ijulia_boostrap < <(julia /global/cfs/cdirs/nstaff/blaschke/julia/kernels/bootstrap.jl) 12 | 13 | echo "Check-and-install returned following output:" 14 | _IFS=$IFS 15 | IFS=$'\n' 16 | for each in ${ijulia_boostrap[*]} 17 | do 18 | echo $each 19 | done 20 | IFS=$_IFS 21 | 22 | JULIA_EXEC=$(which julia) 23 | KERNEL="${ijulia_boostrap[-1]}" 24 | export JULIA_NUM_THREADS=8 25 | 26 | echo "Connecting using JULIA_EXEC=$JULIA_EXEC and KERNEL=$KERNEL" 27 | exec $JULIA_EXEC -i --startup-file=yes --color=yes $KERNEL "$@" 28 | -------------------------------------------------------------------------------- /help/jupyter-kernel/julia-tutorial/kernel.json: -------------------------------------------------------------------------------- 1 | { 2 | "display_name": "JuliaCon24 HPC Workshop", 3 | "argv": [ 4 | "{resource_dir}/kernel-helper.sh", 5 | "{connection_file}" 6 | ], 7 | "language": "julia", 8 | "env": {}, 9 | "interrupt_mode": "signal" 10 | } 11 | -------------------------------------------------------------------------------- /help/jupyter-kernel/julia-tutorial/logo-32x32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JuliaHPC/juliacon24-hpcworkshop/2c6e764bcf94eebd549785b69264215b1012eaf2/help/jupyter-kernel/julia-tutorial/logo-32x32.png -------------------------------------------------------------------------------- /help/jupyter-kernel/julia-tutorial/logo-64x64.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JuliaHPC/juliacon24-hpcworkshop/2c6e764bcf94eebd549785b69264215b1012eaf2/help/jupyter-kernel/julia-tutorial/logo-64x64.png -------------------------------------------------------------------------------- /help/perlmutter_cheatsheet.md: -------------------------------------------------------------------------------- 1 | # Perlmutter Cheatsheet 2 | 3 | ## Managing jobs 4 | 5 | ### Submitting a job 6 | `sbatch job_script.sh` 7 | 8 | ### List your submitted jobs 9 | 10 | `sqs` or maybe even `watch -n 10 'sqs'` 11 | 12 | ### Canceling a job 13 | 14 | `scancel ` where `` is the id of the job (can be found with `squeue`, see above). 15 | 16 | ## Interactive sessions on compute nodes 17 | 18 | ### CPU 19 | ```bash 20 | salloc --nodes 1 --qos interactive --time 01:00:00 --constraint cpu --account=ntrain1 21 | ``` 22 | (see the file `cpunode.sh` which you can simply run with `sh cpunode.sh`) 23 | 24 | ### GPU 25 | ```bash 26 | salloc --nodes 1 --qos interactive --time 01:00:00 --constraint gpu --gpus 4 --account=ntrain1 27 | ``` 28 | (see the file `gpunode.sh` which you can simply run with `sh gpunode.sh`) 29 | 30 | ## Examplatory job scripts 31 | 32 | ### CPU (full node) 33 | ```bash 34 | #!/bin/bash 35 | #SBATCH --time=00:05:00 36 | #SBATCH --nodes=1 37 | #SBATCH --ntasks-per-node=1 38 | #SBATCH --cpus-per-task=256 39 | #SBATCH --constraint=cpu 40 | #SBATCH --account=ntrain1 41 | 42 | # Load julia 43 | ml use /global/common/software/nersc/n9/julia/modules 44 | ml julia 45 | 46 | julia --project --threads 8 whatever.jl 47 | ``` 48 | 49 | ### MPI 50 | 51 | * "tasks" in SLURM correspond to MPI ranks 52 | * **If you want more than 8 nodes, you need to specify `#SBATCH --qos=regular`.** 53 | 54 | ```bash 55 | #!/bin/bash 56 | #SBATCH --time=00:10:00 57 | #SBATCH --nodes=9 58 | #SBATCH --ntasks-per-node=1 59 | #SBATCH --constraint=cpu 60 | #SBATCH --account=ntrain1 61 | #SBATCH --qos=regular 62 | 63 | # Load julia 64 | ml use /global/common/software/nersc/n9/julia/modules 65 | ml julia 66 | 67 | mpiexecjl --project -n 9 julia mpicode.jl 68 | ``` 69 | ### MPI GPU 70 | 71 | ```bash 72 | #!/bin/bash 73 | #SBATCH -A ntrain1 74 | #SBATCH -C gpu 75 | #SBATCH -q regular 76 | #SBATCH --output=slurm_gpu_mpi_multinode.out 77 | #SBATCH --time=00:05:00 78 | #SBATCH --nodes=4 79 | #SBATCH --ntasks=16 80 | #SBATCH --gpus-per-node=4 81 | #SBATCH --exclusive 82 | #SBATCH --gpu-bind=none 83 | 84 | # pin to closest NIC to GPU 85 | export MPICH_OFI_NIC_POLICY=GPU 86 | 87 | # default to std memory pool, see: https://juliaparallel.org/MPI.jl/stable/knownissues/#Memory-pool 88 | export JULIA_CUDA_MEMORY_POOL=none 89 | 90 | ml use /global/common/software/nersc/n9/julia/modules 91 | ml julia 92 | 93 | mpiexecjl --project=../.. julia gpu_mpicode.jl 94 | 95 | ``` 96 | -------------------------------------------------------------------------------- /help/vscode_cheatsheet.md: -------------------------------------------------------------------------------- 1 | # VS Code Cheatsheet 2 | 3 | ## SSH → Perlmutter 4 | 5 | 1) In VS Code, press `F1` and run the `Remote-SSH: Open SSH Host...` command. 6 | - If the command isn't available, make sure that [Remote - SSH extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-ssh) is installed (but it should be available out of the box). 7 | 2) Enter `trainXY@perlmutter.nersc.gov` (with `trainXY` replaced by your training account) and press enter. 8 | 3) In the popup input box, enter your password and press enter. 9 | 10 | After a second or two, you should have VS Code running on a Perlmutter login node! 🎉 11 | 12 | ## Basics 13 | 14 | * Run a command: Press `F1` or `CTRL/CMD + SHIFT + P` 15 | 16 | * Open a terminal: `` Ctrl + ` `` 17 | 18 | * Open a folder from the terminal: `code -r .` 19 | 20 | * Search for a file: `CTRL/CMD + T` 21 | 22 | * Search for a function in a file: `CTRL/CMD + R` 23 | 24 | ## Julia 25 | 26 | * Open the REPL: `ALT/OPTION + J` followed by `ALT/OPTION + O` 27 | 28 | * Restart the REPL: `ALT/OPTION + J` followed by `ALT/OPTION + R` 29 | 30 | * Kill the REPL: `ALT/OPTION + J` followed by `ALT/OPTION + K` 31 | 32 | * Change the Julia environment: `ALT/OPTION + J` followed by `ALT/OPTION + E` -------------------------------------------------------------------------------- /imgs/julia_hpc_workshop.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JuliaHPC/juliacon24-hpcworkshop/2c6e764bcf94eebd549785b69264215b1012eaf2/imgs/julia_hpc_workshop.png -------------------------------------------------------------------------------- /julia_wrapper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Making module / ml available 4 | # ------------------------------------------------------------ 5 | export MODULEPATH=/global/common/software/nersc/n9/julia/modules:/opt/cray/pe/lmod/modulefiles/perftools/23.12.0:/opt/cray/pe/lmod/modulefiles/comnet/gnu/12.0/ofi/1.0:/opt/cray/pe/lmod/modulefiles/mix_compilers:/opt/cray/pe/lmod/modulefiles/compiler/gnu/12.0:/opt/cray/pe/lmod/modulefiles/mpi/gnu/12.0/ofi/1.0/cray-mpich/8.0:/opt/cray/pe/lmod/modulefiles/net/ofi/1.0:/opt/cray/pe/lmod/modulefiles/cpu/x86-milan/1.0:/opt/cray/pe/modulefiles/Linux:/opt/cray/pe/modulefiles/Core:/opt/cray/pe/lmod/lmod/modulefiles/Core:/opt/cray/pe/lmod/modulefiles/core:/opt/cray/pe/lmod/modulefiles/craype-targets/default:/global/common/software/nersc/pe/modulefiles_hotfixes:/opt/nersc/pe/modulefiles:/usr/share/lmod/lmod/modulefiles/Core:/opt/cray/modulefiles 6 | source /usr/share/lmod/lmod/init/profile 7 | export LMOD_SYSTEM_DEFAULT_MODULES=craype-x86-milan:craype-network-ofi:perftools-base:xpmem:PrgEnv-gnu:cpe:gpu 8 | module --initial_load restore 9 | # ------------------------------------------------------------ 10 | 11 | # Load julia 12 | ml use /global/common/software/nersc/n9/julia/modules 13 | ml julia 14 | 15 | # Pass on all arguments to julia 16 | exec julia "${@}" -------------------------------------------------------------------------------- /onboarding/README.md: -------------------------------------------------------------------------------- 1 | # Getting Started at NERSC 2 | 3 | This place we keep onboarding instructions. Please also refer to the [cheat sheets](../help/). 4 | 5 | Also, if you're interested in applying for a NERSC account, please take a look at [Rebecca's slides](../help/NERSC%20Education%20Resources.pdf). 6 | 7 | ## Important: Before you go on your own ways 8 | 9 | We've taken some shortcuts in order to help you start being productive quickly. If you're using NERSC for more than just training purposes, please consider the following 10 | 11 | 1. We put your software environment on `$SCRATCH` -- this is a temporary place. For production software please use: 12 | - Containers: https://docs.nersc.gov/development/containers/ 13 | - `/global/common/software/$YOUR_PROJECT_ID` 14 | - `$HOME` or `$CFS` for your source code 15 | 16 | 2. The [setup script](../setup.sh) configures your `.bashrc`. Please understand these, and configure your user environment in a way that works for you. **Make sure that important job scripts, and software environments don't rely on shell configurations** 17 | 18 | 3. We put our shared code into a `shared.jl` and include this in our Julia programs. This is fine for small-scale runs (couple dozen nodes). Ideally you want to be able to precompile though, for this to work, you want to package your program up [as julia packages](https://pkgdocs.julialang.org/v1/creating-packages/) 19 | -------------------------------------------------------------------------------- /onboarding/intro.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JuliaHPC/juliacon24-hpcworkshop/2c6e764bcf94eebd549785b69264215b1012eaf2/onboarding/intro.pdf -------------------------------------------------------------------------------- /onboarding/julia_vscode_on_perlmutter.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JuliaHPC/juliacon24-hpcworkshop/2c6e764bcf94eebd549785b69264215b1012eaf2/onboarding/julia_vscode_on_perlmutter.pdf -------------------------------------------------------------------------------- /onboarding/overview.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JuliaHPC/juliacon24-hpcworkshop/2c6e764bcf94eebd549785b69264215b1012eaf2/onboarding/overview.pdf -------------------------------------------------------------------------------- /parts/diffusion_2d/README.md: -------------------------------------------------------------------------------- 1 | # 2D Linear Diffusion Solver 2 | 3 | In this part, we introduce the Diffusion 2D example we will use throughout the workshop to exemplify various HPC concepts in Julia, namely: 4 | - [Multithreading](./../multithreading/) 5 | - [Distributed computing](./../mpi/) 6 | - [GPU acceleration](./../gpu/) 7 | 8 | The script [`diffusion_d2.jl`](./diffusion_2d.jl) provides the starting point; a vectorised 2D linear diffusion solver computing diffusive fluxes and their divergence in a `compute_flux!` and `diffusion_step!` function, respectively. 9 | 10 | The follow-up script, [`diffusion_d2_loop.jl`](./diffusion_2d_loop.jl), implements a serial loop version of the previous script that we will use as a starting point for all our further experiments. 11 | 12 | ## Warm-up Task - running 2D diffusion 13 | 14 | Your very first task is to get familiar with the script structure and output generated. Run the [`diffusion_2d-jl`](diffusion_2d.jl) script, verifying that plotting works and assess the reported effective memory throughput `T_eff` (in the REPL). 15 | 16 | Repeat the same for the [`diffusion_2d_loop.jl`](diffusion_2d_loop.jl) script. 17 | -------------------------------------------------------------------------------- /parts/diffusion_2d/diffusion_2d.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Introducing the example: Diffusion 2D" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Overview" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "* **The brief physics intro**\n", 22 | " * 2D explicit diffusion using the finite-difference method\n", 23 | "\n", 24 | "* **The code structure overview**\n", 25 | " * Compute and \"main\" functions\n", 26 | " * [`shared.jl`](./../shared.jl) (included) script\n", 27 | "\n", 28 | "* **The output**\n", 29 | " * Visualisation\n", 30 | " * `Time` and `T_eff` - Performance reporting in the REPL\n", 31 | "\n", 32 | "* **The serial loop version**\n", 33 | " * Macros, \"race\" conditions\n", 34 | " * Moving to a single compute function" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "## The brief physics intro" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "During this workshop, we will use a 2D diffusion solver to investigate how Julia allows us to tackle various HPC concepts in a concise fashion, without trading performance.\n", 49 | "\n", 50 | "We will solve the 2D diffusion equation for a quantity $C$,\n", 51 | "$$\n", 52 | "\\frac{∂C}{∂t} = -∇ ⋅ q~,\n", 53 | "$$\n", 54 | "where $q$ represents the diffusive flux:\n", 55 | "$$\n", 56 | "q = -D \\; ∇C~,\n", 57 | "$$\n", 58 | "and where $D$ stands for the diffusion coefficient.\n", 59 | "\n", 60 | "\n", 61 | "We will solve this partial differential equation (PDE) using the finite-difference method and an explicit forward Euler time integrator on a regular staggered Cartesian grid.\n", 62 | "\n", 63 | "\n", 64 | "\n", 65 | "The 2D domain is of size $L=10$ and the scalar linear diffusion coefficient $D=1$. We use a constant grid size `ds = L / ns`, where `ns` represent the number of finite-difference cells in both $x$ and $y$ dimension.\n", 66 | "\n", 67 | "As initial condition, we define a Gaussian perturbation centred in the middle of the domain of amplitude and standard deviation equal to 1.\n", 68 | "\n", 69 | "" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "## The code structure overview" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "Let's have a look at the code structure. We'll first look at the [`diffusion_2d.jl`](diffusion_2d.jl) script. It contains:\n", 84 | "- 2 compute functions implementing the spatial and temporal discretisation of the PDE;\n", 85 | "\n", 86 | "- a \"main\" function to run the code;\n", 87 | "\n", 88 | "- an include statement for [`shared.jl`](./../shared.jl), mostly containing parameters and arrays initialisation, and visualisation." 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "## The output" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "### Graphics\n", 103 | "\n", 104 | "The visualisation renders the evolution of the distribution of the diffusing quantity $C$ throughout the simulation at frequency intervals defined by `nout = nt / 5`." 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "### Timing and performance\n", 112 | "\n", 113 | "Besides plotting, the code also reports performance using wall-time and effective memory throughput as metric and prints in the REPL." 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "#### Wall time\n", 121 | "\n", 122 | "The first performance metric is wall time, or total runtime. It is computed using a simple custom tic-toc approach, making sure to skip the first 10 iterations to account for \"warm-up\". For any performance assessment, we should make sure to run the code for at least 1 second in order to ensure reliable results.\n", 123 | "\n", 124 | "#### Effective memory throughput\n", 125 | "\n", 126 | "The second metric is the effective memory throughput $T_\\mathrm{eff}$ (`T_eff` in the REPL). It defines as the **non-redundant** memory access per iteration divided by the time per iteration $t_\\mathrm{it}$ (in sec.):\n", 127 | "$$\n", 128 | "T_\\mathrm{eff} = \\frac{A_\\mathrm{eff}}{t_\\mathrm{it}}~,\n", 129 | "$$\n", 130 | "where $A_\\mathrm{eff} = n_\\mathrm{IO} ~ n_s^2 ~ s_\\mathrm{DAT} ~ 10^{-9}$ is the effective memory access (in GB).\n", 131 | "\n", 132 | "In our example, $n_\\mathrm{IO} = 2$ as we only need to read old values of $C$ and write them back to solve the diffusion PDE. $s_\\mathrm{DAT} = 8$ as we are running double precision floating point arithmetic.\n", 133 | "\n", 134 | "$T_\\mathrm{eff}$ provides an idea on how far from the performance of memory copy only memory-bounded codes are, under various assumptions. Refer to [Räss et al. (2022)](https://doi.org/10.5194/gmd-15-5757-2022) for details.\n", 135 | "\n", 136 | "We will further use this metric in the GPU computing part." 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "## The serial loop version\n", 144 | "\n", 145 | "The final step to look at, before we start our deep dive, is the serial loop version of the 2D diffusion code. If we now open the [`diffusion_2d_loop.jl`](diffusion_2d_loop.jl) script aside the vectorized one ([`diffusion_2d.jl`](diffusion_2d.jl)), we can diff them \"by eye\" to see the major change being the change in the `diffusion_step!` function.\n", 146 | "\n", 147 | "In a nutshell:\n", 148 | "- we do no longer explicitly assign flux computation results to temporary variable in global memory (previously `qx` and `qy`);\n", 149 | "\n", 150 | "- we introduce a nested loop of spacial dimensions respecting a **column major order**;\n", 151 | "\n", 152 | "- we introduce a temporary second array `C2` to not read and write from the same array in order to avoid race conditions;\n", 153 | "\n", 154 | "- we use `@inbounds` upon having verified the correctness of the results to skip bound-checking." 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "**→ Hands-on** (see [README.md](README.md))" 162 | ] 163 | } 164 | ], 165 | "metadata": { 166 | "kernelspec": { 167 | "display_name": "Julia 1.10.4", 168 | "language": "julia", 169 | "name": "julia-1.10" 170 | }, 171 | "language_info": { 172 | "file_extension": ".jl", 173 | "mimetype": "application/julia", 174 | "name": "julia", 175 | "version": "1.10.4" 176 | } 177 | }, 178 | "nbformat": 4, 179 | "nbformat_minor": 2 180 | } 181 | -------------------------------------------------------------------------------- /parts/diffusion_2d/diffusion_2d.jl: -------------------------------------------------------------------------------- 1 | # 2D linear diffusion solver - serial, vectorized 2 | using Printf 3 | using CairoMakie 4 | include(joinpath(@__DIR__, "../shared.jl")) 5 | 6 | function compute_flux!(params, qx, qy, C) 7 | (; D, ds) = params 8 | @views qx .= .-D .* diff(C[:, 2:end-1], dims=1) ./ ds 9 | @views qy .= .-D .* diff(C[2:end-1, :], dims=2) ./ ds 10 | return 11 | end 12 | 13 | function diffusion_step!(params, C, qx, qy) 14 | (; ds, dt) = params 15 | @views C[2:end-1, 2:end-1] .-= dt .* (diff(qx, dims=1) ./ ds .+ diff(qy, dims=2) ./ ds) 16 | return 17 | end 18 | 19 | function run_diffusion(; ns=64, nt=100, do_visualize=false) 20 | params = init_params(; ns, nt, do_visualize) 21 | C, qx, qy = init_arrays_with_flux(params) 22 | fig, plt = maybe_init_visualization(params, C) 23 | t_tic = 0.0 24 | # time loop 25 | for it in 1:nt 26 | # time after warmup (ignore first 10 iterations) 27 | (it == 11) && (t_tic = Base.time()) 28 | # diffusion 29 | compute_flux!(params, qx, qy, C) 30 | diffusion_step!(params, C, qx, qy) 31 | # visualization 32 | maybe_update_visualization(params, fig, plt, C, it) 33 | end 34 | t_toc = (Base.time() - t_tic) 35 | print_perf(params, t_toc) 36 | return nothing 37 | end 38 | 39 | # Running things... 40 | 41 | # enable visualization by default 42 | (!@isdefined do_visualize) && (do_visualize = true) 43 | # enable execution by default 44 | (!@isdefined do_run) && (do_run = true) 45 | 46 | if do_run 47 | run_diffusion(; ns=256, nt=500, do_visualize) 48 | end 49 | -------------------------------------------------------------------------------- /parts/diffusion_2d/diffusion_2d_loop.jl: -------------------------------------------------------------------------------- 1 | # 2D linear diffusion solver - serial, loop version 2 | using Printf 3 | using CairoMakie 4 | include(joinpath(@__DIR__, "../shared.jl")) 5 | 6 | # convenience macros simply to avoid writing nested finite-difference expression 7 | macro qx(ix, iy) esc(:(-D * (C[$ix+1, $iy] - C[$ix, $iy]) / ds)) end 8 | macro qy(ix, iy) esc(:(-D * (C[$ix, $iy+1] - C[$ix, $iy]) / ds)) end 9 | 10 | function diffusion_step!(params, C2, C) 11 | (; ds, dt, D) = params 12 | # respect column major order 13 | for iy in 1:size(C, 2)-2 14 | for ix in 1:size(C, 1)-2 15 | @inbounds C2[ix+1, iy+1] = C[ix+1, iy+1] - dt * ((@qx(ix+1, iy+1) - @qx(ix, iy+1)) / ds + 16 | (@qy(ix+1, iy+1) - @qy(ix+1, iy)) / ds) 17 | end 18 | end 19 | return nothing 20 | end 21 | 22 | function run_diffusion(; ns=64, nt=100, do_visualize=false) 23 | params = init_params(; ns, nt, do_visualize) 24 | C, C2 = init_arrays(params) 25 | fig, plt = maybe_init_visualization(params, C) 26 | t_tic = 0.0 27 | # time loop 28 | for it in 1:nt 29 | # time after warmup (ignore first 10 iterations) 30 | (it == 11) && (t_tic = Base.time()) 31 | # diffusion 32 | diffusion_step!(params, C2, C) 33 | C, C2 = C2, C # pointer swap 34 | # visualization 35 | maybe_update_visualization(params, fig, plt, C, it) 36 | end 37 | t_toc = (Base.time() - t_tic) 38 | print_perf(params, t_toc) 39 | return nothing 40 | end 41 | 42 | # Running things... 43 | 44 | # enable visualization by default 45 | (!@isdefined do_visualize) && (do_visualize = true) 46 | # enable execution by default 47 | (!@isdefined do_run) && (do_run = true) 48 | 49 | if do_run 50 | run_diffusion(; ns=256, do_visualize) 51 | end 52 | -------------------------------------------------------------------------------- /parts/diffusion_2d/imgs/initial.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JuliaHPC/juliacon24-hpcworkshop/2c6e764bcf94eebd549785b69264215b1012eaf2/parts/diffusion_2d/imgs/initial.png -------------------------------------------------------------------------------- /parts/diffusion_2d/imgs/stagg_2D.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JuliaHPC/juliacon24-hpcworkshop/2c6e764bcf94eebd549785b69264215b1012eaf2/parts/diffusion_2d/imgs/stagg_2D.png -------------------------------------------------------------------------------- /parts/distributed/explanation/Project.toml: -------------------------------------------------------------------------------- 1 | [deps] 2 | ClusterManagers = "34f1f09b-3a8b-5176-ab39-66d58a4d544e" 3 | Dagger = "d58978e5-989f-55fb-8d15-ea34adc7bf54" 4 | DistributedArrays = "aaf54ef3-cdf8-58ed-94cc-d582ad619b94" 5 | NetworkInterfaceControllers = "6f74fd91-2978-43ad-8164-3af8c0ec0142" 6 | Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" 7 | -------------------------------------------------------------------------------- /parts/gpu/README.md: -------------------------------------------------------------------------------- 1 | # Diffusion 2D - GPU acceleration 2 | 3 | In this part, we want to use GPU computing and multi-GPUs (distributed memory parallelization) to accelerate our Diffusion 2D example. 4 | 5 | The starting point is the serial loop version [`diffusion_2d_loop.jl`](./../diffusion_2d/diffusion_2d_loop.jl). The file [`diffusion_2d_cuda.jl`](./diffusion_2d_cuda.jl) in this folder is a slightly modified copy of this version. Specifically, we included the gpu initialization of the arrays `C` and `C2` in form of the function `init_arrays_gpu` and left the computational kernel (`diffusion_step_kernel!`) and the wrapper function (`diffusion_step!`) mostly unimplemented. 6 | 7 | In a second step, we will merge the CUDA and MPI codes in order to achieve a multi-GPU diffusion solver. For this task, the starting point is the [`diffusion_2d_mpi.jl`](./../mpi/diffusion_2d_mpi.jl) script. The file [`diffusion_2d_cuda_mpi.jl`](./diffusion_2d_cuda_mpi.jl) in this folder is a slightly modified copy of this version. Specifically, we included the gpu mpi initialization of the arrays `C` and `C2` in form of the function `init_arrays_gpu_mpi` and left the `update_halo!` and `init_bufs` functions mostly unimplemented. We also did not yet implement the GPU selection from local MPI rank. 8 | 9 | Note that there are few code stubs (indicated by `TODO` comments) that you will implement in the tasks below. 10 | 11 | Recall that on the GPU, you need to explicitly specify the data type to be `Float64` as CUDA.jl defaults to `Float32`. 12 | 13 | ## Reminder 14 | 15 | Remember that, on Perlmutter, **you can't run GPU or MPI processes on a login node**. You have two options to work on a compute node: 16 | 17 | 1) **Interactive session**: You can try to get an interactive session on a compute node by running `sh get_gpu_compute_node_interactive.sh` (but unfortunately, we don't have a node for everyone). **If you can get one**, you can: 18 | - single GPU script: launch Julia from the interactive session and run the single GPU script. Alternatively, you can run `sh job_bench_gpu.sh`. 19 | - multi-GPU: run the GPU MPI code by `mpiexecjl --project -n 4 julia diffusion_2d_cuda_mpi.jl`. Alternatively, you can run `sh job_gpu_mpi_singlenode.sh`. 20 | 21 | 2) **Compute job**: You can always submit a job that runs the code: `sbatch job_gpu_mpi_singlenode.sh`. The output will land in `slurm_gpu_mpi_singlenode.out`. Check out the [Perlmutter cheetsheet](../../help/perlmutter_cheatsheet.md) to learn more about jobs. 22 | 23 | ## Task 1 - CUDA `diffusion_step_kernel!` 24 | 25 | ### Part A 26 | 27 | Your first task is to take the diffusion kernel from `diffusion_2d_loop.jl` and replace the nested loop over spatial dimensions by "vecotized" CUDA indices. See the `TODO` comments inside of the `diffusion_step_kernel!` function. Make sure to correctly handle the ranges where the computation should occur given that we do not want to update the boundary cells of `C2` array. 28 | 29 | Then you should complete the wrapper function `diffusion_step!` we are using the call the GPU kernel (which allows us to have the same function call signature in the `run_diffusion` function). Use the appropriate CUDA launch parameters. 30 | 31 | Note that the number of threads and blocks used to execute the kernel is defined in `init_params_gpu` from [`shared.jl`](./../shared.jl) as: 32 | ```julia 33 | nthreads = 32, 8 # number of threads per block 34 | nblocks = cld.(ns, nthreads) # number of blocks 35 | ``` 36 | 37 | **Question:** 38 | * How did you implement the appropriate range selection? 39 | 40 | ### Part B 41 | 42 | Let's make a rough performance benchmark. Run your implementation on a single Nvidia A100 GPU and compare timings/`T_eff` ("strong scaling"). Perform this comparison for five values of `ns`, for example 512, 2048, 4096, 8192 and 16384. 43 | 44 | **How to run the code?** 45 | 46 | You can either perform the rough benchmark in an interactive Julia session or use the script `job_bench_gpu.sh`. 47 | 48 | * Interactive: 49 | * Set `do_visualize=false`. 50 | * Use `include("diffusion_2d_cuda.jl")` to run the code. 51 | 52 | * Script: 53 | * Either just run the script on the current node (`sh job_bench_gpu.sh`) or submit it as a job to SLURM (`sbatch job_bench_gpu.sh`). In the latter case, the output will end up in a file called `slurm_bench_gpu.out`. 54 | 55 | **Questions:** 56 | * What do you observe? 57 | * What about the performance as function of `ns`? 58 | * How does it compare to peak memory throughput of the Nvidia A100 (memcopy only)? 59 | 60 | ## Task 2 - Multi-GPUs 61 | 62 | In this second task, we will see how to combine GPUs and MPI in order to achieve distributed memory parallelization on multiple GPUs. This step is the gateway to run Julia at scale on latest GPU-accelerated supercomputers such as NERSC's Perlmutter. 63 | 64 | We will first make the required changes to the code (Part A), test our implementation (Part B) and perform a weak scaling test (Part C). 65 | 66 | ### Part A 67 | 68 | Complete the `update_halo!` and `init_bufs` functions taking inspiration from the CPU MPI script and making sure to use the the correct data type for the GPU buffers (see the `TODO`s therein). 69 | 70 | Then, in the `run_diffusion` function, we need to implement a procedure to map the GPUs from each node to MPI processes running on that same node. There are various ways to achieve this. We will here use an MPI shared memory communicator to detect all ranks on the same node: 71 | 1. We can use `MPI.Comm_split_type(comm, MPI.COMM_TYPE_SHARED, me)` from MPI.jl passing the existing communicator `comm` and the global rank `me` to retrieve the node-local communicator `comm_l`. 72 | 2. We then need to retrieve the rank from `comm_l` which will give us the node local rank `me_l`. 73 | 3. We can then use to select the GPU device upon `gpu_id = CUDA.device!(me_l)`. 74 | 75 | ### Part B 76 | 77 | We will now run the GPU MPI code on a single node using all 4 A100 Nvidia GPUs on that node and assess whether the GPU selection works and asses the correctness of the implementation doing an "eye test" looking at the plotting results. 78 | 79 | **How to run the code?** 80 | 81 | You can run the GPU MPI script using the script `job_bench_gpu_mpi_singlenode.sh`, submitting it as a job to SLURM (`sbatch job_bench_gpu_mpi_singlenode.sh`). The output will end up in a file called `slurm_bench_gpu_mpi.out`. 82 | 83 | Then, try running the same job but this time on 4 nodes, using 1 GPU per node. You can achieve this by using the `job_bench_gpu_mpi_multinode.sh` script we prepared for you. 84 | 85 | **How to visualize the results?** 86 | 87 | The code will save one output file per rank, having hte rank ID in the filename such as `out_$(me).jld2`. 88 | 89 | You can run the [`visualize_mpi.jl`](./visualize_mpi.jl) script in order to visualise the results. The visualization script defines the `vizme2D_mpi(nprocs)` function, which takes `nprocs` as argument, defualting to `(2, 2)`, our default MPI topology. 90 | 91 | **Questions:** 92 | * Do you observe correct diffusion results, for both the singlenode and multinode configurations? 93 | * Is each MPI rank accessing a different GPU from that node? 94 | 95 | ### Part C 96 | 97 | As a last step, we will realize a weak scaling to assess the parallel efficiency of our implementation. For this we should set the spatial resolution `ns` to the value that was showing best performance in the strong scaling experiment from Task 1, possibly adapting `nt` such that the code executes not much longer than 1 second and setting `do_save = false`. 98 | 99 | Then one should run the GPU MPI script on one MPI rank (thus one GPU) in order to assess the baseline performance. Once this is done, one should increase the number of MPI ranks, while keeping the same local problem size, making the global problem scale linearly with the computing resources. Performance tests could be achieved for 1, 4, 9, 16, (64) ranks. Parallel efficiency can be reported by normalising the $T_\mathrm{eff}$ or wall-time obtained for tests > 1 rank by the single rank performance. 100 | 101 | **Questions:** 102 | * What parallel efficiency do you observe? 103 | * If it drops, what workaround could one implement? 104 | -------------------------------------------------------------------------------- /parts/gpu/advanced/closest_device.jl: -------------------------------------------------------------------------------- 1 | using CpuId, MPI, CUDA, Hwloc, AbstractTrees 2 | 3 | import AbstractTrees: PreOrderDFS 4 | import Hwloc: hwloc_pci_class_string 5 | 6 | import Base: filter, Fix1 7 | filter(f::Function)::Function = Fix1(filter, f) 8 | 9 | const cpucycle_mask = ( 10 | (1 << (64 - leading_zeros(CpuId.cputhreads()))) - 1 11 | ) % UInt32 12 | 13 | cpucycle_coreid() = Int(cpucycle_id()[2] & cpucycle_mask) 14 | 15 | function get_device_attributes() 16 | attr = Dict{Tuple{Int32, Int32}, Int32}() 17 | for i in 0:(ndevices()-1) 18 | d = CuDevice(i) 19 | attr[( 20 | attribute(d, CUDA.CU_DEVICE_ATTRIBUTE_PCI_BUS_ID), 21 | attribute(d, CUDA.CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID) 22 | )] = d 23 | end 24 | attr 25 | end 26 | 27 | function tag_subtree!(tree_node, val) 28 | for n in collect(AbstractTrees.PreOrderDFS(tree_node)) 29 | n.tag = val 30 | end 31 | end 32 | 33 | function distance_to_core!(node, target_index) 34 | # shield re-entrance when iterating 35 | node.tag = 1 36 | 37 | if node.type == :PU 38 | # println("Checking: $(nodevalue(node).os_index)") 39 | if nodevalue(node).os_index == target_index 40 | return true, 0 41 | end 42 | end 43 | 44 | for child in node.children 45 | if child.tag == 1 46 | continue 47 | end 48 | 49 | found, dist = distance_to_core!(child, target_index) 50 | if found 51 | return true, dist + 1 52 | end 53 | end 54 | 55 | if node.parent != nothing 56 | found, dist = distance_to_core!(node.parent, target_index) 57 | if found 58 | return true, dist + 1 59 | end 60 | end 61 | 62 | return false, typemax(Int) 63 | end 64 | 65 | function distance_to_core(root, node, target_index) 66 | tag_subtree!(root, 0) 67 | found, dist = distance_to_core!(node, target_index) 68 | tag_subtree!(root, 0) 69 | return found, dist 70 | end 71 | 72 | sys_devs = children(gettopology()) 73 | pci_devs = PreOrderDFS(sys_devs) |> collect |> filter(x->x.type==:PCI_Device) 74 | gpu_devs = pci_devs |> filter(x->hwloc_pci_class_string(nodevalue(x).attr.class_id) == "3D") 75 | 76 | function get_device_distances(core) 77 | attr = get_device_attributes() 78 | dist = Dict{Int32, Int32}() 79 | dev = Dict{Int32, Int32}() 80 | for d in gpu_devs 81 | idx = attr[(nodevalue(d).attr.bus, nodevalue(d).attr.dev)] 82 | found, dev_d = distance_to_core(sys_devs, d, core) 83 | if found 84 | dist[idx] = dev_d 85 | dev[dev_d] = idx 86 | end 87 | end 88 | dist, dev 89 | end 90 | 91 | dist, dev = get_device_distances(cpucycle_coreid()) 92 | closest_dev = dev[dev |> keys |> minimum] 93 | println(closest_dev) 94 | -------------------------------------------------------------------------------- /parts/gpu/advanced/job_gpu_mpi_multinode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --time=00:05:00 3 | #SBATCH --nodes=4 4 | #SBATCH --ntasks-per-node=4 5 | #SBATCH --constraint=gpu 6 | #SBATCH --account=ntrain1 7 | #SBATCH --output=slurm_gpu_mpi_multinode.out 8 | #SBATCH --qos=regular 9 | 10 | # pin to closest NIC to GPU 11 | export MPICH_OFI_NIC_POLICY=GPU 12 | 13 | # Load julia 14 | ml use /global/common/software/nersc/n9/julia/modules 15 | ml julia 16 | 17 | mpiexecjl -G 16 -c 32 --project julia closest_device.jl 18 | -------------------------------------------------------------------------------- /parts/gpu/advanced/job_gpu_mpi_singlenode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --time=00:05:00 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=4 5 | #SBATCH --constraint=gpu 6 | #SBATCH --account=ntrain1 7 | #SBATCH --output=slurm_gpu_mpi_singlenode.out 8 | #SBATCH --qos=regular 9 | 10 | # pin to closest NIC to GPU 11 | export MPICH_OFI_NIC_POLICY=GPU 12 | 13 | # Load julia 14 | ml use /global/common/software/nersc/n9/julia/modules 15 | ml julia 16 | 17 | mpiexecjl -G 4 -c 32 --project julia closest_device.jl 18 | -------------------------------------------------------------------------------- /parts/gpu/diffusion_2d_cuda.jl: -------------------------------------------------------------------------------- 1 | # 2D linear diffusion solver - GPU cuda version 2 | using Printf 3 | using JLD2 4 | using CUDA 5 | include(joinpath(@__DIR__, "../shared.jl")) 6 | 7 | # convenience macros simply to avoid writing nested finite-difference expression 8 | macro qx(ix, iy) esc(:(-D * (C[$ix+1, $iy] - C[$ix, $iy]) * inv(ds))) end 9 | macro qy(ix, iy) esc(:(-D * (C[$ix, $iy+1] - C[$ix, $iy]) * inv(ds))) end 10 | 11 | function diffusion_step_kernel!(params, C2, C) 12 | (; ds, dt, D) = params 13 | # 14 | # !! TODO !! 15 | # 16 | # We want to replace the nested loop over spatial dimensions by "vecotized" CUDA indices. 17 | # Based off of the serial kernel (see README.md or diffusion_2d_loop.jl) implement 18 | # the CUDA variant using CUDA.jl taking care the handle to range in an appropriate 19 | # manner (see "TODO..." below). 20 | # 21 | ix = # TODO # CUDA vectorised unique index 22 | iy = # TODO # CUDA vectorised unique index 23 | if # TODO select correct range 24 | @inbounds C2[ix+1, iy+1] = C[ix+1, iy+1] - dt * ((@qx(ix + 1, iy + 1) - @qx(ix, iy + 1)) * inv(ds) + 25 | (@qy(ix + 1, iy + 1) - @qy(ix + 1, iy)) * inv(ds)) 26 | end 27 | return nothing 28 | end 29 | 30 | function diffusion_step!(params, C2, C) 31 | (; nthreads, nblocks) = params 32 | # 33 | # !! TODO !! 34 | # 35 | # Complete the CPU wrapper function calling the `diffusion_step_kernel!` 36 | # using the `@cuda` macro and appropriate launch parameters (see "TODO..." below). 37 | # 38 | @cuda # TODO 39 | return nothing 40 | end 41 | 42 | function run_diffusion(; ns=64, nt=100, do_save=false) 43 | params = init_params_gpu(; ns, nt, do_save) 44 | C, C2 = init_arrays_gpu(params) 45 | t_tic = 0.0 46 | # Time loop 47 | for it in 1:nt 48 | # time after warmup (ignore first 10 iterations) 49 | (it == 11) && (t_tic = Base.time()) 50 | # diffusion 51 | diffusion_step!(params, C2, C) 52 | C, C2 = C2, C # pointer swap 53 | end 54 | # synchronize the gpu before querying the final time 55 | # TODO # Add synchronization 56 | t_toc = (Base.time() - t_tic) 57 | print_perf(params, t_toc) 58 | do_save && jldsave(joinpath(@__DIR__, "out_gpu.jld2"); C = Array(C), l = params.L) 59 | return nothing 60 | end 61 | 62 | # Running things... 63 | 64 | # enable visualization by default 65 | (!@isdefined do_save) && (do_save = true) 66 | # enable execution by default 67 | (!@isdefined do_run) && (do_run = true) 68 | 69 | if do_run 70 | if !isempty(ARGS) 71 | run_diffusion(; ns=parse(Int, ARGS[1]), do_save) 72 | else 73 | run_diffusion(; ns=256, do_save) 74 | end 75 | end 76 | -------------------------------------------------------------------------------- /parts/gpu/diffusion_2d_cuda_mpi.jl: -------------------------------------------------------------------------------- 1 | # 2D linear diffusion solver - GPU MPI 2 | using Printf 3 | using JLD2 4 | using CUDA 5 | using MPI 6 | include(joinpath(@__DIR__, "../shared.jl")) 7 | 8 | # convenience macros simply to avoid writing nested finite-difference expression 9 | macro qx(ix, iy) esc(:(-D * (C[$ix+1, $iy] - C[$ix, $iy]) * inv(dx))) end 10 | macro qy(ix, iy) esc(:(-D * (C[$ix, $iy+1] - C[$ix, $iy]) * inv(dy))) end 11 | 12 | function diffusion_step_kernel!(params, C2, C) 13 | (; dx, dy, dt, D) = params 14 | ix = (blockIdx().x - 1) * blockDim().x + threadIdx().x # CUDA vectorised unique index 15 | iy = (blockIdx().y - 1) * blockDim().y + threadIdx().y # CUDA vectorised unique index 16 | if ix <= size(C, 1)-2 && iy <= size(C, 2)-2 17 | @inbounds C2[ix+1, iy+1] = C[ix+1, iy+1] - dt * ((@qx(ix + 1, iy + 1) - @qx(ix, iy + 1)) * inv(dx) + 18 | (@qy(ix + 1, iy + 1) - @qy(ix + 1, iy)) * inv(dy)) 19 | end 20 | return nothing 21 | end 22 | 23 | function diffusion_step!(params, C2, C) 24 | (; nthreads, nblocks) = params 25 | @cuda threads = nthreads blocks = nblocks diffusion_step_kernel!(params, C2, C) 26 | return nothing 27 | end 28 | 29 | # MPI functions 30 | @views function update_halo!(A, bufs, neighbors, comm) 31 | # 32 | # !! TODO !! 33 | # 34 | # We want to replace use the `update_halo!` function defined in the CPU MPI script 35 | # and use it here. Since we are using GPU-aware MPI, we can directly re-use the 36 | # function since MPI communication will take care of exchanging halo values living 37 | # in GPU memory. 38 | # 39 | return 40 | end 41 | 42 | function init_bufs(A) 43 | # 44 | # !! TODO !! 45 | # 46 | # We are using GPU-aware MPI, which greatly simplifies the implementation and ensures 47 | # good performance. GPU-aware MPI exchanges GPU pointers and thus we shpuld initialize 48 | # the send and receive buffers on the GPU memory. Complete the missing `return` statement 49 | # by replicating what we did for CPU MPI but making sure to initialise buffers on the GPU 50 | # using the correct data type (Float64). 51 | # 52 | return (; # TODO ) 53 | end 54 | 55 | function run_diffusion(; ns=64, nt=100, do_save=false) 56 | MPI.Init() 57 | comm = MPI.COMM_WORLD 58 | nprocs = MPI.Comm_size(comm) 59 | dims = MPI.Dims_create(nprocs, (0, 0)) |> Tuple 60 | comm_cart = MPI.Cart_create(comm, dims) 61 | me = MPI.Comm_rank(comm_cart) 62 | coords = MPI.Cart_coords(comm_cart) |> Tuple 63 | neighbors = (; x=MPI.Cart_shift(comm_cart, 0, 1), y=MPI.Cart_shift(comm_cart, 1, 1)) 64 | # select GPU on multi-GPU system based on shared memory topology 65 | # 66 | # !! TODO !! 67 | # 68 | # We need to define a local MPI communicator based on `MPI.COMM_TYPE_SHARED` in order to 69 | # retireve the node-local rank of the MPI processes given we want to map each GPU from one 70 | # node to a MPI rank. Then we want to get the rank from the new communicator and use 71 | # it to set the GPU device. 72 | # 73 | println("$(gpu_id), out of: $(ndevices())") 74 | (me == 0) && println("nprocs = $(nprocs), dims = $dims") 75 | 76 | params = init_params_gpu_mpi(; dims, coords, ns, nt, do_save) 77 | C, C2 = init_arrays_gpu_mpi(params) 78 | bufs = init_bufs(C) 79 | t_tic = 0.0 80 | # Time loop 81 | for it in 1:nt 82 | # time after warmup (ignore first 10 iterations) 83 | (it == 11) && (t_tic = Base.time()) 84 | # diffusion 85 | diffusion_step!(params, C2, C) 86 | update_halo!(C2, bufs, neighbors, comm_cart) 87 | C, C2 = C2, C # pointer swap 88 | end 89 | # synchronize the gpu before querying the final time 90 | CUDA.synchronize() 91 | t_toc = (Base.time() - t_tic) 92 | # "master" prints performance 93 | (me == 0) && print_perf(params, t_toc) 94 | # save to (maybe) visualize later 95 | if do_save 96 | jldsave(joinpath(@__DIR__, "out_$(me).jld2"); C = Array(C[2:end-1, 2:end-1]), lxy = (; lx=params.L, ly=params.L)) 97 | end 98 | MPI.Finalize() 99 | return 100 | end 101 | 102 | # Running things... 103 | 104 | # enable save to disk by default 105 | (!@isdefined do_save) && (do_save = true) 106 | # enable execution by default 107 | (!@isdefined do_run) && (do_run = true) 108 | 109 | if do_run 110 | run_diffusion(; ns=256, do_save) 111 | end 112 | -------------------------------------------------------------------------------- /parts/gpu/get_gpu_compute_node_interactive.sh: -------------------------------------------------------------------------------- 1 | salloc --nodes 1 --qos interactive --time 00:45:00 --constraint gpu --account=ntrain1 2 | -------------------------------------------------------------------------------- /parts/gpu/gpu.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# GPU acceleration" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Overview" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "* **Why to bother with GPU computing in 2024**\n", 22 | " * HPC and Supercomputing is GPU-accelerated\n", 23 | " * When Julia overcomes the two-language barrier\n", 24 | "\n", 25 | "* **GPU computing Fast-Forward**\n", 26 | " * Array vs Kernel programming\n", 27 | " * Performance considerations\n", 28 | "\n", 29 | "* **Going multi-GPUs**\n", 30 | " * MPI + GPUs" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "### Why to still bother with GPU computing in 2024\n", 38 | "- It's around for more than a decade\n", 39 | "- It shows massive performance gain compared to serial CPU computing\n", 40 | "- First exascale supercomputer, Frontier, is full of GPUs\n", 41 | "\n", 42 | "\n", 43 | "\n", 44 | "### Performance that matters\n", 45 | "\n", 46 | "\n", 47 | "\n", 48 | "Taking a look at a recent GPU and CPU:\n", 49 | "- Nvidia Tesla A100 GPU\n", 50 | "- AMD EPYC \"Rome\" 7282 (16 cores) CPU\n", 51 | "\n", 52 | "| Device | TFLOP/s (FP64) | Memory BW TB/s | Imbalance (FP64) |\n", 53 | "| :------------: | :------------: | :------------: | :------------------: |\n", 54 | "| Tesla A100 | 9.7 | 1.55 | 9.7 / 1.55 × 8 = 50 |\n", 55 | "| AMD EPYC 7282 | 0.7 | 0.085 | 0.7 / 0.085 × 8 = 66 |\n", 56 | "\n", 57 | "**Meaning:** we can do about 50 floating point operations per number accessed from main memory.\n", 58 | "Floating point operations are \"for free\" when we work in memory-bounded regimes.\n", 59 | "\n", 60 | "👉 Requires re-thinking the numerical implementation and solution strategies\n", 61 | "\n", 62 | "Unfortunately, the cost of evaluating a first derivative $∂A / ∂x$ in, e.g., diffusive flux calculations using finite-differences:\n", 63 | "\n", 64 | "`q[ix] = -D * (A[ix+1] - A[ix]) / dx`\n", 65 | "\n", 66 | "consists of:\n", 67 | "- 1 read (`A`) + 1 write (`q`) => $2 × 8$ = **16 Bytes transferred**\n", 68 | "- 1 addition + 1 multiplication + 1 division => **3 floating point operations**\n", 69 | "\n", 70 | "👉 assuming `D`, `dx` are scalars, `q` and `A` are arrays of `Float64` (read from main memory)\n", 71 | "\n", 72 | "### Performance that matters - an example\n", 73 | "Not yet convinced? Let's have a look at an example.\n", 74 | "\n", 75 | "Let's assess how close from memory copy (1400 GB/s) we can get solving a 2D diffusion problem on an Nvidia Tesla A100 GPU.\n", 76 | "\n", 77 | "$$ \\frac{\\partial C}{\\partial t} = \\frac{\\partial^2 C}{\\partial x^2} + \\frac{\\partial^2 C}{\\partial y^2} $$\n", 78 | "\n", 79 | "👉 Let's test the performance using a simple script." 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "### Measuring GPU performance\n", 87 | "\n", 88 | "Load modules:" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 3, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "using CUDA\n", 98 | "using BenchmarkTools\n", 99 | "using Printf" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "Memory copy function to measure the \"peak\" memory throughput:" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 4, 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "data": { 116 | "text/plain": [ 117 | "mycopy! (generic function with 1 method)" 118 | ] 119 | }, 120 | "metadata": {}, 121 | "output_type": "display_data" 122 | } 123 | ], 124 | "source": [ 125 | "function mycopy!(A, B)\n", 126 | " ix = (blockIdx().x - 1) * blockDim().x + threadIdx().x\n", 127 | " iy = (blockIdx().y - 1) * blockDim().y + threadIdx().y\n", 128 | " if ix <= size(A, 1) && iy <= size(A, 2)\n", 129 | " @inbounds A[ix, iy] = B[ix, iy] + 1\n", 130 | " end\n", 131 | " return\n", 132 | "end" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "Laplacian kernel using the finite difference method (FDM):" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 5, 145 | "metadata": {}, 146 | "outputs": [ 147 | { 148 | "data": { 149 | "text/plain": [ 150 | "laplacian! (generic function with 1 method)" 151 | ] 152 | }, 153 | "metadata": {}, 154 | "output_type": "display_data" 155 | } 156 | ], 157 | "source": [ 158 | "function laplacian!(A, B, dt, _dx2, _dy2)\n", 159 | " ix = (blockIdx().x - 1) * blockDim().x + threadIdx().x\n", 160 | " iy = (blockIdx().y - 1) * blockDim().y + threadIdx().y\n", 161 | " if ix <= size(A, 1) - 2 && iy <= size(A, 2) - 2\n", 162 | " @inbounds A[ix+1, iy+1] = B[ix+1, iy+1] + dt *\n", 163 | " ((B[ix+2, iy+1] - 2 * B[ix+1, iy+1] + B[ix, iy+1]) * _dx2 +\n", 164 | " (B[ix+1, iy+2] - 2 * B[ix+1, iy+1] + B[ix+1, iy]) * _dy2)\n", 165 | " end\n", 166 | " return\n", 167 | "end" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "Let's test the performance!" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 8, 180 | "metadata": {}, 181 | "outputs": [ 182 | { 183 | "name": "stdout", 184 | "output_type": "stream", 185 | "text": [ 186 | "Effective memory throughput (copy) : 1335.85 GB/s\n", 187 | "Effective memory throughput (laplacian) : 1303.32 GB/s\n", 188 | "Theoretical peak memory throughput : 1555.20 GB/s\n", 189 | "\n", 190 | "Wow 🚀! Laplacian runs at:\n", 191 | " 97.56% of copy speed\n", 192 | " 83.80% of peak memory bandwidth\n", 193 | "on a NVIDIA A100-SXM4-40GB device\n" 194 | ] 195 | } 196 | ], 197 | "source": [ 198 | "# if the array size is too small, the GPU will not be fully utilized\n", 199 | "nx = ny = 512 * 32\n", 200 | "A = CUDA.rand(Float64, nx, ny)\n", 201 | "B = CUDA.rand(Float64, nx, ny)\n", 202 | "\n", 203 | "_dx2 = _dy2 = dt = rand()\n", 204 | "\n", 205 | "# launch configuration\n", 206 | "nthreads = (16, 16)\n", 207 | "nblocks = cld.((nx, ny), nthreads)\n", 208 | "\n", 209 | "# measure the execution times\n", 210 | "time_copy = @belapsed CUDA.@sync @cuda threads=nthreads blocks=nblocks mycopy!(A, B)\n", 211 | "time_lapl = @belapsed CUDA.@sync @cuda threads=nthreads blocks=nblocks laplacian!(A, B, dt, _dx2, _dy2)\n", 212 | "\n", 213 | "# effective memory throughput (1 read + 1 write per element)\n", 214 | "Teff_copy = 2 * nx * ny * sizeof(Float64) / time_copy / 1e9\n", 215 | "Teff_lapl = 2 * nx * ny * sizeof(Float64) / time_lapl / 1e9\n", 216 | "\n", 217 | "# compute theoretical peak memory bandwidth\n", 218 | "dev = CUDA.device()\n", 219 | "\n", 220 | "bus_width = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH) |> Float64 # in bits\n", 221 | "clock_rate = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE) |> Float64 # in kHz\n", 222 | "rate_multiplier = 2 # 2 for HBM2/DDR, 4 for HBM3/GDDR5, 8 for GDDR6\n", 223 | "\n", 224 | "Teff_peak = bus_width * clock_rate * rate_multiplier / 1e6 / 8\n", 225 | "\n", 226 | "# report results\n", 227 | "@printf(\"Effective memory throughput (copy) : %.2f GB/s\\n\", Teff_copy)\n", 228 | "@printf(\"Effective memory throughput (laplacian) : %.2f GB/s\\n\", Teff_lapl)\n", 229 | "@printf(\"Theoretical peak memory throughput : %.2f GB/s\\n\", Teff_peak)\n", 230 | "\n", 231 | "@printf(\"\\nWow 🚀! Laplacian runs at:\\n\")\n", 232 | "@printf(\" %.2f%% of copy speed\\n\" , 100 * Teff_lapl / Teff_copy)\n", 233 | "@printf(\" %.2f%% of peak memory bandwidth\\n\", 100 * Teff_lapl / Teff_peak)\n", 234 | "@printf(\"on a %s device\\n\", CUDA.name(dev))" 235 | ] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "metadata": {}, 240 | "source": [ 241 | "### GPU kernel programming\n", 242 | "\n", 243 | "We'll get started with a brief overview of the Nvidia GPU architecture and how to program it.\n", 244 | "\n", 245 | "The Nvidia general purpose GPUs can be programmed using the CUDA language extension. CUDA is accessible in Julia via [CUDA.jl](https://cuda.juliagpu.org/stable/), which exposes most of the native CUDA features to the Julia ecosystem.\n", 246 | "\n", 247 | "In the CUDA programming model, `blocks` of `threads` compose the `grid`. In our implementation, we want to map one thread to each finite-difference cell of the 2D Cartesian domain.\n", 248 | "\n", 249 | "The figure hereafter depicts the relation between the CUDA domain and the finite-difference domain:\n", 250 | "\n", 251 | "\n", 252 | "\n", 253 | "**Playing with GPUs: the rules**\n", 254 | "\n", 255 | "- Current GPUs allow typically a maximum of 1024 threads per block.\n", 256 | "\n", 257 | "- The maximum number of blocks allowed is huge; computing the largest possible array on the GPU will make you run out of device memory (currently 16-80 GB) before hitting the maximal number of blocks when selecting sensible kernel launch parameters (usually threads per block >= 128).\n", 258 | "\n", 259 | "- Threads, blocks and grid have 3D \"Cartesian\" topology, which is very useful for 1D, 2D and 3D Cartesian finite-difference domains." 260 | ] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "metadata": {}, 265 | "source": [ 266 | "### Multi-GPU\n", 267 | "\n", 268 | "#### GPU - MPI ranks mapping\n", 269 | "The challenging part is to run on multiple GPUs using MPI. To achieve this, we need to map node-local MPI ranks to GPU IDs.\n", 270 | "\n", 271 | "This can be achieved in Julia using MPI.jl and CUDA.jl by\n", 272 | "```julia\n", 273 | "comm = MPI.COMM_WORLD\n", 274 | "rank = MPI.Comm_rank(comm)\n", 275 | "comm_l = MPI.Comm_split_type(comm, MPI.COMM_TYPE_SHARED, rank)\n", 276 | "rank_l = MPI.Comm_rank(comm_l)\n", 277 | "gpu_id = CUDA.device!(rank_l)\n", 278 | "```\n", 279 | "\n", 280 | "#### GPU-aware MPI\n", 281 | "\n", 282 | "On modern supercomputers, one has access to GPU-aware MPI. GPU aware-MPI allows to directly exchange GPU memory by-passing an explicit host copy.\n", 283 | "\n", 284 | "The file [`multigpu.jl`](./multigpu.jl) implements this and would check that GPU-aware MPI works:" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": 2, 290 | "metadata": {}, 291 | "outputs": [ 292 | { 293 | "name": "stderr", 294 | "output_type": "stream", 295 | "text": [ 296 | "srun: Job 27855189 step creation temporarily disabled, retrying (Requested nodes are busy)\n", 297 | "srun: Step created for StepId=27855189.1\n" 298 | ] 299 | }, 300 | { 301 | "name": "stdout", 302 | "output_type": "stream", 303 | "text": [ 304 | "rank=3 rank_loc=3 (gpu_id=CuDevice(3)), size=4, dst=0, src=2\n", 305 | "rank=0 rank_loc=0 (gpu_id=CuDevice(0)), size=4, dst=1, src=3\n", 306 | "rank=1 rank_loc=1 (gpu_id=CuDevice(1)), size=4, dst=2, src=0\n", 307 | "rank=2 rank_loc=2 (gpu_id=CuDevice(2)), size=4, dst=3, src=1\n", 308 | "start sending...\n", 309 | "recv_mesg on proc 3: [2.0, 2.0, 2.0, 2.0]\n", 310 | "recv_mesg on proc 0: [3.0, 3.0, 3.0, 3.0]\n", 311 | "done.\n", 312 | "recv_mesg on proc 2: [1.0, 1.0, 1.0, 1.0]\n", 313 | "recv_mesg on proc 1: [0.0, 0.0, 0.0, 0.0]\n" 314 | ] 315 | } 316 | ], 317 | "source": [ 318 | "run_cmd = `mpiexecjl -n 4 -G 4 --nodes 1 --qos regular --constraint gpu --gpus 4 --account=ntrain1 --project julia multigpu.jl`\n", 319 | "run(run_cmd);" 320 | ] 321 | } 322 | ], 323 | "metadata": { 324 | "kernelspec": { 325 | "display_name": "Julia 1.10.4", 326 | "language": "julia", 327 | "name": "julia-1.10" 328 | }, 329 | "language_info": { 330 | "file_extension": ".jl", 331 | "mimetype": "application/julia", 332 | "name": "julia", 333 | "version": "1.10.4" 334 | } 335 | }, 336 | "nbformat": 4, 337 | "nbformat_minor": 2 338 | } 339 | -------------------------------------------------------------------------------- /parts/gpu/imgs/cpu_gpu_evo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JuliaHPC/juliacon24-hpcworkshop/2c6e764bcf94eebd549785b69264215b1012eaf2/parts/gpu/imgs/cpu_gpu_evo.png -------------------------------------------------------------------------------- /parts/gpu/imgs/cuda_grid.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JuliaHPC/juliacon24-hpcworkshop/2c6e764bcf94eebd549785b69264215b1012eaf2/parts/gpu/imgs/cuda_grid.png -------------------------------------------------------------------------------- /parts/gpu/imgs/frontier.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JuliaHPC/juliacon24-hpcworkshop/2c6e764bcf94eebd549785b69264215b1012eaf2/parts/gpu/imgs/frontier.png -------------------------------------------------------------------------------- /parts/gpu/job_bench_gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --time=00:05:00 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 5 | #SBATCH --cpus-per-task=128 6 | #SBATCH --gpus-per-task=1 7 | #SBATCH --constraint=gpu 8 | #SBATCH --account=ntrain1 9 | #SBATCH --output=slurm_bench_gpu.out 10 | #SBATCH --qos=regular 11 | 12 | # Load julia 13 | ml use /global/common/software/nersc/n9/julia/modules 14 | ml julia 15 | 16 | for i in 512 2048 4096 8192 16384 17 | do 18 | echo -e "\n\n#### GPU run $i" 19 | 20 | julia --project -e 'do_save=false; include("diffusion_2d_cuda.jl")' $i 21 | done 22 | -------------------------------------------------------------------------------- /parts/gpu/job_gpu_mpi_multinode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -A ntrain1 4 | #SBATCH -C gpu 5 | #SBATCH -q regular 6 | #SBATCH --output=slurm_gpu_mpi_multinode.out 7 | #SBATCH --time=00:05:00 8 | #SBATCH --nodes=4 9 | #SBATCH --ntasks=16 10 | #SBATCH --gpus-per-node=4 11 | #SBATCH --exclusive 12 | #SBATCH --gpu-bind=none 13 | 14 | # pin to closest NIC to GPU 15 | export MPICH_OFI_NIC_POLICY=GPU 16 | 17 | # default to std memory pool, see: https://juliaparallel.org/MPI.jl/stable/knownissues/#Memory-pool 18 | export JULIA_CUDA_MEMORY_POOL=none 19 | 20 | ml use /global/common/software/nersc/n9/julia/modules 21 | ml julia 22 | 23 | mpiexecjl --project=../.. julia diffusion_2d_cuda_mpi.jl 24 | -------------------------------------------------------------------------------- /parts/gpu/job_gpu_mpi_singlenode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -A ntrain1 4 | #SBATCH -C gpu 5 | #SBATCH -q regular 6 | #SBATCH --output=slurm_gpu_mpi_singlenode.out 7 | #SBATCH --time=00:05:00 8 | #SBATCH --nodes=1 9 | #SBATCH --ntasks=4 10 | #SBATCH --gpus-per-node=4 11 | #SBATCH --exclusive 12 | #SBATCH --gpu-bind=none 13 | 14 | # pin to closest NIC to GPU 15 | export MPICH_OFI_NIC_POLICY=GPU 16 | 17 | # default to std memory pool, see: https://juliaparallel.org/MPI.jl/stable/knownissues/#Memory-pool 18 | export JULIA_CUDA_MEMORY_POOL=none 19 | 20 | ml use /global/common/software/nersc/n9/julia/modules 21 | ml julia 22 | 23 | mpiexecjl --project=../.. julia diffusion_2d_cuda_mpi.jl 24 | -------------------------------------------------------------------------------- /parts/gpu/multigpu.jl: -------------------------------------------------------------------------------- 1 | using MPI 2 | using CUDA 3 | MPI.Init() 4 | comm = MPI.COMM_WORLD 5 | rank = MPI.Comm_rank(comm) 6 | # select device 7 | comm_l = MPI.Comm_split_type(comm, MPI.COMM_TYPE_SHARED, rank) 8 | rank_l = MPI.Comm_rank(comm_l) 9 | gpu_id = CUDA.device!(rank_l) 10 | # select device 11 | size = MPI.Comm_size(comm) 12 | dst = mod(rank+1, size) 13 | src = mod(rank-1, size) 14 | println("rank=$rank rank_loc=$rank_l (gpu_id=$gpu_id), size=$size, dst=$dst, src=$src") 15 | N = 4 16 | send_mesg = CuArray{Float64}(undef, N) 17 | recv_mesg = CuArray{Float64}(undef, N) 18 | fill!(send_mesg, Float64(rank)) 19 | CUDA.synchronize() 20 | rank==0 && println("start sending...") 21 | MPI.Sendrecv!(send_mesg, dst, 0, recv_mesg, src, 0, comm) 22 | println("recv_mesg on proc $rank_l: $recv_mesg") 23 | rank==0 && println("done.") 24 | -------------------------------------------------------------------------------- /parts/gpu/slurm/hello.jl: -------------------------------------------------------------------------------- 1 | using MPI, CUDA, Libdl 2 | 3 | #_______________________________________________________________________________ 4 | # Get MPI version string from libmpi.so 5 | # 6 | 7 | function get_mpi_version_string() 8 | buf_size = 8192 # HACK: this should be enough space 9 | buf = Array{UInt8}(undef, buf_size) 10 | buflen = Ref{Cint}() 11 | 12 | hndl = Libdl.dlopen(MPI.libmpi, Libdl.RTLD_LAZY | Libdl.RTLD_GLOBAL) 13 | 14 | try 15 | ptr = Libdl.dlsym(hndl, :MPI_Get_library_version) 16 | ccall(ptr, Cint, (Ptr{UInt8}, Ref{Cint}), buf, buflen) 17 | finally 18 | Libdl.dlclose(hndl) 19 | end 20 | 21 | @assert buflen[] < buf_size 22 | resize!(buf, buflen[]) 23 | return String(buf) 24 | end 25 | 26 | #------------------------------------------------------------------------------- 27 | 28 | 29 | #_______________________________________________________________________________ 30 | # Get information on which Device and Bus a GPU is connected to: 31 | # 32 | 33 | function get_device_attributes() 34 | attr = Dict{Tuple{Int32, Int32}, Int32}() 35 | for i in 0:(ndevices()-1) 36 | d = CuDevice(i) 37 | attr[( 38 | attribute(d, CUDA.CU_DEVICE_ATTRIBUTE_PCI_BUS_ID), 39 | attribute(d, CUDA.CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID) 40 | )] = d 41 | end 42 | attr 43 | end 44 | 45 | #------------------------------------------------------------------------------- 46 | 47 | 48 | MPI.Init() 49 | MPI.ThreadLevel(2) 50 | 51 | comm = MPI.COMM_WORLD 52 | rank = MPI.Comm_rank(comm) 53 | size = MPI.Comm_size(comm) 54 | name = gethostname() 55 | 56 | devices = get_device_attributes() 57 | 58 | # get the MPI version string and print it. This will be the same for every 59 | # rank, so do this only on Rank 0 60 | if rank == 0 61 | version_string = get_mpi_version_string() 62 | println("MPI Version: $(version_string)") 63 | end 64 | 65 | println( 66 | "Hello world, I am rank $(rank) of $(size) on $(name). " * 67 | "I have $(ndevices()) GPUs with properties: $(devices)" 68 | ) 69 | 70 | MPI.Barrier(comm) 71 | -------------------------------------------------------------------------------- /parts/gpu/slurm/job_hello_multinode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --time=00:05:00 3 | #SBATCH --nodes=4 4 | #SBATCH --ntasks-per-node=4 5 | #SBATCH --gpus-per-task=1 6 | #SBATCH --constraint=gpu 7 | #SBATCH --account=ntrain1 8 | #SBATCH --output=slurm_hello_multinode.out 9 | #SBATCH --qos=regular 10 | 11 | # pin to closest NIC to GPU 12 | export MPICH_OFI_NIC_POLICY=GPU 13 | 14 | # Load julia 15 | ml use /global/common/software/nersc/n9/julia/modules 16 | ml julia 17 | 18 | mpiexecjl --project julia hello.jl 19 | -------------------------------------------------------------------------------- /parts/gpu/slurm/job_hello_singlenode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --time=00:05:00 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=4 5 | #SBATCH --gpus-per-task=1 6 | #SBATCH --constraint=gpu 7 | #SBATCH --account=ntrain1 8 | #SBATCH --output=slurm_hello_singlenode.out 9 | #SBATCH --qos=regular 10 | 11 | # pin to closest NIC to GPU 12 | export MPICH_OFI_NIC_POLICY=GPU 13 | 14 | # Load julia 15 | ml use /global/common/software/nersc/n9/julia/modules 16 | ml julia 17 | 18 | mpiexecjl --project julia hello.jl 19 | -------------------------------------------------------------------------------- /parts/gpu/solution/diffusion_2d_cuda.jl: -------------------------------------------------------------------------------- 1 | # 2D linear diffusion solver - GPU cuda version 2 | using Printf 3 | using JLD2 4 | using CUDA 5 | include(joinpath(@__DIR__, "../../shared.jl")) 6 | 7 | # convenience macros simply to avoid writing nested finite-difference expression 8 | macro qx(ix, iy) esc(:(-D * (C[$ix+1, $iy] - C[$ix, $iy]) * inv(ds))) end 9 | macro qy(ix, iy) esc(:(-D * (C[$ix, $iy+1] - C[$ix, $iy]) * inv(ds))) end 10 | 11 | function diffusion_step_kernel!(params, C2, C) 12 | (; ds, dt, D) = params 13 | ix = (blockIdx().x - 1) * blockDim().x + threadIdx().x # CUDA vectorised unique index 14 | iy = (blockIdx().y - 1) * blockDim().y + threadIdx().y # CUDA vectorised unique index 15 | if ix <= size(C, 1)-2 && iy <= size(C, 2)-2 16 | @inbounds C2[ix+1, iy+1] = C[ix+1, iy+1] - dt * ((@qx(ix + 1, iy + 1) - @qx(ix, iy + 1)) * inv(ds) + 17 | (@qy(ix + 1, iy + 1) - @qy(ix + 1, iy)) * inv(ds)) 18 | end 19 | return nothing 20 | end 21 | 22 | function diffusion_step!(params, C2, C) 23 | (; nthreads, nblocks) = params 24 | @cuda threads = nthreads blocks = nblocks diffusion_step_kernel!(params, C2, C) 25 | return nothing 26 | end 27 | 28 | function run_diffusion(; ns=64, nt=100, do_save=false) 29 | params = init_params_gpu(; ns, nt, do_save) 30 | C, C2 = init_arrays_gpu(params) 31 | t_tic = 0.0 32 | # Time loop 33 | for it in 1:nt 34 | # time after warmup (ignore first 10 iterations) 35 | (it == 11) && (t_tic = Base.time()) 36 | # diffusion 37 | diffusion_step!(params, C2, C) 38 | C, C2 = C2, C # pointer swap 39 | end 40 | # synchronize the gpu before querying the final time 41 | CUDA.synchronize() 42 | t_toc = (Base.time() - t_tic) 43 | print_perf(params, t_toc) 44 | do_save && jldsave(joinpath(@__DIR__, "out_gpu.jld2"); C = Array(C), l = params.L) 45 | return nothing 46 | end 47 | 48 | # Running things... 49 | 50 | # enable saving by default 51 | (!@isdefined do_save) && (do_save = true) 52 | # enable execution by default 53 | (!@isdefined do_run) && (do_run = true) 54 | 55 | if do_run 56 | if !isempty(ARGS) 57 | run_diffusion(; ns=parse(Int, ARGS[1]), do_save) 58 | else 59 | run_diffusion(; ns=256, do_save) 60 | end 61 | end 62 | -------------------------------------------------------------------------------- /parts/gpu/solution/diffusion_2d_cuda_mpi.jl: -------------------------------------------------------------------------------- 1 | # 2D linear diffusion solver - GPU MPI 2 | using Printf 3 | using JLD2 4 | using CUDA 5 | using MPI 6 | include(joinpath(@__DIR__, "../../shared.jl")) 7 | 8 | # convenience macros simply to avoid writing nested finite-difference expression 9 | macro qx(ix, iy) esc(:(-D * (C[$ix+1, $iy] - C[$ix, $iy]) * inv(dx))) end 10 | macro qy(ix, iy) esc(:(-D * (C[$ix, $iy+1] - C[$ix, $iy]) * inv(dy))) end 11 | 12 | function diffusion_step_kernel!(params, C2, C) 13 | (; dx, dy, dt, D) = params 14 | ix = (blockIdx().x - 1) * blockDim().x + threadIdx().x # CUDA vectorised unique index 15 | iy = (blockIdx().y - 1) * blockDim().y + threadIdx().y # CUDA vectorised unique index 16 | if ix <= size(C, 1)-2 && iy <= size(C, 2)-2 17 | @inbounds C2[ix+1, iy+1] = C[ix+1, iy+1] - dt * ((@qx(ix + 1, iy + 1) - @qx(ix, iy + 1)) * inv(dx) + 18 | (@qy(ix + 1, iy + 1) - @qy(ix + 1, iy)) * inv(dy)) 19 | end 20 | return nothing 21 | end 22 | 23 | function diffusion_step!(params, C2, C) 24 | (; nthreads, nblocks) = params 25 | @cuda threads = nthreads blocks = nblocks diffusion_step_kernel!(params, C2, C) 26 | return nothing 27 | end 28 | 29 | # MPI functions 30 | @views function update_halo!(A, bufs, neighbors, comm) 31 | # dim-1 (x) 32 | (neighbors.x[1] != MPI.PROC_NULL) && copyto!(bufs.send_1_1, A[2 , :]) 33 | (neighbors.x[2] != MPI.PROC_NULL) && copyto!(bufs.send_1_2, A[end-1, :]) 34 | 35 | reqs = MPI.MultiRequest(4) 36 | (neighbors.x[1] != MPI.PROC_NULL) && MPI.Irecv!(bufs.recv_1_1, comm, reqs[1]; source=neighbors.x[1]) 37 | (neighbors.x[2] != MPI.PROC_NULL) && MPI.Irecv!(bufs.recv_1_2, comm, reqs[2]; source=neighbors.x[2]) 38 | 39 | (neighbors.x[1] != MPI.PROC_NULL) && MPI.Isend(bufs.send_1_1, comm, reqs[3]; dest=neighbors.x[1]) 40 | (neighbors.x[2] != MPI.PROC_NULL) && MPI.Isend(bufs.send_1_2, comm, reqs[4]; dest=neighbors.x[2]) 41 | MPI.Waitall(reqs) # blocking 42 | 43 | (neighbors.x[1] != MPI.PROC_NULL) && copyto!(A[1 , :], bufs.recv_1_1) 44 | (neighbors.x[2] != MPI.PROC_NULL) && copyto!(A[end, :], bufs.recv_1_2) 45 | 46 | # dim-2 (y) 47 | (neighbors.y[1] != MPI.PROC_NULL) && copyto!(bufs.send_2_1, A[:, 2 ]) 48 | (neighbors.y[2] != MPI.PROC_NULL) && copyto!(bufs.send_2_2, A[:, end-1]) 49 | 50 | reqs = MPI.MultiRequest(4) 51 | (neighbors.y[1] != MPI.PROC_NULL) && MPI.Irecv!(bufs.recv_2_1, comm, reqs[1]; source=neighbors.y[1]) 52 | (neighbors.y[2] != MPI.PROC_NULL) && MPI.Irecv!(bufs.recv_2_2, comm, reqs[2]; source=neighbors.y[2]) 53 | 54 | (neighbors.y[1] != MPI.PROC_NULL) && MPI.Isend(bufs.send_2_1, comm, reqs[3]; dest=neighbors.y[1]) 55 | (neighbors.y[2] != MPI.PROC_NULL) && MPI.Isend(bufs.send_2_2, comm, reqs[4]; dest=neighbors.y[2]) 56 | MPI.Waitall(reqs) # blocking 57 | 58 | (neighbors.y[1] != MPI.PROC_NULL) && copyto!(A[:, 1 ], bufs.recv_2_1) 59 | (neighbors.y[2] != MPI.PROC_NULL) && copyto!(A[:, end], bufs.recv_2_2) 60 | return 61 | end 62 | 63 | function init_bufs(A) 64 | return (; send_1_1=CUDA.zeros(Float64, size(A, 2)), send_1_2=CUDA.zeros(Float64, size(A, 2)), 65 | send_2_1=CUDA.zeros(Float64, size(A, 1)), send_2_2=CUDA.zeros(Float64, size(A, 1)), 66 | recv_1_1=CUDA.zeros(Float64, size(A, 2)), recv_1_2=CUDA.zeros(Float64, size(A, 2)), 67 | recv_2_1=CUDA.zeros(Float64, size(A, 1)), recv_2_2=CUDA.zeros(Float64, size(A, 1))) 68 | end 69 | 70 | function run_diffusion(; ns=64, nt=100, do_save=false) 71 | MPI.Init() 72 | comm = MPI.COMM_WORLD 73 | nprocs = MPI.Comm_size(comm) 74 | dims = MPI.Dims_create(nprocs, (0, 0)) |> Tuple 75 | comm_cart = MPI.Cart_create(comm, dims) 76 | me = MPI.Comm_rank(comm_cart) 77 | coords = MPI.Cart_coords(comm_cart) |> Tuple 78 | neighbors = (; x=MPI.Cart_shift(comm_cart, 0, 1), y=MPI.Cart_shift(comm_cart, 1, 1)) 79 | # select GPU on multi-GPU system based on shared memory topology 80 | comm_l = MPI.Comm_split_type(comm, MPI.COMM_TYPE_SHARED, me) 81 | me_l = MPI.Comm_rank(comm_l) 82 | # set GPU, but only if more than one device present 83 | gpu_id = CUDA.device!(me_l % ndevices()) 84 | println("$(gpu_id), out of: $(ndevices())") 85 | (me == 0) && println("nprocs = $(nprocs), dims = $dims") 86 | 87 | params = init_params_gpu_mpi(; dims, coords, ns, nt, do_save) 88 | C, C2 = init_arrays_gpu_mpi(params) 89 | bufs = init_bufs(C) 90 | t_tic = 0.0 91 | # Time loop 92 | for it in 1:nt 93 | # time after warmup (ignore first 10 iterations) 94 | (it == 11) && (t_tic = Base.time()) 95 | # diffusion 96 | diffusion_step!(params, C2, C) 97 | update_halo!(C2, bufs, neighbors, comm_cart) 98 | C, C2 = C2, C # pointer swap 99 | end 100 | # synchronize the gpu before querying the final time 101 | CUDA.synchronize() 102 | t_toc = (Base.time() - t_tic) 103 | # "master" prints performance 104 | (me == 0) && print_perf(params, t_toc) 105 | # save to (maybe) visualize later 106 | if do_save 107 | jldsave(joinpath(@__DIR__, "out_$(me).jld2"); C = Array(C[2:end-1, 2:end-1]), lxy = (; lx=params.L, ly=params.L)) 108 | end 109 | MPI.Finalize() 110 | return 111 | end 112 | 113 | # Running things... 114 | 115 | # enable save to disk by default 116 | (!@isdefined do_save) && (do_save = true) 117 | # enable execution by default 118 | (!@isdefined do_run) && (do_run = true) 119 | 120 | if do_run 121 | # run_diffusion(; ns=256, do_save) 122 | run_diffusion(; ns=16384, do_save=false) 123 | end 124 | -------------------------------------------------------------------------------- /parts/gpu/solution/job_bench_gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --time=00:05:00 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 5 | #SBATCH --cpus-per-task=128 6 | #SBATCH --gpus-per-task=1 7 | #SBATCH --constraint=gpu 8 | #SBATCH --account=ntrain1 9 | #SBATCH --output=slurm_bench_gpu.out 10 | #SBATCH --qos=regular 11 | 12 | # Load julia 13 | ml use /global/common/software/nersc/n9/julia/modules 14 | ml julia 15 | 16 | for i in 512 2048 4096 8192 16384 17 | do 18 | echo -e "\n\n#### GPU run $i" 19 | 20 | julia --project -e 'do_save=false; include("diffusion_2d_cuda.jl")' $i 21 | done 22 | -------------------------------------------------------------------------------- /parts/gpu/solution/job_gpu_mpi_multinode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -A ntrain1 4 | #SBATCH -C gpu 5 | #SBATCH -q regular 6 | #SBATCH --output=slurm_gpu_mpi_multinode.out 7 | #SBATCH --time=00:05:00 8 | #SBATCH --nodes=4 9 | #SBATCH --ntasks=16 10 | #SBATCH --gpus-per-node=4 11 | #SBATCH --exclusive 12 | #SBATCH --gpu-bind=none 13 | 14 | # pin to closest NIC to GPU 15 | export MPICH_OFI_NIC_POLICY=GPU 16 | 17 | # default to std memory pool, see: https://juliaparallel.org/MPI.jl/stable/knownissues/#Memory-pool 18 | export JULIA_CUDA_MEMORY_POOL=none 19 | 20 | ml use /global/common/software/nersc/n9/julia/modules 21 | ml julia 22 | 23 | mpiexecjl --project=../../.. julia diffusion_2d_cuda_mpi.jl 24 | -------------------------------------------------------------------------------- /parts/gpu/solution/job_gpu_mpi_singlenode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -A ntrain1 4 | #SBATCH -C gpu 5 | #SBATCH -q regular 6 | #SBATCH --output=slurm_gpu_mpi_singlenode.out 7 | #SBATCH --time=00:05:00 8 | #SBATCH --nodes=1 9 | #SBATCH --ntasks=4 10 | #SBATCH --gpus-per-node=4 11 | #SBATCH --exclusive 12 | #SBATCH --gpu-bind=none 13 | 14 | # pin to closest NIC to GPU 15 | export MPICH_OFI_NIC_POLICY=GPU 16 | 17 | # default to std memory pool, see: https://juliaparallel.org/MPI.jl/stable/knownissues/#Memory-pool 18 | export JULIA_CUDA_MEMORY_POOL=none 19 | 20 | ml use /global/common/software/nersc/n9/julia/modules 21 | ml julia 22 | 23 | mpiexecjl --project=../../.. julia diffusion_2d_cuda_mpi.jl 24 | -------------------------------------------------------------------------------- /parts/gpu/solution/visualize_mpi.jl: -------------------------------------------------------------------------------- 1 | # Visualisation script for the 2D MPI solver 2 | using CairoMakie 3 | using JLD2 4 | 5 | function vizme2D_mpi(nprocs) 6 | C = [] 7 | lx = ly = 0.0 8 | ip = 1 9 | for ipx in 1:nprocs[1] 10 | for ipy in 1:nprocs[2] 11 | C_loc, lxy = load(joinpath(@__DIR__, "out_$(ip-1).jld2"), "C", "lxy") 12 | nx_i, ny_i = size(C_loc, 1), size(C_loc, 2) 13 | ix1, iy1 = 1 + (ipx - 1) * nx_i, 1 + (ipy - 1) * ny_i 14 | if ip == 1 15 | C = zeros(nprocs[1] * nx_i, nprocs[2] * ny_i) 16 | lx, ly = lxy 17 | end 18 | C[ix1:ix1+nx_i-1, iy1:iy1+ny_i-1] .= C_loc 19 | ip += 1 20 | end 21 | end 22 | xc, yc = LinRange.(0, (lx, ly), size(C)) 23 | fig = Figure(; size=(500, 400), fontsize=14) 24 | ax = Axis(fig[1, 1][1, 1]; aspect=DataAspect(), title="C") 25 | hm = heatmap!(ax, xc, yc, C; colormap=:turbo, colorrange=(0, 1)) 26 | cb = Colorbar(fig[1, 1][1, 2], hm) 27 | display(fig) 28 | return 29 | end 30 | 31 | nprocs = (2, 2) # nprocs (x, y) dim 32 | vizme2D_mpi(nprocs) 33 | -------------------------------------------------------------------------------- /parts/gpu/visualize.jl: -------------------------------------------------------------------------------- 1 | # Visualisation script for the 2D MPI solver 2 | using CairoMakie 3 | using JLD2 4 | 5 | function vizme2D() 6 | C, l = load(joinpath(@__DIR__, "out_gpu.jld2"), "C", "l") 7 | xc, yc = LinRange.(0, (l, l), size(C)) 8 | fig = Figure(; size=(500, 400), fontsize=14) 9 | ax = Axis(fig[1, 1][1, 1]; aspect=DataAspect(), title="C") 10 | hm = heatmap!(ax, xc, yc, C; colormap=:turbo, colorrange=(0, 1)) 11 | cb = Colorbar(fig[1, 1][1, 2], hm) 12 | display(fig) 13 | return 14 | end 15 | 16 | vizme2D() 17 | -------------------------------------------------------------------------------- /parts/gpu/visualize_mpi.jl: -------------------------------------------------------------------------------- 1 | # Visualisation script for the 2D MPI solver 2 | using CairoMakie 3 | using JLD2 4 | 5 | function vizme2D_mpi(nprocs) 6 | C = [] 7 | lx = ly = 0.0 8 | ip = 1 9 | for ipx in 1:nprocs[1] 10 | for ipy in 1:nprocs[2] 11 | C_loc, lxy = load(joinpath(@__DIR__, "out_$(ip-1).jld2"), "C", "lxy") 12 | nx_i, ny_i = size(C_loc, 1), size(C_loc, 2) 13 | ix1, iy1 = 1 + (ipx - 1) * nx_i, 1 + (ipy - 1) * ny_i 14 | if ip == 1 15 | C = zeros(nprocs[1] * nx_i, nprocs[2] * ny_i) 16 | lx, ly = lxy 17 | end 18 | C[ix1:ix1+nx_i-1, iy1:iy1+ny_i-1] .= C_loc 19 | ip += 1 20 | end 21 | end 22 | xc, yc = LinRange.(0, (lx, ly), size(C)) 23 | fig = Figure(; size=(500, 400), fontsize=14) 24 | ax = Axis(fig[1, 1][1, 1]; aspect=DataAspect(), title="C") 25 | hm = heatmap!(ax, xc, yc, C; colormap=:turbo, colorrange=(0, 1)) 26 | cb = Colorbar(fig[1, 1][1, 2], hm) 27 | display(fig) 28 | return 29 | end 30 | 31 | nprocs = (2, 2) # nprocs (x, y) dim 32 | vizme2D_mpi(nprocs) 33 | -------------------------------------------------------------------------------- /parts/mpi/README.md: -------------------------------------------------------------------------------- 1 | # Diffusion 2D - MPI 2 | 3 | In this part, we want to use MPI (distributed parallelism) to parallelize our Diffusion 2D example. 4 | 5 | The starting point is (once again) the serial loop version [`diffusion_2d_loop.jl`](./../diffusion_2d/diffusion_2d_loop.jl). The file [`diffusion_2d_mpi.jl`](./diffusion_2d_mpi.jl) in this folder is a modified copy of this variant. While the computational kernel `diffusion_step!` is essentially untouched, we included MPI bits at the beginning of the `run_diffusion` function and introduced the key function `update_halo!`, which is supposed to take care of data exchange between MPI ranks. However, as of now, the function isn't communicating anything and it will be (one of) your tasks to fix that 😉. 6 | 7 | 8 | ## Task 1 - Running the MPI code 9 | 10 | Although incomplete from a semantic point of view, the code in `diffusion_2d_mpi.jl` is perfectly runnable as is. It won't compute the right thing, but it runs 😉. So **let's run it**. But how? 11 | 12 | First thing to realize is that, on Perlmutter, **you can't run MPI on a login node**. You have two options to work on a compute node: 13 | 14 | 1) **Interactive session**: You can try to get an interactive session on a compute node by running `sh get_compute_node_interactive.sh`. But unfortunately, we don't have a node for everyone, so you might not get one (Sorry!). **If you can get one**, you can use `mpiexecjl --project -n 4 julia diffusion_2d_mpi.jl` to run the code. Alternatively, you can run `sh job_mpi_singlenode.sh`. 15 | 16 | 2) **Compute job**: You can always submit a job that runs the code: `sbatch job_mpi_singlenode.sh`. The output will land in `slurm_mpi_singlenode.out`. Check out the [Perlmutter cheetsheet](../../help/perlmutter_cheatsheet.md) to learn more about jobs. 17 | 18 | Irrespective of which option you choose, **go ahead an run the code** (with 4 MPI ranks). 19 | 20 | To see that the code is currently not working properly (in the sense of computing the right thing), run `julia --project visualize_mpi.jl` to combine the results of different MPI ranks (`*.jld2` files) into a visualization (`visualization.png`). Inspect the visualization and notice the undesired dark lines. 21 | 22 | ## Task 2 - Halo exchange 23 | 24 | Take a look at the general MPI setup (the beginning of `run_diffusion`) and the `update_halo!` function (the bits that are already there) and try to understand it. 25 | 26 | Afterwards, implement the necessary MPI communication. To that end, find the "TODO" block in `update_halo!` and follow the instructions. Note that we want to use **non-blocking** communication, i.e. you should use the functions `MPI.Irecv` and `MPI.Isend`. 27 | 28 | Check that your code is working by comparing the `visualization.png` that you get to this (basic "eye test"): 29 | 30 | 31 | 32 | ## Task 3 - Benchmark 33 | 34 | ### Part A 35 | 36 | Our goal is to perform a rough and basic scaling analysis with 4, 8, and 16 MPI ranks distributed across multiple nodes. Specifically, we want to run 4 MPI ranks on a node and increase the number of nodes to get up to 16 ranks in total. 37 | 38 | The file `job_mpi_multinode.sh` is a job script that currently requests a single node (see the line `#SBATCH --nodes=1`) that runs 4 MPI ranks (see the line `#SBATCH --ntasks-per-node=4`), and then runs our Julia MPI code with `do_save=false` for simplicity and `ns=6144`. 39 | 40 | Submit this file to SLURM via `sbatch job_mpi_multinode.sh`. Once the job has run, the output will land in `slurm_mpi_multinode.sh`. Write the output down somewhere (copy & paste), change the number of nodes to 2 (= 8 MPI ranks in total) and rerun the experiment. Repeat the same thing, this time requesting 3 nodes (= 12 MPI ranks in total) and then requesting 4 nodes (= 16 MPI ranks in total). 41 | 42 | ### Part B 43 | 44 | Inspect the results that you've obtained and compare them. 45 | 46 | **Questions** 47 | * What do you observe? 48 | * Is this what you'd expected? 49 | 50 | Note that in setting up our MPI ranks, we split our global grid into local grids. In the process, the meaning of the input parameter `ns` changed compared to previous codes (serial & multithreading). It now determines the resolution of the **local grid** - that each MPI rank is holding - rather than the resolution of the global grid. Since we keep `ns` fixed (6144 in `job_mpi_multinode.sh`), we thus increase the problem size (the total grid resolution) when we increase the number of MPI ranks. This is known as a "weak scaling" analysis. 51 | 52 | **Question** 53 | 54 | * Given the comment above, what does "ideal parallel scaling" mean in the context of a "weak scaling" analysis? 55 | * What do the observed results tell you? 56 | -------------------------------------------------------------------------------- /parts/mpi/diffusion_2d_mpi.jl: -------------------------------------------------------------------------------- 1 | # 2D linear diffusion solver - MPI 2 | using Printf 3 | using JLD2 4 | using MPI 5 | include(joinpath(@__DIR__, "../shared.jl")) 6 | 7 | # convenience macros simply to avoid writing nested finite-difference expression 8 | macro qx(ix, iy) esc(:(-D * (C[$ix+1, $iy] - C[$ix, $iy]) / dx)) end 9 | macro qy(ix, iy) esc(:(-D * (C[$ix, $iy+1] - C[$ix, $iy]) / dy)) end 10 | 11 | function diffusion_step!(params, C2, C) 12 | (; dx, dy, dt, D) = params 13 | for iy in 1:size(C, 2)-2 14 | for ix in 1:size(C, 1)-2 15 | @inbounds C2[ix+1, iy+1] = C[ix+1, iy+1] - dt * ((@qx(ix+1, iy+1) - @qx(ix, iy+1)) / dx + 16 | (@qy(ix+1, iy+1) - @qy(ix+1, iy)) / dy) 17 | end 18 | end 19 | return nothing 20 | end 21 | 22 | # MPI functions 23 | @views function update_halo!(A, bufs, neighbors, comm) 24 | # 25 | # !!! TODO 26 | # 27 | # Complete the halo exchange implementation. Specifically, use non-blocking 28 | # MPI communication (Irecv and Isend) at the positions marked by "TODO..." below. 29 | # 30 | # Help: 31 | # left neighbor: neighbors.x[1] 32 | # right neighbor: neighbors.x[2] 33 | # up neighbor: neighbors.y[1] 34 | # down neighbor: neighbors.y[2] 35 | # 36 | 37 | # dim-1 (x) 38 | (neighbors.x[1] != MPI.PROC_NULL) && copyto!(bufs.send_1_1, A[2 , :]) 39 | (neighbors.x[2] != MPI.PROC_NULL) && copyto!(bufs.send_1_2, A[end-1, :]) 40 | 41 | reqs = MPI.MultiRequest(4) 42 | (neighbors.x[1] != MPI.PROC_NULL) && # TODO... receive from left neighbor into bufs.recv_1_1 43 | (neighbors.x[2] != MPI.PROC_NULL) && # TODO... receive from right neighbor into bufs.recv_1_2 44 | 45 | (neighbors.x[1] != MPI.PROC_NULL) && # TODO... send bufs.send_1_1 to left neighbor 46 | (neighbors.x[2] != MPI.PROC_NULL) && # TODO... send bufs.send_1_2 to right neighbor 47 | MPI.Waitall(reqs) # blocking 48 | 49 | (neighbors.x[1] != MPI.PROC_NULL) && copyto!(A[1 , :], bufs.recv_1_1) 50 | (neighbors.x[2] != MPI.PROC_NULL) && copyto!(A[end, :], bufs.recv_1_2) 51 | 52 | # dim-2 (y) 53 | (neighbors.y[1] != MPI.PROC_NULL) && copyto!(bufs.send_2_1, A[:, 2 ]) 54 | (neighbors.y[2] != MPI.PROC_NULL) && copyto!(bufs.send_2_2, A[:, end-1]) 55 | 56 | reqs = MPI.MultiRequest(4) 57 | (neighbors.y[1] != MPI.PROC_NULL) && # TODO... receive from up neighbor into bufs.recv_2_1 58 | (neighbors.y[2] != MPI.PROC_NULL) && # TODO... receive from down neighbor into bufs.recv_2_2 59 | 60 | (neighbors.y[1] != MPI.PROC_NULL) && # TODO... send bufs.send_2_1 to up neighbor 61 | (neighbors.y[2] != MPI.PROC_NULL) && # TODO... send bufs.send_2_2 to down neighbor 62 | MPI.Waitall(reqs) # blocking 63 | 64 | (neighbors.y[1] != MPI.PROC_NULL) && copyto!(A[:, 1 ], bufs.recv_2_1) 65 | (neighbors.y[2] != MPI.PROC_NULL) && copyto!(A[:, end], bufs.recv_2_2) 66 | return nothing 67 | end 68 | 69 | function init_bufs(A) 70 | return (; send_1_1=zeros(size(A, 2)), send_1_2=zeros(size(A, 2)), 71 | send_2_1=zeros(size(A, 1)), send_2_2=zeros(size(A, 1)), 72 | recv_1_1=zeros(size(A, 2)), recv_1_2=zeros(size(A, 2)), 73 | recv_2_1=zeros(size(A, 1)), recv_2_2=zeros(size(A, 1))) 74 | end 75 | 76 | function run_diffusion(; ns=64, nt=100, do_save=false) 77 | MPI.Init() 78 | comm = MPI.COMM_WORLD 79 | nprocs = MPI.Comm_size(comm) 80 | dims = MPI.Dims_create(nprocs, (0, 0)) |> Tuple 81 | comm_cart = MPI.Cart_create(comm, dims) 82 | me = MPI.Comm_rank(comm_cart) 83 | coords = MPI.Cart_coords(comm_cart) |> Tuple 84 | neighbors = (; x=MPI.Cart_shift(comm_cart, 0, 1), y=MPI.Cart_shift(comm_cart, 1, 1)) 85 | (me == 0) && println("nprocs = $(nprocs), dims = $dims") 86 | 87 | params = init_params_mpi(; dims, coords, ns, nt, do_save) 88 | C, C2 = init_arrays_mpi(params) 89 | bufs = init_bufs(C) 90 | t_tic = 0.0 91 | # time loop 92 | for it in 1:nt 93 | # time after warmup (ignore first 10 iterations) 94 | (it == 11) && (t_tic = Base.time()) 95 | # diffusion 96 | diffusion_step!(params, C2, C) 97 | update_halo!(C2, bufs, neighbors, comm_cart) 98 | C, C2 = C2, C # pointer swap 99 | end 100 | t_toc = (Base.time() - t_tic) 101 | # "master" prints performance 102 | (me == 0) && print_perf(params, t_toc) 103 | # save to (maybe) visualize later 104 | if do_save 105 | jldsave(joinpath(@__DIR__, "out_$(me).jld2"); C = Array(C[2:end-1, 2:end-1]), lxy = (; lx=params.L, ly=params.L)) 106 | end 107 | MPI.Finalize() 108 | return nothing 109 | end 110 | 111 | # Running things... 112 | 113 | # enable save to disk by default 114 | (!@isdefined do_save) && (do_save = true) 115 | # enable execution by default 116 | (!@isdefined do_run) && (do_run = true) 117 | 118 | if do_run 119 | if !isempty(ARGS) 120 | run_diffusion(; ns=parse(Int, ARGS[1]), do_save) 121 | else 122 | run_diffusion(; ns=256, do_save) 123 | end 124 | end 125 | -------------------------------------------------------------------------------- /parts/mpi/explanation/01_mpi+jupyter.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "650f758f-84da-4dd3-9479-8dbc49ebc3d4", 6 | "metadata": { 7 | "slideshow": { 8 | "slide_type": "skip" 9 | }, 10 | "tags": [] 11 | }, 12 | "source": [ 13 | "# Setup\n", 14 | "\n", 15 | "Note: you might need to run `Pkg.instantiate()` to ensure that the `Manifest.toml` is up to date. This only needs to be done once." 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 1, 21 | "id": "89ab4e89-10ca-4ba8-a7bc-d33fcf3f2e60", 22 | "metadata": { 23 | "slideshow": { 24 | "slide_type": "skip" 25 | }, 26 | "tags": [] 27 | }, 28 | "outputs": [ 29 | { 30 | "name": "stderr", 31 | "output_type": "stream", 32 | "text": [ 33 | "\u001b[32m\u001b[1m Activating\u001b[22m\u001b[39m project at `/global/u1/b/blaschke/juliacon24-hpcworkshop/parts/mpi/explanation`\n" 34 | ] 35 | }, 36 | { 37 | "name": "stdout", 38 | "output_type": "stream", 39 | "text": [ 40 | "\u001b[32m\u001b[1mStatus\u001b[22m\u001b[39m `/global/u1/b/blaschke/juliacon24-hpcworkshop/parts/mpi/explanation/Project.toml`\n", 41 | " \u001b[90m[1520ce14] \u001b[39mAbstractTrees v0.4.5\n", 42 | " \u001b[90m[052768ef] \u001b[39mCUDA v5.4.2\n", 43 | " \u001b[90m[adafc99b] \u001b[39mCpuId v0.3.1\n", 44 | " \u001b[90m[0e44f5e4] \u001b[39mHwloc v3.0.1\n", 45 | " \u001b[90m[da04e1cc] \u001b[39mMPI v0.20.20\n", 46 | " \u001b[90m[e7922434] \u001b[39mMPIClusterManagers v0.2.4\n", 47 | " \u001b[90m[6f74fd91] \u001b[39mNetworkInterfaceControllers v0.1.0\n" 48 | ] 49 | } 50 | ], 51 | "source": [ 52 | "import Pkg;\n", 53 | "Pkg.activate(@__DIR__)\n", 54 | "Pkg.status()" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "id": "53799c57-9c82-4cb2-9a73-f858a8725071", 60 | "metadata": { 61 | "slideshow": { 62 | "slide_type": "slide" 63 | }, 64 | "tags": [] 65 | }, 66 | "source": [ 67 | "# Julia + Jupyter + MPI\n", 68 | "\n", 69 | "`MPI.jl` provides wrappers for the system MPI libraries. And the `MPIClusterManagers.jl` package lets you control MPI workflows within Julia" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "id": "89cfa159-4234-4961-b18e-6f7a4472bb04", 75 | "metadata": { 76 | "slideshow": { 77 | "slide_type": "subslide" 78 | }, 79 | "tags": [] 80 | }, 81 | "source": [ 82 | "## MPI.jl" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 2, 88 | "id": "6bcb1ba8-c4da-4311-a873-3354126c952d", 89 | "metadata": { 90 | "slideshow": { 91 | "slide_type": "fragment" 92 | }, 93 | "tags": [] 94 | }, 95 | "outputs": [], 96 | "source": [ 97 | "using MPI" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "id": "1f4228e3-d910-451b-8523-7b60f342788d", 103 | "metadata": { 104 | "slideshow": { 105 | "slide_type": "fragment" 106 | }, 107 | "tags": [] 108 | }, 109 | "source": [ 110 | "`MPI.versioninfo()` tells you which MPI backend is being used by `MPI.jl`. On HPC systems, which rely on vendor-provided MPI implementations (e.g. on HPE Cray systems like Perlmutter), make sure that `MPI.jl` loads the \"right\" `libmpi.so`:" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 3, 116 | "id": "eb4f99e3-63a2-43af-903d-36cfbe011415", 117 | "metadata": { 118 | "slideshow": { 119 | "slide_type": "subslide" 120 | }, 121 | "tags": [] 122 | }, 123 | "outputs": [ 124 | { 125 | "name": "stdout", 126 | "output_type": "stream", 127 | "text": [ 128 | "MPIPreferences:\n", 129 | " binary: system\n", 130 | " abi: MPICH\n", 131 | " libmpi: libmpi_gnu_123.so\n", 132 | " mpiexec: srun\n", 133 | "\n", 134 | "Package versions\n", 135 | " MPI.jl: 0.20.20\n", 136 | " MPIPreferences.jl: 0.1.11\n", 137 | "\n", 138 | "Library information:\n", 139 | " libmpi: libmpi_gnu_123.so\n", 140 | " libmpi dlpath: /opt/cray/pe/lib64/libmpi_gnu_123.so\n", 141 | " MPI version: 3.1.0\n", 142 | " Library version: \n", 143 | " MPI VERSION : CRAY MPICH version 8.1.28.29 (ANL base 3.4a2)\n", 144 | " MPI BUILD INFO : Wed Nov 15 20:57 2023 (git hash 1cde46f)\n", 145 | " \n" 146 | ] 147 | } 148 | ], 149 | "source": [ 150 | "MPI.versioninfo()" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "id": "0ebcbfaa-839b-4d40-a9ef-fc99cee61b04", 156 | "metadata": { 157 | "slideshow": { 158 | "slide_type": "subslide" 159 | }, 160 | "tags": [] 161 | }, 162 | "source": [ 163 | "## MPIClusterManagers.jl" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "id": "338abb9b-48de-4c85-9e82-bc08927ad43a", 169 | "metadata": { 170 | "slideshow": { 171 | "slide_type": "fragment" 172 | }, 173 | "tags": [] 174 | }, 175 | "source": [ 176 | "`MPIClusterManagers.jl` provide a way for Jupyter to connect to MPI processes." 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "id": "8725708a-b5b5-4cac-8983-c95a0c4b7ab9", 182 | "metadata": { 183 | "slideshow": { 184 | "slide_type": "fragment" 185 | }, 186 | "tags": [] 187 | }, 188 | "source": [ 189 | "On Perlmutter, we have a choice among network interfaces:" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 3, 195 | "id": "d2e41152-6380-4b21-8bbe-71257eb8aba7", 196 | "metadata": { 197 | "slideshow": { 198 | "slide_type": "fragment" 199 | }, 200 | "tags": [] 201 | }, 202 | "outputs": [ 203 | { 204 | "data": { 205 | "text/plain": [ 206 | "6-element Vector{NetworkInterfaceControllers.Interface}:\n", 207 | " NetworkInterfaceControllers.Interface(\"nmn0\", :v4, ip\"10.100.108.57\")\n", 208 | " NetworkInterfaceControllers.Interface(\"hsn0\", :v4, ip\"10.249.42.35\")\n", 209 | " NetworkInterfaceControllers.Interface(\"hsn0:chn\", :v4, ip\"128.55.84.171\")\n", 210 | " NetworkInterfaceControllers.Interface(\"hsn1\", :v4, ip\"10.249.42.19\")\n", 211 | " NetworkInterfaceControllers.Interface(\"hsn2\", :v4, ip\"10.249.42.20\")\n", 212 | " NetworkInterfaceControllers.Interface(\"hsn3\", :v4, ip\"10.249.42.36\")" 213 | ] 214 | }, 215 | "execution_count": 3, 216 | "metadata": {}, 217 | "output_type": "execute_result" 218 | } 219 | ], 220 | "source": [ 221 | "using NetworkInterfaceControllers, Sockets\n", 222 | "interfaces = NetworkInterfaceControllers.get_interface_data(IPv4)" 223 | ] 224 | }, 225 | { 226 | "cell_type": "markdown", 227 | "id": "78c91aa1-41ce-450a-b646-d8574e8740f4", 228 | "metadata": { 229 | "slideshow": { 230 | "slide_type": "subslide" 231 | }, 232 | "tags": [] 233 | }, 234 | "source": [ 235 | "Buf we have to be careful about which network we connect to:" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 4, 241 | "id": "a31df2d1-6a35-4420-9385-b60af0831074", 242 | "metadata": { 243 | "slideshow": { 244 | "slide_type": "skip" 245 | }, 246 | "tags": [] 247 | }, 248 | "outputs": [ 249 | { 250 | "data": { 251 | "text/plain": [ 252 | "filter (generic function with 11 methods)" 253 | ] 254 | }, 255 | "execution_count": 4, 256 | "metadata": {}, 257 | "output_type": "execute_result" 258 | } 259 | ], 260 | "source": [ 261 | "import Base: filter, Fix1\n", 262 | "filter(f::Function)::Function = Fix1(filter, f)" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 5, 268 | "id": "26e0a840-7b61-4202-974c-1cda95820690", 269 | "metadata": { 270 | "slideshow": { 271 | "slide_type": "skip" 272 | }, 273 | "tags": [] 274 | }, 275 | "outputs": [], 276 | "source": [ 277 | "using Hwloc, AbstractTrees\n", 278 | "\n", 279 | "import AbstractTrees: PreOrderDFS\n", 280 | "import Hwloc: hwloc_pci_class_string\n", 281 | "\n", 282 | "sys_devs = children(gettopology())\n", 283 | "pci_devs = PreOrderDFS(sys_devs) |> collect |> filter(x->x.type==:PCI_Device)\n", 284 | "net_devs = pci_devs |> filter(x->hwloc_pci_class_string(nodevalue(x).attr.class_id) == \"Ethernet\")\n", 285 | "\n", 286 | ";" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 6, 292 | "id": "848daddc-d8cb-4ad0-9a33-eed34197e3cb", 293 | "metadata": { 294 | "slideshow": { 295 | "slide_type": "fragment" 296 | }, 297 | "tags": [] 298 | }, 299 | "outputs": [ 300 | { 301 | "name": "stdout", 302 | "output_type": "stream", 303 | "text": [ 304 | "Device hsn0 is a Slingshot device\n", 305 | "Device nmn0 is a Unknown device\n", 306 | "Device hsn1 is a Slingshot device\n", 307 | "Device hsn2 is a Slingshot device\n", 308 | "Device hsn3 is a Slingshot device\n" 309 | ] 310 | } 311 | ], 312 | "source": [ 313 | "# net_devs are populated using Hwloc, please take a look at the source notebook\n", 314 | "# for further information\n", 315 | "\n", 316 | "for dev in net_devs\n", 317 | " io = dev.io_children |> only\n", 318 | " name = io.object.name\n", 319 | " kind = io.object.subtype\n", 320 | " kind = kind == \"\" ? \"Unknown\" : kind\n", 321 | " println(\"Device $(name) is a $(kind) device\")\n", 322 | "end" 323 | ] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "id": "36cb812b-3779-48ae-a982-d3aa8599b39f", 328 | "metadata": { 329 | "slideshow": { 330 | "slide_type": "fragment" 331 | }, 332 | "tags": [] 333 | }, 334 | "source": [ 335 | "Therefore only the `hsn*` defivices are Slingshot devices." 336 | ] 337 | }, 338 | { 339 | "cell_type": "markdown", 340 | "id": "f6d965b3-1002-41ec-a964-6e4f71faf95e", 341 | "metadata": { 342 | "slideshow": { 343 | "slide_type": "subslide" 344 | }, 345 | "tags": [] 346 | }, 347 | "source": [ 348 | "Let's now use this information to find a HSN device with which we manage our MPI cluster. Note: we'll take the one with `:chn` in the name (as it's the only one with a public IP):" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": 7, 354 | "id": "af6bdb63-1f0e-4bf6-ad6a-144d365a7e97", 355 | "metadata": { 356 | "slideshow": { 357 | "slide_type": "fragment" 358 | }, 359 | "tags": [] 360 | }, 361 | "outputs": [ 362 | { 363 | "data": { 364 | "text/plain": [ 365 | "NetworkInterfaceControllers.Interface(\"hsn0:chn\", :v4, ip\"128.55.84.171\")" 366 | ] 367 | }, 368 | "execution_count": 7, 369 | "metadata": {}, 370 | "output_type": "execute_result" 371 | } 372 | ], 373 | "source": [ 374 | "hsn0_public = filter(\n", 375 | " x->(x.name==\"hsn0:chn\" && x.version==:v4), interfaces\n", 376 | ") |> only " 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": 8, 382 | "id": "1a502b97-b4e1-44f9-a5e9-3bc09c0e8491", 383 | "metadata": { 384 | "slideshow": { 385 | "slide_type": "fragment" 386 | }, 387 | "tags": [] 388 | }, 389 | "outputs": [ 390 | { 391 | "data": { 392 | "text/plain": [ 393 | "\"nid200344-hsn0\"" 394 | ] 395 | }, 396 | "execution_count": 8, 397 | "metadata": {}, 398 | "output_type": "execute_result" 399 | } 400 | ], 401 | "source": [ 402 | "public_slingshot_name = getnameinfo(hsn0_public.ip)" 403 | ] 404 | }, 405 | { 406 | "cell_type": "markdown", 407 | "id": "70db6ae1-a001-4606-9933-55f2ac158be2", 408 | "metadata": { 409 | "slideshow": { 410 | "slide_type": "slide" 411 | }, 412 | "tags": [] 413 | }, 414 | "source": [ 415 | "## MPI Worker Cluster\n", 416 | "\n", 417 | "We use `MPIClusterManagers.jl` to start a cluster of workers. Each worker uses MPI to communicate (`MPIWorkerManager` stars an `srun` session), and is controlled via the device at `public_slingshot_name` (previous section):" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": 9, 423 | "id": "1c81c337-5e88-4688-bcf2-f48b6eeb98e8", 424 | "metadata": { 425 | "slideshow": { 426 | "slide_type": "fragment" 427 | }, 428 | "tags": [] 429 | }, 430 | "outputs": [ 431 | { 432 | "data": { 433 | "text/plain": [ 434 | "4-element Vector{Int64}:\n", 435 | " 2\n", 436 | " 3\n", 437 | " 4\n", 438 | " 5" 439 | ] 440 | }, 441 | "execution_count": 9, 442 | "metadata": {}, 443 | "output_type": "execute_result" 444 | } 445 | ], 446 | "source": [ 447 | "# to import MPIManager\n", 448 | "using MPIClusterManagers\n", 449 | "\n", 450 | "# need to also import Distributed to use addprocs()\n", 451 | "using Distributed\n", 452 | "\n", 453 | "# specify, number of mpi workers, launch cmd, etc.\n", 454 | "manager=MPIWorkerManager(4)\n", 455 | "\n", 456 | "# start mpi workers and add them as julia workers too.\n", 457 | "addprocs(\n", 458 | " manager,\n", 459 | " exeflags=`--project=$(Base.active_project())`,\n", 460 | " master_tcp_interface=public_slingshot_name\n", 461 | ")" 462 | ] 463 | }, 464 | { 465 | "cell_type": "markdown", 466 | "id": "343ca90a-f66e-43d6-a887-2b6956fae59e", 467 | "metadata": { 468 | "slideshow": { 469 | "slide_type": "subslide" 470 | }, 471 | "tags": [] 472 | }, 473 | "source": [ 474 | "Now we can use `@mpi_do` to issue instructions to all of our MPI workers:" 475 | ] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": 10, 480 | "id": "0f6bc5b9-2973-4dc5-8fdd-bfd483f01460", 481 | "metadata": { 482 | "slideshow": { 483 | "slide_type": "fragment" 484 | }, 485 | "tags": [] 486 | }, 487 | "outputs": [ 488 | { 489 | "name": "stdout", 490 | "output_type": "stream", 491 | "text": [ 492 | " From worker 5:\tHello world, I am 3 of 4 on nid200349\n", 493 | " From worker 4:\tHello world, I am 2 of 4 on nid200348\n", 494 | " From worker 2:\tHello world, I am 0 of 4 on nid200344\n", 495 | " From worker 3:\tHello world, I am 1 of 4 on nid200345\n" 496 | ] 497 | } 498 | ], 499 | "source": [ 500 | "@mpi_do manager begin\n", 501 | " using MPI: MPI, Comm, Win, free\n", 502 | " comm = MPI.COMM_WORLD\n", 503 | " rank = MPI.Comm_rank(comm)\n", 504 | " size = MPI.Comm_size(comm)\n", 505 | " name = gethostname()\n", 506 | " println(\"Hello world, I am $(rank) of $(size) on $(name)\")\n", 507 | "end" 508 | ] 509 | }, 510 | { 511 | "cell_type": "markdown", 512 | "id": "98174d30-5828-43f9-b63d-11d85a46185c", 513 | "metadata": { 514 | "slideshow": { 515 | "slide_type": "fragment" 516 | }, 517 | "tags": [] 518 | }, 519 | "source": [ 520 | "We started this in a 4-node job. Therefore each worker is on a different node." 521 | ] 522 | }, 523 | { 524 | "cell_type": "code", 525 | "execution_count": null, 526 | "id": "88e46e3b-f8d4-48d5-b8fc-2ae660f5a4a8", 527 | "metadata": { 528 | "slideshow": { 529 | "slide_type": "skip" 530 | }, 531 | "tags": [] 532 | }, 533 | "outputs": [], 534 | "source": [] 535 | } 536 | ], 537 | "metadata": { 538 | "kernelspec": { 539 | "display_name": "Julia 1.9.4", 540 | "language": "julia", 541 | "name": "julia-1.9.4" 542 | }, 543 | "language_info": { 544 | "file_extension": ".jl", 545 | "mimetype": "application/julia", 546 | "name": "julia", 547 | "version": "1.9.4" 548 | } 549 | }, 550 | "nbformat": 4, 551 | "nbformat_minor": 5 552 | } 553 | -------------------------------------------------------------------------------- /parts/mpi/explanation/02_comms.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "4d3cf46f-8189-4609-b217-29948b377255", 6 | "metadata": { 7 | "slideshow": { 8 | "slide_type": "skip" 9 | }, 10 | "tags": [] 11 | }, 12 | "source": [ 13 | "# Setup\n", 14 | "\n", 15 | "Note: you might need to run `Pkg.instantiate()` to ensure that the `Manifest.toml` is up to date. This only needs to be done once." 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 1, 21 | "id": "89ab4e89-10ca-4ba8-a7bc-d33fcf3f2e60", 22 | "metadata": { 23 | "slideshow": { 24 | "slide_type": "skip" 25 | }, 26 | "tags": [] 27 | }, 28 | "outputs": [ 29 | { 30 | "name": "stderr", 31 | "output_type": "stream", 32 | "text": [ 33 | "\u001b[32m\u001b[1m Activating\u001b[22m\u001b[39m project at `/global/u1/b/blaschke/juliacon24-hpcworkshop/parts/mpi/explanation`\n" 34 | ] 35 | }, 36 | { 37 | "name": "stdout", 38 | "output_type": "stream", 39 | "text": [ 40 | "\u001b[32m\u001b[1mStatus\u001b[22m\u001b[39m `/global/u1/b/blaschke/juliacon24-hpcworkshop/parts/mpi/explanation/Project.toml`\n", 41 | " \u001b[90m[1520ce14] \u001b[39mAbstractTrees v0.4.5\n", 42 | " \u001b[90m[052768ef] \u001b[39mCUDA v5.4.2\n", 43 | " \u001b[90m[adafc99b] \u001b[39mCpuId v0.3.1\n", 44 | " \u001b[90m[0e44f5e4] \u001b[39mHwloc v3.0.1\n", 45 | " \u001b[90m[da04e1cc] \u001b[39mMPI v0.20.20\n", 46 | " \u001b[90m[e7922434] \u001b[39mMPIClusterManagers v0.2.4\n", 47 | " \u001b[90m[6f74fd91] \u001b[39mNetworkInterfaceControllers v0.1.0\n" 48 | ] 49 | } 50 | ], 51 | "source": [ 52 | "import Pkg;\n", 53 | "Pkg.activate(@__DIR__)\n", 54 | "Pkg.status()" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 2, 60 | "id": "1c81c337-5e88-4688-bcf2-f48b6eeb98e8", 61 | "metadata": { 62 | "slideshow": { 63 | "slide_type": "skip" 64 | }, 65 | "tags": [] 66 | }, 67 | "outputs": [ 68 | { 69 | "data": { 70 | "text/plain": [ 71 | "4-element Vector{Int64}:\n", 72 | " 2\n", 73 | " 3\n", 74 | " 4\n", 75 | " 5" 76 | ] 77 | }, 78 | "execution_count": 2, 79 | "metadata": {}, 80 | "output_type": "execute_result" 81 | } 82 | ], 83 | "source": [ 84 | "using MPI\n", 85 | "\n", 86 | "using NetworkInterfaceControllers, Sockets\n", 87 | "interfaces = NetworkInterfaceControllers.get_interface_data(IPv4)\n", 88 | "\n", 89 | "hsn0_public = filter(x->(x.name==\"hsn0:chn\" && x.version==:v4), interfaces) |> only \n", 90 | "public_slingshot_name = getnameinfo(hsn0_public.ip)\n", 91 | "\n", 92 | "# to import MPIManager\n", 93 | "using MPIClusterManagers\n", 94 | "\n", 95 | "# need to also import Distributed to use addprocs()\n", 96 | "using Distributed\n", 97 | "\n", 98 | "# specify, number of mpi workers, launch cmd, etc.\n", 99 | "manager=MPIWorkerManager(4)\n", 100 | "\n", 101 | "# start mpi workers and add them as julia workers too.\n", 102 | "addprocs(\n", 103 | " manager,\n", 104 | " exeflags=`--project=$(Base.active_project())`,\n", 105 | " master_tcp_interface=public_slingshot_name\n", 106 | ")" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "id": "53799c57-9c82-4cb2-9a73-f858a8725071", 112 | "metadata": { 113 | "slideshow": { 114 | "slide_type": "slide" 115 | }, 116 | "tags": [] 117 | }, 118 | "source": [ 119 | "# Communication with MPI.jl" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "id": "332001ad-3b08-4ceb-b4e4-54e619451191", 125 | "metadata": { 126 | "slideshow": { 127 | "slide_type": "fragment" 128 | }, 129 | "tags": [] 130 | }, 131 | "source": [ 132 | "Picking up from the previous demo, we have a job with 4 ranks: " 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 3, 138 | "id": "0f6bc5b9-2973-4dc5-8fdd-bfd483f01460", 139 | "metadata": { 140 | "slideshow": { 141 | "slide_type": "fragment" 142 | }, 143 | "tags": [] 144 | }, 145 | "outputs": [ 146 | { 147 | "name": "stdout", 148 | "output_type": "stream", 149 | "text": [ 150 | " From worker 5:\tHello world, I am 3 of 4 on nid200349\n", 151 | " From worker 2:\tHello world, I am 0 of 4 on nid200344\n", 152 | " From worker 4:\tHello world, I am 2 of 4 on nid200348\n", 153 | " From worker 3:\tHello world, I am 1 of 4 on nid200345\n" 154 | ] 155 | } 156 | ], 157 | "source": [ 158 | "@mpi_do manager begin\n", 159 | " using MPI: MPI, Comm, Win, free\n", 160 | " comm = MPI.COMM_WORLD\n", 161 | " rank = MPI.Comm_rank(comm)\n", 162 | " size = MPI.Comm_size(comm)\n", 163 | " name = gethostname()\n", 164 | " println(\"Hello world, I am $(rank) of $(size) on $(name)\")\n", 165 | "end" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "id": "7982d349-c25e-4bc9-9624-bbf6f2b6c8cc", 171 | "metadata": { 172 | "slideshow": { 173 | "slide_type": "slide" 174 | }, 175 | "tags": [] 176 | }, 177 | "source": [ 178 | "## Domain Decomposition" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "id": "63c5872e-ab53-4871-8bf0-be59956fd42e", 184 | "metadata": { 185 | "slideshow": { 186 | "slide_type": "fragment" 187 | }, 188 | "tags": [] 189 | }, 190 | "source": [ 191 | "PDE solvers often break up work over a \"grid\" of ranks (domain decomposition). This will find the dimension of this grid:" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 4, 197 | "id": "1122c61b-aa2b-47e5-871f-ea7f2f1d501b", 198 | "metadata": { 199 | "slideshow": { 200 | "slide_type": "fragment" 201 | }, 202 | "tags": [] 203 | }, 204 | "outputs": [], 205 | "source": [ 206 | "@mpi_do manager begin\n", 207 | " dims = [0]\n", 208 | " MPI.Dims_create!(size, dims)\n", 209 | "end" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 5, 215 | "id": "612ccdb8-8e29-41cc-8c1f-af533e355715", 216 | "metadata": { 217 | "slideshow": { 218 | "slide_type": "fragment" 219 | }, 220 | "tags": [] 221 | }, 222 | "outputs": [ 223 | { 224 | "name": "stdout", 225 | "output_type": "stream", 226 | "text": [ 227 | " From worker 3:\t[4]\n", 228 | " From worker 2:\t[4]\n", 229 | " From worker 4:\t[4]\n", 230 | " From worker 5:\t[4]\n" 231 | ] 232 | } 233 | ], 234 | "source": [ 235 | "@mpi_do manager begin\n", 236 | " println(dims)\n", 237 | "end" 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "id": "4ec74bff-4668-4c33-b93a-ee19f67551ac", 243 | "metadata": { 244 | "slideshow": { 245 | "slide_type": "fragment" 246 | }, 247 | "tags": [] 248 | }, 249 | "source": [ 250 | "Each rank has the same value for `dims`. In $N$-dimensions, `length(dims) == N`." 251 | ] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "id": "3b3679ec-dfac-46d7-97ef-e0ad10ffe295", 256 | "metadata": { 257 | "slideshow": { 258 | "slide_type": "slide" 259 | }, 260 | "tags": [] 261 | }, 262 | "source": [ 263 | "## Cartesian Grids" 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "id": "871f8fd5-7504-4b03-9a62-e63d3278d098", 269 | "metadata": { 270 | "slideshow": { 271 | "slide_type": "fragment" 272 | }, 273 | "tags": [] 274 | }, 275 | "source": [ 276 | "We will now lay out each rank in a \"grid\" (in this example, $N=1$ so it's actually a line). In the excercise, $N=2$, so this will be an actual \"grid\". The steps here are pretty much the same though." 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": 6, 282 | "id": "c33bfb02-e341-40e4-8315-83734796a18b", 283 | "metadata": { 284 | "slideshow": { 285 | "slide_type": "fragment" 286 | }, 287 | "tags": [] 288 | }, 289 | "outputs": [], 290 | "source": [ 291 | "@mpi_do manager begin\n", 292 | " comm_cart = MPI.Cart_create(\n", 293 | " comm, # MPI Communicator\n", 294 | " dims, # Dimensions of grid\n", 295 | " [0], # 0 == not periodic, 1 == periodic\n", 296 | " 1, # 0 == not allowed to reorder, 1 == allowed to reoder\n", 297 | " )\n", 298 | " me = MPI.Comm_rank(comm_cart)\n", 299 | " coords = MPI.Cart_coords(comm_cart)\n", 300 | " neighbors = MPI.Cart_shift(\n", 301 | " comm_cart,\n", 302 | " 0, # Which dimension to shift (zero-indexed)\n", 303 | " 1, # Shift magnitude\n", 304 | " )\n", 305 | "end" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": 7, 311 | "id": "e8cf1293-b416-415f-a14e-d529a9e3e7bc", 312 | "metadata": { 313 | "slideshow": { 314 | "slide_type": "subslide" 315 | }, 316 | "tags": [] 317 | }, 318 | "outputs": [], 319 | "source": [ 320 | "@mpi_do manager begin\n", 321 | " comm_cart = MPI.Cart_create(\n", 322 | " comm, # MPI Communicator\n", 323 | " dims, # Dimensions of grid\n", 324 | " [0], # 0 == not periodic, 1 == periodic\n", 325 | " 1, # 0 == not allowed to reorder, 1 == allowed to reoder\n", 326 | " )\n", 327 | " me = MPI.Comm_rank(comm_cart)\n", 328 | " coords = MPI.Cart_coords(comm_cart)\n", 329 | " neighbors = MPI.Cart_shift(\n", 330 | " comm_cart,\n", 331 | " 0, # Which dimension to shift (zero-indexed)\n", 332 | " 1, # Shift magnitude\n", 333 | " )\n", 334 | "end" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 8, 340 | "id": "d3ab1a58-0aea-4ec5-a79b-48bcd810c631", 341 | "metadata": { 342 | "slideshow": { 343 | "slide_type": "fragment" 344 | }, 345 | "tags": [] 346 | }, 347 | "outputs": [ 348 | { 349 | "name": "stdout", 350 | "output_type": "stream", 351 | "text": [ 352 | " From worker 2:\trank=0; coord=[0], neighbors=(-1, 1)\n", 353 | " From worker 3:\trank=1; coord=[1], neighbors=(0, 2)\n", 354 | " From worker 5:\trank=3; coord=[3], neighbors=(2, -1)\n", 355 | " From worker 4:\trank=2; coord=[2], neighbors=(1, 3)\n" 356 | ] 357 | } 358 | ], 359 | "source": [ 360 | "@mpi_do manager begin\n", 361 | " println(\"rank=$(me); coord=$(coords), neighbors=$(neighbors)\")\n", 362 | "end" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": 9, 368 | "id": "63bda425-3a47-4a1c-ba8b-ae3c891d3021", 369 | "metadata": { 370 | "slideshow": { 371 | "slide_type": "subslide" 372 | }, 373 | "tags": [] 374 | }, 375 | "outputs": [ 376 | { 377 | "name": "stdout", 378 | "output_type": "stream", 379 | "text": [ 380 | " From worker 5:\trank=3; coord=[3], neighbors=(2, -1)\n", 381 | " From worker 2:\trank=0; coord=[0], neighbors=(-1, 1)\n", 382 | " From worker 4:\trank=2; coord=[2], neighbors=(1, 3)\n", 383 | " From worker 3:\trank=1; coord=[1], neighbors=(0, 2)\n" 384 | ] 385 | } 386 | ], 387 | "source": [ 388 | "@mpi_do manager begin\n", 389 | " println(\"rank=$(me); coord=$(coords), neighbors=$(neighbors)\")\n", 390 | "end" 391 | ] 392 | }, 393 | { 394 | "cell_type": "markdown", 395 | "id": "b80b410a-c68c-4e38-ab1c-e355c4d20d8c", 396 | "metadata": { 397 | "slideshow": { 398 | "slide_type": "fragment" 399 | }, 400 | "tags": [] 401 | }, 402 | "source": [ 403 | "MPI contains several constants, for example what `-1` means in the context above. This means that there is \"no neighbor\" there:" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": 10, 409 | "id": "94bc63d1-24cc-47f6-a6ab-4624d95523fd", 410 | "metadata": { 411 | "slideshow": { 412 | "slide_type": "fragment" 413 | }, 414 | "tags": [] 415 | }, 416 | "outputs": [ 417 | { 418 | "data": { 419 | "text/plain": [ 420 | "-1" 421 | ] 422 | }, 423 | "execution_count": 10, 424 | "metadata": {}, 425 | "output_type": "execute_result" 426 | } 427 | ], 428 | "source": [ 429 | "MPI.PROC_NULL" 430 | ] 431 | }, 432 | { 433 | "cell_type": "markdown", 434 | "id": "b165e80a-91ce-4233-a8e4-4bd3f09786c1", 435 | "metadata": { 436 | "slideshow": { 437 | "slide_type": "slide" 438 | }, 439 | "tags": [] 440 | }, 441 | "source": [ 442 | "## Point-to-point Communication" 443 | ] 444 | }, 445 | { 446 | "cell_type": "markdown", 447 | "id": "07f6f278-6dc3-4042-aa06-4abc9a7fa7f4", 448 | "metadata": { 449 | "slideshow": { 450 | "slide_type": "fragment" 451 | }, 452 | "tags": [] 453 | }, 454 | "source": [ 455 | "Let's do something harder:\n", 456 | "1. Each rank draws a random number between 1 and 100\n", 457 | "2. Each rank's random number is shared with its neighbors" 458 | ] 459 | }, 460 | { 461 | "cell_type": "markdown", 462 | "id": "b286f218-4851-4f11-b3e2-550635a2c688", 463 | "metadata": { 464 | "slideshow": { 465 | "slide_type": "fragment" 466 | }, 467 | "tags": [] 468 | }, 469 | "source": [ 470 | "This is an example of point-to-point communication on a grid. We'll be using the same communication pattern in the excercise." 471 | ] 472 | }, 473 | { 474 | "cell_type": "markdown", 475 | "id": "45478166-3101-4380-9149-e9ee101b3b06", 476 | "metadata": { 477 | "slideshow": { 478 | "slide_type": "subslide" 479 | }, 480 | "tags": [] 481 | }, 482 | "source": [ 483 | "First we generate a andom number on each rank" 484 | ] 485 | }, 486 | { 487 | "cell_type": "code", 488 | "execution_count": 11, 489 | "id": "e5187bd3-8699-4a3b-a43c-28d4a647cdc0", 490 | "metadata": { 491 | "slideshow": { 492 | "slide_type": "fragment" 493 | }, 494 | "tags": [] 495 | }, 496 | "outputs": [], 497 | "source": [ 498 | "@mpi_do manager begin\n", 499 | " using Random\n", 500 | " my_int = rand(1:100)\n", 501 | "end" 502 | ] 503 | }, 504 | { 505 | "cell_type": "code", 506 | "execution_count": 12, 507 | "id": "a926edfd-9b22-4e33-851d-6d9e26429065", 508 | "metadata": { 509 | "slideshow": { 510 | "slide_type": "fragment" 511 | }, 512 | "tags": [] 513 | }, 514 | "outputs": [ 515 | { 516 | "name": "stdout", 517 | "output_type": "stream", 518 | "text": [ 519 | " From worker 2:\trank=0; my_int=38\n", 520 | " From worker 4:\trank=2; my_int=29\n", 521 | " From worker 5:\trank=3; my_int=70\n", 522 | " From worker 3:\trank=1; my_int=71\n" 523 | ] 524 | } 525 | ], 526 | "source": [ 527 | "@mpi_do manager begin\n", 528 | " println(\"rank=$(me); my_int=$(my_int)\")\n", 529 | "end" 530 | ] 531 | }, 532 | { 533 | "cell_type": "markdown", 534 | "id": "064d74de-4b8a-4962-a521-f620f8164cae", 535 | "metadata": { 536 | "slideshow": { 537 | "slide_type": "subslide" 538 | }, 539 | "tags": [] 540 | }, 541 | "source": [ 542 | "MPI uses zero-copy memory access => we need to set up buffers (arrays) to send and receive data." 543 | ] 544 | }, 545 | { 546 | "cell_type": "code", 547 | "execution_count": 13, 548 | "id": "343bd286-e07b-49b6-8342-ebd85b1a2af7", 549 | "metadata": { 550 | "slideshow": { 551 | "slide_type": "fragment" 552 | }, 553 | "tags": [] 554 | }, 555 | "outputs": [], 556 | "source": [ 557 | "@mpi_do manager begin\n", 558 | " send_1 = zeros(Int64, 1)\n", 559 | " send_2 = zeros(Int64, 1)\n", 560 | " recv_1 = zeros(Int64, 1)\n", 561 | " recv_2 = zeros(Int64, 1)\n", 562 | "end" 563 | ] 564 | }, 565 | { 566 | "cell_type": "markdown", 567 | "id": "5669bf32-cc11-42b3-b353-31b3231999b4", 568 | "metadata": { 569 | "slideshow": { 570 | "slide_type": "fragment" 571 | }, 572 | "tags": [] 573 | }, 574 | "source": [ 575 | "Now we fill the buffers by copying out data into it -- wherever a buffer is needed." 576 | ] 577 | }, 578 | { 579 | "cell_type": "code", 580 | "execution_count": 14, 581 | "id": "48a0fa62-2cd2-4071-9046-958e0b335916", 582 | "metadata": { 583 | "slideshow": { 584 | "slide_type": "fragment" 585 | }, 586 | "tags": [] 587 | }, 588 | "outputs": [], 589 | "source": [ 590 | "@mpi_do manager begin\n", 591 | " if neighbors[1] != MPI.PROC_NULL\n", 592 | " copyto!(send_1, my_int)\n", 593 | " end\n", 594 | " if neighbors[2] != MPI.PROC_NULL\n", 595 | " copyto!(send_2, my_int)\n", 596 | " end \n", 597 | "end" 598 | ] 599 | }, 600 | { 601 | "cell_type": "markdown", 602 | "id": "b79dfa66-e9c8-455f-b658-004e49ea4df2", 603 | "metadata": { 604 | "slideshow": { 605 | "slide_type": "subslide" 606 | }, 607 | "tags": [] 608 | }, 609 | "source": [ 610 | "Now we're ready to perform a data transfer with MPI. MPI is (largely) transaction based. There is a receiving end, and a sending end. In order for a send to be successful, the receiver must be ready to receive." 611 | ] 612 | }, 613 | { 614 | "cell_type": "markdown", 615 | "id": "2d89f9e2-2527-4700-9eeb-600c1844eb06", 616 | "metadata": { 617 | "slideshow": { 618 | "slide_type": "fragment" 619 | }, 620 | "tags": [] 621 | }, 622 | "source": [ 623 | "To help coordinate all of this, we set up a request store:" 624 | ] 625 | }, 626 | { 627 | "cell_type": "code", 628 | "execution_count": 15, 629 | "id": "c05abe1a-8d67-4aff-9191-d135272ca4be", 630 | "metadata": { 631 | "slideshow": { 632 | "slide_type": "fragment" 633 | }, 634 | "tags": [] 635 | }, 636 | "outputs": [], 637 | "source": [ 638 | "@mpi_do manager begin\n", 639 | " reqs = MPI.MultiRequest(4)\n", 640 | "end" 641 | ] 642 | }, 643 | { 644 | "cell_type": "markdown", 645 | "id": "2256d83d-f6fe-4bed-88d0-e405f53dd664", 646 | "metadata": { 647 | "slideshow": { 648 | "slide_type": "subslide" 649 | }, 650 | "tags": [] 651 | }, 652 | "source": [ 653 | "And we transfer the data using non-blocking MPI communivation (`Isend` and `Irecv`). Pro tip: initiate receive before send" 654 | ] 655 | }, 656 | { 657 | "cell_type": "code", 658 | "execution_count": 16, 659 | "id": "d847d757-71b4-4d62-8faa-228962bb4794", 660 | "metadata": { 661 | "slideshow": { 662 | "slide_type": "fragment" 663 | }, 664 | "tags": [] 665 | }, 666 | "outputs": [], 667 | "source": [ 668 | "@mpi_do manager begin\n", 669 | " # Initiate data reciever\n", 670 | " if neighbors[1] != MPI.PROC_NULL\n", 671 | " MPI.Irecv!(recv_1, comm_cart, reqs[1]; source=neighbors[1])\n", 672 | " end\n", 673 | " if neighbors[2] != MPI.PROC_NULL\n", 674 | " MPI.Irecv!(recv_2, comm_cart, reqs[2]; source=neighbors[2])\n", 675 | " end\n", 676 | " # Send data\n", 677 | " if neighbors[1] != MPI.PROC_NULL\n", 678 | " MPI.Isend(send_1, comm_cart, reqs[3]; dest=neighbors[1])\n", 679 | " end\n", 680 | " if neighbors[2] != MPI.PROC_NULL\n", 681 | " MPI.Isend(send_2, comm_cart, reqs[4]; dest=neighbors[2])\n", 682 | " end\n", 683 | "end" 684 | ] 685 | }, 686 | { 687 | "cell_type": "markdown", 688 | "id": "4b2ef6ff-dead-4aff-981c-21407f01c9ef", 689 | "metadata": { 690 | "slideshow": { 691 | "slide_type": "fragment" 692 | }, 693 | "tags": [] 694 | }, 695 | "source": [ 696 | "Notice how we tagged data with `source` and `dest`. This makes sure that data is received in the correct order (the middle ranks receive data from _both_ sides), and -- in the case of `Isend` -- that the data is sent to the correct rank." 697 | ] 698 | }, 699 | { 700 | "cell_type": "markdown", 701 | "id": "54ed66db-1274-4efe-a2a0-a8b2b7986527", 702 | "metadata": { 703 | "slideshow": { 704 | "slide_type": "subslide" 705 | }, 706 | "tags": [] 707 | }, 708 | "source": [ 709 | "When using non-blocking communication, it's good to wait for all transactions to be completed before using the buffers:" 710 | ] 711 | }, 712 | { 713 | "cell_type": "code", 714 | "execution_count": 17, 715 | "id": "113d8a31-1834-4d6d-931f-3991592e7ab5", 716 | "metadata": { 717 | "slideshow": { 718 | "slide_type": "fragment" 719 | }, 720 | "tags": [] 721 | }, 722 | "outputs": [], 723 | "source": [ 724 | "@mpi_do manager begin\n", 725 | " # Wait for all requests to finish\n", 726 | " MPI.Waitall(reqs)\n", 727 | "end" 728 | ] 729 | }, 730 | { 731 | "cell_type": "markdown", 732 | "id": "26e5ed91-9afd-4764-b875-ebbf924dc077", 733 | "metadata": { 734 | "slideshow": { 735 | "slide_type": "subslide" 736 | }, 737 | "tags": [] 738 | }, 739 | "source": [ 740 | "Let's take a look at what we've transferred:" 741 | ] 742 | }, 743 | { 744 | "cell_type": "code", 745 | "execution_count": 19, 746 | "id": "c7f159d3-e651-4795-b63b-2b49a03af961", 747 | "metadata": { 748 | "slideshow": { 749 | "slide_type": "fragment" 750 | }, 751 | "tags": [] 752 | }, 753 | "outputs": [ 754 | { 755 | "name": "stdout", 756 | "output_type": "stream", 757 | "text": [ 758 | " From worker 4:\trank=2; my_int=29; prev=[71]; next=[70]\n", 759 | " From worker 2:\trank=0; my_int=38; prev=[0]; next=[71]\n", 760 | " From worker 5:\trank=3; my_int=70; prev=[29]; next=[0]\n", 761 | " From worker 3:\trank=1; my_int=71; prev=[38]; next=[29]\n" 762 | ] 763 | }, 764 | { 765 | "ename": "KeyError", 766 | "evalue": "KeyError: key \"usage_request\" not found", 767 | "output_type": "error", 768 | "traceback": [ 769 | "KERNEL EXCEPTION", 770 | "KeyError: key \"usage_request\" not found", 771 | "", 772 | "Stacktrace:", 773 | " [1] getindex(h::Dict{String, Function}, key::String)", 774 | " @ Base ./dict.jl:484", 775 | " [2] eventloop(socket::ZMQ.Socket)", 776 | " @ IJulia ~/.julia/packages/IJulia/Vo51o/src/eventloop.jl:8", 777 | " [3] (::IJulia.var\"#14#17\")()", 778 | " @ IJulia ./task.jl:514" 779 | ] 780 | }, 781 | { 782 | "ename": "KeyError", 783 | "evalue": "KeyError: key \"usage_request\" not found", 784 | "output_type": "error", 785 | "traceback": [ 786 | "KERNEL EXCEPTION", 787 | "KeyError: key \"usage_request\" not found", 788 | "", 789 | "Stacktrace:", 790 | " [1] getindex(h::Dict{String, Function}, key::String)", 791 | " @ Base ./dict.jl:484", 792 | " [2] eventloop(socket::ZMQ.Socket)", 793 | " @ IJulia ~/.julia/packages/IJulia/Vo51o/src/eventloop.jl:8", 794 | " [3] (::IJulia.var\"#14#17\")()", 795 | " @ IJulia ./task.jl:514" 796 | ] 797 | }, 798 | { 799 | "ename": "KeyError", 800 | "evalue": "KeyError: key \"usage_request\" not found", 801 | "output_type": "error", 802 | "traceback": [ 803 | "KERNEL EXCEPTION", 804 | "KeyError: key \"usage_request\" not found", 805 | "", 806 | "Stacktrace:", 807 | " [1] getindex(h::Dict{String, Function}, key::String)", 808 | " @ Base ./dict.jl:484", 809 | " [2] eventloop(socket::ZMQ.Socket)", 810 | " @ IJulia ~/.julia/packages/IJulia/Vo51o/src/eventloop.jl:8", 811 | " [3] (::IJulia.var\"#14#17\")()", 812 | " @ IJulia ./task.jl:514" 813 | ] 814 | } 815 | ], 816 | "source": [ 817 | "@mpi_do manager begin\n", 818 | " println(\n", 819 | " \"rank=$(me); \" *\n", 820 | " \"my_int=$(my_int); prev=$(recv_1); next=$(recv_2)\"\n", 821 | " )\n", 822 | "end" 823 | ] 824 | }, 825 | { 826 | "cell_type": "code", 827 | "execution_count": null, 828 | "id": "88e46e3b-f8d4-48d5-b8fc-2ae660f5a4a8", 829 | "metadata": { 830 | "slideshow": { 831 | "slide_type": "skip" 832 | }, 833 | "tags": [] 834 | }, 835 | "outputs": [], 836 | "source": [] 837 | } 838 | ], 839 | "metadata": { 840 | "kernelspec": { 841 | "display_name": "Julia 1.9.4", 842 | "language": "julia", 843 | "name": "julia-1.9.4" 844 | }, 845 | "language_info": { 846 | "file_extension": ".jl", 847 | "mimetype": "application/julia", 848 | "name": "julia", 849 | "version": "1.9.4" 850 | } 851 | }, 852 | "nbformat": 4, 853 | "nbformat_minor": 5 854 | } 855 | -------------------------------------------------------------------------------- /parts/mpi/explanation/03_halo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "7d81f9b4-89d8-4597-a458-4bfff3c27b81", 6 | "metadata": { 7 | "slideshow": { 8 | "slide_type": "skip" 9 | }, 10 | "tags": [] 11 | }, 12 | "source": [ 13 | "# Setup\n", 14 | "\n", 15 | "Note: you might need to run `Pkg.instantiate()` to ensure that the `Manifest.toml` is up to date. This only needs to be done once." 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 1, 21 | "id": "df64b70e-4682-4885-b055-056bc4e88a59", 22 | "metadata": { 23 | "slideshow": { 24 | "slide_type": "skip" 25 | }, 26 | "tags": [] 27 | }, 28 | "outputs": [ 29 | { 30 | "name": "stderr", 31 | "output_type": "stream", 32 | "text": [ 33 | "\u001b[32m\u001b[1m Activating\u001b[22m\u001b[39m project at `/global/u1/b/blaschke/juliacon24-hpcworkshop/parts/mpi/explanation`\n" 34 | ] 35 | }, 36 | { 37 | "name": "stdout", 38 | "output_type": "stream", 39 | "text": [ 40 | "\u001b[32m\u001b[1mStatus\u001b[22m\u001b[39m `/global/u1/b/blaschke/juliacon24-hpcworkshop/parts/mpi/explanation/Project.toml`\n", 41 | " \u001b[90m[1520ce14] \u001b[39mAbstractTrees v0.4.5\n", 42 | " \u001b[90m[052768ef] \u001b[39mCUDA v5.4.2\n", 43 | " \u001b[90m[adafc99b] \u001b[39mCpuId v0.3.1\n", 44 | " \u001b[90m[0e44f5e4] \u001b[39mHwloc v3.0.1\n", 45 | " \u001b[90m[da04e1cc] \u001b[39mMPI v0.20.20\n", 46 | " \u001b[90m[e7922434] \u001b[39mMPIClusterManagers v0.2.4\n", 47 | " \u001b[90m[6f74fd91] \u001b[39mNetworkInterfaceControllers v0.1.0\n" 48 | ] 49 | } 50 | ], 51 | "source": [ 52 | "import Pkg;\n", 53 | "Pkg.activate(@__DIR__)\n", 54 | "Pkg.status()" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 2, 60 | "id": "cd1e1253-87a0-47b9-a225-33dffac6d33f", 61 | "metadata": { 62 | "slideshow": { 63 | "slide_type": "skip" 64 | }, 65 | "tags": [] 66 | }, 67 | "outputs": [ 68 | { 69 | "data": { 70 | "text/plain": [ 71 | "\"nid200360-hsn0\"" 72 | ] 73 | }, 74 | "execution_count": 2, 75 | "metadata": {}, 76 | "output_type": "execute_result" 77 | } 78 | ], 79 | "source": [ 80 | "using MPI\n", 81 | "\n", 82 | "using NetworkInterfaceControllers, Sockets\n", 83 | "interfaces = NetworkInterfaceControllers.get_interface_data(IPv4)\n", 84 | "\n", 85 | "hsn0_public = filter(x->(x.name==\"hsn0:chn\" && x.version==:v4), interfaces) |> only \n", 86 | "public_slingshot_name = getnameinfo(hsn0_public.ip)" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 3, 92 | "id": "68377016-c3df-4a1c-9c42-150d6af80de8", 93 | "metadata": { 94 | "slideshow": { 95 | "slide_type": "skip" 96 | }, 97 | "tags": [] 98 | }, 99 | "outputs": [ 100 | { 101 | "data": { 102 | "text/plain": [ 103 | "4-element Vector{Int64}:\n", 104 | " 2\n", 105 | " 3\n", 106 | " 4\n", 107 | " 5" 108 | ] 109 | }, 110 | "execution_count": 3, 111 | "metadata": {}, 112 | "output_type": "execute_result" 113 | } 114 | ], 115 | "source": [ 116 | "# to import MPIManager\n", 117 | "using MPIClusterManagers\n", 118 | "\n", 119 | "# need to also import Distributed to use addprocs()\n", 120 | "using Distributed\n", 121 | "\n", 122 | "# specify, number of mpi workers, launch cmd, etc.\n", 123 | "manager=MPIWorkerManager(4)\n", 124 | "\n", 125 | "# start mpi workers and add them as julia workers too.\n", 126 | "addprocs(\n", 127 | " manager,\n", 128 | " exeflags=`--project=$(Base.active_project())`,\n", 129 | " master_tcp_interface=public_slingshot_name\n", 130 | ")" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 5, 136 | "id": "b243745d-ad52-4d52-873a-b9bc6575054a", 137 | "metadata": { 138 | "slideshow": { 139 | "slide_type": "skip" 140 | }, 141 | "tags": [] 142 | }, 143 | "outputs": [ 144 | { 145 | "name": "stdout", 146 | "output_type": "stream", 147 | "text": [ 148 | " From worker 5:\tHello world, I am 3 of 4 on nid200365\n", 149 | " From worker 2:\tHello world, I am 0 of 4 on nid200360\n", 150 | " From worker 4:\tHello world, I am 2 of 4 on nid200364\n", 151 | " From worker 3:\tHello world, I am 1 of 4 on nid200361\n" 152 | ] 153 | }, 154 | { 155 | "ename": "KeyError", 156 | "evalue": "KeyError: key \"usage_request\" not found", 157 | "output_type": "error", 158 | "traceback": [ 159 | "KERNEL EXCEPTION", 160 | "KeyError: key \"usage_request\" not found", 161 | "", 162 | "Stacktrace:", 163 | " [1] getindex(h::Dict{String, Function}, key::String)", 164 | " @ Base ./dict.jl:484", 165 | " [2] eventloop(socket::ZMQ.Socket)", 166 | " @ IJulia ~/.julia/packages/IJulia/Vo51o/src/eventloop.jl:8", 167 | " [3] (::IJulia.var\"#14#17\")()", 168 | " @ IJulia ./task.jl:514" 169 | ] 170 | }, 171 | { 172 | "ename": "KeyError", 173 | "evalue": "KeyError: key \"usage_request\" not found", 174 | "output_type": "error", 175 | "traceback": [ 176 | "KERNEL EXCEPTION", 177 | "KeyError: key \"usage_request\" not found", 178 | "", 179 | "Stacktrace:", 180 | " [1] getindex(h::Dict{String, Function}, key::String)", 181 | " @ Base ./dict.jl:484", 182 | " [2] eventloop(socket::ZMQ.Socket)", 183 | " @ IJulia ~/.julia/packages/IJulia/Vo51o/src/eventloop.jl:8", 184 | " [3] (::IJulia.var\"#14#17\")()", 185 | " @ IJulia ./task.jl:514" 186 | ] 187 | } 188 | ], 189 | "source": [ 190 | "@mpi_do manager begin\n", 191 | " using MPI: MPI, Comm, Win, free\n", 192 | " comm = MPI.COMM_WORLD\n", 193 | " rank = MPI.Comm_rank(comm)\n", 194 | " mpi_size = MPI.Comm_size(comm) # don't use \"size\" as this overwrites the `size` function\n", 195 | " name = gethostname()\n", 196 | " println(\"Hello world, I am $(rank) of $(mpi_size) on $(name)\")\n", 197 | "end" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": 6, 203 | "id": "f4197ff5-6ba6-4964-aca4-178147857b74", 204 | "metadata": { 205 | "slideshow": { 206 | "slide_type": "skip" 207 | }, 208 | "tags": [] 209 | }, 210 | "outputs": [], 211 | "source": [ 212 | "@mpi_do manager begin\n", 213 | " dims = [0]\n", 214 | " MPI.Dims_create!(mpi_size, dims)\n", 215 | "end" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 7, 221 | "id": "e56347d7-018b-4daa-8b0f-7934a3097718", 222 | "metadata": { 223 | "slideshow": { 224 | "slide_type": "skip" 225 | }, 226 | "tags": [] 227 | }, 228 | "outputs": [], 229 | "source": [ 230 | "@mpi_do manager begin\n", 231 | " comm_cart = MPI.Cart_create(\n", 232 | " comm, # MPI Communicator\n", 233 | " dims, # Dimensions of grid\n", 234 | " [0], # 0 == not periodic, 1 == periodic\n", 235 | " 1, # 0 == not allowed to reorder, 1 == allowed to reoder\n", 236 | " )\n", 237 | " me = MPI.Comm_rank(comm_cart)\n", 238 | " coords = MPI.Cart_coords(comm_cart)\n", 239 | " neighbors = MPI.Cart_shift(\n", 240 | " comm_cart,\n", 241 | " 0, # Which dimension to shift (zero-indexed)\n", 242 | " 1, # Shift magnitude\n", 243 | " )\n", 244 | "end" 245 | ] 246 | }, 247 | { 248 | "cell_type": "markdown", 249 | "id": "e591dff1-e930-405a-aced-7ba54ef75164", 250 | "metadata": { 251 | "slideshow": { 252 | "slide_type": "slide" 253 | }, 254 | "tags": [] 255 | }, 256 | "source": [ 257 | "# Halo Exchange" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "id": "5eac60ff-cd2d-4561-bf87-a732e93cdbc5", 263 | "metadata": { 264 | "slideshow": { 265 | "slide_type": "fragment" 266 | }, 267 | "tags": [] 268 | }, 269 | "source": [ 270 | "When cast into the discrete form:\n", 271 | "\n", 272 | "$$\n", 273 | "\\partial_t x = -D \\mathrm{div}(\\mathrm{grad}(x)) \\\\\n", 274 | "\\Delta_t x = -D \\frac{q_i - q_{i-1}}{\\Delta s} = \\frac{(x_{i+1} - x_i) - (x_{i} - x_{i-1})}{(\\Delta s)^2} = \\frac{x_{i+1} + 2 x_i - x_{i-1}}{(\\Delta s)^2}\n", 275 | "$$\n", 276 | "\n", 277 | "The diffusion equation has a stencil width of 2, but the necessary halo only needs 1 cell to be transferred:" 278 | ] 279 | }, 280 | { 281 | "cell_type": "markdown", 282 | "id": "c67e1b1e-7bec-4b02-bcd8-4fecefd8170b", 283 | "metadata": { 284 | "slideshow": { 285 | "slide_type": "subslide" 286 | }, 287 | "tags": [] 288 | }, 289 | "source": [ 290 | "![1D_halo](l8_1D_global_grid.png)" 291 | ] 292 | }, 293 | { 294 | "cell_type": "markdown", 295 | "id": "7ebe2ba1-2ea3-498d-be6e-bd34d6a50ad9", 296 | "metadata": { 297 | "slideshow": { 298 | "slide_type": "subslide" 299 | }, 300 | "tags": [] 301 | }, 302 | "source": [ 303 | "In 2D this will look as follows:\n", 304 | "\n", 305 | "![2D_halo](diffusion_2d_halo_exchange.png)" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "id": "d22a1ac9-cc48-4bed-87fc-a2113ebb8067", 311 | "metadata": { 312 | "slideshow": { 313 | "slide_type": "slide" 314 | }, 315 | "tags": [] 316 | }, 317 | "source": [ 318 | "## 1D Solver Example" 319 | ] 320 | }, 321 | { 322 | "cell_type": "markdown", 323 | "id": "bc43046b-6490-429c-bd4e-a442e0c2cafd", 324 | "metadata": { 325 | "slideshow": { 326 | "slide_type": "fragment" 327 | }, 328 | "tags": [] 329 | }, 330 | "source": [ 331 | "Let's set up a basic example: 1D diffusion! First we need some parameters:" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": 245, 337 | "id": "49fc3f16-27d8-4589-8f89-76d868c4f3c1", 338 | "metadata": { 339 | "slideshow": { 340 | "slide_type": "fragment" 341 | }, 342 | "tags": [] 343 | }, 344 | "outputs": [], 345 | "source": [ 346 | "@mpi_do manager begin\n", 347 | " D = 1e-4\n", 348 | " ds = 1e-4\n", 349 | " dt = ds^2 / D / 8.2 \n", 350 | " qx(ix, D, C, ds) = -D * (C[ix+1, 1] - C[ix, 1]) / ds\n", 351 | "end" 352 | ] 353 | }, 354 | { 355 | "cell_type": "markdown", 356 | "id": "2062fc6f-d631-46a8-8908-08db59eb3c43", 357 | "metadata": { 358 | "slideshow": { 359 | "slide_type": "subslide" 360 | }, 361 | "tags": [] 362 | }, 363 | "source": [ 364 | "We can now iterate over the local array (which has a halo of 2 cells):" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": 248, 370 | "id": "b1f885ad-b823-45da-a33c-8ab615425362", 371 | "metadata": { 372 | "slideshow": { 373 | "slide_type": "fragment" 374 | }, 375 | "tags": [] 376 | }, 377 | "outputs": [], 378 | "source": [ 379 | "@mpi_do manager begin\n", 380 | " function step_diffusion!(C2, C)\n", 381 | " for i in 1:size(C, 1) - 2\n", 382 | " C2[i+1] = C[i+1] - dt * (qx(i+1, D, C, ds) - qx(i, D, C, ds)) / ds\n", 383 | " end\n", 384 | " end\n", 385 | "end" 386 | ] 387 | }, 388 | { 389 | "cell_type": "markdown", 390 | "id": "3f237d99-4457-4e0f-abbf-2dbb676ef837", 391 | "metadata": { 392 | "slideshow": { 393 | "slide_type": "subslide" 394 | }, 395 | "tags": [] 396 | }, 397 | "source": [ 398 | "We set up an initial condition where a single cell at the edge of domain 2 (rank 1) is non-zero. Recall that the halo is 2-cells wide => `C[8]` is at the very end of domain 2." 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": 246, 404 | "id": "e0085955-4b07-4942-b471-7f9a130ab908", 405 | "metadata": { 406 | "slideshow": { 407 | "slide_type": "fragment" 408 | }, 409 | "tags": [] 410 | }, 411 | "outputs": [], 412 | "source": [ 413 | "@mpi_do manager begin\n", 414 | " C = zeros(10, 1)\n", 415 | " if rank == 1\n", 416 | " C[8] = 1/ds\n", 417 | " end\n", 418 | "end" 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": 247, 424 | "id": "9c67b1a0-3dfa-44a6-b0a7-32e2cf550db8", 425 | "metadata": { 426 | "slideshow": { 427 | "slide_type": "fragment" 428 | }, 429 | "tags": [] 430 | }, 431 | "outputs": [ 432 | { 433 | "name": "stdout", 434 | "output_type": "stream", 435 | "text": [ 436 | " From worker 2:\t[0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0;;]\n", 437 | " From worker 4:\t[0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0;;]\n", 438 | " From worker 5:\t[0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0;;]\n", 439 | " From worker 3:\t[0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 10000.0; 0.0; 0.0;;]\n" 440 | ] 441 | } 442 | ], 443 | "source": [ 444 | "@mpi_do manager begin\n", 445 | " println(C)\n", 446 | "end" 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": 249, 452 | "id": "b09f3b69-a733-467a-84ed-a91b577c89ba", 453 | "metadata": { 454 | "slideshow": { 455 | "slide_type": "fragment" 456 | }, 457 | "tags": [] 458 | }, 459 | "outputs": [], 460 | "source": [ 461 | "@mpi_do manager begin\n", 462 | " C2 = similar(C)\n", 463 | " fill!(C2, 0.)\n", 464 | "end" 465 | ] 466 | }, 467 | { 468 | "cell_type": "markdown", 469 | "id": "f8307d70-8e35-4285-bca5-63ed716a417a", 470 | "metadata": { 471 | "slideshow": { 472 | "slide_type": "slide" 473 | }, 474 | "tags": [] 475 | }, 476 | "source": [ 477 | "## Halo Exchanges in 1D" 478 | ] 479 | }, 480 | { 481 | "cell_type": "markdown", 482 | "id": "1e075e1e-0575-4a32-b5ad-da0fa724279b", 483 | "metadata": { 484 | "slideshow": { 485 | "slide_type": "subslide" 486 | }, 487 | "tags": [] 488 | }, 489 | "source": [ 490 | "In the previous example we exchanged `Int64`, now we're going to tranfer `Float64`" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": 8, 496 | "id": "d36f644b-65fb-44d0-998c-09c74e805235", 497 | "metadata": { 498 | "slideshow": { 499 | "slide_type": "fragment" 500 | }, 501 | "tags": [] 502 | }, 503 | "outputs": [], 504 | "source": [ 505 | "@mpi_do manager begin\n", 506 | " send_1 = zeros(Float64, 1)\n", 507 | " send_2 = zeros(Float64, 1)\n", 508 | " recv_1 = zeros(Float64, 1)\n", 509 | " recv_2 = zeros(Float64, 1)\n", 510 | "end" 511 | ] 512 | }, 513 | { 514 | "cell_type": "markdown", 515 | "id": "8f3b2de0-563f-4bb0-acb7-99d7be2a2c66", 516 | "metadata": { 517 | "slideshow": { 518 | "slide_type": "subslide" 519 | }, 520 | "tags": [] 521 | }, 522 | "source": [ 523 | "We set up a halo-exchange function using the previous section's point-to-point communication pattern" 524 | ] 525 | }, 526 | { 527 | "cell_type": "code", 528 | "execution_count": 257, 529 | "id": "a6891fa9-6d92-4c65-b791-2aa4246e1e2e", 530 | "metadata": { 531 | "slideshow": { 532 | "slide_type": "fragment" 533 | }, 534 | "tags": [] 535 | }, 536 | "outputs": [], 537 | "source": [ 538 | "@mpi_do manager begin\n", 539 | " function halo_exchange!(A)\n", 540 | " # Copy to buffers\n", 541 | " (neighbors[1] != MPI.PROC_NULL) && copyto!(send_1, A[2:2, 1])\n", 542 | " (neighbors[2] != MPI.PROC_NULL) && copyto!(send_2, A[(end-1):(end-1), 1]) \n", 543 | " # Request handler\n", 544 | " reqs = MPI.MultiRequest(4)\n", 545 | " # Initiate data reciever\n", 546 | " (neighbors[1] != MPI.PROC_NULL) && MPI.Irecv!(recv_1, comm_cart, reqs[1]; source=neighbors[1])\n", 547 | " (neighbors[2] != MPI.PROC_NULL) && MPI.Irecv!(recv_2, comm_cart, reqs[2]; source=neighbors[2])\n", 548 | " # Send data\n", 549 | " (neighbors[1] != MPI.PROC_NULL) && MPI.Isend(send_1, comm_cart, reqs[3]; dest=neighbors[1])\n", 550 | " (neighbors[2] != MPI.PROC_NULL) && MPI.Isend(send_2, comm_cart, reqs[4]; dest=neighbors[2])\n", 551 | " # Block until all transactions are done before touching buffers\n", 552 | " MPI.Waitall(reqs) \n", 553 | " # Copy from buffers (copyto! needs a pointer to the cell)\n", 554 | " r1 = @view A[1:1, 1] \n", 555 | " r2 = @view A[end:end, 1]\n", 556 | " (neighbors[1] != MPI.PROC_NULL) && copyto!(r1, recv_1)\n", 557 | " (neighbors[2] != MPI.PROC_NULL) && copyto!(r2, recv_2)\n", 558 | " end\n", 559 | "end" 560 | ] 561 | }, 562 | { 563 | "cell_type": "markdown", 564 | "id": "1944ce97-2586-42b5-b954-b5b4a587766c", 565 | "metadata": { 566 | "slideshow": { 567 | "slide_type": "subslide" 568 | }, 569 | "tags": [] 570 | }, 571 | "source": [ 572 | "Let's run 1 step of the diffusion algorithm to see how the halo exchane works:" 573 | ] 574 | }, 575 | { 576 | "cell_type": "code", 577 | "execution_count": 250, 578 | "id": "871692b2-3589-4e13-9889-bad943325e23", 579 | "metadata": { 580 | "slideshow": { 581 | "slide_type": "fragment" 582 | }, 583 | "tags": [] 584 | }, 585 | "outputs": [], 586 | "source": [ 587 | "@mpi_do manager begin\n", 588 | " step_diffusion!(C2, C)\n", 589 | " halo_exchange!(C2)\n", 590 | " C, C2 = C2, C\n", 591 | "end" 592 | ] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": 252, 597 | "id": "7ccd49d0-3eee-46ea-a888-8094047e3bd8", 598 | "metadata": { 599 | "slideshow": { 600 | "slide_type": "fragment" 601 | }, 602 | "tags": [] 603 | }, 604 | "outputs": [ 605 | { 606 | "name": "stdout", 607 | "output_type": "stream", 608 | "text": [ 609 | " From worker 5:\t[0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0;;]\n", 610 | " From worker 4:\t[1219.5121951219512; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0;;]\n", 611 | " From worker 2:\t[0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0;;]\n", 612 | " From worker 3:\t[0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 1219.5121951219512; 7560.975609756098; 1219.5121951219512; 0.0;;]\n" 613 | ] 614 | } 615 | ], 616 | "source": [ 617 | "@mpi_do manager begin\n", 618 | " println(C)\n", 619 | "end" 620 | ] 621 | }, 622 | { 623 | "cell_type": "code", 624 | "execution_count": null, 625 | "id": "2dc8e0fd-66e7-43aa-9035-95e84915971b", 626 | "metadata": { 627 | "slideshow": { 628 | "slide_type": "skip" 629 | }, 630 | "tags": [] 631 | }, 632 | "outputs": [], 633 | "source": [] 634 | } 635 | ], 636 | "metadata": { 637 | "kernelspec": { 638 | "display_name": "Julia 1.9.4", 639 | "language": "julia", 640 | "name": "julia-1.9.4" 641 | }, 642 | "language_info": { 643 | "file_extension": ".jl", 644 | "mimetype": "application/julia", 645 | "name": "julia", 646 | "version": "1.9.4" 647 | } 648 | }, 649 | "nbformat": 4, 650 | "nbformat_minor": 5 651 | } 652 | -------------------------------------------------------------------------------- /parts/mpi/explanation/Project.toml: -------------------------------------------------------------------------------- 1 | [deps] 2 | AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" 3 | CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" 4 | CpuId = "adafc99b-e345-5852-983c-f28acb93d879" 5 | Hwloc = "0e44f5e4-bd66-52a0-8798-143a42290a1d" 6 | MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195" 7 | MPIClusterManagers = "e7922434-ae4b-11e9-05c5-9780451d2c66" 8 | NetworkInterfaceControllers = "6f74fd91-2978-43ad-8164-3af8c0ec0142" 9 | -------------------------------------------------------------------------------- /parts/mpi/explanation/advanced/00_gpu_select.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "7fc2f000-ba64-483f-99d7-37b7f24969d1", 7 | "metadata": { 8 | "tags": [] 9 | }, 10 | "outputs": [ 11 | { 12 | "name": "stderr", 13 | "output_type": "stream", 14 | "text": [ 15 | "\u001b[32m\u001b[1m Activating\u001b[22m\u001b[39m project at `/global/u1/b/blaschke/juliacon24-hpcworkshop/parts/mpi/explanation`\n" 16 | ] 17 | }, 18 | { 19 | "name": "stdout", 20 | "output_type": "stream", 21 | "text": [ 22 | "\u001b[32m\u001b[1mStatus\u001b[22m\u001b[39m `/global/u1/b/blaschke/juliacon24-hpcworkshop/parts/mpi/explanation/Project.toml`\n", 23 | " \u001b[90m[1520ce14] \u001b[39mAbstractTrees v0.4.5\n", 24 | " \u001b[90m[052768ef] \u001b[39mCUDA v5.4.2\n", 25 | " \u001b[90m[adafc99b] \u001b[39mCpuId v0.3.1\n", 26 | " \u001b[90m[0e44f5e4] \u001b[39mHwloc v3.0.1\n", 27 | " \u001b[90m[da04e1cc] \u001b[39mMPI v0.20.20\n", 28 | " \u001b[90m[e7922434] \u001b[39mMPIClusterManagers v0.2.4\n", 29 | " \u001b[90m[6f74fd91] \u001b[39mNetworkInterfaceControllers v0.1.0\n" 30 | ] 31 | } 32 | ], 33 | "source": [ 34 | "import Pkg;\n", 35 | "Pkg.activate(@__DIR__)\n", 36 | "Pkg.status()" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "id": "2fffd2fb-ff8c-45a3-963a-06e40f4511f7", 43 | "metadata": { 44 | "tags": [] 45 | }, 46 | "outputs": [ 47 | { 48 | "data": { 49 | "text/plain": [ 50 | "cpucycle_coreid (generic function with 1 method)" 51 | ] 52 | }, 53 | "execution_count": 2, 54 | "metadata": {}, 55 | "output_type": "execute_result" 56 | } 57 | ], 58 | "source": [ 59 | "using CpuId\n", 60 | "\n", 61 | "const cpucycle_mask = (\n", 62 | " (1 << (64 - leading_zeros(CpuId.cputhreads()))) - 1\n", 63 | ") % UInt32\n", 64 | "\n", 65 | "cpucycle_coreid() = Int(cpucycle_id()[2] & cpucycle_mask)" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 3, 71 | "id": "a38f335a-0c2c-43c3-bd9a-45656331d464", 72 | "metadata": { 73 | "tags": [] 74 | }, 75 | "outputs": [ 76 | { 77 | "data": { 78 | "text/plain": [ 79 | "13" 80 | ] 81 | }, 82 | "execution_count": 3, 83 | "metadata": {}, 84 | "output_type": "execute_result" 85 | } 86 | ], 87 | "source": [ 88 | "cpucycle_coreid()" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 4, 94 | "id": "0ab0999d-bcac-4a10-885a-c689eda97924", 95 | "metadata": { 96 | "tags": [] 97 | }, 98 | "outputs": [], 99 | "source": [ 100 | "using MPI, CUDA" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 5, 106 | "id": "7bc9f6ae-0ba6-4206-84d6-ce4dc6576f24", 107 | "metadata": { 108 | "tags": [] 109 | }, 110 | "outputs": [ 111 | { 112 | "name": "stdout", 113 | "output_type": "stream", 114 | "text": [ 115 | "MPIPreferences:\n", 116 | " binary: system\n", 117 | " abi: MPICH\n", 118 | " libmpi: libmpi_gnu_123.so\n", 119 | " mpiexec: srun\n", 120 | "\n", 121 | "Package versions\n", 122 | " MPI.jl: 0.20.20\n", 123 | " MPIPreferences.jl: 0.1.11\n", 124 | "\n", 125 | "Library information:\n", 126 | " libmpi: libmpi_gnu_123.so\n", 127 | " libmpi dlpath: /opt/cray/pe/lib64/libmpi_gnu_123.so\n", 128 | " MPI version: 3.1.0\n", 129 | " Library version: \n", 130 | " MPI VERSION : CRAY MPICH version 8.1.28.29 (ANL base 3.4a2)\n", 131 | " MPI BUILD INFO : Wed Nov 15 20:57 2023 (git hash 1cde46f)\n", 132 | " \n" 133 | ] 134 | } 135 | ], 136 | "source": [ 137 | "MPI.versioninfo()" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 6, 143 | "id": "27fb385f-7c83-421f-b77d-a59289004f8e", 144 | "metadata": { 145 | "tags": [] 146 | }, 147 | "outputs": [ 148 | { 149 | "name": "stdout", 150 | "output_type": "stream", 151 | "text": [ 152 | "CUDA runtime 12.2, local installation\n", 153 | "CUDA driver 12.2\n", 154 | "NVIDIA driver 525.105.17\n", 155 | "\n", 156 | "CUDA libraries: \n", 157 | "- CUBLAS: 12.2.1\n", 158 | "- CURAND: 10.3.3\n", 159 | "- CUFFT: 11.0.8\n", 160 | "- CUSOLVER: 11.5.0\n", 161 | "- CUSPARSE: 12.1.1\n", 162 | "- CUPTI: 20.0.0\n", 163 | "- NVML: 12.0.0+525.105.17\n", 164 | "\n", 165 | "Julia packages: \n", 166 | "- CUDA: 5.4.2\n", 167 | "- CUDA_Driver_jll: 0.9.1+1\n", 168 | "- CUDA_Runtime_jll: 0.14.1+0\n", 169 | "- CUDA_Runtime_Discovery: 0.3.4\n", 170 | "\n", 171 | "Toolchain:\n", 172 | "- Julia: 1.9.4\n", 173 | "- LLVM: 14.0.6\n", 174 | "\n", 175 | "Preferences:\n", 176 | "- CUDA_Runtime_jll.version: 12.2\n", 177 | "- CUDA_Runtime_jll.local: true\n", 178 | "\n", 179 | "4 devices:\n", 180 | " 0: NVIDIA A100-SXM4-80GB (sm_80, 79.150 GiB / 80.000 GiB available)\n", 181 | " 1: NVIDIA A100-SXM4-80GB (sm_80, 79.150 GiB / 80.000 GiB available)\n", 182 | " 2: NVIDIA A100-SXM4-80GB (sm_80, 79.150 GiB / 80.000 GiB available)\n", 183 | " 3: NVIDIA A100-SXM4-80GB (sm_80, 79.150 GiB / 80.000 GiB available)\n" 184 | ] 185 | } 186 | ], 187 | "source": [ 188 | "CUDA.versioninfo()" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 7, 194 | "id": "57b8e1ad-c17a-4af5-abda-a6bddb59c15f", 195 | "metadata": { 196 | "slideshow": { 197 | "slide_type": "skip" 198 | }, 199 | "tags": [] 200 | }, 201 | "outputs": [ 202 | { 203 | "data": { 204 | "text/plain": [ 205 | "filter (generic function with 26 methods)" 206 | ] 207 | }, 208 | "execution_count": 7, 209 | "metadata": {}, 210 | "output_type": "execute_result" 211 | } 212 | ], 213 | "source": [ 214 | "import Base: filter, Fix1\n", 215 | "filter(f::Function)::Function = Fix1(filter, f)" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 8, 221 | "id": "f51cb426-9357-4ed3-9ca9-319e81bc4f69", 222 | "metadata": { 223 | "tags": [] 224 | }, 225 | "outputs": [ 226 | { 227 | "data": { 228 | "text/plain": [ 229 | "get_device_attributes (generic function with 1 method)" 230 | ] 231 | }, 232 | "execution_count": 8, 233 | "metadata": {}, 234 | "output_type": "execute_result" 235 | } 236 | ], 237 | "source": [ 238 | "function get_device_attributes()\n", 239 | " attr = Dict{Tuple{Int32, Int32}, Int32}()\n", 240 | " for i in 0:(ndevices()-1)\n", 241 | " d = CuDevice(i)\n", 242 | " attr[(\n", 243 | " attribute(d, CUDA.CU_DEVICE_ATTRIBUTE_PCI_BUS_ID),\n", 244 | " attribute(d, CUDA.CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID)\n", 245 | " )] = d\n", 246 | " end\n", 247 | " attr\n", 248 | "end" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": 9, 254 | "id": "bc7e78ce-3ed7-4663-981e-99da96e9f5c7", 255 | "metadata": { 256 | "slideshow": { 257 | "slide_type": "skip" 258 | }, 259 | "tags": [] 260 | }, 261 | "outputs": [], 262 | "source": [ 263 | "using Hwloc, AbstractTrees\n", 264 | "\n", 265 | "\n", 266 | "import AbstractTrees: PreOrderDFS\n", 267 | "import Hwloc: hwloc_pci_class_string" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 34, 273 | "id": "cc2c286d-9822-4ad6-8d55-efd4dcc442b0", 274 | "metadata": { 275 | "tags": [] 276 | }, 277 | "outputs": [ 278 | { 279 | "data": { 280 | "text/plain": [ 281 | "distance_to_core (generic function with 1 method)" 282 | ] 283 | }, 284 | "execution_count": 34, 285 | "metadata": {}, 286 | "output_type": "execute_result" 287 | } 288 | ], 289 | "source": [ 290 | "function tag_subtree!(tree_node, val)\n", 291 | " for n in collect(AbstractTrees.PreOrderDFS(tree_node))\n", 292 | " n.tag = val\n", 293 | " end\n", 294 | "end\n", 295 | "\n", 296 | "function distance_to_core!(node, target_index)\n", 297 | " # shield re-entrance when iterating\n", 298 | " node.tag = 1\n", 299 | "\n", 300 | " if node.type == :PU\n", 301 | " # println(\"Checking: $(nodevalue(node).os_index)\")\n", 302 | " if nodevalue(node).os_index == target_index\n", 303 | " return true, 0\n", 304 | " end\n", 305 | " end\n", 306 | "\n", 307 | " for child in node.children\n", 308 | " if child.tag == 1\n", 309 | " continue\n", 310 | " end\n", 311 | "\n", 312 | " found, dist = distance_to_core!(child, target_index)\n", 313 | " if found\n", 314 | " return true, dist + 1\n", 315 | " end\n", 316 | " end\n", 317 | "\n", 318 | " if node.parent != nothing\n", 319 | " found, dist = distance_to_core!(node.parent, target_index)\n", 320 | " if found\n", 321 | " return true, dist + 1\n", 322 | " end\n", 323 | " end\n", 324 | "\n", 325 | " return false, typemax(Int)\n", 326 | "end\n", 327 | "\n", 328 | "function distance_to_core(root, node, target_index)\n", 329 | " tag_subtree!(root, 0) \n", 330 | " found, dist = distance_to_core!(node, target_index)\n", 331 | " tag_subtree!(root, 0) \n", 332 | " return found, dist\n", 333 | "end" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": 35, 339 | "id": "ddbadb39-1998-4472-b940-09648284ad8c", 340 | "metadata": { 341 | "tags": [] 342 | }, 343 | "outputs": [ 344 | { 345 | "data": { 346 | "text/plain": [ 347 | "Dict{Tuple{Int32, Int32}, Int32} with 4 entries:\n", 348 | " (65, 0) => 1\n", 349 | " (193, 0) => 3\n", 350 | " (130, 0) => 2\n", 351 | " (3, 0) => 0" 352 | ] 353 | }, 354 | "execution_count": 35, 355 | "metadata": {}, 356 | "output_type": "execute_result" 357 | } 358 | ], 359 | "source": [ 360 | "get_device_attributes()" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": 36, 366 | "id": "00f54295-a079-474c-8028-57fcea1fa288", 367 | "metadata": { 368 | "slideshow": { 369 | "slide_type": "skip" 370 | }, 371 | "tags": [] 372 | }, 373 | "outputs": [ 374 | { 375 | "data": { 376 | "text/plain": [ 377 | "get_device_distances (generic function with 1 method)" 378 | ] 379 | }, 380 | "execution_count": 36, 381 | "metadata": {}, 382 | "output_type": "execute_result" 383 | } 384 | ], 385 | "source": [ 386 | "sys_devs = children(gettopology())\n", 387 | "pci_devs = PreOrderDFS(sys_devs) |> collect |> filter(x->x.type==:PCI_Device)\n", 388 | "gpu_devs = pci_devs |> filter(x->hwloc_pci_class_string(nodevalue(x).attr.class_id) == \"3D\")\n", 389 | "\n", 390 | "function get_device_distances(core)\n", 391 | " attr = get_device_attributes()\n", 392 | " dist = Dict{Int32, Int32}()\n", 393 | " dev = Dict{Int32, Int32}()\n", 394 | " for d in gpu_devs\n", 395 | " idx = attr[(nodevalue(d).attr.bus, nodevalue(d).attr.dev)]\n", 396 | " found, dev_d = distance_to_core(sys_devs, d, core)\n", 397 | " if found\n", 398 | " dist[idx] = dev_d\n", 399 | " dev[dev_d] = idx\n", 400 | " end\n", 401 | " end\n", 402 | " dist, dev\n", 403 | "end" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": 37, 409 | "id": "d35ac271-b857-46f3-9744-a2512642c009", 410 | "metadata": { 411 | "tags": [] 412 | }, 413 | "outputs": [ 414 | { 415 | "data": { 416 | "text/plain": [ 417 | "49" 418 | ] 419 | }, 420 | "execution_count": 37, 421 | "metadata": {}, 422 | "output_type": "execute_result" 423 | } 424 | ], 425 | "source": [ 426 | "cpucycle_coreid()" 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": 48, 432 | "id": "c867b0ed-85df-47c1-b85a-c99312c66430", 433 | "metadata": { 434 | "tags": [] 435 | }, 436 | "outputs": [ 437 | { 438 | "data": { 439 | "text/plain": [ 440 | "0" 441 | ] 442 | }, 443 | "execution_count": 48, 444 | "metadata": {}, 445 | "output_type": "execute_result" 446 | } 447 | ], 448 | "source": [ 449 | "dist, dev = get_device_distances(cpucycle_coreid())\n", 450 | "closest_dev = dev[dev |> keys |> minimum]" 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": 45, 456 | "id": "e22f3df1-afef-45da-baef-8554c5f69189", 457 | "metadata": { 458 | "tags": [] 459 | }, 460 | "outputs": [ 461 | { 462 | "data": { 463 | "text/plain": [ 464 | "Dict{Int32, Int32} with 4 entries:\n", 465 | " 0 => 18\n", 466 | " 2 => 516\n", 467 | " 3 => 516\n", 468 | " 1 => 516" 469 | ] 470 | }, 471 | "execution_count": 45, 472 | "metadata": {}, 473 | "output_type": "execute_result" 474 | } 475 | ], 476 | "source": [ 477 | "dist" 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": 46, 483 | "id": "24afd17a-2308-4089-933b-5d138f555482", 484 | "metadata": { 485 | "tags": [] 486 | }, 487 | "outputs": [ 488 | { 489 | "data": { 490 | "text/plain": [ 491 | "Dict{Int32, Int32} with 2 entries:\n", 492 | " 18 => 0\n", 493 | " 516 => 1" 494 | ] 495 | }, 496 | "execution_count": 46, 497 | "metadata": {}, 498 | "output_type": "execute_result" 499 | } 500 | ], 501 | "source": [ 502 | "dev" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": 47, 508 | "id": "8c844ec9-bb7b-4142-899c-acac3035d841", 509 | "metadata": { 510 | "tags": [] 511 | }, 512 | "outputs": [ 513 | { 514 | "data": { 515 | "text/plain": [ 516 | "0" 517 | ] 518 | }, 519 | "execution_count": 47, 520 | "metadata": {}, 521 | "output_type": "execute_result" 522 | } 523 | ], 524 | "source": [] 525 | }, 526 | { 527 | "cell_type": "code", 528 | "execution_count": null, 529 | "id": "a17f0a7a-5ced-4fa7-8c65-ee4da39d54af", 530 | "metadata": {}, 531 | "outputs": [], 532 | "source": [] 533 | } 534 | ], 535 | "metadata": { 536 | "kernelspec": { 537 | "display_name": "Julia 1.9.4", 538 | "language": "julia", 539 | "name": "julia-1.9.4" 540 | }, 541 | "language_info": { 542 | "file_extension": ".jl", 543 | "mimetype": "application/julia", 544 | "name": "julia", 545 | "version": "1.9.4" 546 | } 547 | }, 548 | "nbformat": 4, 549 | "nbformat_minor": 5 550 | } 551 | -------------------------------------------------------------------------------- /parts/mpi/explanation/diffusion_2d_halo_exchange.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JuliaHPC/juliacon24-hpcworkshop/2c6e764bcf94eebd549785b69264215b1012eaf2/parts/mpi/explanation/diffusion_2d_halo_exchange.pdf -------------------------------------------------------------------------------- /parts/mpi/explanation/diffusion_2d_halo_exchange.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JuliaHPC/juliacon24-hpcworkshop/2c6e764bcf94eebd549785b69264215b1012eaf2/parts/mpi/explanation/diffusion_2d_halo_exchange.png -------------------------------------------------------------------------------- /parts/mpi/explanation/l8_1D_global_grid.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JuliaHPC/juliacon24-hpcworkshop/2c6e764bcf94eebd549785b69264215b1012eaf2/parts/mpi/explanation/l8_1D_global_grid.png -------------------------------------------------------------------------------- /parts/mpi/get_compute_node_interactive.sh: -------------------------------------------------------------------------------- 1 | salloc --nodes 1 --cpus-per-task=1 --qos interactive --time 00:45:00 --constraint cpu --ntasks-per-node=4 --account=ntrain1 2 | -------------------------------------------------------------------------------- /parts/mpi/job_mpi_multinode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -A ntrain1 4 | #SBATCH -C cpu 5 | #SBATCH -q regular 6 | #SBATCH --output=slurm_mpi_multinode.out 7 | #SBATCH --time=00:05:00 8 | #SBATCH --nodes=4 9 | #SBATCH --ntasks=16 10 | #SBATCH --exclusive 11 | 12 | ml use /global/common/software/nersc/n9/julia/modules 13 | ml julia 14 | 15 | mpiexecjl --project=../.. julia -e 'do_save=false; include("diffusion_2d_mpi.jl");' 16 | -------------------------------------------------------------------------------- /parts/mpi/job_mpi_singlenode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -A ntrain1 4 | #SBATCH -C cpu 5 | #SBATCH -q regular 6 | #SBATCH --output=slurm_mpi_singlenode.out 7 | #SBATCH --time=00:05:00 8 | #SBATCH --nodes=1 9 | #SBATCH --ntasks=4 10 | #SBATCH --exclusive 11 | 12 | ml use /global/common/software/nersc/n9/julia/modules 13 | ml julia 14 | 15 | mpiexecjl --project=../.. julia diffusion_2d_mpi.jl 16 | -------------------------------------------------------------------------------- /parts/mpi/solution/diffusion_2d_mpi.jl: -------------------------------------------------------------------------------- 1 | # 2D linear diffusion solver - MPI 2 | using Printf 3 | using JLD2 4 | using MPI 5 | include(joinpath(@__DIR__, "../../shared.jl")) 6 | 7 | # convenience macros simply to avoid writing nested finite-difference expression 8 | macro qx(ix, iy) esc(:(-D * (C[$ix+1, $iy] - C[$ix, $iy]) / dx)) end 9 | macro qy(ix, iy) esc(:(-D * (C[$ix, $iy+1] - C[$ix, $iy]) / dy)) end 10 | 11 | function diffusion_step!(params, C2, C) 12 | (; dx, dy, dt, D) = params 13 | for iy in 1:size(C, 2)-2 14 | for ix in 1:size(C, 1)-2 15 | @inbounds C2[ix+1, iy+1] = C[ix+1, iy+1] - dt * ((@qx(ix+1, iy+1) - @qx(ix, iy+1)) / dx + 16 | (@qy(ix+1, iy+1) - @qy(ix+1, iy)) / dy) 17 | end 18 | end 19 | return nothing 20 | end 21 | 22 | # MPI functions 23 | @views function update_halo!(A, bufs, neighbors, comm) 24 | # dim-1 (x) 25 | (neighbors.x[1] != MPI.PROC_NULL) && copyto!(bufs.send_1_1, A[2 , :]) 26 | (neighbors.x[2] != MPI.PROC_NULL) && copyto!(bufs.send_1_2, A[end-1, :]) 27 | 28 | reqs = MPI.MultiRequest(4) 29 | (neighbors.x[1] != MPI.PROC_NULL) && MPI.Irecv!(bufs.recv_1_1, comm, reqs[1]; source=neighbors.x[1]) 30 | (neighbors.x[2] != MPI.PROC_NULL) && MPI.Irecv!(bufs.recv_1_2, comm, reqs[2]; source=neighbors.x[2]) 31 | 32 | (neighbors.x[1] != MPI.PROC_NULL) && MPI.Isend(bufs.send_1_1, comm, reqs[3]; dest=neighbors.x[1]) 33 | (neighbors.x[2] != MPI.PROC_NULL) && MPI.Isend(bufs.send_1_2, comm, reqs[4]; dest=neighbors.x[2]) 34 | MPI.Waitall(reqs) # blocking 35 | 36 | (neighbors.x[1] != MPI.PROC_NULL) && copyto!(A[1 , :], bufs.recv_1_1) 37 | (neighbors.x[2] != MPI.PROC_NULL) && copyto!(A[end, :], bufs.recv_1_2) 38 | 39 | # dim-2 (y) 40 | (neighbors.y[1] != MPI.PROC_NULL) && copyto!(bufs.send_2_1, A[:, 2 ]) 41 | (neighbors.y[2] != MPI.PROC_NULL) && copyto!(bufs.send_2_2, A[:, end-1]) 42 | 43 | reqs = MPI.MultiRequest(4) 44 | (neighbors.y[1] != MPI.PROC_NULL) && MPI.Irecv!(bufs.recv_2_1, comm, reqs[1]; source=neighbors.y[1]) 45 | (neighbors.y[2] != MPI.PROC_NULL) && MPI.Irecv!(bufs.recv_2_2, comm, reqs[2]; source=neighbors.y[2]) 46 | 47 | (neighbors.y[1] != MPI.PROC_NULL) && MPI.Isend(bufs.send_2_1, comm, reqs[3]; dest=neighbors.y[1]) 48 | (neighbors.y[2] != MPI.PROC_NULL) && MPI.Isend(bufs.send_2_2, comm, reqs[4]; dest=neighbors.y[2]) 49 | MPI.Waitall(reqs) # blocking 50 | 51 | (neighbors.y[1] != MPI.PROC_NULL) && copyto!(A[:, 1 ], bufs.recv_2_1) 52 | (neighbors.y[2] != MPI.PROC_NULL) && copyto!(A[:, end], bufs.recv_2_2) 53 | return nothing 54 | end 55 | 56 | function init_bufs(A) 57 | return (; send_1_1=zeros(size(A, 2)), send_1_2=zeros(size(A, 2)), 58 | send_2_1=zeros(size(A, 1)), send_2_2=zeros(size(A, 1)), 59 | recv_1_1=zeros(size(A, 2)), recv_1_2=zeros(size(A, 2)), 60 | recv_2_1=zeros(size(A, 1)), recv_2_2=zeros(size(A, 1))) 61 | end 62 | 63 | function run_diffusion(; ns=64, nt=100, do_save=false) 64 | MPI.Init() 65 | comm = MPI.COMM_WORLD 66 | nprocs = MPI.Comm_size(comm) 67 | dims = MPI.Dims_create(nprocs, (0, 0)) |> Tuple 68 | comm_cart = MPI.Cart_create(comm, dims) 69 | me = MPI.Comm_rank(comm_cart) 70 | coords = MPI.Cart_coords(comm_cart) |> Tuple 71 | neighbors = (; x=MPI.Cart_shift(comm_cart, 0, 1), y=MPI.Cart_shift(comm_cart, 1, 1)) 72 | (me == 0) && println("nprocs = $(nprocs), dims = $dims") 73 | 74 | params = init_params_mpi(; dims, coords, ns, nt, do_save) 75 | C, C2 = init_arrays_mpi(params) 76 | bufs = init_bufs(C) 77 | t_tic = 0.0 78 | # time loop 79 | for it in 1:nt 80 | # time after warmup (ignore first 10 iterations) 81 | (it == 11) && (t_tic = Base.time()) 82 | # diffusion 83 | diffusion_step!(params, C2, C) 84 | update_halo!(C2, bufs, neighbors, comm_cart) 85 | C, C2 = C2, C # pointer swap 86 | end 87 | t_toc = (Base.time() - t_tic) 88 | # "master" prints performance 89 | (me == 0) && print_perf(params, t_toc) 90 | # save to (maybe) visualize later 91 | if do_save 92 | jldsave(joinpath(@__DIR__, "out_$(me).jld2"); C = Array(C[2:end-1, 2:end-1]), lxy = (; lx=params.L, ly=params.L)) 93 | end 94 | MPI.Finalize() 95 | return nothing 96 | end 97 | 98 | # Running things... 99 | 100 | # enable save to disk by default 101 | (!@isdefined do_save) && (do_save = true) 102 | # enable execution by default 103 | (!@isdefined do_run) && (do_run = true) 104 | 105 | if do_run 106 | if !isempty(ARGS) 107 | run_diffusion(; ns=parse(Int, ARGS[1]), do_save) 108 | else 109 | run_diffusion(; ns=256, do_save) 110 | end 111 | end 112 | -------------------------------------------------------------------------------- /parts/mpi/solution/job_mpi_multinode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -A ntrain1 4 | #SBATCH -C cpu 5 | #SBATCH -q regular 6 | #SBATCH --output=slurm_mpi_multinode.out 7 | #SBATCH --time=00:05:00 8 | #SBATCH --nodes=4 9 | #SBATCH --ntasks=16 10 | #SBATCH --exclusive 11 | 12 | ml use /global/common/software/nersc/n9/julia/modules 13 | ml julia 14 | 15 | mpiexecjl --project=../../.. julia -e 'do_save=false; include("diffusion_2d_mpi.jl");' 16 | -------------------------------------------------------------------------------- /parts/mpi/solution/job_mpi_singlenode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -A ntrain1 4 | #SBATCH -C cpu 5 | #SBATCH -q regular 6 | #SBATCH --output=slurm_mpi_singlenode.out 7 | #SBATCH --time=00:05:00 8 | #SBATCH --nodes=1 9 | #SBATCH --ntasks=4 10 | #SBATCH --exclusive 11 | 12 | ml use /global/common/software/nersc/n9/julia/modules 13 | ml julia 14 | 15 | mpiexecjl --project=../../.. julia diffusion_2d_mpi.jl 16 | -------------------------------------------------------------------------------- /parts/mpi/solution/multinode_results.txt: -------------------------------------------------------------------------------- 1 | # 1 node, 4 MPI ranks 2 | nprocs = 4, dims = [2, 2] 3 | Time = 6.5865e+00 s, T_eff = 8.25 GB/s 4 | 5 | # 2 nodes, 8 MPI ranks 6 | nprocs = 8, dims = [4, 2] 7 | Time = 6.5964e+00 s, T_eff = 8.24 GB/s 8 | 9 | # 3 nodes, 12 MPI ranks 10 | nprocs = 12, dims = [4, 3] 11 | Time = 6.5889e+00 s, T_eff = 8.25 GB/s 12 | 13 | # 4 nodes, 16 MPI ranks 14 | nprocs = 16, dims = [4, 4] 15 | Time = 6.6004e+00 s, T_eff = 8.24 GB/s -------------------------------------------------------------------------------- /parts/mpi/solution/slurm_mpi_singlenode.out: -------------------------------------------------------------------------------- 1 | nprocs = 4, dims = [2, 2] 2 | Time = 1.2309e-02 s, T_eff = 7.67 GB/s 3 | -------------------------------------------------------------------------------- /parts/mpi/solution/visualization_before.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JuliaHPC/juliacon24-hpcworkshop/2c6e764bcf94eebd549785b69264215b1012eaf2/parts/mpi/solution/visualization_before.png -------------------------------------------------------------------------------- /parts/mpi/solution/visualization_desired.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JuliaHPC/juliacon24-hpcworkshop/2c6e764bcf94eebd549785b69264215b1012eaf2/parts/mpi/solution/visualization_desired.png -------------------------------------------------------------------------------- /parts/mpi/visualize_mpi.jl: -------------------------------------------------------------------------------- 1 | # Visualisation script for the 2D MPI solver 2 | using CairoMakie 3 | using JLD2 4 | 5 | function vizme2D_mpi(nprocs) 6 | C = [] 7 | lx = ly = 0.0 8 | ip = 1 9 | for ipx in 1:nprocs[1] 10 | for ipy in 1:nprocs[2] 11 | C_loc, lxy = load("out_$(ip-1).jld2", "C", "lxy") 12 | nx_i, ny_i = size(C_loc, 1), size(C_loc, 2) 13 | ix1, iy1 = 1 + (ipx - 1) * nx_i, 1 + (ipy - 1) * ny_i 14 | if ip == 1 15 | C = zeros(nprocs[1] * nx_i, nprocs[2] * ny_i) 16 | lx, ly = lxy 17 | end 18 | C[ix1:ix1+nx_i-1, iy1:iy1+ny_i-1] .= C_loc 19 | ip += 1 20 | end 21 | end 22 | xc, yc = LinRange.(0, (lx, ly), size(C)) 23 | fig = Figure(; size=(500, 400), fontsize=14) 24 | ax = Axis(fig[1, 1][1, 1]; aspect=DataAspect(), title="C") 25 | hm = heatmap!(ax, xc, yc, C; colormap=:turbo, colorrange=(0, 1)) 26 | cb = Colorbar(fig[1, 1][1, 2], hm) 27 | if isinteractive() 28 | display(fig) 29 | else 30 | save("visualization.png", fig) 31 | end 32 | return 33 | end 34 | 35 | nprocs = (2, 2) # nprocs (x, y) dim 36 | vizme2D_mpi(nprocs) 37 | -------------------------------------------------------------------------------- /parts/multithreading/README.md: -------------------------------------------------------------------------------- 1 | # Diffusion 2D - Multithreading 2 | 3 | In this part, we want to use multithreading (shared-memory parallelism) to parallelize our Diffusion 2D example. 4 | 5 | The starting point is the serial loop version [`diffusion_2d_loop.jl`](./../diffusion_2d/diffusion_2d_loop.jl). The file [`diffusion_2d_threads.jl`](./diffusion_2d_threads.jl) in this folder is a slightly modified copy of this version. Specifically, we included the serial initialization of the arrays `C` and `C2` in form of the function `init_arrays_threads` and left the computational kernel (`diffusion_step!`) mostly unimplemented. Note that there are few code stubs (indicated by `TODO` comments) that you will implement in the tasks below. 6 | 7 | ## Task 1 - Multithreading `diffusion_step!` 8 | 9 | ### Part A 10 | 11 | Your first task is to take the diffusion kernel from `diffusion_2d_loop.jl` - recited below for your convenience - and use `@threads` to parallelize it. See the `TODO` comments inside of the `diffusion_step!` function. 12 | 13 | You shall implement two variants, one that uses static scheduling and another that uses dynamic scheduling. A variable `static` will be used to switch between the two cases . 14 | 15 | (To test the correctness of your implementation, you can do an "eye test" and just look at the resulting plots.) 16 | 17 | **Question:** 18 | * Should you parallelize the inner or the outer loop? 19 | * (You can try both and compare the two in terms of performance if you are unsure.) 20 | 21 | **Serial kernel from diffusion_2d_loop.jl:** 22 | ```julia 23 | for iy in 1:size(C, 2)-2 24 | for ix in 1:size(C, 1)-2 25 | @inbounds C2[ix+1, iy+1] = C[ix+1, iy+1] - dt * ((@qx(ix+1, iy+1) - @qx(ix, iy+1)) / ds + 26 | (@qy(ix+1, iy+1) - @qy(ix+1, iy)) / ds) 27 | end 28 | end 29 | ``` 30 | 31 | ### Part B 32 | 33 | Let's make a first rough performance comparison. Run your implementation using 8 Julia threads and using 1 Julia thread and compare the timings/`T_eff` ("strong scaling"). Perform this comparison for three values of `ns`, for example 512, 2048, and 6144. 34 | 35 | Note that you don't have to implement the other `TODO`s in the file. The code should run just fine if you've implemented `diffusion_step!`. 36 | 37 | **How to run the code?** 38 | 39 | You can either perform the rough benchmark in an interactive Julia session or use the script `job_compare_threads_serial.sh`. 40 | 41 | * Interactive: 42 | * Set `do_visualize=false`. 43 | * Use `include("diffusion_2d_threads.jl")` to run the code. 44 | 45 | * Script:: 46 | * Either just run the script on the current node (`sh job_compare_threads_serial.sh`) or submit it as a job to SLURM (`sbatch job_compare_threads_serial.sh`). In the latter case, the output will end up in a file called `slurm_compare_threads_serial.out`. 47 | 48 | **Questions:** 49 | * What do you observe? 50 | * Are you happy with the performance improvement? 51 | * Consider taking ratios of the timings (i.e. $t_{serial}$ / $t_{parallel}$) and compare it to 8 (naively anticipating perfect scaling). 52 | 53 | ## Task 2 - Parallel initialization and thread pinning 54 | 55 | As has been stated before the hands-on, how we pin the Julia threads and how/whether we initialize the data (`C`, `C2`) in serial or parallel can heavily influence the performance of our code. Let's put this to the test! 56 | 57 | ### Part A 58 | 59 | Go ahead and parallelize the initialization of `C` and `C2` in the function `init_arrays_threads` (see the `TODO`s therein) in the same way as you've parallelized the kernel in `diffusion_step!` above. 60 | 61 | The variable `parallel_init` (`true` or `false`) is used to switch between parallel and serial initialization. Similarly, the variable `static` (`true` or `false`) is used to switch between static and dynamic scheduling. 62 | 63 | (To test the correctness of your implementation, you can do an "eye test" and just look at the resulting plots.) 64 | 65 | ### Part B 66 | 67 | Now, we want to systematically compare the performance of our code for 68 | * different combinations of `parallel_init` and `static`, 69 | * different values of `ns` (512, 2048, and 6144), and 70 | * different pinning schemes (`:cores`, `:sockets`, `:numa`) 71 | 72 | While you are more than invited to play around with these degrees of freedom in an interactive Julia session running on a **compute node**, this will likely become rather cumbersome very quickly. 73 | (We still invite you to play around a little bit with ThreadPinning's `threadinfo` and `pinthreads`!) 74 | 75 | To simplify things, we've prepared the script `job_bench_threads.sh` for you, which you can simply submit to SLURM (`sbatch job_bench_threads.sh`). The output will end up in the file `slurm_bench_threads.out`. 76 | 77 | **Questions:** 78 | * First, compare the `static=true` results with increasing `ns` (that is, ignore the dynamic scheduling runs for now). Can you qualitatively explain the performance difference/similarity between the three pinning strategies? And maybe also why it changes with increasing `ns`? 79 | * Why does dynamic scheduling (most of the time) give worse performance than static scheduling? 80 | * The output also shows single-threaded timings. Consider the timing ratio ($t_{serial}$ / $t_{parallel}$) for the best performing cases. Is it an improvement over what you found above (i.e. closer to a factor of 8)? 81 | -------------------------------------------------------------------------------- /parts/multithreading/diffusion_2d_threads.jl: -------------------------------------------------------------------------------- 1 | # 2D linear diffusion solver - multithreading 2 | using Printf 3 | using CairoMakie 4 | include(joinpath(@__DIR__, "../shared.jl")) 5 | 6 | function init_arrays_threads(params) 7 | (; ns, cs, parallel_init, static) = params 8 | C = Matrix{Float64}(undef, ns, ns) 9 | C2 = Matrix{Float64}(undef, ns, ns) 10 | # 11 | # !! TODO !! 12 | # 13 | # Below, you see how the arrays C and C2 are initialized without multithreading. 14 | # Based off of this serial implementation implement two multithreaded variants 15 | # that use static or dynamic scheduling, respectively (see "TODO..." below). 16 | # 17 | if parallel_init 18 | # parallel initialization 19 | if static 20 | # static scheduling 21 | # TODO... 22 | else 23 | # dynamic scheduling 24 | # TODO... 25 | end 26 | else 27 | # serial initialization 28 | for iy in axes(C, 2) 29 | for ix in axes(C, 1) 30 | C[ix, iy] = exp(- cs[ix]^2 - cs[iy]^2) 31 | C2[ix, iy] = C[ix, iy] # element-wise copy 32 | end 33 | end 34 | end 35 | return C, C2 36 | end 37 | 38 | # to avoid writing nested finite-difference expression 39 | macro qx(ix, iy) esc(:(-D * (C[$ix+1, $iy] - C[$ix, $iy]) / ds)) end 40 | macro qy(ix, iy) esc(:(-D * (C[$ix, $iy+1] - C[$ix, $iy]) / ds)) end 41 | 42 | function diffusion_step!(params, C2, C) 43 | (; ds, dt, D, static) = params 44 | # 45 | # !! TODO !! 46 | # 47 | # We want to multithread the diffusion step (our computational kernel). 48 | # Based off of the serial kernel (see README.md or diffusion_2d_loop.jl) implement 49 | # two multithreaded variants that use static or dynamic scheduling, respectively 50 | # (see "TODO..." below). 51 | # 52 | if static 53 | # static scheduling 54 | # TODO... 55 | else 56 | # dynamic scheduling 57 | # TODO... 58 | end 59 | return nothing 60 | end 61 | 62 | function run_diffusion(; ns=64, nt=100, do_visualize=false, parallel_init=false, static=false) 63 | params = init_params(; ns, nt, do_visualize, parallel_init, static) 64 | C, C2 = init_arrays_threads(params) 65 | fig, plt = maybe_init_visualization(params, C) 66 | t_tic = 0.0 67 | # time loop 68 | for it in 1:nt 69 | # time after warmup (ignore first 10 iterations) 70 | (it == 11) && (t_tic = Base.time()) 71 | # diffusion 72 | diffusion_step!(params, C2, C) 73 | C, C2 = C2, C # pointer swap 74 | # visualization 75 | maybe_update_visualization(params, fig, plt, C, it) 76 | end 77 | t_toc = (Base.time() - t_tic) 78 | print_perf(params, t_toc) 79 | return nothing 80 | end 81 | 82 | # Running things... 83 | 84 | # enable visualization by default 85 | (!@isdefined do_visualize) && (do_visualize = true) 86 | # enable execution by default 87 | (!@isdefined do_run) && (do_run = true) 88 | 89 | if do_run 90 | if !isempty(ARGS) 91 | run_diffusion(; ns=parse(Int, ARGS[1]), do_visualize) 92 | else 93 | run_diffusion(; ns=256, do_visualize) 94 | end 95 | end 96 | -------------------------------------------------------------------------------- /parts/multithreading/imgs/stack_heap_threads.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JuliaHPC/juliacon24-hpcworkshop/2c6e764bcf94eebd549785b69264215b1012eaf2/parts/multithreading/imgs/stack_heap_threads.png -------------------------------------------------------------------------------- /parts/multithreading/job_bench_threads.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --time=00:05:00 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 5 | #SBATCH --cpus-per-task=256 6 | #SBATCH --constraint=cpu 7 | #SBATCH --account=ntrain1 8 | #SBATCH --output=slurm_bench_threads.out 9 | 10 | # Load julia 11 | ml use /global/common/software/nersc/n9/julia/modules 12 | ml julia 13 | 14 | for i in 512 2048 6144 15 | do 16 | echo -e "\n\n#### Run $i" 17 | 18 | echo -e "-- single threaded" 19 | julia --project --threads 1 -e 'do_visualize=false; include("diffusion_2d_threads.jl")' $i 20 | echo -e "" 21 | 22 | julia --project --threads 8 bench_threads.jl $i # benchmark multithreaded variants 23 | done -------------------------------------------------------------------------------- /parts/multithreading/job_compare_threads_serial.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --time=00:05:00 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 5 | #SBATCH --cpus-per-task=256 6 | #SBATCH --constraint=cpu 7 | #SBATCH --account=ntrain1 8 | #SBATCH --output=slurm_compare_threads_serial.out 9 | 10 | # Load julia 11 | ml use /global/common/software/nersc/n9/julia/modules 12 | ml julia 13 | 14 | for i in 512 2048 6144 15 | do 16 | echo -e "\n\n#### Run $i" 17 | 18 | echo -e "-- single threaded" 19 | julia --project --threads 1 -e 'do_visualize=false; include("diffusion_2d_threads.jl")' $i 20 | echo -e "" 21 | 22 | echo -e "-- multithreaded (8 threads)" 23 | julia --project --threads 8 -e 'do_visualize=false; include("diffusion_2d_threads.jl")' $i 24 | echo -e "" 25 | done -------------------------------------------------------------------------------- /parts/multithreading/multithreading.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Multithreading (shared-memory parallelism)" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Overview" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "* **Running Julia with multiple threads**\n", 22 | "\n", 23 | "* Where are the threads running?\n", 24 | " * ThreadPinning.jl\n", 25 | "\n", 26 | "* **Task-based multithreading**\n", 27 | " * dynamic and static scheduling\n", 28 | "\n", 29 | "* **\"Data pinning\"**\n", 30 | " * NUMA \"first-touch\" policy" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "## Running Julia with multiple threads" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "By default, Julia starts with a single *user thread*. We must tell it explicitly to start multiple user threads.\n", 45 | "\n", 46 | "* Environment variable: `export JULIA_NUM_THREADS=8`\n", 47 | "\n", 48 | "* Command line argument: `julia -t 8` or `julia --threads 8`\n", 49 | "\n", 50 | "* **VS Code:** Add `\"julia.NumThreads\": 8` to workspace settings (`Preferences: Open Workspace Settings (JSON)`)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "**It is currently not really possible to change the number of threads at runtime!**" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "Threads.nthreads()" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "## Where are the threads running?" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "[ThreadPinning.jl](https://github.com/carstenbauer/ThreadPinning.jl) is the best tool for visualizing and controlling thread placement in Julia. (Disclaimer: I'm the main author 😉)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "using ThreadPinning\n", 90 | "\n", 91 | "threadinfo()" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "### Pinning threads (i.e. controling where they are running)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": {}, 104 | "source": [ 105 | "#### Why?\n", 106 | "\n", 107 | "* To avoid double occupancy of CPU cores.\n", 108 | "\n", 109 | "* To reduce noise in benchmarks.\n", 110 | "\n", 111 | "* To address the complexity of the system topology, e.g. to use specific/all memory domains (NUMA).\n", 112 | "\n", 113 | "* ..." 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "#### How?\n", 121 | "\n", 122 | "`pinthreads(strategy)`\n", 123 | "* `:cputhreads` pin to CPU threads (incl. \"hypterthreads\") one after another\n", 124 | "* `:cores:` pin to CPU cores one after another\n", 125 | "* `:numa:` alternate between NUMA domains (round-robin)\n", 126 | "* `:sockets:` alternate between sockets (round-robin)\n", 127 | "* `:affinitymask`: pin according to an external affinity mask (e.g. set by SLURM)\n", 128 | "\n", 129 | "(More? See my talk at JuliaCon2023 @ MIT: https://youtu.be/6Whc9XtlCC0)" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "pinthreads(:cores) # try :cores or :sockets or :random\n", 139 | "threadinfo()" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "pinthreads(:numa)\n", 149 | "threadinfo(; groupby=:numa)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "#### Memory domains (NUMA)" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "metadata": {}, 162 | "source": [ 163 | "NUMA = **n**on-**u**niform **m**emory **a**ccess" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "One (of two) AMD Milan CPUs in a Perlmutter node:\n", 171 | "\n", 172 | "\n", 173 | "\n", 174 | "**Image source:** [AMD, High Performance Computing (HPC) Tuning Guide for AMD EPYCTM 7003 Series Processors](https://www.amd.com/system/files/documents/high-performance-computing-tuning-guide-amd-epyc7003-series-processors.pdf)" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [ 183 | "# Other useful options for querying system information\n", 184 | "\n", 185 | "# using CpuId\n", 186 | "# cpuinfo()\n", 187 | "\n", 188 | "# using Hwloc\n", 189 | "# topology_graphical()" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": {}, 195 | "source": [ 196 | "## Task-based multithreading\n", 197 | "\n", 198 | "
\n", 199 | "\n", 200 | "
\n", 201 | "\n", 202 | "The user doesn't control threads but tasks that get scheduled on threads.\n", 203 | "\n", 204 | "**Advantages:** 👍\n", 205 | "* high-level abstraction\n", 206 | "* nestability / composability\n", 207 | "\n", 208 | "**Disadvantages:** 👎\n", 209 | "* scheduling overhead\n", 210 | "* uncertain and potentially suboptimal task → thread assignment\n", 211 | " * scheduler has limited information (e.g. about the system topology)\n", 212 | " * task migration" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": {}, 218 | "source": [ 219 | "### Dynamic scheduling: `@threads :dynamic for ... in ...`" 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": {}, 225 | "source": [ 226 | "* **Splits up the iteration space into `nthreads()` contiguous chunks**\n", 227 | "\n", 228 | "* Creates a task for each of them and hands them off to the dynamic scheduler (essentially `@spawn`s each chunk)." 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "using Base.Threads: @threads, threadid, nthreads" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [ 246 | "# implicitly creates nthreads() many tasks, each of which handles 2 iterations\n", 247 | "@threads :dynamic for i in 1:2*nthreads()\n", 248 | " println(\"Running iteration \", i, \" on thread \", threadid())\n", 249 | "end" 250 | ] 251 | }, 252 | { 253 | "cell_type": "markdown", 254 | "metadata": {}, 255 | "source": [ 256 | "#### Static scheduling: `@threads :static for ... in ...`" 257 | ] 258 | }, 259 | { 260 | "cell_type": "markdown", 261 | "metadata": {}, 262 | "source": [ 263 | "* `:static` option to opt-out of dynamic scheduling\n", 264 | "\n", 265 | "* Statically **\"pins\" tasks to threads**\n", 266 | " * task 1 → thread 1, task 2 → thread 2, and so on.\n", 267 | "\n", 268 | "Pro 👍\n", 269 | " * **fixed task-thread mapping** (no task migration)\n", 270 | " * very little overhead\n", 271 | " \n", 272 | "Con 👎\n", 273 | " * not composable / nestable" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [ 282 | "@threads :static for i in 1:2*nthreads()\n", 283 | " println(\"Running iteration \", i, \" on thread \", threadid());\n", 284 | "end" 285 | ] 286 | }, 287 | { 288 | "cell_type": "markdown", 289 | "metadata": {}, 290 | "source": [ 291 | "(For `@threads :static`, every thread handles precisely two iterations!)" 292 | ] 293 | }, 294 | { 295 | "cell_type": "markdown", 296 | "metadata": {}, 297 | "source": [ 298 | "## \"Data pinning\" (NUMA revisited)" 299 | ] 300 | }, 301 | { 302 | "cell_type": "markdown", 303 | "metadata": {}, 304 | "source": [ 305 | "Implicitly → **NUMA \"first-touch\" policy**\n", 306 | "\n", 307 | "Explicitly → [NUMA.jl](https://github.com/JuliaPerf/NUMA.jl)" 308 | ] 309 | }, 310 | { 311 | "cell_type": "markdown", 312 | "metadata": {}, 313 | "source": [ 314 | "### NUMA \"first-touch\" policy\n", 315 | "\n", 316 | "Data is (typically) placed in the **NUMA domain that is closest to the thread/CPU core** that is \"touching\" the data.\n", 317 | "\n", 318 | "```julia\n", 319 | "x = Vector{Float64}(undef, 10) # allocation, no \"touch\" yet\n", 320 | "rand!(x) # first touch == first write\n", 321 | "```" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": null, 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [ 330 | "pinthreads(:numa)\n", 331 | "threadinfo(; groupby=:numa)" 332 | ] 333 | }, 334 | { 335 | "cell_type": "markdown", 336 | "metadata": {}, 337 | "source": [ 338 | "### Array initialization: serial vs parallel" 339 | ] 340 | }, 341 | { 342 | "cell_type": "markdown", 343 | "metadata": {}, 344 | "source": [ 345 | "**Different parts of an array can be placed in different NUMA domains!**\n", 346 | "\n", 347 | "Data is managed in terms of memory pages (\"unit of data\")." 348 | ] 349 | }, 350 | { 351 | "cell_type": "markdown", 352 | "metadata": {}, 353 | "source": [ 354 | "#### Serial\n", 355 | "\n", 356 | "```julia\n", 357 | "x = Vector{Float64}(undef, 100) # allocation, no \"touch\" yet\n", 358 | "rand!(x) # first touch == first write\n", 359 | "```\n", 360 | "\n", 361 | "The location of the \"main\" thread determines the NUMA domain of the entire array!\n", 362 | "\n", 363 | "If we later access the data in parallel, all threads must read from the same NUMA domain → competition for the memory bus → potential bottleneck." 364 | ] 365 | }, 366 | { 367 | "cell_type": "markdown", 368 | "metadata": {}, 369 | "source": [ 370 | "#### Parallel\n", 371 | "\n", 372 | "```julia\n", 373 | "pinthreads(:numa) # pin threads to different NUMA domains\n", 374 | "x = Vector{Float64}(undef, 100) # allocation, no \"touch\" yet\n", 375 | "@threads :static for i in eachindex(x) # parallel iteration\n", 376 | " x[i] = rand() # first touch == first write\n", 377 | "end\n", 378 | "```\n", 379 | "\n", 380 | "Different threads - running in different NUMA regions - touch different parts of the array → the latter will (likely) be placed in different NUMA domains.\n", 381 | "\n", 382 | "If we later access the data in parallel, all threads can read their part of the array from their local NUMA domain → no bottleneck." 383 | ] 384 | }, 385 | { 386 | "cell_type": "markdown", 387 | "metadata": {}, 388 | "source": [ 389 | "Crucial point: **How you initialize your data influences the performance of your computational kernel!** (non-local effect)\n", 390 | "\n", 391 | "**→ Hands-on** (see [README.md](README.md))" 392 | ] 393 | } 394 | ], 395 | "metadata": { 396 | "kernelspec": { 397 | "display_name": "Julia 1.10.4", 398 | "language": "julia", 399 | "name": "julia-1.10" 400 | }, 401 | "language_info": { 402 | "file_extension": ".jl", 403 | "mimetype": "application/julia", 404 | "name": "julia", 405 | "version": "1.10.4" 406 | } 407 | }, 408 | "nbformat": 4, 409 | "nbformat_minor": 2 410 | } 411 | -------------------------------------------------------------------------------- /parts/multithreading/solution/bench_threads.jl: -------------------------------------------------------------------------------- 1 | # Script for benchmarking and comparing the multithreaded variants. 2 | # - Supposed to be run on an entire (exclusive) compute node. 3 | # - Takes `ns` as (the only) input argument. 4 | # 5 | using ThreadPinning 6 | 7 | do_visualize = false 8 | do_run = false 9 | codefile = joinpath(@__DIR__, "diffusion_2d_threads.jl") 10 | include(codefile) 11 | 12 | ns = parse(Int, ARGS[1]) 13 | nt = 100 14 | 15 | println("-- ns=$ns, nt=$nt, SERIAL initialization, STATIC scheduling") 16 | for pin in (:cores, :sockets, :numa) 17 | println("pinthreads($pin)") 18 | pinthreads(pin) 19 | run_diffusion(; ns, nt, do_visualize, parallel_init=false, static=true) 20 | end 21 | 22 | println("\n-- ns=$ns, nt=$nt, PARALLEL initialization, STATIC scheduling") 23 | for pin in (:cores, :sockets, :numa) 24 | println("pinthreads($pin)") 25 | pinthreads(pin) 26 | run_diffusion(; ns, nt, do_visualize, parallel_init=true, static=true) 27 | end 28 | 29 | println("\n-- ns=$ns, nt=$nt, PARALLEL initialization, DYNAMIC scheduling") 30 | for pin in (:cores, :sockets, :numa) 31 | println("pinthreads($pin)") 32 | pinthreads(pin) 33 | run_diffusion(; ns, nt, do_visualize, parallel_init=true, static=false) 34 | end 35 | -------------------------------------------------------------------------------- /parts/multithreading/solution/diffusion_2d_threads.jl: -------------------------------------------------------------------------------- 1 | # 2D linear diffusion solver - multithreading 2 | using Printf 3 | using CairoMakie 4 | include(joinpath(@__DIR__, "../../shared.jl")) 5 | 6 | function init_arrays_threads(params) 7 | (; ns, cs, parallel_init, static) = params 8 | C = Matrix{Float64}(undef, ns, ns) 9 | C2 = Matrix{Float64}(undef, ns, ns) 10 | if parallel_init 11 | # parallel initialization 12 | if static 13 | # static scheduling 14 | Threads.@threads :static for iy in axes(C, 2) 15 | for ix in axes(C, 1) 16 | C[ix, iy] = exp(- cs[ix]^2 - cs[iy]^2) 17 | C2[ix, iy] = C[ix, iy] # element-wise copy 18 | end 19 | end 20 | else 21 | # dynamic scheduling 22 | Threads.@threads :dynamic for iy in axes(C, 2) 23 | for ix in axes(C, 1) 24 | C[ix, iy] = exp(- cs[ix]^2 - cs[iy]^2) 25 | C2[ix, iy] = C[ix, iy] # element-wise copy 26 | end 27 | end 28 | end 29 | else 30 | # serial initialization 31 | for iy in axes(C, 2) 32 | for ix in axes(C, 1) 33 | C[ix, iy] = exp(- cs[ix]^2 - cs[iy]^2) 34 | C2[ix, iy] = C[ix, iy] # element-wise copy 35 | end 36 | end 37 | end 38 | return C, C2 39 | end 40 | 41 | # to avoid writing nested finite-difference expression 42 | macro qx(ix, iy) esc(:(-D * (C[$ix+1, $iy] - C[$ix, $iy]) / ds)) end 43 | macro qy(ix, iy) esc(:(-D * (C[$ix, $iy+1] - C[$ix, $iy]) / ds)) end 44 | 45 | function diffusion_step!(params, C2, C) 46 | (; ds, dt, D, static) = params 47 | if static 48 | # static scheduling 49 | Threads.@threads :static for iy in 1:size(C, 2)-2 50 | for ix in 1:size(C, 1)-2 51 | @inbounds C2[ix+1, iy+1] = C[ix+1, iy+1] - dt * ((@qx(ix+1, iy+1) - @qx(ix, iy+1)) / ds + 52 | (@qy(ix+1, iy+1) - @qy(ix+1, iy)) / ds) 53 | end 54 | end 55 | else 56 | # dynamic scheduling 57 | Threads.@threads :dynamic for iy in 1:size(C, 2)-2 58 | for ix in 1:size(C, 1)-2 59 | @inbounds C2[ix+1, iy+1] = C[ix+1, iy+1] - dt * ((@qx(ix+1, iy+1) - @qx(ix, iy+1)) / ds + 60 | (@qy(ix+1, iy+1) - @qy(ix+1, iy)) / ds) 61 | end 62 | end 63 | end 64 | return nothing 65 | end 66 | 67 | function run_diffusion(; ns=64, nt=100, do_visualize=false, parallel_init=false, static=false) 68 | params = init_params(; ns, nt, do_visualize, parallel_init, static) 69 | C, C2 = init_arrays_threads(params) 70 | fig, plt = maybe_init_visualization(params, C) 71 | t_tic = 0.0 72 | # time loop 73 | for it in 1:nt 74 | # time after warmup (ignore first 10 iterations) 75 | (it == 11) && (t_tic = Base.time()) 76 | # diffusion 77 | diffusion_step!(params, C2, C) 78 | C, C2 = C2, C # pointer swap 79 | # visualization 80 | maybe_update_visualization(params, fig, plt, C, it) 81 | end 82 | t_toc = (Base.time() - t_tic) 83 | print_perf(params, t_toc) 84 | return nothing 85 | end 86 | 87 | # Running things... 88 | 89 | # enable visualization by default 90 | (!@isdefined do_visualize) && (do_visualize = true) 91 | # enable execution by default 92 | (!@isdefined do_run) && (do_run = true) 93 | 94 | if do_run 95 | if !isempty(ARGS) 96 | run_diffusion(; ns=parse(Int, ARGS[1]), do_visualize) 97 | else 98 | run_diffusion(; ns=256, do_visualize) 99 | end 100 | end 101 | -------------------------------------------------------------------------------- /parts/multithreading/solution/job_bench_threads.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --time=00:05:00 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 5 | #SBATCH --cpus-per-task=256 6 | #SBATCH --constraint=cpu 7 | #SBATCH --account=ntrain1 8 | #SBATCH --output=slurm_bench_threads.out 9 | 10 | # Load julia 11 | ml use /global/common/software/nersc/n9/julia/modules 12 | ml julia 13 | 14 | for i in 512 2048 6144 15 | do 16 | echo -e "\n\n#### Run $i" 17 | 18 | echo -e "-- single threaded" 19 | julia --project --threads 1 -e 'do_visualize=false; include("diffusion_2d_threads.jl")' $i 20 | echo -e "" 21 | 22 | julia --project --threads 8 bench_threads.jl $i # benchmark multithreaded variants 23 | done -------------------------------------------------------------------------------- /parts/multithreading/solution/job_compare_threads_serial.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --time=00:05:00 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 5 | #SBATCH --cpus-per-task=256 6 | #SBATCH --constraint=cpu 7 | #SBATCH --account=ntrain1 8 | #SBATCH --output=slurm_compare_threads_serial.out 9 | 10 | # Load julia 11 | ml use /global/common/software/nersc/n9/julia/modules 12 | ml julia 13 | 14 | for i in 512 2048 6144 15 | do 16 | echo -e "\n\n#### Run $i" 17 | 18 | echo -e "-- single threaded" 19 | julia --project --threads 1 -e 'do_visualize=false; include("diffusion_2d_threads.jl")' $i 20 | echo -e "" 21 | 22 | echo -e "-- multithreaded (8 threads)" 23 | julia --project --threads 8 -e 'do_visualize=false; include("diffusion_2d_threads.jl")' $i 24 | echo -e "" 25 | done -------------------------------------------------------------------------------- /parts/multithreading/solution/slurm_bench_threads.out: -------------------------------------------------------------------------------- 1 | 2 | 3 | #### Run 512 4 | -- single threaded 5 | Time = 4.6068e-02 s, T_eff = 8.19 GB/s 6 | 7 | -- ns=512, nt=100, SERIAL initialization, STATIC scheduling 8 | pinthreads(cores) 9 | Time = 6.8419e-03 s, T_eff = 55.17 GB/s 10 | pinthreads(sockets) 11 | Time = 6.9280e-03 s, T_eff = 54.49 GB/s 12 | pinthreads(numa) 13 | Time = 7.0949e-03 s, T_eff = 53.21 GB/s 14 | 15 | -- ns=512, nt=100, PARALLEL initialization, STATIC scheduling 16 | pinthreads(cores) 17 | Time = 6.8409e-03 s, T_eff = 55.18 GB/s 18 | pinthreads(sockets) 19 | Time = 6.7821e-03 s, T_eff = 55.66 GB/s 20 | pinthreads(numa) 21 | Time = 7.6101e-03 s, T_eff = 49.60 GB/s 22 | 23 | -- ns=512, nt=100, PARALLEL initialization, DYNAMIC scheduling 24 | pinthreads(cores) 25 | Time = 6.6450e-03 s, T_eff = 56.81 GB/s 26 | pinthreads(sockets) 27 | Time = 1.1217e-02 s, T_eff = 33.65 GB/s 28 | pinthreads(numa) 29 | Time = 9.5861e-03 s, T_eff = 39.38 GB/s 30 | 31 | 32 | #### Run 2048 33 | -- single threaded 34 | Time = 7.3495e-01 s, T_eff = 8.22 GB/s 35 | 36 | -- ns=2048, nt=100, SERIAL initialization, STATIC scheduling 37 | pinthreads(cores) 38 | Time = 1.4753e-01 s, T_eff = 40.94 GB/s 39 | pinthreads(sockets) 40 | Time = 9.4935e-02 s, T_eff = 63.62 GB/s 41 | pinthreads(numa) 42 | Time = 9.5550e-02 s, T_eff = 63.21 GB/s 43 | 44 | -- ns=2048, nt=100, PARALLEL initialization, STATIC scheduling 45 | pinthreads(cores) 46 | Time = 1.7262e-01 s, T_eff = 34.99 GB/s 47 | pinthreads(sockets) 48 | Time = 1.0453e-01 s, T_eff = 57.78 GB/s 49 | pinthreads(numa) 50 | Time = 1.0394e-01 s, T_eff = 58.11 GB/s 51 | 52 | -- ns=2048, nt=100, PARALLEL initialization, DYNAMIC scheduling 53 | pinthreads(cores) 54 | Time = 1.4637e-01 s, T_eff = 41.26 GB/s 55 | pinthreads(sockets) 56 | Time = 2.4384e-01 s, T_eff = 24.77 GB/s 57 | pinthreads(numa) 58 | Time = 1.2115e-01 s, T_eff = 49.85 GB/s 59 | 60 | 61 | #### Run 6144 62 | -- single threaded 63 | Time = 6.5916e+00 s, T_eff = 8.25 GB/s 64 | 65 | -- ns=6144, nt=100, SERIAL initialization, STATIC scheduling 66 | pinthreads(cores) 67 | Time = 2.1146e+00 s, T_eff = 25.71 GB/s 68 | pinthreads(sockets) 69 | Time = 1.9464e+00 s, T_eff = 27.93 GB/s 70 | pinthreads(numa) 71 | Time = 1.5678e+00 s, T_eff = 34.67 GB/s 72 | 73 | -- ns=6144, nt=100, PARALLEL initialization, STATIC scheduling 74 | pinthreads(cores) 75 | Time = 2.1160e+00 s, T_eff = 25.69 GB/s 76 | pinthreads(sockets) 77 | Time = 9.7413e-01 s, T_eff = 55.80 GB/s 78 | pinthreads(numa) 79 | Time = 8.2886e-01 s, T_eff = 65.58 GB/s 80 | 81 | -- ns=6144, nt=100, PARALLEL initialization, DYNAMIC scheduling 82 | pinthreads(cores) 83 | Time = 2.1081e+00 s, T_eff = 25.79 GB/s 84 | pinthreads(sockets) 85 | Time = 1.5051e+00 s, T_eff = 36.11 GB/s 86 | pinthreads(numa) 87 | Time = 1.6027e+00 s, T_eff = 33.92 GB/s 88 | -------------------------------------------------------------------------------- /parts/multithreading/solution/slurm_compare_threads_serial.out: -------------------------------------------------------------------------------- 1 | 2 | 3 | #### Run 512 4 | -- single threaded 5 | Time = 4.6421e-02 s, T_eff = 8.13 GB/s 6 | 7 | -- multithreaded (8 threads) 8 | Time = 1.6735e-02 s, T_eff = 22.56 GB/s 9 | 10 | 11 | 12 | #### Run 2048 13 | -- single threaded 14 | Time = 7.3168e-01 s, T_eff = 8.25 GB/s 15 | 16 | -- multithreaded (8 threads) 17 | Time = 5.6353e-01 s, T_eff = 10.72 GB/s 18 | 19 | 20 | 21 | #### Run 6144 22 | -- single threaded 23 | Time = 6.5959e+00 s, T_eff = 8.24 GB/s 24 | 25 | -- multithreaded (8 threads) 26 | Time = 3.2809e+00 s, T_eff = 16.57 GB/s 27 | 28 | -------------------------------------------------------------------------------- /parts/shared.jl: -------------------------------------------------------------------------------- 1 | ## PARAMETER INITIALIZATION 2 | function init_params(; ns=64, nt=100, kwargs...) 3 | L = 10.0 # physical domain length 4 | D = 1.0 # diffusion coefficient 5 | ds = L / ns # grid spacing 6 | dt = ds^2 / D / 8.2 # time step 7 | cs = range(start=ds / 2, stop=L - ds / 2, length=ns) .- 0.5 * L # vector of coord points 8 | nout = floor(Int, nt / 5) # plotting frequency 9 | return (; L, D, ns, nt, ds, dt, cs, nout, kwargs...) 10 | end 11 | 12 | function init_params_mpi(; dims, coords, ns=64, nt=100, kwargs...) 13 | L = 10.0 # physical domain length 14 | D = 1.0 # diffusion coefficient 15 | nx_g = dims[1] * (ns - 2) + 2 # global number of grid points along dim 1 16 | ny_g = dims[2] * (ns - 2) + 2 # global number of grid points along dim 2 17 | dx = L / nx_g # grid spacing 18 | dy = L / ny_g # grid spacing 19 | dt = min(dx, dy)^2 / D / 8.2 # time step 20 | x0 = coords[1] * (ns - 2) * dx # coords shift to get global coords on local process 21 | y0 = coords[2] * (ns - 2) * dy # coords shift to get global coords on local process 22 | xcs = LinRange(x0 + dx / 2, x0 + ns * dx - dx / 2, ns) .- 0.5 .* L # local vector of global coord points 23 | ycs = LinRange(y0 + dy / 2, y0 + ns * dx - dy / 2, ns) .- 0.5 .* L # local vector of global coord points 24 | return (; L, D, ns, nt, dx, dy, dt, xcs, ycs, kwargs...) 25 | end 26 | 27 | function init_params_gpu(; ns=64, nt=100, kwargs...) 28 | L = 10.0 # physical domain length 29 | D = 1.0 # diffusion coefficient 30 | ds = L / ns # grid spacing 31 | dt = ds^2 / D / 8.2 # time step 32 | cs = range(start=ds / 2, stop=L - ds / 2, length=ns) .- 0.5 * L # vector of coord points 33 | nout = floor(Int, nt / 5) # plotting frequency 34 | nthreads = 32, 8 # number of threads per block 35 | nblocks = cld.(ns, nthreads) # number of blocks 36 | return (; L, D, ns, nt, ds, dt, cs, nout, nthreads, nblocks, kwargs...) 37 | end 38 | 39 | function init_params_gpu_mpi(; dims, coords, ns=64, nt=100, kwargs...) 40 | L = 10.0 # physical domain length 41 | D = 1.0 # diffusion coefficient 42 | nx_g = dims[1] * (ns - 2) + 2 # global number of grid points along dim 1 43 | ny_g = dims[2] * (ns - 2) + 2 # global number of grid points along dim 2 44 | dx = L / nx_g # grid spacing 45 | dy = L / ny_g # grid spacing 46 | dt = min(dx, dy)^2 / D / 8.2 # time step 47 | x0 = coords[1] * (ns - 2) * dx # coords shift to get global coords on local process 48 | y0 = coords[2] * (ns - 2) * dy # coords shift to get global coords on local process 49 | xcs = LinRange(x0 + dx / 2, x0 + ns * dx - dx / 2, ns) .- 0.5 * L # local vector of global coord points 50 | ycs = LinRange(y0 + dy / 2, y0 + ns * dy - dy / 2, ns) .- 0.5 * L # local vector of global coord points 51 | nthreads = 32, 8 # number of threads per block 52 | nblocks = cld.(ns, nthreads) # number of blocks 53 | return (; L, D, ns, nt, dx, dy, dt, xcs, ycs, nthreads, nblocks, kwargs...) 54 | end 55 | 56 | ## ARRAY INITIALIZATION 57 | function init_arrays_with_flux(params) 58 | (; cs, ns) = params 59 | C = @. exp(-cs^2 - (cs')^2) 60 | qx = zeros(ns - 1, ns - 2) 61 | qy = zeros(ns - 2, ns - 1) 62 | return C, qx, qy 63 | end 64 | 65 | function init_arrays(params) 66 | (; cs) = params 67 | C = @. exp(-cs^2 - (cs')^2) 68 | C2 = copy(C) 69 | return C, C2 70 | end 71 | 72 | function init_arrays_mpi(params) 73 | (; xcs, ycs) = params 74 | C = @. exp(-xcs^2 - (ycs')^2) 75 | C2 = copy(C) 76 | return C, C2 77 | end 78 | 79 | function init_arrays_gpu(params) 80 | (; cs) = params 81 | C = CuArray(@. exp(-cs^2 - (cs')^2)) 82 | C2 = copy(C) 83 | return C, C2 84 | end 85 | 86 | function init_arrays_gpu_mpi(params) 87 | (; xcs, ycs) = params 88 | C = CuArray(@. exp(-xcs^2 - (ycs')^2)) 89 | C2 = copy(C) 90 | return C, C2 91 | end 92 | 93 | ## VISUALIZATION & PRINTING 94 | function maybe_init_visualization(params, C) 95 | if params.do_visualize 96 | fig = Figure(; size=(500, 400), fontsize=14) 97 | ax = Axis(fig[1, 1][1, 1]; aspect=DataAspect(), title="C") 98 | plt = heatmap!(ax, params.cs, params.cs, Array(C); colormap=:turbo, colorrange=(0, 1)) 99 | cb = Colorbar(fig[1, 1][1, 2], plt) 100 | display(fig) 101 | return fig, plt 102 | end 103 | return nothing, nothing 104 | end 105 | 106 | function maybe_update_visualization(params, fig, plt, C, it) 107 | if params.do_visualize && (it % params.nout == 0) 108 | plt[3] = Array(C) 109 | display(fig) 110 | end 111 | return nothing 112 | end 113 | 114 | function print_perf(params, t_toc) 115 | (; ns, nt) = params 116 | @printf("Time = %1.4e s, T_eff = %1.2f GB/s \n", t_toc, round((2 / 1e9 * ns^2 * sizeof(Float64)) / (t_toc / (nt - 10)), sigdigits=6)) 117 | return nothing 118 | end 119 | -------------------------------------------------------------------------------- /setup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | #= 3 | $(dirname "$0")/julia_wrapper.sh $0 4 | exit 5 | # =# 6 | 7 | @info("Preparing .bashrc") 8 | bashrc = joinpath(ENV["HOME"], ".bashrc") 9 | str = """\n 10 | # --- JULIACON24-HPCWORKSHOP --- 11 | export JULIA_DEPOT_PATH=\$SCRATCH/.julia:/global/common/software/ntrain1/.julia 12 | export PATH=\$SCRATCH/.julia/bin:\$PATH 13 | # auto-load the Julia module 14 | ml use /global/common/software/nersc/n9/julia/modules 15 | ml julia\n 16 | """ 17 | open(bashrc; append=true) do f 18 | write(f, str) 19 | end 20 | @info("Done!") 21 | 22 | @info("Instantiating Julia environment") 23 | empty!(DEPOT_PATH) 24 | push!(DEPOT_PATH, joinpath(ENV["SCRATCH"], ".julia")) 25 | push!(DEPOT_PATH, "/global/common/software/ntrain1/.julia") 26 | using Pkg 27 | Pkg.activate(@__DIR__) 28 | Pkg.instantiate() 29 | @info("Done!") 30 | 31 | using MPI 32 | MPI.install_mpiexecjl(force=true) 33 | 34 | @info("Installing Jupyter kernel") 35 | Pkg.build("IJulia") # to be safe 36 | using IJulia 37 | IJulia.installkernel("JuliaCon24 HPC Workshop"; env=Dict("JULIA_NUM_THREADS" => "8", "JULIA_PROJECT" => @__DIR__)) 38 | @info("Done!") 39 | --------------------------------------------------------------------------------