├── .gitignore
├── Benchmarks
    ├── .DS_Store
    ├── brdw-pascal-ofed40
    │   ├── cpuflusher
    │   │   ├── hpgmg-ASYNC-s4-p2.txt
    │   │   ├── hpgmg-ASYNC-s5-p2.txt
    │   │   ├── hpgmg-ASYNC-s6-p2.txt
    │   │   ├── hpgmg-ASYNC-s7-p2.txt
    │   │   └── out_cpuflusher.txt
    │   ├── hpgmg-COMM-s4-p2.txt
    │   ├── hpgmg-COMM-s5-p2.txt
    │   ├── hpgmg-COMM-s6-p2.txt
    │   ├── hpgmg-COMM-s7-p2.txt
    │   ├── hpgmg-GPU-initiated-s4-p2.txt
    │   ├── hpgmg-GPU-initiated-s5-p2.txt
    │   ├── hpgmg-GPU-initiated-s6-p2.txt
    │   ├── hpgmg-GPU-initiated-s7-p2.txt
    │   ├── hpgmg-MPI-s4-p2.txt
    │   ├── hpgmg-MPI-s5-p2.txt
    │   ├── hpgmg-MPI-s6-p2.txt
    │   ├── hpgmg-MPI-s7-p2.txt
    │   ├── nicflusher
    │   │   ├── hpgmg-ASYNC-s4-p2.txt
    │   │   ├── hpgmg-ASYNC-s5-p2.txt
    │   │   ├── hpgmg-ASYNC-s6-p2.txt
    │   │   ├── hpgmg-ASYNC-s7-p2.txt
    │   │   └── out_nicflusher.txt
    │   ├── noflusher
    │   │   ├── hpgmg-ASYNC-s4-p2.txt
    │   │   ├── hpgmg-ASYNC-s5-p2.txt
    │   │   ├── hpgmg-ASYNC-s6-p2.txt
    │   │   └── hpgmg-ASYNC-s7-p2.txt
    │   ├── out.txt
    │   └── out_GPU.txt
    ├── brdw-pascal-ofed42
    │   ├── hpgmg-ASYNC-s4-p2.txt
    │   ├── hpgmg-ASYNC-s5-p2.txt
    │   ├── hpgmg-ASYNC-s6-p2.txt
    │   ├── hpgmg-ASYNC-s7-p2.txt
    │   ├── hpgmg-COMM-s4-p2.txt
    │   ├── hpgmg-COMM-s5-p2.txt
    │   ├── hpgmg-COMM-s6-p2.txt
    │   ├── hpgmg-COMM-s7-p2.txt
    │   ├── hpgmg-GPU-initiated-s4-p2.txt
    │   ├── hpgmg-GPU-initiated-s5-p2.txt
    │   ├── hpgmg-GPU-initiated-s6-p2.txt
    │   ├── hpgmg-GPU-initiated-s7-p2.txt
    │   ├── hpgmg-MPI-s4-p2.txt
    │   ├── hpgmg-MPI-s5-p2.txt
    │   ├── hpgmg-MPI-s6-p2.txt
    │   ├── hpgmg-MPI-s7-p2.txt
    │   ├── out_brdw42.txt
    │   └── out_brdw42_ki.txt
    ├── ivys23_ofed34
    │   ├── hpgmg-ASYNC-s4-p2.txt
    │   ├── hpgmg-ASYNC-s5-p2.txt
    │   ├── hpgmg-ASYNC-s6-p2.txt
    │   ├── hpgmg-ASYNC-s7-p2.txt
    │   ├── hpgmg-COMM-s4-p2.txt
    │   ├── hpgmg-COMM-s5-p2.txt
    │   ├── hpgmg-COMM-s6-p2.txt
    │   ├── hpgmg-COMM-s7-p2.txt
    │   ├── hpgmg-GPU-initiated-s4-p2.txt
    │   ├── hpgmg-GPU-initiated-s5-p2.txt
    │   ├── hpgmg-GPU-initiated-s6-p2.txt
    │   ├── hpgmg-GPU-initiated-s7-p2.txt
    │   ├── hpgmg-MPI-s4-p2.txt
    │   ├── hpgmg-MPI-s5-p2.txt
    │   ├── hpgmg-MPI-s6-p2.txt
    │   ├── hpgmg-MPI-s7-p2.txt
    │   └── out_bench.txt
    ├── ivys23_ofed42
    │   ├── hpgmg-ASYNC-s4-p2.txt
    │   ├── hpgmg-ASYNC-s5-p2.txt
    │   ├── hpgmg-ASYNC-s6-p2.txt
    │   ├── hpgmg-ASYNC-s7-p2.txt
    │   ├── hpgmg-COMM-s4-p2.txt
    │   ├── hpgmg-COMM-s5-p2.txt
    │   ├── hpgmg-COMM-s6-p2.txt
    │   ├── hpgmg-COMM-s7-p2.txt
    │   ├── hpgmg-GPU-initiated-s4-p2.txt
    │   ├── hpgmg-GPU-initiated-s5-p2.txt
    │   ├── hpgmg-GPU-initiated-s6-p2.txt
    │   ├── hpgmg-GPU-initiated-s7-p2.txt
    │   ├── hpgmg-MPI-s4-p2.txt
    │   ├── hpgmg-MPI-s5-p2.txt
    │   ├── hpgmg-MPI-s6-p2.txt
    │   ├── hpgmg-MPI-s7-p2.txt
    │   ├── ivy23_ofed42_ki
    │   │   ├── hpgmg-GPU-initiated-s4-p2.txt
    │   │   ├── hpgmg-GPU-initiated-s5-p2.txt
    │   │   ├── hpgmg-GPU-initiated-s6-p2.txt
    │   │   ├── hpgmg-GPU-initiated-s7-p2.txt
    │   │   └── out_ivy23_ki.txt
    │   └── out_ofed42.txt
    ├── p9_pwr05
    │   ├── hpgmg-ASYNC-s4-p2.txt
    │   ├── hpgmg-ASYNC-s5-p2.txt
    │   ├── hpgmg-ASYNC-s6-p2.txt
    │   ├── hpgmg-ASYNC-s7-p2.txt
    │   ├── hpgmg-COMM-s4-p2.txt
    │   ├── hpgmg-COMM-s5-p2.txt
    │   ├── hpgmg-COMM-s6-p2.txt
    │   ├── hpgmg-COMM-s7-p2.txt
    │   ├── hpgmg-GPU-initiated-s4-p2.txt
    │   ├── hpgmg-GPU-initiated-s5-p2.txt
    │   ├── hpgmg-GPU-initiated-s6-p2.txt
    │   ├── hpgmg-GPU-initiated-s7-p2.txt
    │   ├── hpgmg-MPI-s4-p2.txt
    │   ├── hpgmg-MPI-s5-p2.txt
    │   ├── hpgmg-MPI-s6-p2.txt
    │   ├── hpgmg-MPI-s7-p2.txt
    │   └── out_pwr09.txt
    ├── psg-benchmarks_ofed40
    │   ├── hpgmg-ASYNC-s4-p2.txt
    │   ├── hpgmg-ASYNC-s4-p4.txt
    │   ├── hpgmg-ASYNC-s4-p8.txt
    │   ├── hpgmg-ASYNC-s5-p2.txt
    │   ├── hpgmg-ASYNC-s5-p4.txt
    │   ├── hpgmg-ASYNC-s5-p8.txt
    │   ├── hpgmg-ASYNC-s6-p2.txt
    │   ├── hpgmg-ASYNC-s6-p4.txt
    │   ├── hpgmg-ASYNC-s6-p8.txt
    │   ├── hpgmg-ASYNC-s7-p2.txt
    │   ├── hpgmg-ASYNC-s7-p4.txt
    │   ├── hpgmg-ASYNC-s7-p8.txt
    │   ├── hpgmg-COMM-s4-p2.txt
    │   ├── hpgmg-COMM-s4-p4.txt
    │   ├── hpgmg-COMM-s4-p8.txt
    │   ├── hpgmg-COMM-s5-p2.txt
    │   ├── hpgmg-COMM-s5-p4.txt
    │   ├── hpgmg-COMM-s5-p8.txt
    │   ├── hpgmg-COMM-s6-p2.txt
    │   ├── hpgmg-COMM-s6-p4.txt
    │   ├── hpgmg-COMM-s6-p8.txt
    │   ├── hpgmg-COMM-s7-p2.txt
    │   ├── hpgmg-COMM-s7-p4.txt
    │   ├── hpgmg-COMM-s7-p8.txt
    │   ├── hpgmg-GPU-initiated-s4-p2.txt
    │   ├── hpgmg-GPU-initiated-s4-p4.txt
    │   ├── hpgmg-GPU-initiated-s4-p8.txt
    │   ├── hpgmg-GPU-initiated-s5-p2.txt
    │   ├── hpgmg-GPU-initiated-s5-p4.txt
    │   ├── hpgmg-GPU-initiated-s5-p8.txt
    │   ├── hpgmg-GPU-initiated-s6-p2.txt
    │   ├── hpgmg-GPU-initiated-s6-p4.txt
    │   ├── hpgmg-GPU-initiated-s6-p8.txt
    │   ├── hpgmg-GPU-initiated-s7-p2.txt
    │   ├── hpgmg-GPU-initiated-s7-p4.txt
    │   ├── hpgmg-GPU-initiated-s7-p8.txt
    │   ├── hpgmg-MPI-s4-p2.txt
    │   ├── hpgmg-MPI-s4-p4.txt
    │   ├── hpgmg-MPI-s4-p8.txt
    │   ├── hpgmg-MPI-s5-p2.txt
    │   ├── hpgmg-MPI-s5-p4.txt
    │   ├── hpgmg-MPI-s5-p8.txt
    │   ├── hpgmg-MPI-s6-p2.txt
    │   ├── hpgmg-MPI-s6-p4.txt
    │   ├── hpgmg-MPI-s6-p8.txt
    │   ├── hpgmg-MPI-s7-p2.txt
    │   ├── hpgmg-MPI-s7-p4.txt
    │   ├── hpgmg-MPI-s7-p8.txt
    │   └── out_ofed40.txt
    ├── psg-benchmarks_ofed42
    │   ├── hpgmg-ASYNC-s4-p2.txt
    │   ├── hpgmg-ASYNC-s4-p4.txt
    │   ├── hpgmg-ASYNC-s4-p8.txt
    │   ├── hpgmg-ASYNC-s5-p2.txt
    │   ├── hpgmg-ASYNC-s5-p4.txt
    │   ├── hpgmg-ASYNC-s5-p8.txt
    │   ├── hpgmg-ASYNC-s6-p2.txt
    │   ├── hpgmg-ASYNC-s6-p4.txt
    │   ├── hpgmg-ASYNC-s6-p8.txt
    │   ├── hpgmg-ASYNC-s7-p2.txt
    │   ├── hpgmg-ASYNC-s7-p4.txt
    │   ├── hpgmg-ASYNC-s7-p8.txt
    │   ├── hpgmg-GPU-initiated-s4-p2.txt
    │   ├── hpgmg-GPU-initiated-s4-p4.txt
    │   ├── hpgmg-GPU-initiated-s4-p8.txt
    │   ├── hpgmg-GPU-initiated-s5-p2.txt
    │   ├── hpgmg-GPU-initiated-s5-p4.txt
    │   ├── hpgmg-GPU-initiated-s5-p8.txt
    │   ├── hpgmg-GPU-initiated-s6-p2.txt
    │   ├── hpgmg-GPU-initiated-s6-p4.txt
    │   ├── hpgmg-GPU-initiated-s6-p8.txt
    │   ├── hpgmg-GPU-initiated-s7-p2.txt
    │   ├── hpgmg-GPU-initiated-s7-p4.txt
    │   ├── hpgmg-GPU-initiated-s7-p8.txt
    │   ├── hpgmg-MPI-s4-p2.txt
    │   ├── hpgmg-MPI-s4-p4.txt
    │   ├── hpgmg-MPI-s4-p8.txt
    │   ├── hpgmg-MPI-s5-p2.txt
    │   ├── hpgmg-MPI-s5-p4.txt
    │   ├── hpgmg-MPI-s5-p8.txt
    │   ├── hpgmg-MPI-s6-p2.txt
    │   ├── hpgmg-MPI-s6-p4.txt
    │   ├── hpgmg-MPI-s6-p8.txt
    │   ├── hpgmg-MPI-s7-p2.txt
    │   ├── hpgmg-MPI-s7-p4.txt
    │   ├── hpgmg-MPI-s7-p8.txt
    │   └── out_psg_ofed42.txt
    └── wilkes1-dec2016
    │   ├── Async1
    │       ├── hpgmg-ASYNC-s4-p16.txt
    │       ├── hpgmg-ASYNC-s4-p2.txt
    │       ├── hpgmg-ASYNC-s4-p4.txt
    │       ├── hpgmg-ASYNC-s4-p8.txt
    │       ├── hpgmg-ASYNC-s5-p16.txt
    │       ├── hpgmg-ASYNC-s5-p2.txt
    │       ├── hpgmg-ASYNC-s5-p4.txt
    │       ├── hpgmg-ASYNC-s5-p8.txt
    │       ├── hpgmg-ASYNC-s6-p16.txt
    │       ├── hpgmg-ASYNC-s6-p2.txt
    │       ├── hpgmg-ASYNC-s6-p4.txt
    │       ├── hpgmg-ASYNC-s6-p8.txt
    │       ├── hpgmg-ASYNC-s7-p16.txt
    │       ├── hpgmg-ASYNC-s7-p2.txt
    │       ├── hpgmg-ASYNC-s7-p4.txt
    │       └── hpgmg-ASYNC-s7-p8.txt
    │   ├── Async2
    │       ├── hpgmg-ASYNC-s4-p16.txt
    │       ├── hpgmg-ASYNC-s4-p2.txt
    │       ├── hpgmg-ASYNC-s4-p24.txt
    │       ├── hpgmg-ASYNC-s4-p32.txt
    │       ├── hpgmg-ASYNC-s4-p4.txt
    │       ├── hpgmg-ASYNC-s4-p8.txt
    │       ├── hpgmg-ASYNC-s5-p16.txt
    │       ├── hpgmg-ASYNC-s5-p2.txt
    │       ├── hpgmg-ASYNC-s5-p4.txt
    │       ├── hpgmg-ASYNC-s5-p8.txt
    │       ├── hpgmg-ASYNC-s6-p16.txt
    │       ├── hpgmg-ASYNC-s6-p2.txt
    │       ├── hpgmg-ASYNC-s6-p4.txt
    │       ├── hpgmg-ASYNC-s6-p8.txt
    │       ├── hpgmg-ASYNC-s7-p16.txt
    │       ├── hpgmg-ASYNC-s7-p2.txt
    │       ├── hpgmg-ASYNC-s7-p4.txt
    │       └── hpgmg-ASYNC-s7-p8.txt
    │   ├── GPU
    │       ├── hpgmg-GPU-initiated-s4-p16.txt
    │       ├── hpgmg-GPU-initiated-s4-p2.txt
    │       ├── hpgmg-GPU-initiated-s4-p4.txt
    │       ├── hpgmg-GPU-initiated-s4-p8.txt
    │       ├── hpgmg-GPU-initiated-s5-p16.txt
    │       ├── hpgmg-GPU-initiated-s5-p2.txt
    │       ├── hpgmg-GPU-initiated-s5-p4.txt
    │       ├── hpgmg-GPU-initiated-s5-p8.txt
    │       ├── hpgmg-GPU-initiated-s6-p16.txt
    │       ├── hpgmg-GPU-initiated-s6-p2.txt
    │       ├── hpgmg-GPU-initiated-s6-p4.txt
    │       ├── hpgmg-GPU-initiated-s6-p8.txt
    │       ├── hpgmg-GPU-initiated-s7-p16.txt
    │       ├── hpgmg-GPU-initiated-s7-p2.txt
    │       ├── hpgmg-GPU-initiated-s7-p4.txt
    │       ├── hpgmg-GPU-initiated-s7-p8.txt
    │       └── out.txt
    │   ├── Sync-mine
    │       ├── hpgmg-MPI-s4-p16.txt
    │       ├── hpgmg-MPI-s4-p2.txt
    │       ├── hpgmg-MPI-s4-p24.txt
    │       ├── hpgmg-MPI-s4-p32.txt
    │       ├── hpgmg-MPI-s4-p4.txt
    │       ├── hpgmg-MPI-s4-p8.txt
    │       ├── hpgmg-MPI-s5-p16.txt
    │       ├── hpgmg-MPI-s5-p2.txt
    │       ├── hpgmg-MPI-s5-p4.txt
    │       ├── hpgmg-MPI-s5-p8.txt
    │       ├── hpgmg-MPI-s6-p16.txt
    │       ├── hpgmg-MPI-s6-p2.txt
    │       ├── hpgmg-MPI-s6-p4.txt
    │       ├── hpgmg-MPI-s6-p8.txt
    │       ├── hpgmg-MPI-s7-p16.txt
    │       ├── hpgmg-MPI-s7-p2.txt
    │       ├── hpgmg-MPI-s7-p4.txt
    │       └── hpgmg-MPI-s7-p8.txt
    │   ├── Sync-original
    │       ├── hpgmg-MPI-s4-p16.txt
    │       ├── hpgmg-MPI-s4-p2.txt
    │       ├── hpgmg-MPI-s4-p24.txt
    │       ├── hpgmg-MPI-s4-p4.txt
    │       ├── hpgmg-MPI-s4-p8.txt
    │       ├── hpgmg-MPI-s5-p16.txt
    │       ├── hpgmg-MPI-s5-p2.txt
    │       ├── hpgmg-MPI-s5-p24.txt
    │       ├── hpgmg-MPI-s5-p4.txt
    │       ├── hpgmg-MPI-s5-p8.txt
    │       ├── hpgmg-MPI-s6-p16.txt
    │       ├── hpgmg-MPI-s6-p2.txt
    │       ├── hpgmg-MPI-s6-p24.txt
    │       ├── hpgmg-MPI-s6-p4.txt
    │       ├── hpgmg-MPI-s6-p8.txt
    │       ├── hpgmg-MPI-s7-p16.txt
    │       ├── hpgmg-MPI-s7-p2.txt
    │       ├── hpgmg-MPI-s7-p24.txt
    │       ├── hpgmg-MPI-s7-p4.txt
    │       └── hpgmg-MPI-s7-p8.txt
    │   └── hpgmg_MPI_9sept.txt
├── HPGMG_Async_manuscript.pdf
├── LICENSE
├── Makefile
├── README.md
├── base.mk
├── build.sh
├── build_titan.sh
├── configure
├── docs
    ├── HPGMG-logo2.pdf
    ├── HPGMG-logo2.png
    ├── HPGMG-logo2.pptx
    ├── hpgmg-template.pptx
    ├── ppt
    │   └── ISC2014.pptx
    ├── static
    │   ├── appendix.sty
    │   ├── comment.sty
    │   ├── elsart.cls
    │   ├── hpgmg.tex
    │   └── thebib.bib
    └── whitepapers
    │   ├── hpgmg.bib
    │   └── hpgmg_intro.tex
├── finite-element
    ├── fefas-align.h
    ├── fefas-test.c
    ├── fefas.c
    ├── fefas.h
    ├── fmg.c
    ├── grid.c
    ├── hpgmg-analyze.py
    ├── local.mk
    ├── memusage.c
    ├── op
    │   ├── fefas-op.h
    │   ├── genregister.py
    │   ├── local.mk
    │   ├── op-poisson-qpx.c
    │   ├── op-poisson1.c
    │   └── op.c
    ├── pointwise.h
    ├── sampler.c
    ├── sharness
    │   ├── API.md
    │   ├── CHANGELOG.md
    │   ├── COPYING
    │   ├── README.md
    │   ├── aggregate-results.sh
    │   └── sharness.sh
    ├── tensor-fma.c
    ├── tensor-qpx.c
    ├── tensor.c
    ├── tensor.h
    ├── tensorimpl.h
    └── test
    │   ├── Makefile
    │   ├── aggregate-results.sh
    │   ├── hpgmg-sharness.sh
    │   ├── sharness.sh
    │   ├── t010-grid.sh
    │   ├── t020-fespace.sh
    │   ├── t030-feinject.sh
    │   ├── t040-feinterp.sh
    │   ├── t045-ferestrict.sh
    │   ├── t100-poisson.sh
    │   ├── t110-poissondiag.sh
    │   ├── t120-poissonksp.sh
    │   ├── t200-mgv.sh
    │   ├── t220-fmg.sh
    │   ├── t230-fmg-poisson2.sh
    │   └── t60-sample.sh
├── finite-volume
    ├── README
    ├── example_jobs
    │   ├── job.biou.00008
    │   ├── job.carver.00064
    │   ├── job.carver.00128
    │   ├── job.carver.00512
    │   ├── job.carver.01728
    │   ├── job.edison.00064
    │   ├── job.edison.00512
    │   ├── job.edison.01024
    │   ├── job.edison.04096
    │   ├── job.edison.08000
    │   ├── job.edison.10648
    │   ├── job.edison.4096.strong
    │   ├── job.edison.pstate
    │   ├── job.edison.strong
    │   ├── job.hopper.01000
    │   ├── job.hopper.04096
    │   ├── job.hopper.09261
    │   ├── job.hopper.13824
    │   ├── job.hopper.21952
    │   ├── job.hopper.special
    │   └── job.titan
    ├── local.mk
    └── source
    │   ├── README
    │   ├── TODO
    │   ├── compile
    │   ├── cuda
    │       ├── blockCopy.h
    │       ├── boundary_fd.h
    │       ├── boundary_fv.h
    │       ├── common.h
    │       ├── cub
    │       │   ├── agent
    │       │   │   ├── agent_histogram.cuh
    │       │   │   ├── agent_radix_sort_downsweep.cuh
    │       │   │   ├── agent_radix_sort_upsweep.cuh
    │       │   │   ├── agent_reduce.cuh
    │       │   │   ├── agent_reduce_by_key.cuh
    │       │   │   ├── agent_rle.cuh
    │       │   │   ├── agent_scan.cuh
    │       │   │   ├── agent_segment_fixup.cuh
    │       │   │   ├── agent_select_if.cuh
    │       │   │   ├── agent_spmv.cuh
    │       │   │   ├── agent_spmv_orig.cuh
    │       │   │   └── single_pass_scan_operators.cuh
    │       │   ├── block
    │       │   │   ├── block_adjacent_difference.cuh
    │       │   │   ├── block_discontinuity.cuh
    │       │   │   ├── block_exchange.cuh
    │       │   │   ├── block_histogram.cuh
    │       │   │   ├── block_load.cuh
    │       │   │   ├── block_radix_rank.cuh
    │       │   │   ├── block_radix_sort.cuh
    │       │   │   ├── block_raking_layout.cuh
    │       │   │   ├── block_reduce.cuh
    │       │   │   ├── block_scan.cuh
    │       │   │   ├── block_shuffle.cuh
    │       │   │   ├── block_store.cuh
    │       │   │   └── specializations
    │       │   │   │   ├── block_histogram_atomic.cuh
    │       │   │   │   ├── block_histogram_sort.cuh
    │       │   │   │   ├── block_reduce_raking.cuh
    │       │   │   │   ├── block_reduce_raking_commutative_only.cuh
    │       │   │   │   ├── block_reduce_warp_reductions.cuh
    │       │   │   │   ├── block_scan_raking.cuh
    │       │   │   │   ├── block_scan_warp_scans.cuh
    │       │   │   │   ├── block_scan_warp_scans2.cuh
    │       │   │   │   └── block_scan_warp_scans3.cuh
    │       │   ├── cub.cuh
    │       │   ├── device
    │       │   │   ├── device_histogram.cuh
    │       │   │   ├── device_partition.cuh
    │       │   │   ├── device_radix_sort.cuh
    │       │   │   ├── device_reduce.cuh
    │       │   │   ├── device_run_length_encode.cuh
    │       │   │   ├── device_scan.cuh
    │       │   │   ├── device_segmented_radix_sort.cuh
    │       │   │   ├── device_segmented_reduce.cuh
    │       │   │   ├── device_select.cuh
    │       │   │   ├── device_spmv.cuh
    │       │   │   └── dispatch
    │       │   │   │   ├── dispatch_histogram.cuh
    │       │   │   │   ├── dispatch_radix_sort.cuh
    │       │   │   │   ├── dispatch_reduce.cuh
    │       │   │   │   ├── dispatch_reduce_by_key.cuh
    │       │   │   │   ├── dispatch_rle.cuh
    │       │   │   │   ├── dispatch_scan.cuh
    │       │   │   │   ├── dispatch_select_if.cuh
    │       │   │   │   ├── dispatch_spmv.cuh
    │       │   │   │   └── dispatch_spmv_orig.cuh
    │       │   ├── grid
    │       │   │   ├── grid_barrier.cuh
    │       │   │   ├── grid_even_share.cuh
    │       │   │   ├── grid_mapping.cuh
    │       │   │   └── grid_queue.cuh
    │       │   ├── host
    │       │   │   ├── mutex.cuh
    │       │   │   └── spinlock.cuh
    │       │   ├── iterator
    │       │   │   ├── arg_index_input_iterator.cuh
    │       │   │   ├── cache_modified_input_iterator.cuh
    │       │   │   ├── cache_modified_output_iterator.cuh
    │       │   │   ├── constant_input_iterator.cuh
    │       │   │   ├── counting_input_iterator.cuh
    │       │   │   ├── discard_output_iterator.cuh
    │       │   │   ├── tex_obj_input_iterator.cuh
    │       │   │   ├── tex_ref_input_iterator.cuh
    │       │   │   └── transform_input_iterator.cuh
    │       │   ├── thread
    │       │   │   ├── thread_load.cuh
    │       │   │   ├── thread_operators.cuh
    │       │   │   ├── thread_reduce.cuh
    │       │   │   ├── thread_scan.cuh
    │       │   │   ├── thread_search.cuh
    │       │   │   └── thread_store.cuh
    │       │   ├── util_allocator.cuh
    │       │   ├── util_arch.cuh
    │       │   ├── util_debug.cuh
    │       │   ├── util_device.cuh
    │       │   ├── util_macro.cuh
    │       │   ├── util_namespace.cuh
    │       │   ├── util_ptx.cuh
    │       │   ├── util_type.cuh
    │       │   └── warp
    │       │   │   ├── specializations
    │       │   │       ├── warp_reduce_shfl.cuh
    │       │   │       ├── warp_reduce_smem.cuh
    │       │   │       ├── warp_scan_shfl.cuh
    │       │   │       └── warp_scan_smem.cuh
    │       │   │   ├── warp_reduce.cuh
    │       │   │   └── warp_scan.cuh
    │       ├── extra.h
    │       ├── interpolation.h
    │       ├── interpolation_v2.h
    │       ├── interpolation_v4.h
    │       ├── misc.h
    │       ├── operators.7pt.cu
    │       ├── operators.fv2.cu
    │       ├── operators.fv4.cu
    │       ├── restriction.h
    │       └── stencils
    │       │   ├── chebyshev.flux.fv4.h
    │       │   ├── gsrb.h
    │       │   ├── residual.base.h
    │       │   ├── residual.reg.fv2.h
    │       │   ├── residual.reg.fv4.h
    │       │   ├── smooth.base.h
    │       │   ├── smooth.reg.fv2.h
    │       │   ├── smooth.reg.fv4.h
    │       │   └── smooth.smem.fv4.h
    │   ├── debug.c
    │   ├── debug.h
    │   ├── defines.h
    │   ├── hpgmg-fv.c
    │   ├── level.c
    │   ├── level.h
    │   ├── local.mk
    │   ├── mg.c
    │   ├── mg.h
    │   ├── operators.27pt.c
    │   ├── operators.7pt.c
    │   ├── operators.fv2.c
    │   ├── operators.fv4.c
    │   ├── operators.h
    │   ├── operators.old.c
    │   ├── operators.old
    │       ├── aggregate.mpi
    │       │   ├── chebyshev.c
    │       │   ├── gsrb.c
    │       │   └── jacobi.c
    │       ├── apply_op.c
    │       ├── chebyshev.c
    │       ├── gsrb.c
    │       ├── iterators.c
    │       ├── jacobi.c
    │       ├── misc.c
    │       ├── residual.c
    │       └── symgs.c
    │   ├── operators
    │       ├── apply_op.c
    │       ├── blockCopy.c
    │       ├── boundary_fd.c
    │       ├── boundary_fv.c
    │       ├── chebyshev.c
    │       ├── exchange_boundary.c
    │       ├── gsrb.c
    │       ├── interpolation_p0.c
    │       ├── interpolation_p1.c
    │       ├── interpolation_p2.c
    │       ├── interpolation_v2.c
    │       ├── interpolation_v4.c
    │       ├── jacobi.c
    │       ├── misc.c
    │       ├── problem.fv.c
    │       ├── problem.p4.c
    │       ├── problem.p6.c
    │       ├── problem.sine.c
    │       ├── rebuild.c
    │       ├── residual.c
    │       ├── restriction.c
    │       └── symgs.c
    │   ├── solvers.c
    │   ├── solvers.h
    │   ├── solvers
    │       ├── bicgstab.c
    │       ├── cabicgstab.c
    │       ├── cacg.c
    │       ├── cg.c
    │       └── matmul.c
    │   ├── timers.c
    │   ├── timers.h
    │   └── timers
    │       ├── mpi.c
    │       ├── omp.c
    │       └── x86.c
├── hpgmgconf.py
├── local.mk
├── run.sh
├── run_all_hpgmg.sh
└── wrapper.sh


/.gitignore:
--------------------------------------------------------------------------------
1 | paper/HPGMG-Async.aux
2 | paper/HPGMG-Async.log
3 | paper/HPGMG-Async.pdf
4 | paper/HPGMG-Async.synctex.gz
5 | 


--------------------------------------------------------------------------------
/Benchmarks/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/e-ago/hpgmg-cuda-async/7896114058909da072e4c3ddb48a188e9d915fb7/Benchmarks/.DS_Store


--------------------------------------------------------------------------------
/Benchmarks/wilkes1-dec2016/GPU/out.txt:
--------------------------------------------------------------------------------
 1 | MODE: GPU-initiated, SIZE: 4, PROC: 2
 2 | use cuda                           1            0            0            0            0   
 3 | Total by level                0.004185     0.002306     0.000697     0.000266     0.000080     0.007535
 4 | MODE: GPU-initiated, SIZE: 4, PROC: 4
 5 | use cuda                           1            0            0            0            0   
 6 | Total by level                0.005535     0.003701     0.001485     0.001090     0.000206     0.012017
 7 | MODE: GPU-initiated, SIZE: 4, PROC: 8
 8 | use cuda                           1            0            0            0            0            0   
 9 | Total by level                0.005816     0.004161     0.001490     0.000921     0.000328     0.000099     0.012814
10 | MODE: GPU-initiated, SIZE: 4, PROC: 16
11 | use cuda                           1            0            0            0            0   
12 | Total by level                0.006549     0.004591     0.002221     0.002146     0.000773     0.016280
13 | MODE: GPU-initiated, SIZE: 5, PROC: 2
14 | use cuda                           1            1            0            0            0            0   
15 | Total by level                0.005164     0.008937     0.003429     0.000919     0.000331     0.000100     0.018880
16 | MODE: GPU-initiated, SIZE: 5, PROC: 4
17 | use cuda                           1            1            0            0            0            0   
18 | Total by level                0.007328     0.011813     0.005473     0.001897     0.001216     0.000246     0.027973
19 | MODE: GPU-initiated, SIZE: 5, PROC: 8
20 | use cuda                           1            1            0            0            0            0            0   
21 | Total by level                0.007448     0.011978     0.006194     0.001803     0.001159     0.000392     0.000118     0.029091
22 | MODE: GPU-initiated, SIZE: 5, PROC: 16
23 | use cuda                           1            1            0            0            0            0   
24 | Total by level                0.009244     0.014551     0.006774     0.002919     0.002649     0.000933     0.037070
25 | MODE: GPU-initiated, SIZE: 6, PROC: 2
26 | use cuda                           1            1            1            0            0            0            0   
27 | Total by level                0.015952     0.000940     0.029312     0.004217     0.001140     0.000388     0.000120     0.052069
28 | MODE: GPU-initiated, SIZE: 6, PROC: 4
29 | use cuda                           1            1            1            0            0            0            0   
30 | Total by level                0.026787     0.001212     0.044937     0.007178     0.002417     0.001444     0.000292     0.084268
31 | MODE: GPU-initiated, SIZE: 6, PROC: 8
32 | use cuda                           1            1            1            0            0            0            0            0   
33 | Total by level                0.027799     0.001686     0.045790     0.008204     0.002202     0.001366     0.000454     0.000138     0.087638
34 | MODE: GPU-initiated, SIZE: 6, PROC: 16
35 | use cuda                           1            1            1            0            0            0            0   
36 | Total by level                0.033302     0.002507     0.054828     0.008998     0.003602     0.003365     0.001057     0.107659
37 | MODE: GPU-initiated, SIZE: 7, PROC: 2
38 | use cuda                           1            1            1            1            0            0            0            0   
39 | Total by level                0.086922     0.000955     0.001372     0.133889     0.005225     0.001368     0.000451     0.000138     0.230319
40 | MODE: GPU-initiated, SIZE: 7, PROC: 4
41 | use cuda                           1            1            1            1            0            0            0            0   
42 | Total by level                0.155914     0.001211     0.001808     0.231071     0.008935     0.002912     0.001677     0.000339     0.403867
43 | MODE: GPU-initiated, SIZE: 7, PROC: 8
44 | use cuda                           1            1            1            1            0            0            0            0            0   
45 | Total by level                0.168862     0.001744     0.002545     0.245997     0.010298     0.002673     0.001588     0.000525     0.000160     0.434392
46 | MODE: GPU-initiated, SIZE: 7, PROC: 16
47 | use cuda                           1            1            1            1            0            0            0            0   
48 | Total by level                0.186298     0.002414     0.003675     0.276368     0.011224     0.004471     0.003649     0.001197     0.489296
49 | 
50 | 


--------------------------------------------------------------------------------
/Benchmarks/wilkes1-dec2016/Sync-original/hpgmg-MPI-s6-p24.txt:
--------------------------------------------------------------------------------
 1 | [tesla40:19195] Warning: could not find environment variable "MP_EVENT_ASYNC"
 2 | [tesla40:19195] Warning: could not find environment variable "MP_ENABLE_WARN"
 3 | [tesla40:19195] Warning: could not find environment variable "MP_GUARD_PROGRESS"
 4 | [tesla40:19195] Warning: could not find environment variable "CUDA_VISIBLE_DEVICES"
 5 | [tesla40:19195] Warning: could not find environment variable "SIZE"
 6 | [tesla40:19195] Warning: could not find environment variable "MAX_SIZE"
 7 | [tesla40:19195] Warning: could not find environment variable "KERNEL_TIME"
 8 | [tesla40:19195] Warning: could not find environment variable "CALC_SIZE"
 9 | [tesla40:19195] Warning: could not find environment variable "COMM_COMP_RATIO"
10 | [tesla40:19195] Warning: could not find environment variable "USE_SINGLE_STREAM"
11 | [tesla40:19195] Warning: could not find environment variable "USE_GPU_ASYNC"
12 | [tesla40:19195] Warning: could not find environment variable "COMM_USE_GDRDMA"
13 | COMM_USE_COMM=0
14 | COMM_USE_ASYNC=0
15 | COMM_USE_GPU_COMM=0
16 | 


--------------------------------------------------------------------------------
/Benchmarks/wilkes1-dec2016/Sync-original/hpgmg-MPI-s7-p24.txt:
--------------------------------------------------------------------------------
 1 | [tesla40:19228] Warning: could not find environment variable "MP_EVENT_ASYNC"
 2 | [tesla40:19228] Warning: could not find environment variable "MP_ENABLE_WARN"
 3 | [tesla40:19228] Warning: could not find environment variable "MP_GUARD_PROGRESS"
 4 | [tesla40:19228] Warning: could not find environment variable "CUDA_VISIBLE_DEVICES"
 5 | [tesla40:19228] Warning: could not find environment variable "SIZE"
 6 | [tesla40:19228] Warning: could not find environment variable "MAX_SIZE"
 7 | [tesla40:19228] Warning: could not find environment variable "KERNEL_TIME"
 8 | [tesla40:19228] Warning: could not find environment variable "CALC_SIZE"
 9 | [tesla40:19228] Warning: could not find environment variable "COMM_COMP_RATIO"
10 | [tesla40:19228] Warning: could not find environment variable "USE_SINGLE_STREAM"
11 | [tesla40:19228] Warning: could not find environment variable "USE_GPU_ASYNC"
12 | [tesla40:19228] Warning: could not find environment variable "COMM_USE_GDRDMA"
13 | COMM_USE_COMM=0
14 | COMM_USE_ASYNC=0
15 | COMM_USE_GPU_COMM=0
16 | 


--------------------------------------------------------------------------------
/HPGMG_Async_manuscript.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/e-ago/hpgmg-cuda-async/7896114058909da072e4c3ddb48a188e9d915fb7/HPGMG_Async_manuscript.pdf


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2014-2015, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | The U.S. Department of Energy funded the development of this software
 4 | under subcontract B609478 with Lawrence Livermore National Security (LLNS).
 5 | 
 6 | Copyright (c) 2014, The Regents of the University of California, through
 7 | Lawrence Berkeley National Laboratory and UChicago Argonne, LLC.
 8 | All rights reserved.
 9 | 
10 | Redistribution and use in source and binary forms, with or without modification,
11 | are permitted provided that the following conditions are met:
12 | 
13 | * Redistributions of source code must retain the above copyright notice, this
14 |   list of conditions and the following disclaimer.
15 | * Redistributions in binary form must reproduce the above copyright notice, this
16 |   list of conditions and the following disclaimer in the documentation and/or
17 |   other materials provided with the distribution.
18 | * Neither the name of NVIDIA CORPORATION, Lawrence Livermore National
19 |   Security, the U.S. Department of Energy, nor the names of its
20 |   contributors may be used to endorse or promote products derived
21 |   from this software without specific prior written permission.
22 | 
23 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
24 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
25 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
26 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
27 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
28 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
30 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
32 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | HPGMG_ARCH := $(if $(PETSC_ARCH),$(PETSC_ARCH),build)
 2 | 
 3 | all :
 4 | 	./configure --arch=$(HPGMG_ARCH)
 5 | 	$(MAKE) -C $(HPGMG_ARCH)
 6 | 	@echo "Build complete in $(HPGMG_ARCH).  Use make -C $(HPGMG_ARCH) test to test."
 7 | 
 8 | test : all
 9 | 	$(MAKE) -C $(HPGMG_ARCH) test
10 | 
11 | clean :
12 | 	$(MAKE) -C $(HPGMG_ARCH) clean
13 | 
14 | .PHONY: all test clean
15 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # HPGMG-FV CUDA Async
 2 | 
 3 | ## Build
 4 | 
 5 | In `build.sh` script you need to set the GPUDirect Async path.
 6 | You can download all GPUDirect Async libraries and headers from here [GDAsync suite](https://github.com/e-ago/gdasync)
 7 | 
 8 | [HPGMG](https://hpgmg.org) is an HPC benchmarking effort and supercomputing ranking metric based on geometric multigrid methods. NVIDIA reworked the original implementation moving from a CPU implementation to an [hybrid CPU-GPU solution](https://bitbucket.org/nsakharnykh/hpgmg-cuda).
 9 | In this repository, starting from the NVIDIA solution, we leverage the communications with [GPUDirect Async](https://github.com/gpudirect/libgdsync), recently released by NVIDIA.
10 | 
11 | For further information about Async and benchmarks, please refer to:
12 | 
13 |  - ["GPUDirect Async: exploring GPU synchronous communication techniques for InfiniBand clusters"](https://www.sciencedirect.com/science/article/pii/S0743731517303386), E. Agostini, D. Rossetti, S. Potluri. Journal of Parallel and Distributed Computing, Vol. 114, Pages 28-45, April 2018
14 |  - ["Offloading communication control logic in GPU accelerated applications"](http://ieeexplore.ieee.org/document/7973709), E. Agostini, D. Rossetti, S. Potluri. Proceedings of the 17th IEEE/ACM International Symposium on Cluster, Cloud and Grid Computing (CCGrid’ 17), IEEE Conference Publications, Pages 248-257, Nov 2016
15 |  


--------------------------------------------------------------------------------
/build.sh:
--------------------------------------------------------------------------------
  1 | # find MPI compiler
  2 | CC=`which mpicc`
  3 | #CC=`which mpiicc`
  4 | CXX=`which mpic++`
  5 | 
  6 | # find NVCC compiler
  7 | NVCC=`which nvcc`
  8 | 
  9 | # set gpu architectures to compile for
 10 | CUDA_ARCH="-gencode code=sm_35,arch=compute_35 "
 11 | CUDA_ARCH="$CUDA_ARCH -gencode code=sm_60,arch=compute_60 "
 12 | #CUDA_ARCH="$CUDA_ARCH -gencode code=sm_70,arch=compute_70 "
 13 | 
 14 | # main tile size
 15 | OPTS="-DBLOCKCOPY_TILE_I=32 "
 16 | OPTS=" $OPTS -DBLOCKCOPY_TILE_J=4 "
 17 | OPTS=" $OPTS -DBLOCKCOPY_TILE_K=8 "
 18 | 
 19 | # special tile size for boundary conditions
 20 | OPTS=" $OPTS -DBOUNDARY_TILE_I=64 "
 21 | OPTS=" $OPTS -DBOUNDARY_TILE_J=16 "
 22 | OPTS=" $OPTS -DBOUNDARY_TILE_K=16 "
 23 | 
 24 | # max number of solves after warmup
 25 | OPTS=" $OPTS -DMAX_SOLVES=100 "
 26 | 
 27 | # host level threshold: number of grid elements
 28 | OPTS=" $OPTS -DHOST_LEVEL_SIZE_THRESHOLD=10000 "
 29 | 
 30 | # max number of solves after warmup
 31 | #OPTS=" $OPTS -DMAX_SOLVES=10 "
 32 | 
 33 | # unified memory allocation options
 34 | OPTS=" $OPTS -DCUDA_UM_ALLOC "
 35 | #cudaHostAlloc
 36 | OPTS=" $OPTS -DCUDA_UM_ZERO_COPY "
 37 | 
 38 | # MPI buffers allocation policy
 39 | #cudaHostAlloc
 40 | OPTS=" $OPTS -DMPI_ALLOC_ZERO_COPY "
 41 | #cudaMalloc
 42 | #OPTS=" $OPTS -DMPI_ALLOC_PINNED "
 43 | 
 44 | :<<AUTHOR_COMMENTS
 45 | OPTS=" $OPTS -DMPI_ALLOC_ZERO_COPY " - uses cudaMallocHost to allocate MPI buffers
 46 | OPTS=" $OPTS -DMPI_ALLOC_PINNED "  - uses cudaMalloc or malloc to allocate MPI buffers (depends on the level size)
 47 | If you comment out both options it will use cudaMallocManaged to allocate MPI buffers
 48 | AUTHOR_COMMENTS
 49 | 
 50 | # stencil optimizations
 51 | OPTS=" $OPTS -DUSE_REG "
 52 | OPTS=" $OPTS -DUSE_TEX "
 53 | #OPTS=" $OPTS -DUSE_SHM "
 54 | 
 55 | # GSRB smoother options
 56 | #OPTS=" $OPTS -DGSRB_FP "
 57 | #OPTS=" $OPTS -DGSRB_STRIDE2 "
 58 | #OPTS=" $OPTS -DGSRB_BRANCH "
 59 | #OPTS=" $OPTS -DGSRB_OOP "
 60 | 
 61 | # tools
 62 | #OPTS=" $OPTS -DUSE_PROFILE "
 63 | #OPTS=" $OPTS -DUSE_NVTX "
 64 | #OPTS=" $OPTS -DUSE_ERROR "
 65 | 
 66 | # override MVAPICH flags for C++
 67 | OPTS=" $OPTS -DMPICH_IGNORE_CXX_SEEK "
 68 | OPTS=" $OPTS -DMPICH_SKIP_MPICXX "  
 69 | 
 70 | #libmp flags
 71 | OPTS=" $OPTS -DENABLE_EXCHANGE_BOUNDARY_COMM=1 "
 72 | OPTS=" $OPTS -DENABLE_RESTRICTION_COMM=1 "
 73 | OPTS=" $OPTS -DENABLE_RESTRICTION_ASYNC=1 "
 74 | OPTS=" $OPTS -DENABLE_INTERPOLATION_COMM=1 "
 75 | OPTS=" $OPTS -DENABLE_INTERPOLATION_ASYNC=1 "
 76 | 
 77 | :<<FLAGS_TESTS
 78 | # MPI/default stream options
 79 | #OPTS=" $OPTS -DUSE_CUDA_AWARE_MPI "
 80 | #OPTS=" $OPTS -DCUDA_API_PER_THREAD_DEFAULT_STREAM "
 81 | #OPTS=" $OPTS -DUSE_DEFAULT_STREAM_MPI "
 82 | FLAGS_TESTS
 83 | 
 84 | OPTS=" $OPTS -DPROFILE_NVTX_RANGES "
 85 | 
 86 | MPI_INCLUDE=" -I$MPI_HOME/include "
 87 | MPI_LIB=" -L$MPI_HOME/lib -L/lib64 -L/lib "
 88 | 
 89 | CUDA_LIB=" -L$CUDA_HOME/lib64 -lcudart "
 90 | CUDA_INCLUDE=" -I$CUDA_HOME/include "
 91 | 
 92 | #GPUDirect Async required
 93 | [ -z "$PREFIX" ] && { PREFIX="$HOME/gdasync/Libraries"; }
 94 | GDASYNC_LIB="-L$PREFIX/lib"
 95 | GDASYNC_INCLUDE="-I$PREFIX/include"
 96 | 
 97 | OPTS=" $OPTS $CUDA_INCLUDE $MPI_INCLUDE $GDASYNC_INCLUDE "
 98 | 
 99 | CFLAGS="-O2 -fopenmp $OPTS"
100 | CXXFLAGS="-O2 $OPTS"
101 | NVCCFLAGS="-O2 -lineinfo $OPTS "
102 | LDFLAGS="$CUDA_LIB $MPI_LIB $GDASYNC_LIB "
103 | LDLIBS="-lmpcomm -lmp -lgdsync -lgdrapi -lcuda -libverbs "
104 | 
105 | # GSRB smoother (default)
106 | set -x
107 | ./configure --CC=$CC --NVCC=$NVCC --CXX=$CXX --CFLAGS="$CFLAGS" --CXXFLAGS="$CXXFLAGS" --NVCCFLAGS="$NVCCFLAGS" --CUDAARCH="$CUDA_ARCH" --LDFLAGS="$LDFLAGS" --LDLIBS="$LDLIBS" --no-fe
108 | 
109 | # Chebyshev smoother: --fv-smoother="cheby"
110 | make clean -C build
111 | make -j3 -C build V=1
112 | 


--------------------------------------------------------------------------------
/build_titan.sh:
--------------------------------------------------------------------------------
 1 | module swap PrgEnv-pgi PrgEnv-gnu
 2 | module load cudatoolkit
 3 | 
 4 | # find compilers
 5 | CC=`which cc`
 6 | NVCC=`which nvcc`
 7 | 
 8 | # set gpu architectures to compile for
 9 | CUDA_ARCH+="-gencode code=sm_35,arch=compute_35 "
10 | 
11 | # main tile size
12 | OPTS+="-DBLOCKCOPY_TILE_I=32 "
13 | OPTS+="-DBLOCKCOPY_TILE_J=4 "
14 | OPTS+="-DBLOCKCOPY_TILE_K=16 "
15 | 
16 | # special tile size for boundary conditions
17 | OPTS+="-DBOUNDARY_TILE_I=64 "
18 | OPTS+="-DBOUNDARY_TILE_J=16 "
19 | OPTS+="-DBOUNDARY_TILE_K=16 "
20 | 
21 | # host level threshold: number of grid elements
22 | OPTS+="-DHOST_LEVEL_SIZE_THRESHOLD=10000 "
23 | 
24 | # max number of solves after warmup
25 | #OPTS+="-DMAX_SOLVES=10 "
26 | 
27 | # unified memory allocation options
28 | OPTS+="-DCUDA_UM_ALLOC "
29 | OPTS+="-DCUDA_UM_ZERO_COPY "
30 | 
31 | # MPI buffers allocation policy
32 | OPTS+="-DMPI_ALLOC_ZERO_COPY "
33 | #OPTS+="-DMPI_ALLOC_PINNED "
34 | 
35 | # stencil kernel optimizations
36 | OPTS+="-DUSE_REG "
37 | OPTS+="-DUSE_TEX "
38 | #OPTS+="-DUSE_SHM "
39 | 
40 | # GSRB smoother options
41 | #OPTS+="-DGSRB_FP "
42 | #OPTS+="-DGSRB_STRIDE2 "
43 | #OPTS+="-DGSRB_BRANCH "
44 | #OPTS+="-DGSRB_OOP "
45 | 
46 | # tools
47 | #OPTS+="-DUSE_PROFILE "
48 | #OPTS+="-DUSE_NVTX "
49 | #OPTS+="-DUSE_ERROR "
50 | 
51 | # synchronize device after MPI_Waitall
52 | #OPTS+="-DSYNC_DEVICE_AFTER_WAITALL "
53 | 
54 | # override MVAPICH flags for C++
55 | OPTS+="-DMPICH_IGNORE_CXX_SEEK "
56 | OPTS+="-DMPICH_SKIP_MPICXX "
57 | 
58 | # GSRB smoother (default)
59 | ./configure --CC=$CC --NVCC=$NVCC --CFLAGS="-O2 -fopenmp $OPTS" --NVCCFLAGS="-O2 -lineinfo -lnvToolsExt $OPTS" --CUDAARCH="$CUDA_ARCH" --no-fe
60 | 
61 | # Chebyshev smoother
62 | #./configure --CC=$CC --NVCC=$NVCC --CFLAGS="-O2 -fopenmp $OPTS" --NVCCFLAGS="-O2 -lineinfo -lnvToolsExt $OPTS" --CUDAARCH="$CUDA_ARCH" --fv-smoother="cheby" --no-fe
63 | 
64 | make clean -C build
65 | make -j3 -C build
66 | 


--------------------------------------------------------------------------------
/configure:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2.7
2 | 
3 | if __name__ == "__main__":
4 |     import hpgmgconf
5 |     hpgmgconf.main()
6 | 


--------------------------------------------------------------------------------
/docs/HPGMG-logo2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/e-ago/hpgmg-cuda-async/7896114058909da072e4c3ddb48a188e9d915fb7/docs/HPGMG-logo2.pdf


--------------------------------------------------------------------------------
/docs/HPGMG-logo2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/e-ago/hpgmg-cuda-async/7896114058909da072e4c3ddb48a188e9d915fb7/docs/HPGMG-logo2.png


--------------------------------------------------------------------------------
/docs/HPGMG-logo2.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/e-ago/hpgmg-cuda-async/7896114058909da072e4c3ddb48a188e9d915fb7/docs/HPGMG-logo2.pptx


--------------------------------------------------------------------------------
/docs/hpgmg-template.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/e-ago/hpgmg-cuda-async/7896114058909da072e4c3ddb48a188e9d915fb7/docs/hpgmg-template.pptx


--------------------------------------------------------------------------------
/docs/ppt/ISC2014.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/e-ago/hpgmg-cuda-async/7896114058909da072e4c3ddb48a188e9d915fb7/docs/ppt/ISC2014.pptx


--------------------------------------------------------------------------------
/docs/static/comment.sty:
--------------------------------------------------------------------------------
 1 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | % Comment.sty   version 3.0, 3 September 1992
 3 | % selectively in/exclude pieces of text: the user can define new
 4 | % comment versions, and each is controlled separately.
 5 | % Special comments can be defined where the user specifies the
 6 | % action that is to be taken with each comment line.
 7 | %
 8 | % This style can be used with plain TeX or LaTeX, and probably
 9 | % most other packages too.
10 | %
11 | % Author
12 | %    Victor Eijkhout
13 | %    Department of Computer Science
14 | %    University Tennessee at Knoxville
15 | %    104 Ayres Hall
16 | %    Knoxville, TN 37996
17 | %    USA
18 | %
19 | %    eijkhout@cs.utk.edu
20 | %
21 | % Usage: all text included in between 
22 | %    \comment ... \endcomment
23 | % or \begin{comment} ... \end{comment}
24 | % is discarded. The closing command should appear on a line
25 | % of its own. No starting spaces, nothing after it.
26 | % This environment should work with arbitrary amounts
27 | % of comment. 
28 | %
29 | % Other 'comment' environments are defined by
30 | % and are selected/deselected with
31 | % \includecomment{versiona}
32 | % \excludecoment{versionb}
33 | %
34 | % These environments are used as
35 | % \versiona ... \endversiona
36 | % or \begin{versiona} ... \end{versiona}
37 | % with the closing command again on a line of its own.
38 | %
39 | % Special comments are defined as
40 | % \specialcomment{name}{before commands}{after commands}
41 | % where the second and third arguments are executed before
42 | % and after each comment. By defining a control sequence 
43 | % \Thiscomment##1{...} in the before commands the user can
44 | % specify what is to be done which each comment line.
45 | %
46 | % Basic approach: 
47 | % to comment something out, scoop up  every line in verbatim mode
48 | % as macro argument, then throw it away.
49 | % For inclusions, both the opening and closing comands 
50 | % are defined as noop
51 | 
52 | \def\makeinnocent#1{\catcode`#1=12 }
53 | \def\csarg#1#2{\expandafter#1\csname#2\endcsname}
54 | 
55 | \def\TreatAsComment#1{\begingroup
56 |     \def\CurrentComment{#1}%
57 |     \let\do\makeinnocent \dospecials 
58 |     \makeinnocent\^^L% and whatever other special cases
59 |     \endlinechar`\^^M \catcode`\^^M=12 \xComment}
60 | {\catcode`\^^M=12 \endlinechar=-1 %
61 |  \gdef\xComment#1^^M{\def\test{#1}
62 |       \csarg\ifx{PlainEnd\CurrentComment Test}\test
63 |           \def\next{\endgroup\AfterComment}%
64 |       \else \csarg\ifx{LolliEnd\CurrentComment Test}\test
65 |           \def\next{\endgroup\AfterComment}%
66 |       \else \csarg\ifx{LaLaEnd\CurrentComment Test}\test
67 |             \edef\next{\endgroup\noexpand\AfterComment
68 |                        \noexpand\end{\CurrentComment}}
69 |       \else \ThisComment{#1}\let\next\xComment
70 |       \fi \fi \fi \next}
71 | }
72 | 
73 | \def\includecomment
74 |  #1{\message{Including comment '#1'}%
75 |     \expandafter\def\csname#1\endcsname{}%
76 |     \expandafter\def\csname end#1\endcsname{}}
77 | \def\excludecomment
78 |  #1{\message{Excluding comment '#1'}%
79 |     \csarg\def{#1}{\let\AfterComment\relax
80 |            \def\ThisComment####1{}\TreatAsComment{#1}}%
81 |     {\escapechar=-1\relax
82 |      \csarg\xdef{PlainEnd#1Test}{\string\\end#1}%
83 |      \csarg\xdef{LolliEnd#1Test}{\string\\#1Stop}%
84 |      \csarg\xdef{LaLaEnd#1Test}{\string\\end\string\{#1\string\}}%
85 |     }}
86 | \long\def\specialcomment
87 |  #1#2#3{\message{Special comment '#1'}%
88 |     \csarg\def{#1}{\def\ThisComment{}\def\AfterComment{#3}#2%
89 |            \TreatAsComment{#1}}%
90 |     {\escapechar=-1\relax
91 |      \csarg\xdef{PlainEnd#1Test}{\string\\end#1}%
92 |      \csarg\xdef{LolliEnd#1Test}{\string\\#1Stop}%
93 |      \csarg\xdef{LaLaEnd#1Test}{\string\\end\string\{#1\string\}}%
94 |     }}
95 | \excludecomment{comment}
96 | 
97 | \endinput
98 | 
99 | 


--------------------------------------------------------------------------------
/docs/static/hpgmg.tex:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/e-ago/hpgmg-cuda-async/7896114058909da072e4c3ddb48a188e9d915fb7/docs/static/hpgmg.tex


--------------------------------------------------------------------------------
/docs/static/thebib.bib:
--------------------------------------------------------------------------------
 1 | 
 2 | @misc{Top500,
 3 |   howpublished = {http://www.top500.org/},
 4 |   title = {Top 500 Supercomputer Sites},
 5 |   author = {Top500},
 6 |   year = {2013},
 7 | }
 8 | 
 9 | @TechReport{brandt1984,
10 |   author = "Achi Brandt",
11 |   title = "Multigrid Techniques: 1984 Guide with Applications for Fluid Dynamics",
12 |   institution = "Gesellschaft fur Mathematik und Dataenverarbeitung",
13 |   number = "GMD-Studien Nr. 85",
14 |   year = 1984
15 | }
16 | 
17 | @article{Adams-10a,
18 | 	Author = {Mark F. Adams and Ravi Samtaney and Achi Brandt},
19 | 	Doi = {DOI: 10.1016/j.jcp.2010.04.024},
20 | 	Issn = {0021-9991},
21 | 	Journal = {Journal of Computational Physics},
22 | 	Keywords = {Implicit magnetohydrodynamics},
23 | 	Number = {18},
24 | 	Pages = {6208 - 6219},
25 | 	Title = {Toward textbook multigrid efficiency for fully implicit resistive magnetohydrodynamics},
26 | 	Url = {http://www.sciencedirect.com/science/article/B6WHY-4YY8N52-2/2/4a06d49560a7f24a4da5bb25d0c75d62},
27 | 	Volume = {229},
28 | 	Year = {2010},
29 | 	Bdsk-Url-1 = {http://www.sciencedirect.com/science/article/B6WHY-4YY8N52-2/2/4a06d49560a7f24a4da5bb25d0c75d62},
30 | 	Bdsk-Url-2 = {http://dx.doi.org/10.1016/j.jcp.2010.04.024}}
31 | 
32 | @inproceedings{strohmaier2004architecture,
33 |   title={Architecture independent performance characterization and benchmarking for scientific applications},
34 |   author={Strohmaier, Erich and Shan, Hongzhang},
35 |   booktitle={Modeling, Analysis, and Simulation of Computer and Telecommunications Systems, 2004.(MASCOTS 2004). Proceedings. The IEEE Computer Society's 12th Annual International Symposium on},
36 |   pages={467--474},
37 |   year={2004},
38 |   organization={IEEE}
39 | }
40 | 
41 | @TECHREPORT{SCALES_2003,
42 | author = {Phillip Colella and Thom H. Dunning and Jr., William D. Gropp and David E. Keyes},
43 | title = {A SCIENCE-BASED CASE FOR LARGE-SCALE SIMULATION},
44 | institution= {DOE},
45 | year = 2003,
46 | url=http://www.pnl.gov/scales
47 | }
48 | 
49 | @article{strohmaier2009generalized,
50 |   title={Generalized utility metrics for supercomputers},
51 |   author={Strohmaier, Erich},
52 |   journal={Computer Science-Research and Development},
53 |   volume={23},
54 |   number={3-4},
55 |   pages={185--193},
56 |   year={2009},
57 |   publisher={Springer}
58 | }
59 | 
60 | @TECHREPORT{Dongarra_2013,
61 | author = {J. Dongarra and M.A. Heroux},
62 | title = {Toward a New Metric for Ranking High Performance Computing Systems},
63 | number = {SAND2013-4744},
64 | institution= {Sandia National Laboratories},
65 | year = 2013
66 | }
67 | 
68 | @book{UTrottenberg_CWOosterlee_ASchueller_2000a,
69 | 	Address = {London},
70 | 	Author = {U. Trottenberg and C. W. Oosterlee and A. Sch{\"u}ller},
71 | 	Publisher = {Academic Press},
72 | 	Title = {Multigrid},
73 | 	Year = {2001}}
74 | 
75 | @article{Brandt-77,
76 | 	Author = {Brandt, A.},
77 | 	Journal = {Math. Comput.},
78 | 	Pages = {333-390},
79 | 	Title = {Multi-level adaptive solutions to boundary value problems},
80 | 	Volume = {31},
81 | 	Year = {1977}}
82 | 
83 | @misc{HPGMG,
84 |   author={Adams, Mark and Brown, Jed and Shalf, John and Van Straalen, Brian and Strohmaier, Erich and Williams, Sam},  
85 |   howpublished = {https://bitbucket.org/hpgmg/hpgmg},
86 |   title = {{HPGMG}},
87 |   year = {2014},
88 | }
89 | 
90 | @inproceedings{Adams-01b,
91 | 	Address = {Denver, Colorado},
92 | 	Author = {Adams, M.~F.},
93 | 	Booktitle = {ACM/IEEE Proceedings of SC2001: High Performance Networking and Computing},
94 | 	Month = {November},
95 | 	Title = {A Distributed Memory Unstructured {G}auss--{S}eidel Algorithm for Multigrid Smoothers},
96 | 	Year = {2001}}
97 | 


--------------------------------------------------------------------------------
/docs/whitepapers/hpgmg.bib:
--------------------------------------------------------------------------------
 1 | 
 2 | @article{dennis2012computational,
 3 |   title={Computational performance of ultra-high-resolution capability in the Community Earth System Model},
 4 |   author={Dennis, John M and Vertenstein, Mariana and Worley, Patrick H and Mirin, Arthur A and Craig, Anthony P and Jacob, Robert and Mickelson, Sheri},
 5 |   journal={International Journal of High Performance Computing Applications},
 6 |   volume={26},
 7 |   number={1},
 8 |   pages={5--16},
 9 |   year={2012},
10 |   publisher={SAGE Publications}
11 | }
12 | 
13 | 
14 | @misc{Top500,
15 |   howpublished = {http://www.top500.org/},
16 |   title = {Top 500 Supercomputer Sites},
17 |   author = {Top500},
18 |   year = {2013},
19 | }
20 | 
21 | @article{strohmaier2009generalized,
22 |   title={Generalized utility metrics for supercomputers},
23 |   author={Strohmaier, Erich},
24 |   journal={Computer Science-Research and Development},
25 |   volume={23},
26 |   number={3-4},
27 |   pages={185--193},
28 |   year={2009},
29 |   publisher={Springer}
30 | }
31 | 
32 | @TECHREPORT{Dongarra_2013,
33 | author = {J. Dongarra and M.A. Heroux},
34 | title = {Toward a New Metric for Ranking High Performance Computing Systems},
35 | number = {SAND2013-4744},
36 | institution= {Sandia National Laboratories},
37 | year = 2013
38 | }
39 | 
40 | @misc{HPGMG,
41 |   author={Adams, Mark and Brown, Jed and Shalf, John and Van Straalen, Brian and Strohmaier, Erich and Williams, Sam},  
42 |   howpublished = {https://bitbucket.org/hpgmg/hpgmg},
43 |   title = {{HPGMG}},
44 |   year = {2014},
45 | }
46 | 
47 | 
48 | @article{Brandt-77,
49 | 	Author = {Brandt, A.},
50 | 	Journal = {Math. Comput.},
51 | 	Pages = {333-390},
52 | 	Title = {Multi-level adaptive solutions to boundary value problems},
53 | 	Volume = {31},
54 | 	Year = {1977}}
55 | 
56 | @article{Adams-10a,
57 | 	Author = {Adams. M.F. and Samtaney, R. and Brandt, A.},
58 | 	Doi = {DOI: 10.1016/j.jcp.2010.04.024},
59 | 	Issn = {0021-9991},
60 | 	Journal = {Journal of Computational Physics},
61 | 	Keywords = {Implicit magnetohydrodynamics},
62 | 	Number = {18},
63 | 	Pages = {6208 - 6219},
64 | 	Title = {Toward textbook multigrid efficiency for fully implicit resistive magnetohydrodynamics},
65 | 	Url = {http://www.sciencedirect.com/science/article/B6WHY-4YY8N52-2/2/4a06d49560a7f24a4da5bb25d0c75d62},
66 | 	Volume = {229},
67 | 	Year = {2010},
68 | 	Bdsk-Url-1 = {http://www.sciencedirect.com/science/article/B6WHY-4YY8N52-2/2/4a06d49560a7f24a4da5bb25d0c75d62},
69 | 	Bdsk-Url-2 = {http://dx.doi.org/10.1016/j.jcp.2010.04.024}}
70 | 


--------------------------------------------------------------------------------
/finite-element/fefas-align.h:
--------------------------------------------------------------------------------
 1 | #ifndef _fefas_align_h
 2 | #define _fefas_align_h
 3 | 
 4 | #if defined(__AVX__) || defined(__xlc__)  // Assume these compilers support __attribute__((aligned(32)))
 5 | #  define _align __attribute__((aligned(32))) /* AVX packed instructions need 32-byte alignment */
 6 | #elif defined(__GNUC__)
 7 | #  define _align __attribute__((aligned(16))) /* SSE instructions need 16-byte alignment */
 8 | #else
 9 | #  define _align
10 | #endif
11 | 
12 | #endif
13 | 


--------------------------------------------------------------------------------
/finite-element/fefas.c:
--------------------------------------------------------------------------------
 1 | static const char help[] = "Geometric multigrid solver for finite-element elasticity.\n\n";
 2 | 
 3 | #include "fefas.h"
 4 | 
 5 | PetscErrorCode TestGrid(void);
 6 | PetscErrorCode TestFESpace(void);
 7 | PetscErrorCode TestFEGrad(void);
 8 | PetscErrorCode TestFEInject(void);
 9 | PetscErrorCode TestFEInterp(void);
10 | PetscErrorCode TestFERestrict(void);
11 | PetscErrorCode TestOpApply(void);
12 | PetscErrorCode TestOpDiagonal(void);
13 | PetscErrorCode TestKSPSolve(void);
14 | PetscErrorCode TestSampler(void);
15 | PetscErrorCode RunMGV(void);
16 | PetscErrorCode RunFMG(void);
17 | PetscErrorCode RunSample(void);
18 | 
19 | static PetscErrorCode ActionParse(int argc,char *argv[],PetscErrorCode (**action)(void))
20 | {
21 |   PetscFunctionList actionlist = NULL;
22 |   PetscErrorCode ierr;
23 | 
24 |   PetscFunctionBegin;
25 |   *action = NULL;
26 | 
27 |   ierr = PetscFunctionListAdd(&actionlist,"test-grid",TestGrid);CHKERRQ(ierr);
28 |   ierr = PetscFunctionListAdd(&actionlist,"test-fespace",TestFESpace);CHKERRQ(ierr);
29 |   ierr = PetscFunctionListAdd(&actionlist,"test-fegrad",TestFEGrad);CHKERRQ(ierr);
30 |   ierr = PetscFunctionListAdd(&actionlist,"test-feinject",TestFEInject);CHKERRQ(ierr);
31 |   ierr = PetscFunctionListAdd(&actionlist,"test-feinterp",TestFEInterp);CHKERRQ(ierr);
32 |   ierr = PetscFunctionListAdd(&actionlist,"test-ferestrict",TestFERestrict);CHKERRQ(ierr);
33 |   ierr = PetscFunctionListAdd(&actionlist,"test-opapply",TestOpApply);CHKERRQ(ierr);
34 |   ierr = PetscFunctionListAdd(&actionlist,"test-opdiagonal",TestOpDiagonal);CHKERRQ(ierr);
35 |   ierr = PetscFunctionListAdd(&actionlist,"test-kspsolve",TestKSPSolve);CHKERRQ(ierr);
36 |   ierr = PetscFunctionListAdd(&actionlist,"test-sampler",TestSampler);CHKERRQ(ierr);
37 |   ierr = PetscFunctionListAdd(&actionlist,"mgv",RunMGV);CHKERRQ(ierr);
38 |   ierr = PetscFunctionListAdd(&actionlist,"fmg",RunFMG);CHKERRQ(ierr);
39 |   ierr = PetscFunctionListAdd(&actionlist,"sample",RunSample);CHKERRQ(ierr);
40 | 
41 |   if (argc < 2 || !argv[1] || argv[1][0] == '-') {
42 |     ierr = PetscViewerASCIIPrintf(PETSC_VIEWER_STDERR_WORLD,"First argument '%s' must be an action:",argc>=2&&argv[1]?argv[1]:"");CHKERRQ(ierr);
43 |     ierr = PetscFunctionListView(actionlist,PETSC_VIEWER_STDERR_WORLD);CHKERRQ(ierr);
44 |     goto out;
45 |   }
46 |   ierr = PetscFunctionListFind(actionlist,argv[1],action);CHKERRQ(ierr);
47 |   if (!*action) {
48 |     ierr = PetscViewerASCIIPrintf(PETSC_VIEWER_STDERR_WORLD,"Unknown action '%s':",argc>=2&&argv[1]?argv[1]:"");CHKERRQ(ierr);
49 |     ierr = PetscFunctionListView(actionlist,PETSC_VIEWER_STDERR_WORLD);CHKERRQ(ierr);
50 |     goto out;
51 |   }
52 |   out:
53 |   ierr = PetscFunctionListDestroy(&actionlist);CHKERRQ(ierr);
54 |   PetscFunctionReturn(0);
55 | }
56 | 
57 | int main(int argc, char *argv[])
58 | {
59 |   PetscErrorCode ierr,(*action)(void);
60 | 
61 |   PetscInitialize(&argc,&argv,NULL,help);
62 |   ierr = ActionParse(argc,argv,&action);CHKERRQ(ierr);
63 |   if (!action) {
64 |     PetscFinalize();
65 |     return 1;
66 |   }
67 |   ierr = (*action)();CHKERRQ(ierr);
68 |   PetscFinalize();
69 |   return 0;
70 | }
71 | 


--------------------------------------------------------------------------------
/finite-element/fefas.h:
--------------------------------------------------------------------------------
 1 | #ifndef _fefas_h
 2 | #define _fefas_h
 3 | 
 4 | #include <petscdm.h>
 5 | #include <stdint.h>
 6 | #include "op/fefas-op.h"
 7 | 
 8 | typedef struct Grid_private *Grid;
 9 | 
10 | typedef enum {DOMAIN_INTERIOR=0x1,DOMAIN_EXTERIOR=0x2,DOMAIN_CLOSURE=0x3} DomainMode;
11 | 
12 | PetscErrorCode GridCreate(MPI_Comm comm,const PetscInt M[3],const PetscInt p[3],PetscInt cmax,Grid *grid);
13 | PetscErrorCode GridDestroy(Grid *grid);
14 | PetscErrorCode GridView(Grid grid);
15 | PetscErrorCode GridGetNumLevels(Grid grid,PetscInt *nlevels);
16 | PetscInt GridLevelFromM(const PetscInt M[3]);
17 | PetscErrorCode DMCreateFE(Grid grid,PetscInt fedegree,PetscInt dof,DM *dmfe);
18 | PetscErrorCode DMDestroyFE(DM *dm);
19 | PetscErrorCode DMFESetUniformCoordinates(DM dm,const PetscReal L[]);
20 | PetscErrorCode DMFEGetUniformCoordinates(DM dm,PetscReal L[]);
21 | PetscErrorCode DMFEGetInfo(DM dm,PetscInt *fedegree,PetscInt *level,PetscInt mlocal[],PetscInt Mglobal[],PetscInt procs[]);
22 | PetscErrorCode DMFEGetTensorEval(DM dm,PetscInt *P,PetscInt *Q,const PetscReal **B,const PetscReal **D,const PetscReal **x,const PetscReal **w,const PetscReal **w3);
23 | PetscErrorCode DMFEGetNumElements(DM dm,PetscInt *nelems);
24 | PetscErrorCode DMFEExtractElements(DM dm,const PetscScalar *u,PetscInt elem,PetscInt ne,PetscScalar *y);
25 | PetscErrorCode DMFESetElements(DM dm,PetscScalar *u,PetscInt elem,PetscInt ne,InsertMode imode,DomainMode dmode,const PetscScalar *y);
26 | PetscErrorCode DMFECoarsen(DM dm,DM *dmcoarse);
27 | PetscErrorCode DMFEInject(DM dm,Vec Uf,Vec Uc);
28 | PetscErrorCode DMFEInterpolate(DM dm,Vec Uc,Vec Uf);
29 | PetscErrorCode DMFERestrict(DM dm,Vec Uf,Vec Uc);
30 | PetscErrorCode DMFEZeroBoundaries(DM dm,Vec U);
31 | PetscErrorCode DMCoordDistort(DM dm,const PetscReal L[]);
32 | 
33 | typedef struct MG_private *MG;
34 | PetscErrorCode MGCreate(Op op,DM dm,PetscInt nlevels,MG *newmg);
35 | PetscErrorCode MGDestroy(MG *mg);
36 | PetscErrorCode MGMonitorSet(MG mg,PetscBool mon);
37 | PetscErrorCode MGSetUpPC(MG mg);
38 | PetscErrorCode MGFCycle(Op op,MG mg,PetscInt presmooths,PetscInt postsmooths,Vec B,Vec U);
39 | 
40 | PetscInt SampleGridNumLevels(const PetscInt p[]);
41 | int64_t SampleGridNumElements(const PetscInt p[]);
42 | PetscErrorCode SampleGridRangeCreate(PetscMPIInt nranks,PetscInt minlocal,PetscInt maxlocal,PetscInt maxsamples,PetscInt *nsamples,PetscInt **gridsizes);
43 | PetscErrorCode ProcessGridFindSquarest(PetscMPIInt nranks,PetscInt squarest[3]);
44 | 
45 | PetscErrorCode MemoryGetUsage(double *heapused,double *heapavail);
46 | 
47 | #endif
48 | 


--------------------------------------------------------------------------------
/finite-element/hpgmg-analyze.py:
--------------------------------------------------------------------------------
 1 | def parse_logfile(fname):
 2 |     import re
 3 |     FP = r'([+-]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][+-]?\d+)?)'
 4 |     PERFLINE = []
 5 |     PERFLINE.append(re.compile(r'Q2 G''\[([\d ]{4})([\d ]{4})([\d ]{4})\] P\[ *(\d+) +(\d+) +(\d+)\]  '+FP+r' s +'+FP+r' GF +'+FP+r' MEq/s'))
 6 |     PERFLINE.append(re.compile(r'Q2 G''\[([\d ]{5})([\d ]{5})([\d ]{5})\] P\[ *(\d+) +(\d+) +(\d+)\]  '+FP+r' s +'+FP+r' GF +'+FP+r' MEq/s'))
 7 |     HOSTLINE = re.compile(r'.*on a ([a-z\-_0-9]+) named [^ ]+ with (\d+) processors')
 8 |     Dofs = []
 9 |     GFlops = []
10 |     MEqs = []
11 |     Procs = None
12 |     HostName = 'unknown'
13 |     with open(fname) as f:
14 |         while 'Starting performance sampling' not in next(f):
15 |             pass
16 |         while True:
17 |             line = next(f)
18 |             for perfline in PERFLINE:
19 |                 m = re.match(perfline,line)
20 |                 if m: break
21 |             if not m: break
22 |             g0,g1,g2, p0,p1,p2, time, gflops, meqs = m.groups()
23 |             g = (float(g0)*2+1)*(float(g1)*2+1)*(float(g2)*2+1)
24 |             p = int(p0)*int(p1)*int(p2)
25 |             Dofs.append(g)
26 |             GFlops.append(float(gflops))
27 |             MEqs.append(float(meqs))
28 |             if Procs is None:
29 |                 Procs = p
30 |             elif p != Procs:
31 |                 raise RuntimeError('Procs varies within file "%s"' % (fname,))
32 |         while True:
33 |             line = next(f)
34 |             m = re.match(HOSTLINE,line)
35 |             if m:
36 |                 HostName, p = m.groups()
37 |                 assert int(p) == Procs
38 |                 break
39 | 
40 |     return Dofs, GFlops, MEqs, HostName, Procs
41 | 
42 | def plot(args):
43 |     symbols = iter(['ro', 'bv', 'ks', 'g^', 'bx'])
44 |     import matplotlib.pyplot as plt
45 |     fig, ax1 = plt.subplots()
46 |     plt.title('HPGMG-FE Performance')
47 |     if args.perprocess:
48 |         plt.xlabel('Number of equations/process')
49 |     else:
50 |         plt.xlabel('Global number of equations')
51 |     ax2 = ax1.twinx()
52 |     #ax1.set_autoscaley_on(False)
53 |     ax1.set_ylabel('MEquations/second')
54 |     all_dofs = []
55 |     all_gflops = []
56 |     all_meqs = []
57 |     max_meqs = 0
58 |     for f in args.logfiles:
59 |         dofs, gflops, meqs, hostname, procs = parse_logfile(f)
60 |         if args.perprocess:
61 |             dofs = [d/procs for d in dofs]
62 |         all_dofs += dofs
63 |         all_gflops += gflops
64 |         all_meqs += meqs
65 |         if args.loglog:
66 |             ax1.loglog(dofs, meqs, next(symbols), label='%s np=%d'%(hostname, procs))
67 |         else:
68 |             ax1.semilogx(dofs, meqs, next(symbols), label='%s np=%d'%(hostname, procs))
69 |     flops_per_meqn = all_gflops[-1] / all_meqs[-1]
70 |     ax1.set_xlim(0.9*min(all_dofs),1.05*max(all_dofs))
71 |     ax2.set_xlim(0.9*min(all_dofs),1.05*max(all_dofs))
72 |     ax2.set_autoscaley_on(False)
73 |     if args.loglog:
74 |         ax2.set_yscale('log')
75 |         ax1.legend(loc='lower right')
76 |     else:
77 |         ax1.legend(loc='upper left')
78 |     ax1.set_ylim(0.9*min(all_meqs),1.1*max(all_meqs))
79 |     ax2.set_ylim(0.9*min(all_meqs)*flops_per_meqn,1.1*max(all_meqs)*flops_per_meqn)
80 |     ax2.set_ylabel('GFlop/s')
81 |     if args.output:
82 |         plt.savefig(args.output)
83 |     else:
84 |         plt.show()
85 | 
86 | if __name__ == "__main__":
87 |     import argparse
88 |     parser = argparse.ArgumentParser('FE-FAS Performance Analyzer')
89 |     parser.add_argument('-o', '--output', type=str, help='Output file')
90 |     parser.add_argument('--loglog', action='store_true', help='Use logarithmic y axis (x is always logarithmic)')
91 |     parser.add_argument('--perprocess', action='store_true', help='Use problem size per process for x axis')
92 |     parser.add_argument('logfiles', nargs='+', type=str, help='List of files to process, usually including -log_summary')
93 |     args = parser.parse_args()
94 |     plot(args)
95 | 


--------------------------------------------------------------------------------
/finite-element/local.mk:
--------------------------------------------------------------------------------
 1 | hpgmg-fe-y.c += $(call thisdir, \
 2 | 	fefas.c \
 3 | 	fefas-test.c \
 4 | 	fmg.c \
 5 | 	grid.c \
 6 | 	memusage.c \
 7 | 	sampler.c \
 8 | 	tensor.c \
 9 | 	tensor-fma.c \
10 | 	tensor-qpx.c \
11 | 	)
12 | 
13 | include $(call incsubdirs,op)
14 | 


--------------------------------------------------------------------------------
/finite-element/memusage.c:
--------------------------------------------------------------------------------
 1 | #include "fefas.h"
 2 | 
 3 | #ifdef __bgq__
 4 | #  include <spi/include/kernel/memory.h>
 5 | #endif
 6 | 
 7 | PetscErrorCode MemoryGetUsage(double *heapused,double *heapavail) {
 8 |   PetscFunctionBegin;
 9 |   *heapused = -1;
10 |   *heapavail = -1;
11 | #ifdef __bgq__
12 |   {
13 |     uint64_t heap,avail;
14 |     Kernel_GetMemorySize(KERNEL_MEMSIZE_HEAP,&heap);
15 |     Kernel_GetMemorySize(KERNEL_MEMSIZE_HEAPAVAIL,&avail);
16 |     *heapused = (double)heap;
17 |     *heapavail = (double)avail;
18 |   }
19 | #else
20 |   {
21 |     PetscErrorCode ierr;
22 |     ierr = PetscMemoryGetCurrentUsage(heapused);CHKERRQ(ierr);
23 |   }
24 | #endif
25 |   PetscFunctionReturn(0);
26 | }
27 | 


--------------------------------------------------------------------------------
/finite-element/op/fefas-op.h:
--------------------------------------------------------------------------------
 1 | #ifndef _fefas_op_h
 2 | #define _fefas_op_h
 3 | 
 4 | #include <petscdm.h>
 5 | #include "../tensor.h"
 6 | 
 7 | typedef struct Op_private *Op;
 8 | 
 9 | MPI_Comm OpComm(Op);
10 | PetscErrorCode OpCreateFromOptions(MPI_Comm,Op*);
11 | PetscErrorCode OpDestroy(Op*);
12 | PetscErrorCode OpSetDof(Op,PetscInt);
13 | PetscErrorCode OpGetDof(Op,PetscInt*);
14 | PetscErrorCode OpSetFEDegree(Op,PetscInt);
15 | PetscErrorCode OpGetFEDegree(Op,PetscInt*);
16 | PetscErrorCode OpSetContext(Op,void*);
17 | PetscErrorCode OpGetContext(Op,void*);
18 | PetscErrorCode OpSetApply(Op,PetscErrorCode (*)(Op,DM,Vec,Vec));
19 | PetscErrorCode OpSetPointwiseSolution(Op,PetscErrorCode (*)(Op,const PetscReal[],const PetscReal[],PetscScalar[]));
20 | PetscErrorCode OpSetPointwiseForcing(Op,PetscErrorCode (*)(Op,const PetscReal[],const PetscReal[],PetscScalar[]));
21 | typedef PetscErrorCode (*OpPointwiseElementFunction)(Op,PetscInt,PetscInt,const PetscScalar[],const PetscReal[],const PetscScalar[],PetscScalar[]);
22 | PetscErrorCode OpSetPointwiseElement(Op,OpPointwiseElementFunction,PetscInt);
23 | PetscErrorCode OpSetAffineOnly(Op op,PetscBool affine);
24 | PetscErrorCode OpGetAffineOnly(Op op,PetscBool *affine);
25 | PetscErrorCode OpSetDestroy(Op,PetscErrorCode (*)(Op));
26 | PetscErrorCode OpRegister(const char name[],PetscErrorCode (*f)(Op));
27 | PetscErrorCode OpInitializePackage(void);
28 | PetscErrorCode OpFinalizePackage(void);
29 | PetscErrorCode OpDestroy(Op*);
30 | PetscErrorCode OpApply(Op op,DM dm,Vec U,Vec F);
31 | PetscErrorCode OpRestrictState(Op op,DM dm,Vec Uf,Vec Uc);
32 | PetscErrorCode OpRestrictResidual(Op op,DM dm,Vec Uf,Vec Uc);
33 | PetscErrorCode OpInterpolate(Op op,DM dm,Vec Uc,Vec Uf);
34 | PetscErrorCode OpSolution(Op op,DM dm,Vec U);
35 | PetscErrorCode OpForcing(Op op,DM dm,Vec F);
36 | PetscErrorCode OpIntegrateNorms(Op op,DM dm,Vec U,PetscReal *normInfty,PetscReal *norm2);
37 | PetscErrorCode OpGetDiagonal(Op op,DM dm,Vec Diag);
38 | PetscErrorCode OpGetMat(Op op,DM dm,Mat *shell);
39 | PetscErrorCode OpGetTensors(Op op,Tensor *TensorDOF,Tensor *Tensor3);
40 | 
41 | #endif
42 | 


--------------------------------------------------------------------------------
/finite-element/op/genregister.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import re
 4 | 
 5 | CREATE_RE = re.compile(r'PetscErrorCode OpCreate_(\w+) ?\(')
 6 | 
 7 | def mangle(name):
 8 |     return name.lower().replace('_','-'), name
 9 | 
10 | def build_ops(files):
11 |     ops = []
12 |     for src in files:
13 |         with open(src) as f:
14 |             for line in f:
15 |                 m = CREATE_RE.match(line)
16 |                 if m:
17 |                     ops.append(mangle(m.groups()[0]))
18 |     return ops
19 | 
20 | def genregister(outname, files):
21 |     ops = build_ops(files)
22 |     with open(outname, 'w') as out:
23 |         out.write("""#include <op/fefas-op.h>
24 | 
25 | %(opdecl)s
26 | 
27 | PetscErrorCode OpRegisterAll_Generated()
28 | {
29 |   PetscErrorCode ierr;
30 | 
31 |   PetscFunctionBegin;
32 |   %(opreg)s
33 |   PetscFunctionReturn(0);
34 | }
35 | """ % dict(opdecl='\n'.join('PetscErrorCode OpCreate_%s(Op);'%(o[1],) for o in ops),
36 |            opreg='\n  '.join(['ierr = OpRegister("%s",OpCreate_%s);CHKERRQ(ierr);'%o for o in ops])))
37 | 
38 | if __name__ == '__main__':
39 |     import sys
40 |     genregister(sys.argv[1], sys.argv[2:])
41 | 


--------------------------------------------------------------------------------
/finite-element/op/local.mk:
--------------------------------------------------------------------------------
 1 | op-impls.c := $(wildcard $(call thisdir,op-*.c))
 2 | genregister := $(call thisdir,genregister.py)
 3 | register.c := $(OBJDIR)/register.c
 4 | 
 5 | hpgmg-fe-y.c += $(call thisdir, \
 6 | 	op.c \
 7 | 	) $(op-impls.c) $(register.c)
 8 | 
 9 | $(register.c) : $(genregister) $(op-impls.c) | $$(@D)/.DIR
10 | 	$(PYTHON) $(genregister) $@ $(op-impls.c)
11 | 
12 | HPGMG_FE_DIR := $(call thisdir,..)
13 | $(OBJDIR)/register.o : HPGMG_CPPFLAGS += -I$(HPGMG_FE_DIR)
14 | 


--------------------------------------------------------------------------------
/finite-element/pointwise.h:
--------------------------------------------------------------------------------
 1 | #ifndef _pointwise_h
 2 | #define _pointwise_h
 3 | 
 4 | #include <petscsys.h>
 5 | #include "fefas-align.h"
 6 | 
 7 | static PetscErrorCode PointwiseJacobianInvert(PetscInt ne,PetscInt Q,const PetscReal w[Q],PetscScalar dx[3][3][Q][ne],PetscScalar wdxdet[Q][ne])
 8 | {
 9 |   PetscInt i,j,k,e;
10 | 
11 |   for (i=0; i<Q; i++) {
12 |     PetscScalar a[3][3][ne]_align;
13 |     for (e=0; e<ne; e++) {
14 |       PetscScalar b0,b3,b6,det,idet;
15 |       for (j=0; j<3; j++) {
16 |         for (k=0; k<3; k++) {
17 |           a[j][k][e] = dx[j][k][i][e];
18 |         }
19 |       }
20 |       b0 =  (a[1][1][e]*a[2][2][e] - a[2][1][e]*a[1][2][e]);
21 |       b3 = -(a[1][0][e]*a[2][2][e] - a[2][0][e]*a[1][2][e]);
22 |       b6 =  (a[1][0][e]*a[2][1][e] - a[2][0][e]*a[1][1][e]);
23 |       det = a[0][0][e]*b0 + a[0][1][e]*b3 + a[0][2][e]*b6;
24 |       idet = 1.0 / det;
25 |       dx[0][0][i][e] =  idet*b0;
26 |       dx[0][1][i][e] = -idet*(a[0][1][e]*a[2][2][e] - a[2][1][e]*a[0][2][e]);
27 |       dx[0][2][i][e] =  idet*(a[0][1][e]*a[1][2][e] - a[1][1][e]*a[0][2][e]);
28 |       dx[1][0][i][e] =  idet*b3;
29 |       dx[1][1][i][e] =  idet*(a[0][0][e]*a[2][2][e] - a[2][0][e]*a[0][2][e]);
30 |       dx[1][2][i][e] = -idet*(a[0][0][e]*a[1][2][e] - a[1][0][e]*a[0][2][e]);
31 |       dx[2][0][i][e] =  idet*b6;
32 |       dx[2][1][i][e] = -idet*(a[0][0][e]*a[2][1][e] - a[2][0][e]*a[0][1][e]);
33 |       dx[2][2][i][e] =  idet*(a[0][0][e]*a[1][1][e] - a[1][0][e]*a[0][1][e]);
34 |       wdxdet[i][e] =  det*w[i];
35 |     }
36 |   }
37 |   PetscLogFlops(Q*ne*(14 + 1/* division */ + 27 + 1));
38 |   return 0;
39 | }
40 | 
41 | #endif
42 | 


--------------------------------------------------------------------------------
/finite-element/sharness/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | v0.3.0 (2013-04-03)
  2 | -------------------
  3 | 
  4 | This release is all about bringing upstream fixes and improvements from Git to
  5 | Sharness ([GH-7]).
  6 | 
  7 | List of merged upstream changes:
  8 | 
  9 | * Make test number come first in `not ok $count - $message`.
 10 | * Paint known breakages in yellow.
 11 | * Paint unexpectedly fixed known breakages in bold red.
 12 | * Paint skipped tests in blue.
 13 | * Change info messages from yellow/brown to cyan.
 14 | * Fix `say_color()` to not interpret `\a\b\c` in the message.
 15 | * Add check for invalid use of `skip_all` facility.
 16 | * Rename `$satisfied` to `$satisfied_prereq`.
 17 | * Allow negation of prerequisites with "!".
 18 | * Retain cache file `test/.prove` across prove runs.
 19 | * Replace `basic.t` with `sharness.t` which is an adapted version of
 20 |   `t0000-basic.sh` from upstream.
 21 | * Update `README.git` with upstream changes.
 22 | 
 23 | Other changes:
 24 | 
 25 | * Add [git-integration] to the list of projects using Sharness. Also pay tribute
 26 |   to Git's test suite.
 27 | * Let Travis only test the master branch (and pull requests).
 28 | 
 29 | [GH-7]: https://github.com/mlafeldt/sharness/pull/7
 30 | [git-integration]: https://github.com/johnkeeping/git-integration
 31 | 
 32 | v0.2.5 (2013-03-29)
 33 | -------------------
 34 | 
 35 | * Allow to install Sharness via `make install` and to uninstall it via
 36 |   `make uninstall`. See brand-new installation instructions in README. ([GH-5])
 37 | * Allow users to override the test extension via `SHARNESS_TEST_EXTENSION` if
 38 |   they wish to. ([GH-6])
 39 | * Don't set a variable and export it at the same time. ([GH-6])
 40 | * Remove `TEST_INSTALLED` -- use `SHARNESS_BUILD_DIRECTORY` instead.
 41 | * Add vi modeline to `sharness.sh`.
 42 | * Add `AGGREGATE_SCRIPT` variable to `test/Makefile`.
 43 | * Remove superfluous `SHARNESS_TEST_DIRECTORY` assignments from `test/basic.t`.
 44 | * Add [timedb] to the list of projects using Sharness.
 45 | * Add Sharness alternatives to README.
 46 | * Rename HISTORY.md to CHANGELOG.md.
 47 | 
 48 | [GH-5]: https://github.com/mlafeldt/sharness/pull/5
 49 | [GH-6]: https://github.com/mlafeldt/sharness/pull/6
 50 | [timedb]: http://git.cryptoism.org/cgit.cgi/timedb.git
 51 | 
 52 | v0.2.4 (2012-07-13)
 53 | -------------------
 54 | 
 55 | * Add `simple.t` to tests and README.
 56 | * Provide `SHARNESS_TEST_FILE` which is the path to the test script currently
 57 |   being executed.
 58 | * Add [dabba] to the list of projects using Sharness.
 59 | 
 60 | [dabba]: https://github.com/eroullit/dabba
 61 | 
 62 | v0.2.3 (2012-06-20)
 63 | -------------------
 64 | 
 65 | * Make `.t` the new test file extension, which is the default extension used by
 66 |   `prove(1)`. (You can still use the `t????-*` scheme, but you need to rename
 67 |   the `.sh` ending of all tests.)
 68 | * Rename, export, and document public variables `SHARNESS_TEST_DIRECTORY`,
 69 |   `SHARNESS_BUILD_DIRECTORY`, and `SHARNESS_TRASH_DIRECTORY`.
 70 | * TomDoc `SHARNESS_TEST_EXTENSION`.
 71 | 
 72 | v0.2.2 (2012-04-27)
 73 | -------------------
 74 | 
 75 | * Document all public API functions using [TomDoc] and let [tomdoc.sh] generate
 76 |   documentation in markdown format from it, see `API.md`.
 77 | * Rename `test_skip` to `test_skip_` as it is internal.
 78 | * Clean up `test/Makefile`.
 79 | * Sync Git README with upstream.
 80 | 
 81 | [TomDoc]: http://tomdoc.org/
 82 | [tomdoc.sh]: https://github.com/mlafeldt/tomdoc.sh
 83 | 
 84 | v0.2.1 (2012-03-01)
 85 | -------------------
 86 | 
 87 | * Fix: Redirect stdin of tests (by @peff).
 88 | * Unify coding style across all shell scripts.
 89 | * Remove superfluous functions `sane_unset` and `test_declared_prereq`.
 90 | * Get rid of variables `DIFF` and `TEST_CMP_USE_COPIED_CONTEXT`.
 91 | * Remove dysfunctional smoke testing targets from `test/Makefile`.
 92 | * Add Travis CI config.
 93 | * Add top-level Makefile to say `make test`.
 94 | * Add GPL header to all files from Git.
 95 | 
 96 | v0.2.0 (2011-12-13)
 97 | -------------------
 98 | 
 99 | * Rename `test-lib.sh` to `sharness.sh`.
100 | * Strip more Git-specific functionality.
101 | * Add variable `SHARNESS_VERSION`.
102 | * Move self-tests to `test` folder; keep essential files in root.
103 | * Update README.
104 | * Add this history file.
105 | 
106 | v0.1.1 (2011-11-02)
107 | -------------------
108 | 
109 | * Merge changes to test harness library from Git v1.7.8-rc0
110 | 
111 | v0.1.0 (2011-05-02)
112 | -------------------
113 | 
114 | * First version based on test harness library from Git v1.7.5
115 | * Remove Git-specific functions, variables, prerequisites, make targets, etc.
116 | * Remove `GIT_` prefix from global variables.
117 | 


--------------------------------------------------------------------------------
/finite-element/sharness/aggregate-results.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #
 3 | # Copyright (c) 2008-2012 Git project
 4 | #
 5 | # This program is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License as published by
 7 | # the Free Software Foundation, either version 2 of the License, or
 8 | # (at your option) any later version.
 9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU General Public License
16 | # along with this program.  If not, see http://www.gnu.org/licenses/ .
17 | 
18 | failed_tests=
19 | fixed=0
20 | success=0
21 | failed=0
22 | broken=0
23 | total=0
24 | 
25 | while read file; do
26 | 	while read type value; do
27 | 		case $type in
28 | 		'')
29 | 			continue ;;
30 | 		fixed)
31 | 			fixed=$(($fixed + $value)) ;;
32 | 		success)
33 | 			success=$(($success + $value)) ;;
34 | 		failed)
35 | 			failed=$(($failed + $value))
36 | 			if test $value != 0; then
37 | 				test_name=$(expr "$file" : 'test-results/\(.*\)\.[0-9]*\.counts')
38 | 				failed_tests="$failed_tests $test_name"
39 | 			fi
40 | 			;;
41 | 		broken)
42 | 			broken=$(($broken + $value)) ;;
43 | 		total)
44 | 			total=$(($total + $value)) ;;
45 | 		esac
46 | 	done <"$file"
47 | done
48 | 
49 | if test -n "$failed_tests"; then
50 | 	printf "\nfailed test(s):$failed_tests\n\n"
51 | fi
52 | 
53 | printf "%-8s%d\n" fixed $fixed
54 | printf "%-8s%d\n" success $success
55 | printf "%-8s%d\n" failed $failed
56 | printf "%-8s%d\n" broken $broken
57 | printf "%-8s%d\n" total $total
58 | 


--------------------------------------------------------------------------------
/finite-element/tensor-fma.c:
--------------------------------------------------------------------------------
  1 | #include "tensorimpl.h"
  2 | 
  3 | #ifdef __AVX__
  4 | 
  5 | #include <immintrin.h>
  6 | 
  7 | #ifndef __FMA__
  8 | #  define _mm256_fmadd_pd(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b),c)
  9 | #endif
 10 | 
 11 | #define NE 4
 12 | 
 13 | static inline PetscErrorCode TensorContract_FMA(PetscInt dof,PetscInt P,PetscInt Q,const PetscReal Rf[],const PetscReal Sf[],const PetscReal Tf[],TensorMode tmode,const PetscScalar xx[],PetscScalar yy[])
 14 | {
 15 | 
 16 |   PetscFunctionBegin;
 17 |   if (tmode == TENSOR_TRANSPOSE) {PetscInt tmp = Q; Q = P; P = tmp;}
 18 |   {
 19 |     PetscReal R[Q][P],S[Q][P],T[Q][P];
 20 |     const PetscScalar (*x)[P*P*P][NE] = (const PetscScalar(*)[P*P*P][NE])xx;
 21 |     PetscScalar       (*y)[P*P*P][NE] =       (PetscScalar(*)[Q*Q*Q][NE])yy;
 22 |     PetscScalar u[dof][Q*P*P][NE]_align,v[dof][Q*Q*P][NE]_align;
 23 | 
 24 |     for (PetscInt i=0; i<Q; i++) {
 25 |       for (PetscInt j=0; j<P; j++) {
 26 |         R[i][j] = tmode == TENSOR_EVAL ? Rf[i*P+j] : Rf[j*Q+i];
 27 |         S[i][j] = tmode == TENSOR_EVAL ? Sf[i*P+j] : Sf[j*Q+i];
 28 |         T[i][j] = tmode == TENSOR_EVAL ? Tf[i*P+j] : Tf[j*Q+i];
 29 |       }
 30 |     }
 31 | 
 32 |     // u[l,a,j,k] = R[a,i] x[l,i,j,k]
 33 |     for (PetscInt l=0; l<dof; l++) {
 34 |       for (PetscInt a=0; a<Q; a++) {
 35 |         __m256d r[P];
 36 |         for (PetscInt i=0; i<P; i++) r[i] = _mm256_set1_pd(R[a][i]);
 37 |         for (PetscInt jk=0; jk<P*P; jk++) {
 38 |           __m256d u_lajk = _mm256_setzero_pd();
 39 |           for (PetscInt i=0; i<P; i++) {
 40 |             u_lajk = _mm256_fmadd_pd(r[i],_mm256_load_pd(x[l][i*P*P+jk]),u_lajk);
 41 |           }
 42 |           _mm256_store_pd(u[l][a*P*P+jk],u_lajk);
 43 |         }
 44 |       }
 45 |     }
 46 | 
 47 |     // v[l,a,b,k] = S[b,j] u[l,a,j,k]
 48 |     for (PetscInt l=0; l<dof; l++) {
 49 |       for (PetscInt b=0; b<Q; b++) {
 50 |         __m256d s[P];
 51 |         for (int j=0; j<P; j++) s[j] = _mm256_set1_pd(S[b][j]);
 52 |         for (PetscInt a=0; a<Q; a++) {
 53 |           for (PetscInt k=0; k<P; k++) {
 54 |             __m256d v_labk = _mm256_setzero_pd();
 55 |             for (PetscInt j=0; j<P; j++) {
 56 |               v_labk = _mm256_fmadd_pd(s[j],_mm256_load_pd(u[l][(a*P+j)*P+k]),v_labk);
 57 |             }
 58 |             _mm256_store_pd(v[l][(a*Q+b)*P+k],v_labk);
 59 |           }
 60 |         }
 61 |       }
 62 |     }
 63 | 
 64 |     // y[l,a,b,c] = T[c,k] v[l,a,b,k]
 65 |     for (PetscInt l=0; l<dof; l++) {
 66 |       for (PetscInt c=0; c<Q; c++) {
 67 |         __m256d t[P];
 68 |         for (int k=0; k<P; k++) t[k] = _mm256_set1_pd(T[c][k]);
 69 |         for (PetscInt ab=0; ab<Q*Q; ab++) {
 70 |           __m256d y_labc = _mm256_load_pd(y[l][ab*Q+c]);
 71 |           for (PetscInt k=0; k<P; k++) {
 72 |             // for (PetscInt e=0; e<NE; e++) y[l][ab*Q+c][e] += T[c][k] * v[l][ab*P+k][e];
 73 |             y_labc = _mm256_fmadd_pd(t[k],_mm256_load_pd(v[l][ab*P+k]),y_labc);
 74 |           }
 75 |           _mm256_store_pd(y[l][ab*Q+c],y_labc);
 76 |         }
 77 |       }
 78 |     }
 79 |     PetscLogFlops(dof*(Q*P*P*P+Q*Q*P*P+Q*Q*Q*P)*NE*2);
 80 |   }
 81 |   PetscFunctionReturn(0);
 82 | }
 83 | 
 84 | static PetscErrorCode TensorContract_FMA_4_1_2_2(Tensor ten,const PetscReal Rf[],const PetscReal Sf[],const PetscReal Tf[],TensorMode tmode,const PetscScalar xx[],PetscScalar yy[]) {
 85 |   return TensorContract_FMA(1,2,2,Rf,Sf,Tf,tmode,xx,yy);
 86 | }
 87 | static PetscErrorCode TensorContract_FMA_4_3_2_2(Tensor ten,const PetscReal Rf[],const PetscReal Sf[],const PetscReal Tf[],TensorMode tmode,const PetscScalar xx[],PetscScalar yy[]) {
 88 |   return TensorContract_FMA(3,2,2,Rf,Sf,Tf,tmode,xx,yy);
 89 | }
 90 | static PetscErrorCode TensorContract_FMA_4_1_3_3(Tensor ten,const PetscReal Rf[],const PetscReal Sf[],const PetscReal Tf[],TensorMode tmode,const PetscScalar xx[],PetscScalar yy[]) {
 91 |   return TensorContract_FMA(1,3,3,Rf,Sf,Tf,tmode,xx,yy);
 92 | }
 93 | static PetscErrorCode TensorContract_FMA_4_3_3_3(Tensor ten,const PetscReal Rf[],const PetscReal Sf[],const PetscReal Tf[],TensorMode tmode,const PetscScalar xx[],PetscScalar yy[]) {
 94 |   return TensorContract_FMA(3,3,3,Rf,Sf,Tf,tmode,xx,yy);
 95 | }
 96 | 
 97 | #endif
 98 | 
 99 | // Choose our optimized functions if available
100 | PetscErrorCode TensorSelect_AVX(Tensor ten) {
101 | 
102 |   PetscFunctionBegin;
103 | #ifdef __AVX__
104 |   if (ten->ne == 4) {
105 |     PetscInt P = ten->P,Q = ten->Q;
106 |     switch (ten->dof) {
107 |     case 1: // Scalar problems with Q1 or Q2 elements
108 |       if (P == 2 && Q == 2)      ten->Contract = TensorContract_FMA_4_1_2_2;
109 |       else if (P == 3 && Q == 3) ten->Contract = TensorContract_FMA_4_1_3_3;
110 |       break;
111 |     case 3: // Coordinates or elasticity
112 |       if (P == 2 && Q == 2)      ten->Contract = TensorContract_FMA_4_3_2_2;
113 |       else if (P == 3 && Q == 3) ten->Contract = TensorContract_FMA_4_3_3_3;
114 |       break;
115 |     }
116 |   }
117 | #endif
118 |   PetscFunctionReturn(0);
119 | }
120 | 


--------------------------------------------------------------------------------
/finite-element/tensor.c:
--------------------------------------------------------------------------------
  1 | #include "tensorimpl.h"
  2 | 
  3 | static inline PetscErrorCode TensorContract_Inline(PetscInt ne,PetscInt dof,PetscInt P,PetscInt Q,const PetscReal Rf[],const PetscReal Sf[],const PetscReal Tf[],TensorMode tmode,const PetscScalar xx[],PetscScalar yy[])
  4 | {
  5 | 
  6 |   PetscFunctionBegin;
  7 |   if (tmode == TENSOR_TRANSPOSE) {PetscInt tmp = Q; Q = P; P = tmp;}
  8 |   {
  9 |     PetscReal R[Q][P],S[Q][P],T[Q][P];
 10 |     const PetscScalar (*restrict x)[P*P*P][ne]_align = (const PetscScalar(*)[P*P*P][ne])xx;
 11 |     PetscScalar       (*restrict y)[P*P*P][ne]_align =       (PetscScalar(*)[Q*Q*Q][ne])yy;
 12 |     PetscScalar u[dof][Q*P*P][ne]_align,v[dof][Q*Q*P][ne]_align;
 13 | 
 14 |     for (PetscInt i=0; i<Q; i++) {
 15 |       for (PetscInt j=0; j<P; j++) {
 16 |         R[i][j] = tmode == TENSOR_EVAL ? Rf[i*P+j] : Rf[j*Q+i];
 17 |         S[i][j] = tmode == TENSOR_EVAL ? Sf[i*P+j] : Sf[j*Q+i];
 18 |         T[i][j] = tmode == TENSOR_EVAL ? Tf[i*P+j] : Tf[j*Q+i];
 19 |       }
 20 |     }
 21 | 
 22 |     // u[l,a,j,k] = R[a,i] x[l,i,j,k]
 23 |     PetscMemzero(u,sizeof u);
 24 |     for (PetscInt i=0; i<P; i++) {
 25 |       for (PetscInt l=0; l<dof; l++) {
 26 |         for (PetscInt a=0; a<Q; a++) {
 27 |           for (PetscInt jk=0; jk<P*P; jk++) {
 28 |             for (PetscInt e=0; e<ne; e++) u[l][a*P*P+jk][e] += R[a][i] * x[l][i*P*P+jk][e];
 29 |           }
 30 |         }
 31 |       }
 32 |     }
 33 | 
 34 |     // v[l,a,b,k] = S[b,j] u[l,a,j,k]
 35 |     PetscMemzero(v,sizeof v);
 36 |     for (PetscInt l=0; l<dof; l++) {
 37 |       for (PetscInt a=0; a<Q; a++) {
 38 |         for (PetscInt k=0; k<P; k++) {
 39 |           for (PetscInt j=0; j<P; j++) {
 40 |             for (PetscInt b=0; b<Q; b++) {
 41 |               for (PetscInt e=0; e<ne; e++) v[l][(a*Q+b)*P+k][e] += S[b][j] * u[l][(a*P+j)*P+k][e];
 42 |             }
 43 |           }
 44 |         }
 45 |       }
 46 |     }
 47 | 
 48 |     // y[l,a,b,c] = T[c,k] v[l,a,b,k]
 49 |     for (PetscInt l=0; l<dof; l++) {
 50 |       for (PetscInt ab=0; ab<Q*Q; ab++) {
 51 |         for (PetscInt k=0; k<P; k++) {
 52 |           for (PetscInt c=0; c<Q; c++) {
 53 |             for (PetscInt e=0; e<ne; e++) y[l][ab*Q+c][e] += T[c][k] * v[l][ab*P+k][e];
 54 |           }
 55 |         }
 56 |       }
 57 |     }
 58 |     PetscLogFlops(dof*(Q*P*P*P+Q*Q*P*P+Q*Q*Q*P)*ne*2);
 59 |   }
 60 |   PetscFunctionReturn(0);
 61 | }
 62 | 
 63 | static inline PetscErrorCode TensorContract_Ref(Tensor ten,const PetscReal Rf[],const PetscReal Sf[],const PetscReal Tf[],TensorMode tmode,const PetscScalar xx[],PetscScalar yy[]) {
 64 |   PetscInt ne = ten->ne,dof = ten->dof,P = ten->P,Q = ten->Q;
 65 |   return TensorContract_Inline(ne,dof,P,Q,Rf,Sf,Tf,tmode,xx,yy);
 66 | }
 67 | 
 68 | static PetscErrorCode TensorContract_Ref_4_1_2_2(Tensor ten,const PetscReal Rf[],const PetscReal Sf[],const PetscReal Tf[],TensorMode tmode,const PetscScalar xx[],PetscScalar yy[]) {
 69 |   return TensorContract_Inline(4,1,2,2,Rf,Sf,Tf,tmode,xx,yy);
 70 | }
 71 | static PetscErrorCode TensorContract_Ref_4_3_2_2(Tensor ten,const PetscReal Rf[],const PetscReal Sf[],const PetscReal Tf[],TensorMode tmode,const PetscScalar xx[],PetscScalar yy[]) {
 72 |   return TensorContract_Inline(4,3,2,2,Rf,Sf,Tf,tmode,xx,yy);
 73 | }
 74 | static PetscErrorCode TensorContract_Ref_4_1_3_3(Tensor ten,const PetscReal Rf[],const PetscReal Sf[],const PetscReal Tf[],TensorMode tmode,const PetscScalar xx[],PetscScalar yy[]) {
 75 |   return TensorContract_Inline(4,1,3,3,Rf,Sf,Tf,tmode,xx,yy);
 76 | }
 77 | static PetscErrorCode TensorContract_Ref_4_3_3_3(Tensor ten,const PetscReal Rf[],const PetscReal Sf[],const PetscReal Tf[],TensorMode tmode,const PetscScalar xx[],PetscScalar yy[]) {
 78 |   return TensorContract_Inline(4,3,3,3,Rf,Sf,Tf,tmode,xx,yy);
 79 | }
 80 | 
 81 | PetscErrorCode TensorSelect_AVX(Tensor);
 82 | PetscErrorCode TensorSelect_QPX(Tensor);
 83 | 
 84 | PetscErrorCode TensorCreate(PetscInt ne,PetscInt dof,PetscInt P,PetscInt Q,Tensor *ten) {
 85 |   Tensor t;
 86 |   PetscErrorCode ierr;
 87 | 
 88 |   PetscFunctionBegin;
 89 |   ierr = PetscNew(&t);CHKERRQ(ierr);
 90 |   t->ne  = ne;
 91 |   t->dof = dof;
 92 |   t->P   = P;
 93 |   t->Q   = Q;
 94 |   t->Contract = TensorContract_Ref;
 95 |   if (ne == 4) {
 96 |     switch (dof) {
 97 |     case 1: // Scalar problems with Q1 or Q2 elements
 98 |       if (P == 2 && Q == 2)      t->Contract = TensorContract_Ref_4_1_2_2;
 99 |       else if (P == 3 && Q == 3) t->Contract = TensorContract_Ref_4_1_3_3;
100 |       break;
101 |     case 3: // Coordinates or elasticity
102 |       if (P == 2 && Q == 2)      t->Contract = TensorContract_Ref_4_3_2_2;
103 |       else if (P == 3 && Q == 3) t->Contract = TensorContract_Ref_4_3_3_3;
104 |       break;
105 |     }
106 |   }
107 |   ierr = TensorSelect_AVX(t);CHKERRQ(ierr);
108 |   ierr = TensorSelect_QPX(t);CHKERRQ(ierr);
109 |   *ten = t;
110 |   PetscFunctionReturn(0);
111 | }
112 | 
113 | PetscErrorCode TensorDestroy(Tensor *ten) {
114 |   PetscErrorCode ierr;
115 | 
116 |   PetscFunctionBegin;
117 |   ierr = PetscFree(*ten);CHKERRQ(ierr);
118 |   PetscFunctionReturn(0);
119 | }
120 | 
121 | PetscErrorCode TensorContract(Tensor ten,const PetscReal Rf[],const PetscReal Sf[],const PetscReal Tf[],TensorMode tmode,const PetscScalar xx[],PetscScalar yy[]) {
122 |   return (*ten->Contract)(ten,Rf,Sf,Tf,tmode,xx,yy);
123 | }
124 | 


--------------------------------------------------------------------------------
/finite-element/tensor.h:
--------------------------------------------------------------------------------
 1 | #ifndef _tensor_h
 2 | #define _tensor_h
 3 | 
 4 | #include <petscsys.h>
 5 | #include "fefas-align.h"
 6 | 
 7 | typedef enum {TENSOR_EVAL,TENSOR_TRANSPOSE} TensorMode;
 8 | 
 9 | typedef struct Tensor_private *Tensor;
10 | 
11 | PetscErrorCode TensorCreate(PetscInt ne,PetscInt dof,PetscInt P,PetscInt Q,Tensor *ten);
12 | PetscErrorCode TensorDestroy(Tensor *ten);
13 | PetscErrorCode TensorContract(Tensor ten,const PetscReal Rf[],const PetscReal Sf[],const PetscReal Tf[],TensorMode tmode,const PetscScalar xx[],PetscScalar yy[]);
14 | 
15 | #endif
16 | 


--------------------------------------------------------------------------------
/finite-element/tensorimpl.h:
--------------------------------------------------------------------------------
 1 | #ifndef _tensorimpl_h
 2 | #define _tensorimpl_h
 3 | 
 4 | #include "tensor.h"
 5 | 
 6 | struct Tensor_private {
 7 |   PetscInt ne;
 8 |   PetscInt dof;
 9 |   PetscInt P;
10 |   PetscInt Q;
11 |   PetscErrorCode (*Contract)(Tensor,const PetscReal Rf[],const PetscReal Sf[],const PetscReal Tf[],TensorMode tmode,const PetscScalar xx[],PetscScalar yy[]);
12 | };
13 | 
14 | #endif
15 | 


--------------------------------------------------------------------------------
/finite-element/test/Makefile:
--------------------------------------------------------------------------------
 1 | # Run tests
 2 | #
 3 | # Copyright (c) 2011-2012 Mathias Lafeldt
 4 | # Copyright (c) 2005-2012 Git project
 5 | # Copyright (c) 2005-2012 Junio C Hamano
 6 | #
 7 | # This program is free software: you can redistribute it and/or modify
 8 | # it under the terms of the GNU General Public License as published by
 9 | # the Free Software Foundation, either version 2 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | # GNU General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU General Public License
18 | # along with this program.  If not, see http://www.gnu.org/licenses/ .
19 | 
20 | SHELL_PATH ?= $(SHELL)
21 | SHELL_PATH_SQ = $(subst ','\'',$(SHELL_PATH))
22 | RM ?= rm -f
23 | PROVE ?= prove
24 | AGGREGATE_SCRIPT ?= aggregate-results.sh
25 | DEFAULT_TEST_TARGET ?= test
26 | 
27 | T = $(wildcard t[0-9]*.sh)
28 | 
29 | all: $(DEFAULT_TEST_TARGET)
30 | 
31 | test: pre-clean
32 | 	$(MAKE) aggregate-results-and-cleanup
33 | 
34 | prove: pre-clean
35 | 	@echo "*** prove ***"; $(PROVE) --exec '$(SHELL_PATH_SQ)' $(PROVE_OPTS) $(T) :: $(TEST_OPTS)
36 | 	$(MAKE) clean-except-prove-cache
37 | 
38 | $(T):
39 | 	@echo "*** $@ ***"; '$(SHELL_PATH_SQ)' $@ $(TEST_OPTS)
40 | 
41 | pre-clean:
42 | 	$(RM) -r test-results
43 | 
44 | clean-except-prove-cache:
45 | 	$(RM) -r 'trash directory'.* test-results
46 | 
47 | clean: clean-except-prove-cache
48 | 	$(RM) .prove
49 | 
50 | aggregate-results-and-cleanup: $(T)
51 | 	$(MAKE) aggregate-results
52 | 	$(MAKE) clean
53 | 
54 | aggregate-results:
55 | 	for f in test-results/*.counts; do \
56 | 		echo "$$f"; \
57 | 	done | '$(SHELL_PATH_SQ)' '$(AGGREGATE_SCRIPT)'
58 | 
59 | .PHONY: all test prove $(T) pre-clean clean
60 | .PHONY: aggregate-results-and-cleanup aggregate-results
61 | 


--------------------------------------------------------------------------------
/finite-element/test/aggregate-results.sh:
--------------------------------------------------------------------------------
1 | ../sharness/aggregate-results.sh


--------------------------------------------------------------------------------
/finite-element/test/hpgmg-sharness.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | . ./sharness.sh
 4 | 
 5 | # Public: Run parallel executable and compare output
 6 | #
 7 | # When the test passed, an "ok" message is printed and the number of successful
 8 | # tests is incremented. When it failed, a "not ok" message is printed and the
 9 | # number of failed tests is incremented.
10 | #
11 | # With --immediate, exit test immediately upon the first failed test.
12 | #
13 | # Usually takes four arguments:
14 | # $1 - Test description
15 | # $2 - Number of processes
16 | # $3 - Executable name (found in ${HPGMG_BINDIR}/) followed by runtime options
17 | # $4 - Expected output
18 | #
19 | # With five arguments, the first will be taken to be a prerequisite:
20 | # $1 - Comma-separated list of test prerequisites. The test will be skipped if
21 | #      not all of the given prerequisites are set. To negate a prerequisite,
22 | #      put a "!" in front of it.
23 | # $2 - Test description
24 | # $3 - Number of processes
25 | # $4 - Executable name (found in ${HPGMG_BINDIR}/) followed by runtime options
26 | # $5 - Expected output
27 | #
28 | # Returns nothing.
29 | test_expect_stdout() {
30 |     test "$#" = 5 && { test_prereq=$1; shift; } || test_prereq=
31 |     test "$#" = 4 || error "bug in test script: $# not 4 or 5 parameters to test_expect_stdout"
32 | 
33 |     export test_prereq
34 |     if ! test_skip_ "$@"; then
35 |         say >&3 "expecting success: $2 $3"
36 |         sed '1d;$d' <<<"$4" > reference.out
37 |         diffoutput=
38 |         if "${MPIEXEC}" -n $2 "${HPGMG_BINDIR}/"$3 > actual.out 2>&4 &&
39 |             diffoutput=$(git diff --exit-code --no-index reference.out actual.out); then
40 |             test_ok_ "$1"
41 |         else
42 |             test_failure_ "$1 $2 $3" "${diffoutput}"
43 |             
44 |         fi
45 |     fi
46 |     echo >&3 ""
47 | }
48 | 
49 | # Public: Run parallel executable and check for failure with error message
50 | #
51 | # When the test passed, an "ok" message is printed and the number of successful
52 | # tests is incremented. When it failed, a "not ok" message is printed and the
53 | # number of failed tests is incremented.
54 | #
55 | # With --immediate, exit test immediately upon the first failed test.
56 | #
57 | # Usually takes four arguments:
58 | # $1 - Test description
59 | # $2 - Number of processes
60 | # $3 - Executable name (found in ${HPGMG_BINDIR}/) followed by runtime options
61 | # $4 - Expected string in error message (stderr)
62 | #
63 | # With five arguments, the first will be taken to be a prerequisite:
64 | # $1 - Comma-separated list of test prerequisites. The test will be skipped if
65 | #      not all of the given prerequisites are set. To negate a prerequisite,
66 | #      put a "!" in front of it.
67 | # $2 - Test description
68 | # $3 - Number of processes
69 | # $4 - Executable name (found in ${HPGMG_BINDIR}/) followed by runtime options
70 | # $5 - Expected string in error message
71 | #
72 | # Returns nothing.
73 | test_expect_error() {
74 |     test "$#" = 5 && { test_prereq=$1; shift; } || test_prereq=
75 |     test "$#" = 4 || error "bug in test script: $# not 4 or 5 parameters to test_expect_stdout"
76 | 
77 |     export test_prereq
78 |     if ! test_skip_ "$@"; then
79 |         say >&3 "checking known breakage: $2 $3"
80 |         expected_stderr=$(sed '1d' <<<"$4")
81 |         # Don't check exit code because process managers do not always propagate correctly
82 |         "${MPIEXEC}" -n $2 "${HPGMG_BINDIR}/"$3 > /dev/null 2> actual.err
83 |         if fgrep -q "${expected_stderr}" actual.err; then
84 |             test_ok_ "$1"
85 |         else
86 |             test_failure_ "$1 $2 $3" "Expecting: ${expected_stderr}$(echo && cat actual.err)"
87 |         fi
88 |     fi
89 |     echo >&3 ""
90 | }
91 | 
92 | MPIEXEC=$(awk '/MPIEXEC/{print $3}' "${PETSC_DIR}/${PETSC_ARCH}/conf/petscvariables")
93 | 


--------------------------------------------------------------------------------
/finite-element/test/sharness.sh:
--------------------------------------------------------------------------------
1 | ../sharness/sharness.sh


--------------------------------------------------------------------------------
/finite-element/test/t020-fespace.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | test_description='Test FE creation and scatters'
 4 | 
 5 | . ./hpgmg-sharness.sh
 6 | 
 7 | test_expect_stdout 'FE GlobalToLocal fedegree=1' 4 'hpgmg-fe test-fespace -p 2,2,1 -M 4,4,2' '
 8 | Vec Object: 1 MPI processes
 9 |   type: seq
10 | 0
11 | 1
12 | 2
13 | 3
14 | 4
15 | 5
16 | 12
17 | 13
18 | 14
19 | 6
20 | 7
21 | 8
22 | 9
23 | 10
24 | 11
25 | 21
26 | 22
27 | 23
28 | 30
29 | 31
30 | 32
31 | 33
32 | 34
33 | 35
34 | 48
35 | 49
36 | 50
37 | '
38 | 
39 | test_expect_stdout 'FE Gradient/coordinates fedegree=1' 4 'hpgmg-fe test-fegrad -M 6,2,10 -p 2,1,2 -L 7,11,13' '
40 | '
41 | 
42 | test_done
43 | 


--------------------------------------------------------------------------------
/finite-element/test/t030-feinject.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | test_description='Test FE coarsening and injection'
 4 | 
 5 | . ./hpgmg-sharness.sh
 6 | 
 7 | test_expect_stdout 'FE Inject fedegree=1' 4 'hpgmg-fe test-feinject -M 4,2,6 -p 2,1,2 -L 4,2,6' '
 8 | coarse u[ 0] =        0.0 at  0.0  0.0  0.0
 9 | coarse u[ 1] =        2.0 at  0.0  0.0  2.0
10 | coarse u[ 2] =        4.0 at  0.0  0.0  4.0
11 | coarse u[ 3] =        6.0 at  0.0  0.0  6.0
12 | coarse u[ 4] =     2000.0 at  0.0  2.0  0.0
13 | coarse u[ 5] =     2002.0 at  0.0  2.0  2.0
14 | coarse u[ 6] =     2004.0 at  0.0  2.0  4.0
15 | coarse u[ 7] =     2006.0 at  0.0  2.0  6.0
16 | coarse u[ 8] =  2000000.0 at  2.0  0.0  0.0
17 | coarse u[ 9] =  2000002.0 at  2.0  0.0  2.0
18 | coarse u[10] =  2000004.0 at  2.0  0.0  4.0
19 | coarse u[11] =  2000006.0 at  2.0  0.0  6.0
20 | coarse u[12] =  2002000.0 at  2.0  2.0  0.0
21 | coarse u[13] =  2002002.0 at  2.0  2.0  2.0
22 | coarse u[14] =  2002004.0 at  2.0  2.0  4.0
23 | coarse u[15] =  2002006.0 at  2.0  2.0  6.0
24 | coarse u[16] =  4000000.0 at  4.0  0.0  0.0
25 | coarse u[17] =  4000002.0 at  4.0  0.0  2.0
26 | coarse u[18] =  4000004.0 at  4.0  0.0  4.0
27 | coarse u[19] =  4000006.0 at  4.0  0.0  6.0
28 | coarse u[20] =  4002000.0 at  4.0  2.0  0.0
29 | coarse u[21] =  4002002.0 at  4.0  2.0  2.0
30 | coarse u[22] =  4002004.0 at  4.0  2.0  4.0
31 | coarse u[23] =  4002006.0 at  4.0  2.0  6.0
32 | '
33 | 
34 | test_done
35 | 


--------------------------------------------------------------------------------
/finite-element/test/t040-feinterp.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | test_description='Test FE interpolation'
 4 | 
 5 | . ./hpgmg-sharness.sh
 6 | 
 7 | test_expect_stdout 'FE Interpolation fedegree=1 serial' 1 'hpgmg-fe test-feinterp -M 6,2,10 -L 6,2,10' '
 8 | |u - I Ihat u|_max =     0
 9 | '
10 | 
11 | test_expect_stdout 'FE Interpolation fedegree=1 parallel' 4 'hpgmg-fe test-feinterp -M 6,2,10 -L 6,2,10 -p 2,1,2' '
12 | |u - I Ihat u|_max =     0
13 | '
14 | 
15 | test_done
16 | 


--------------------------------------------------------------------------------
/finite-element/test/t045-ferestrict.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | test_description='Test FE restriction'
 4 | 
 5 | . ./hpgmg-sharness.sh
 6 | 
 7 | test_expect_stdout 'FE Restriction fedegree=1 serial' 1 'hpgmg-fe test-ferestrict -M 4,4,6 -L 4,4,6' '
 8 | |u_c - I_h^H u_f|_max =     0
 9 | '
10 | 
11 | test_expect_stdout 'FE Restriction fedegree=1 parallel' 4 'hpgmg-fe test-ferestrict -M 6,4,10 -L 6,4,10 -p 2,1,2' '
12 | |u_c - I_h^H u_f|_max =     0
13 | '
14 | 
15 | test_expect_stdout 'FE Restriction fedegree=1 parallel ragged coarsening' 4 'hpgmg-fe test-ferestrict -M 4,4,12 -L 1,1,1 -p 1,1,4' '
16 | |u_c - I_h^H u_f|_max =     0
17 | '
18 | 
19 | test_done
20 | 


--------------------------------------------------------------------------------
/finite-element/test/t100-poisson.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | test_description='Test Poisson solver'
 4 | 
 5 | . ./hpgmg-sharness.sh
 6 | 
 7 | # We expect second-order convergence on the residual for fedegree=1.
 8 | test_expect_stdout 'FE Poisson fedegree=1 serial' 1 'hpgmg-fe test-opapply -op_type poisson1 -M 4,8,12 -L 1,1,1 -poisson_solution sine' '
 9 | [0] Level 2: [   0:   4,   0:   8,   0:  12] of [   4,   8,  12] on [  1,  1,  1]
10 | [0] Level 1: [   0:   2,   0:   4,   0:   6] of [   2,   4,   6] on [  1,  1,  1]
11 | [0] Level 0: [   0:   1,   0:   2,   0:   3] of [   1,   2,   3] on [  1,  1,  1]
12 | |A u - F|_max/|F|_max = 0.0978195
13 | '
14 | 
15 | test_expect_stdout 'FE Poisson fedegree=1 serial refined' 1 'hpgmg-fe test-opapply -op_type poisson1 -M 8,16,24 -L 1,1,1 -poisson_solution sine' '
16 | [0] Level 3: [   0:   8,   0:  16,   0:  24] of [   8,  16,  24] on [  1,  1,  1]
17 | [0] Level 2: [   0:   4,   0:   8,   0:  12] of [   4,   8,  12] on [  1,  1,  1]
18 | [0] Level 1: [   0:   2,   0:   4,   0:   6] of [   2,   4,   6] on [  1,  1,  1]
19 | [0] Level 0: [   0:   1,   0:   2,   0:   3] of [   1,   2,   3] on [  1,  1,  1]
20 | |A u - F|_max/|F|_max = 0.0253888
21 | '
22 | 
23 | test_expect_stdout 'FE Poisson fedegree=1 parallel refined' 4 'hpgmg-fe test-opapply -op_type poisson1 -M 8,16,24 -L 1,1,1 -p 1,2,2 -cmax 48 -poisson_solution sine' '
24 | [0] Level 3: [   0:   8,   0:   8,   0:  12] of [   8,  16,  24] on [  1,  2,  2]
25 | [0] Level 2: [   0:   4,   0:   4,   0:   6] of [   4,   8,  12] on [  1,  2,  2]
26 | [0] Level 1: [   0:   2,   0:   4,   0:   6] of [   2,   4,   6] on [  1,  1,  1]
27 | [0] Level 0: [   0:   1,   0:   2,   0:   3] of [   1,   2,   3] on [  1,  1,  1]
28 | [1] Level 3: [   0:   8,   0:   8,  12:  24] of [   8,  16,  24] on [  1,  2,  2]
29 | [1] Level 2: [   0:   4,   0:   4,   6:  12] of [   4,   8,  12] on [  1,  2,  2]
30 | [2] Level 3: [   0:   8,   8:  16,   0:  12] of [   8,  16,  24] on [  1,  2,  2]
31 | [2] Level 2: [   0:   4,   4:   8,   0:   6] of [   4,   8,  12] on [  1,  2,  2]
32 | [3] Level 3: [   0:   8,   8:  16,  12:  24] of [   8,  16,  24] on [  1,  2,  2]
33 | [3] Level 2: [   0:   4,   4:   8,   6:  12] of [   4,   8,  12] on [  1,  2,  2]
34 | |A u - F|_max/|F|_max = 0.0253888
35 | '
36 | 
37 | test_done
38 | 


--------------------------------------------------------------------------------
/finite-element/test/t110-poissondiag.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | test_description='Test Poisson diagonal'
 4 | 
 5 | . ./hpgmg-sharness.sh
 6 | 
 7 | test_expect_stdout 'FE Poisson diagonal fedegree=1 serial' 1 'hpgmg-fe test-opdiagonal -op_type poisson1 -M 8,12,16 -L 1,1,1' '
 8 | [0] Level 2: [   0:   8,   0:  12,   0:  16] of [   8,  12,  16] on [  1,  1,  1]
 9 | [0] Level 1: [   0:   4,   0:   6,   0:   8] of [   4,   6,   8] on [  1,  1,  1]
10 | [0] Level 0: [   0:   2,   0:   3,   0:   4] of [   2,   3,   4] on [  1,  1,  1]
11 | |D|_1 = 310.139  |D|_2 = 9.12568  |D|_max = 0.268519
12 | '
13 | 
14 | test_expect_stdout 'FE Poisson diagonal fedegree=1 parallel' 4 'hpgmg-fe test-opdiagonal -op_type poisson1 -M 8,12,16 -L 1,1,1 -p 2,1,2' '
15 | [0] Level 2: [   0:   4,   0:  12,   0:   8] of [   8,  12,  16] on [  2,  1,  2]
16 | [0] Level 1: [   0:   2,   0:   6,   0:   4] of [   4,   6,   8] on [  2,  1,  2]
17 | [0] Level 0: [   0:   2,   0:   3,   0:   4] of [   2,   3,   4] on [  1,  1,  1]
18 | [1] Level 2: [   0:   4,   0:  12,   8:  16] of [   8,  12,  16] on [  2,  1,  2]
19 | [1] Level 1: [   0:   2,   0:   6,   4:   8] of [   4,   6,   8] on [  2,  1,  2]
20 | [2] Level 2: [   4:   8,   0:  12,   0:   8] of [   8,  12,  16] on [  2,  1,  2]
21 | [2] Level 1: [   2:   4,   0:   6,   0:   4] of [   4,   6,   8] on [  2,  1,  2]
22 | [3] Level 2: [   4:   8,   0:  12,   8:  16] of [   8,  12,  16] on [  2,  1,  2]
23 | [3] Level 1: [   2:   4,   0:   6,   4:   8] of [   4,   6,   8] on [  2,  1,  2]
24 | |D|_1 = 310.139  |D|_2 = 9.12568  |D|_max = 0.268519
25 | '
26 | 
27 | test_done
28 | 


--------------------------------------------------------------------------------
/finite-element/test/t120-poissonksp.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | test_description='Test Poisson solve using KSP'
 4 | 
 5 | . ./hpgmg-sharness.sh
 6 | 
 7 | # Error norm is converging at second order
 8 | test_expect_stdout 'FE Poisson KSP solve fedegree=1 serial' 1 'hpgmg-fe test-kspsolve -op_type poisson1 -M 8,12,16 -L 1,1,1 -ksp_converged_reason -ksp_view -ksp_type chebyshev -ksp_chebyshev_eigenvalues 0.2,2 -pc_type jacobi -poisson_solution sine' '
 9 | [0] Level 2: [   0:   8,   0:  12,   0:  16] of [   8,  12,  16] on [  1,  1,  1]
10 | [0] Level 1: [   0:   4,   0:   6,   0:   8] of [   4,   6,   8] on [  1,  1,  1]
11 | [0] Level 0: [   0:   2,   0:   3,   0:   4] of [   2,   3,   4] on [  1,  1,  1]
12 | Linear solve converged due to CONVERGED_RTOL iterations 20
13 | KSP Object: 1 MPI processes
14 |   type: chebyshev
15 |     Chebyshev: eigenvalue estimates:  min = 0.2, max = 2
16 |   maximum iterations=10000, initial guess is zero
17 |   tolerances:  relative=1e-05, absolute=1e-50, divergence=10000
18 |   left preconditioning
19 |   using PRECONDITIONED norm type for convergence test
20 | PC Object: 1 MPI processes
21 |   type: jacobi
22 |   linear system matrix = precond matrix:
23 |   Mat Object:   1 MPI processes
24 |     type: shell
25 |     rows=1989, cols=1989
26 | |v-u|_2/|u|_2 = 0.0393899
27 | '
28 | 
29 | test_expect_stdout 'FE Poisson KSP solve fedegree=1 parallel' 4 'hpgmg-fe test-kspsolve -op_type poisson1 -M 8,12,16 -L 1,1,1 -ksp_converged_reason -ksp_view -ksp_type chebyshev -ksp_chebyshev_eigenvalues 0.2,2 -pc_type jacobi -p 1,2,2 -poisson_solution sine' '
30 | [0] Level 2: [   0:   8,   0:   6,   0:   8] of [   8,  12,  16] on [  1,  2,  2]
31 | [0] Level 1: [   0:   4,   0:   3,   0:   4] of [   4,   6,   8] on [  1,  2,  2]
32 | [0] Level 0: [   0:   2,   0:   3,   0:   4] of [   2,   3,   4] on [  1,  1,  1]
33 | [1] Level 2: [   0:   8,   0:   6,   8:  16] of [   8,  12,  16] on [  1,  2,  2]
34 | [1] Level 1: [   0:   4,   0:   3,   4:   8] of [   4,   6,   8] on [  1,  2,  2]
35 | [2] Level 2: [   0:   8,   6:  12,   0:   8] of [   8,  12,  16] on [  1,  2,  2]
36 | [2] Level 1: [   0:   4,   3:   6,   0:   4] of [   4,   6,   8] on [  1,  2,  2]
37 | [3] Level 2: [   0:   8,   6:  12,   8:  16] of [   8,  12,  16] on [  1,  2,  2]
38 | [3] Level 1: [   0:   4,   3:   6,   4:   8] of [   4,   6,   8] on [  1,  2,  2]
39 | Linear solve converged due to CONVERGED_RTOL iterations 20
40 | KSP Object: 4 MPI processes
41 |   type: chebyshev
42 |     Chebyshev: eigenvalue estimates:  min = 0.2, max = 2
43 |   maximum iterations=10000, initial guess is zero
44 |   tolerances:  relative=1e-05, absolute=1e-50, divergence=10000
45 |   left preconditioning
46 |   using PRECONDITIONED norm type for convergence test
47 | PC Object: 4 MPI processes
48 |   type: jacobi
49 |   linear system matrix = precond matrix:
50 |   Mat Object:   4 MPI processes
51 |     type: shell
52 |     rows=1989, cols=1989
53 | |v-u|_2/|u|_2 = 0.0393899
54 | '
55 | 
56 | test_done
57 | 


--------------------------------------------------------------------------------
/finite-element/test/t200-mgv.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | test_description='Test Poisson solve using MG V-cycles'
 4 | 
 5 | . ./hpgmg-sharness.sh
 6 | 
 7 | # Error norm is converging at second order
 8 | test_expect_stdout 'FE Poisson MG V-cycle solve fedegree=1 serial' 1 'hpgmg-fe mgv -op_type poisson1 -M 16,20,24 -L 1,1,1 -smooth 2,2 -mg_eig_target 2,0.2 -poisson_solution sine' '
 9 | [0] Level 2: [   0:  16,   0:  20,   0:  24] of [  16,  20,  24] on [  1,  1,  1]
10 | [0] Level 1: [   0:   8,   0:  10,   0:  12] of [   8,  10,  12] on [  1,  1,  1]
11 | [0] Level 0: [   0:   4,   0:   5,   0:   6] of [   4,   5,   6] on [  1,  1,  1]
12 | V(2,2)  0: |e|_2/|u|_2 1.50e-02  |r|_2/|f|_2 2.25e-01
13 | V(2,2)  1: |e|_2/|u|_2 1.06e-02  |r|_2/|f|_2 5.40e-02
14 | V(2,2)  2: |e|_2/|u|_2 1.27e-02  |r|_2/|f|_2 1.31e-02
15 | V(2,2)  3: |e|_2/|u|_2 1.34e-02  |r|_2/|f|_2 3.22e-03
16 | V(2,2)  4: |e|_2/|u|_2 1.35e-02  |r|_2/|f|_2 7.91e-04
17 | '
18 | 
19 | test_expect_stdout 'FE Poisson MG V-cycle solve fedegree=1 parallel' 4 'hpgmg-fe mgv -op_type poisson1 -M 16,20,24 -L 1,1,1 -p 1,2,2 -cmax 240 -smooth 2,2 -mg_eig_target 2,0.2 -poisson_solution sine' '
20 | [0] Level 2: [   0:  16,   0:  10,   0:  12] of [  16,  20,  24] on [  1,  2,  2]
21 | [0] Level 1: [   0:   8,   0:   5,   0:   6] of [   8,  10,  12] on [  1,  2,  2]
22 | [0] Level 0: [   0:   4,   0:   5,   0:   6] of [   4,   5,   6] on [  1,  1,  1]
23 | [1] Level 2: [   0:  16,   0:  10,  12:  24] of [  16,  20,  24] on [  1,  2,  2]
24 | [1] Level 1: [   0:   8,   0:   5,   6:  12] of [   8,  10,  12] on [  1,  2,  2]
25 | [2] Level 2: [   0:  16,  10:  20,   0:  12] of [  16,  20,  24] on [  1,  2,  2]
26 | [2] Level 1: [   0:   8,   5:  10,   0:   6] of [   8,  10,  12] on [  1,  2,  2]
27 | [3] Level 2: [   0:  16,  10:  20,  12:  24] of [  16,  20,  24] on [  1,  2,  2]
28 | [3] Level 1: [   0:   8,   5:  10,   6:  12] of [   8,  10,  12] on [  1,  2,  2]
29 | V(2,2)  0: |e|_2/|u|_2 1.50e-02  |r|_2/|f|_2 2.25e-01
30 | V(2,2)  1: |e|_2/|u|_2 1.06e-02  |r|_2/|f|_2 5.40e-02
31 | V(2,2)  2: |e|_2/|u|_2 1.27e-02  |r|_2/|f|_2 1.31e-02
32 | V(2,2)  3: |e|_2/|u|_2 1.34e-02  |r|_2/|f|_2 3.22e-03
33 | V(2,2)  4: |e|_2/|u|_2 1.35e-02  |r|_2/|f|_2 7.91e-04
34 | '
35 | 
36 | test_done
37 | 


--------------------------------------------------------------------------------
/finite-element/test/t220-fmg.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | test_description='Test Poisson solve using FMG'
 4 | 
 5 | . ./hpgmg-sharness.sh
 6 | 
 7 | # Error norm is converging at second order
 8 | test_expect_stdout 'FE Poisson FMG solve fedegree=1 serial' 1 'hpgmg-fe fmg -op_type poisson1 -M 8,16,24 -smooth 3,3 -mg_eig_target 2,0.2 -poisson_solution sine' '
 9 | F(3,3)  0: |e|_2/|u|_2 2.26e-02  |r|_2/|f|_2 3.37e-02
10 | V(3,3)  1: |e|_2/|u|_2 2.58e-02  |r|_2/|f|_2 2.05e-03
11 | V(3,3)  2: |e|_2/|u|_2 2.60e-02  |r|_2/|f|_2 1.25e-04
12 | '
13 | 
14 | test_expect_stdout 'FE Poisson FMG solve fedegree=1 parallel' 4 'hpgmg-fe fmg -op_type poisson1 -M 8,16,24 -p 1,2,2 -smooth 3,3 -mg_eig_target 2,0.2 -poisson_solution sine' '
15 | F(3,3)  0: |e|_2/|u|_2 2.26e-02  |r|_2/|f|_2 3.37e-02
16 | V(3,3)  1: |e|_2/|u|_2 2.58e-02  |r|_2/|f|_2 2.05e-03
17 | V(3,3)  2: |e|_2/|u|_2 2.60e-02  |r|_2/|f|_2 1.25e-04
18 | '
19 | 
20 | test_done
21 | 


--------------------------------------------------------------------------------
/finite-element/test/t230-fmg-poisson2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | test_description='Test Q2 Poisson solve using FMG'
 4 | 
 5 | . ./hpgmg-sharness.sh
 6 | 
 7 | # Error norm is converging at fourth order (superconvergent at Lagrange nodes)
 8 | test_expect_stdout 'FE Poisson FMG solve fedegree=2 serial' 1 'hpgmg-fe fmg -op_type poisson2 -M 4,4,6 -smooth 4,3' '
 9 | F(4,3)  0: |e|_2/|u|_2 9.08e-03  |r|_2/|f|_2 3.35e-04
10 | V(4,3)  1: |e|_2/|u|_2 9.17e-03  |r|_2/|f|_2 8.27e-07
11 | V(4,3)  2: |e|_2/|u|_2 9.17e-03  |r|_2/|f|_2 5.54e-09
12 | '
13 | 
14 | test_expect_stdout 'FE Poisson FMG solve fedegree=2 parallel' 4 'hpgmg-fe fmg -op_type poisson2 -M 4,4,6 -smooth 4,3 -p 1,2,2' '
15 | F(4,3)  0: |e|_2/|u|_2 9.08e-03  |r|_2/|f|_2 3.35e-04
16 | V(4,3)  1: |e|_2/|u|_2 9.17e-03  |r|_2/|f|_2 8.27e-07
17 | V(4,3)  2: |e|_2/|u|_2 9.17e-03  |r|_2/|f|_2 5.54e-09
18 | '
19 | 
20 | test_done
21 | 


--------------------------------------------------------------------------------
/finite-element/test/t60-sample.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | test_description='Test grid creation'
 4 | 
 5 | . ./hpgmg-sharness.sh
 6 | 
 7 | test_expect_stdout 'list of samples' 1 'hpgmg-fe test-sampler -local 100,1e9 -maxsamples 10 -nranks 192' '
 8 | Processors: [   4    6    8] = 192
 9 | Filtered Grid: L11 [4096 6144 6144] = 154618822656
10 | Filtered Grid: L12 [4096 4096 4096] =  68719476736
11 | Filtered Grid: L10 [2048 2048 3072] =  12884901888
12 | Filtered Grid: L 9 [1024 1024 1536] =   1610612736
13 | Filtered Grid: L 8 [ 512  512  768] =    201326592
14 | Filtered Grid: L 7 [ 256  256  384] =     25165824
15 | Filtered Grid: L 6 [ 128  128  192] =      3145728
16 | Filtered Grid: L 5 [  64   64   96] =       393216
17 | Filtered Grid: L 4 [  32   32   48] =        49152
18 | Filtered Grid: L 3 [  24   24   32] =        18432
19 | '
20 | 
21 | test_done
22 | 


--------------------------------------------------------------------------------
/finite-volume/README:
--------------------------------------------------------------------------------
 1 | *** Copyright Notice ***
 2 | 
 3 | Copyright (c) 2014-2015, NVIDIA CORPORATION. All rights reserved.
 4 | 
 5 | The U.S. Department of Energy funded the development of this software
 6 | under subcontract B609478 with Lawrence Livermore National Security (LLNS).
 7 | 
 8 | HPGMG, Copyright (c) 2014, The Regents of the University of
 9 | California, through Lawrence Berkeley National Laboratory (subject to
10 | receipt of any required approvals from the U.S. Dept. of Energy).  All
11 | rights reserved.
12 | 
13 | If you have questions about your rights to use or distribute this
14 | software, please contact Berkeley Lab's Technology Transfer Department
15 | at  TTD@lbl.gov.
16 | 
17 | NOTICE.  This software is owned by the U.S. Department of Energy.  As
18 | such, the U.S. Government has been granted for itself and others
19 | acting on its behalf a paid-up, nonexclusive, irrevocable, worldwide
20 | license in the Software to reproduce, prepare derivative works, and
21 | perform publicly and display publicly.  Beginning five (5) years after
22 | the date permission to assert copyright is obtained from the U.S.
23 | Department of Energy, and subject to any subsequent five (5) year
24 | renewals, the U.S. Government is granted for itself and others acting
25 | on its behalf a paid-up, nonexclusive, irrevocable, worldwide license
26 | in the Software to reproduce, prepare derivative works, distribute
27 | copies to the public, perform publicly and display publicly, and to
28 | permit others to do so.
29 | ****************************
30 | 
31 | This directory contains the current HPGMG finite-volume benchmark.
32 | 
33 | Please see ./source/README for details on how to compiler, run, 
34 | optimize, and examine the output of the hpgmg finite-volume benchmark.
35 | 
36 | Example job scripts are in the ./example_jobs directory
37 | 


--------------------------------------------------------------------------------
/finite-volume/example_jobs/job.biou.00008:
--------------------------------------------------------------------------------
 1 | #PBS -N HPGMG-FV
 2 | #PBS -q parallel
 3 | #PBS -l nodes=2:ppn=32,walltime=00:30:00
 4 | #PBS -m abe
 5 | #PBS -V
 6 | 
 7 | cd $PBS_O_WORKDIR
 8 | 
 9 | module load openmpi/1.8.1-gcc
10 | #module load openmpi/1.6.5-gcc
11 | 
12 | 
13 | # Hybrid MPI + OpenMP
14 | export OMP_NUM_THREADS=8
15 | mpiexec -report-bindings -np   8 --map-by node -bind-to numa                 ./run.power7 7 1
16 | #mpiexec -report-bindings -np   1 -npernode  1 -cpus-per-proc 8 -bind-to-core ./run.power7 7 1
17 | #mpiexec -report-bindings -np   8 -npernode  4 -cpus-per-proc 8 -bind-to-core ./run.power7 7 1
18 | 
19 | # flat MPI
20 | export OMP_NUM_THREADS=1
21 | mpiexec -report-bindings -np  64 --map-by node                               ./run.power7 6 1
22 | #mpiexec -report-bindings -np   8 -npernode  8 -cpus-per-proc 1 -bind-to-core ./run.power7 6 1
23 | #mpiexec -report-bindings -np  64 -npernode 32 -cpus-per-proc 1 -bind-to-core ./run.power7 6 1
24 | 


--------------------------------------------------------------------------------
/finite-volume/example_jobs/job.carver.00064:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #PBS -q debug
 3 | #PBS -N HPGMG
 4 | #PBS -o results.carver.00064
 5 | #PBS -j oe
 6 | #PBS -l walltime=0:30:00
 7 | #PBS -l nodes=32:ppn=8
 8 | #####PBS -l pvmem=10GB
 9 | 
10 | 
11 | set -x
12 | cd $PBS_O_WORKDIR
13 | module swap pgi intel
14 | module swap openmpi openmpi-intel
15 | 
16 | export OMP_NUM_THREADS=4
17 | mpirun -np   1 -report-bindings -npernode 1 -npersocket 1 -bysocket -bind-to-socket ./run.carver   7  1
18 | mpirun -np   8 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver   7  1
19 | mpirun -np  27 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver   7  1
20 | mpirun -np  64 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver   7  1
21 | 


--------------------------------------------------------------------------------
/finite-volume/example_jobs/job.carver.00128:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #PBS -q regular
 3 | #PBS -N HPGMG
 4 | #PBS -o results.carver.00128
 5 | #PBS -j oe
 6 | #PBS -l walltime=0:30:00
 7 | #PBS -l nodes=64:ppn=8
 8 | #####PBS -l pvmem=10GB
 9 | 
10 | 
11 | set -x
12 | cd $PBS_O_WORKDIR
13 | module swap pgi intel
14 | module swap openmpi openmpi-intel
15 | 
16 | export OMP_NUM_THREADS=4
17 | mpirun -np 128 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver   7  4
18 | mpirun -np  64 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver   7  8
19 | mpirun -np  32 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver   7  2
20 | mpirun -np  16 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver   7  4
21 | 
22 | mpirun -np 125 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver   7  1
23 | mpirun -np  64 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver   7  1
24 | mpirun -np  27 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver   7  1
25 | mpirun -np   8 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver   7  1
26 | mpirun -np   1 -report-bindings -npernode 1 -npersocket 1 -bysocket -bind-to-socket ./run.carver   7  1
27 | 


--------------------------------------------------------------------------------
/finite-volume/example_jobs/job.carver.00512:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #PBS -q regular
 3 | #PBS -N HPGMG
 4 | #PBS -o results.carver.00512
 5 | #PBS -j oe
 6 | #PBS -l walltime=0:30:00
 7 | #PBS -l nodes=256:ppn=8
 8 | #####PBS -l pvmem=10GB
 9 | 
10 | 
11 | set -x
12 | cd $PBS_O_WORKDIR
13 | module swap pgi intel
14 | module swap openmpi openmpi-intel
15 | 
16 | export OMP_NUM_THREADS=4
17 | mpirun -np   1 -report-bindings -npernode 1 -npersocket 1 -bysocket -bind-to-socket ./run.carver   7  1
18 | mpirun -np   8 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver   7  1
19 | mpirun -np  27 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver   7  1
20 | mpirun -np  64 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver   7  1
21 | mpirun -np 125 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver   7  1
22 | mpirun -np 216 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver   7  1
23 | mpirun -np 343 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver   7  1
24 | mpirun -np 512 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver   7  1
25 | 


--------------------------------------------------------------------------------
/finite-volume/example_jobs/job.carver.01728:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #PBS -q regular
 3 | #PBS -N HPGMG
 4 | #PBS -o results.carver.01728
 5 | #PBS -j oe
 6 | #PBS -l walltime=0:30:00
 7 | #PBS -l nodes=864:ppn=8
 8 | #####PBS -l pvmem=10GB
 9 | 
10 | 
11 | set -x
12 | cd $PBS_O_WORKDIR
13 | module swap pgi intel
14 | module swap openmpi openmpi-intel
15 | 
16 | export OMP_NUM_THREADS=4
17 | mpirun -np    1 -report-bindings -npernode 1 -npersocket 1 -bysocket -bind-to-socket ./run.carver   7  1
18 | mpirun -np    8 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver   7  1
19 | mpirun -np   27 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver   7  1
20 | mpirun -np   64 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver   7  1
21 | mpirun -np  125 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver   7  1
22 | mpirun -np  216 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver   7  1
23 | mpirun -np  343 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver   7  1
24 | mpirun -np  512 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver   7  1
25 | mpirun -np  729 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver   7  1
26 | mpirun -np 1000 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver   7  1
27 | mpirun -np 1331 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver   7  1
28 | mpirun -np 1728 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver   7  1
29 | 


--------------------------------------------------------------------------------
/finite-volume/example_jobs/job.edison.00064:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #PBS -N HPGMG
 3 | #PBS -o results.edison.00064
 4 | #PBS -q debug
 5 | #PBS -j oe
 6 | #PBS -l walltime=0:29:00
 7 | #PBS -l mppwidth=768
 8 | 
 9 | set -x
10 | cd $PBS_O_WORKDIR
11 | #export UGNI_CDM_MDD_DEDICATED=2
12 | 
13 | #export OMP_NUM_THREADS=12
14 | #aprun -n  64  -N  2  -S 1  -cc numa_node ./run.edison      9  1
15 | #aprun -n  64  -N  2  -S 1  -cc numa_node ./run.edison      8  1
16 | #aprun -n  64  -N  2  -S 1  -cc numa_node ./run.edison      7  1
17 | #aprun -n  64  -N  2  -S 1  -cc numa_node ./run.edison      6  1
18 | #aprun -n  64  -N  2  -S 1  -cc numa_node ./run.edison      5  1
19 | 
20 | export OMP_NUM_THREADS=12
21 | aprun -n   8  -N  2  -S 1  -cc numa_node ./run.edison      7  1
22 | aprun -n  27  -N  2  -S 1  -cc numa_node ./run.edison      7  1
23 | aprun -n  64  -N  2  -S 1  -cc numa_node ./run.edison      7  1
24 | 
25 | aprun -n   8  -N  2  -S 1  -cc numa_node ./run.edison      7  1
26 | aprun -n  27  -N  2  -S 1  -cc numa_node ./run.edison      7  1
27 | aprun -n  64  -N  2  -S 1  -cc numa_node ./run.edison      7  1
28 | 
29 | #export OMP_NUM_THREADS=1
30 | #aprun -n  64  -N 16  -S 8  -cc numa_node ./run.edison.flat 6  1
31 | #aprun -n 216  -N 16  -S 8  -cc numa_node ./run.edison.flat 6  1
32 | #aprun -n 512  -N 16  -S 8  -cc numa_node ./run.edison.flat 6  1
33 | #
34 | #aprun -n  64  -N 16  -S 8  -cc numa_node ./run.edison.flat 6  1
35 | #aprun -n 216  -N 16  -S 8  -cc numa_node ./run.edison.flat 6  1
36 | #aprun -n 512  -N 16  -S 8  -cc numa_node ./run.edison.flat 6  1
37 | 


--------------------------------------------------------------------------------
/finite-volume/example_jobs/job.edison.00512:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #PBS -N HPGMG
 3 | #PBS -o results.edison.00512
 4 | #PBS -q regular
 5 | #PBS -j oe
 6 | #PBS -l walltime=0:29:00
 7 | #PBS -l mppwidth=6144
 8 | 
 9 | set -x
10 | cd $PBS_O_WORKDIR
11 | 
12 | 
13 | export OMP_NUM_THREADS=12
14 | aprun -n     1  -N  1  -S 1  -cc numa_node ./run.edison  7  1
15 | aprun -n     8  -N  2  -S 1  -cc numa_node ./run.edison  7  1
16 | aprun -n    27  -N  2  -S 1  -cc numa_node ./run.edison  7  1
17 | aprun -n    64  -N  2  -S 1  -cc numa_node ./run.edison  7  1
18 | aprun -n   125  -N  2  -S 1  -cc numa_node ./run.edison  7  1
19 | aprun -n   216  -N  2  -S 1  -cc numa_node ./run.edison  7  1
20 | aprun -n   384  -N  2  -S 1  -cc numa_node ./run.edison  7  1
21 | aprun -n   512  -N  2  -S 1  -cc numa_node ./run.edison  7  1
22 | export OMP_NUM_THREADS=12
23 | aprun -n     1  -N  1  -S 1  -cc numa_node ./run.edison  7 64
24 | aprun -n     8  -N  2  -S 1  -cc numa_node ./run.edison  7 64
25 | aprun -n    27  -N  2  -S 1  -cc numa_node ./run.edison  7 64
26 | aprun -n    64  -N  2  -S 1  -cc numa_node ./run.edison  7 64
27 | aprun -n   125  -N  2  -S 1  -cc numa_node ./run.edison  7 64
28 | aprun -n   216  -N  2  -S 1  -cc numa_node ./run.edison  7 64
29 | aprun -n   384  -N  2  -S 1  -cc numa_node ./run.edison  7 64
30 | aprun -n   512  -N  2  -S 1  -cc numa_node ./run.edison  7 64
31 | 
32 | 
33 | #export OMP_NUM_THREADS=1
34 | #aprun -n     8  -N  8  -S 8  -cc numa_node ./run.edison.flat  6  1
35 | #aprun -n    64  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  1
36 | #aprun -n   216  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  1
37 | #aprun -n   512  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  1
38 | #aprun -n  1000  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  1
39 | #aprun -n  1728  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  1
40 | #aprun -n  2744  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  1
41 | #aprun -n  4096  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  1
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/finite-volume/example_jobs/job.edison.01024:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #PBS -N HPGMG
 3 | #PBS -o results.edison.01024
 4 | #PBS -q regular
 5 | #PBS -j oe
 6 | #PBS -l walltime=0:29:00
 7 | #PBS -l mppwidth=12288
 8 | 
 9 | set -x
10 | cd $PBS_O_WORKDIR
11 | #export UGNI_CDM_MDD_DEDICATED=2
12 | 
13 | export OMP_NUM_THREADS=12
14 | aprun -n     1  -N  1  -S 1  -cc numa_node ./run.edison       7  1
15 | aprun -n     8  -N  2  -S 1  -cc numa_node ./run.edison       7  1
16 | aprun -n    27  -N  2  -S 1  -cc numa_node ./run.edison       7  1
17 | aprun -n    64  -N  2  -S 1  -cc numa_node ./run.edison       7  1
18 | aprun -n   125  -N  2  -S 1  -cc numa_node ./run.edison       7  1
19 | aprun -n   216  -N  2  -S 1  -cc numa_node ./run.edison       7  1
20 | aprun -n   343  -N  2  -S 1  -cc numa_node ./run.edison       7  1
21 | aprun -n   512  -N  2  -S 1  -cc numa_node ./run.edison       7  1
22 | aprun -n   729  -N  2  -S 1  -cc numa_node ./run.edison       7  1
23 | aprun -n  1000  -N  2  -S 1  -cc numa_node ./run.edison       7  1
24 | export OMP_NUM_THREADS=1
25 | aprun -n     8  -N  8  -S 8  -cc numa_node ./run.edison.flat  6  1
26 | aprun -n    64  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  1
27 | aprun -n   216  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  1
28 | aprun -n   512  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  1
29 | aprun -n  1000  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  1
30 | aprun -n  1728  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  1
31 | aprun -n  2744  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  1
32 | aprun -n  4096  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  1
33 | aprun -n  5832  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  1
34 | aprun -n  8000  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  1
35 | 
36 | 
37 | export OMP_NUM_THREADS=12
38 | aprun -n     1  -N  1  -S 1  -cc numa_node ./run.edison       7  8
39 | aprun -n     8  -N  2  -S 1  -cc numa_node ./run.edison       7  8
40 | aprun -n    27  -N  2  -S 1  -cc numa_node ./run.edison       7  8
41 | aprun -n    64  -N  2  -S 1  -cc numa_node ./run.edison       7  8
42 | aprun -n   125  -N  2  -S 1  -cc numa_node ./run.edison       7  8
43 | aprun -n   216  -N  2  -S 1  -cc numa_node ./run.edison       7  8
44 | aprun -n   343  -N  2  -S 1  -cc numa_node ./run.edison       7  8
45 | aprun -n   512  -N  2  -S 1  -cc numa_node ./run.edison       7  8
46 | aprun -n   729  -N  2  -S 1  -cc numa_node ./run.edison       7  8
47 | aprun -n  1000  -N  2  -S 1  -cc numa_node ./run.edison       7  8
48 | export OMP_NUM_THREADS=1
49 | aprun -n     8  -N  8  -S 8  -cc numa_node ./run.edison.flat  6  8
50 | aprun -n    64  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  8
51 | aprun -n   216  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  8
52 | aprun -n   512  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  8
53 | aprun -n  1000  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  8
54 | aprun -n  1728  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  8
55 | aprun -n  2744  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  8
56 | aprun -n  4096  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  8
57 | aprun -n  5832  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  8
58 | aprun -n  8000  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  8
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/finite-volume/example_jobs/job.edison.08000:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #PBS -N HPGMG
 3 | #PBS -o results.edison.08000
 4 | #PBS -q regular
 5 | #PBS -j oe
 6 | #PBS -l walltime=0:59:00
 7 | #PBS -l mppwidth=98304
 8 | 
 9 | set -x
10 | cd $PBS_O_WORKDIR
11 | export UGNI_CDM_MDD_DEDICATED=2
12 | 
13 | 
14 | export OMP_NUM_THREADS=1
15 | aprun -n  64000  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  1
16 | aprun -n  46656  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  1
17 | aprun -n  32768  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  1
18 | aprun -n  27000  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  1
19 | aprun -n  21952  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  1
20 | aprun -n  13824  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  1
21 | aprun -n   8000  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  1
22 | 
23 | 
24 | export OMP_NUM_THREADS=12
25 | aprun -n   8000  -N  2  -S 1  -cc numa_node ./run.edison  7  1
26 | aprun -n   6859  -N  2  -S 1  -cc numa_node ./run.edison  7  1
27 | aprun -n   5832  -N  2  -S 1  -cc numa_node ./run.edison  7  1
28 | aprun -n   4913  -N  2  -S 1  -cc numa_node ./run.edison  7  1
29 | aprun -n   4096  -N  2  -S 1  -cc numa_node ./run.edison  7  1
30 | aprun -n   3375  -N  2  -S 1  -cc numa_node ./run.edison  7  1
31 | aprun -n   2744  -N  2  -S 1  -cc numa_node ./run.edison  7  1
32 | aprun -n   2197  -N  2  -S 1  -cc numa_node ./run.edison  7  1
33 | aprun -n   1728  -N  2  -S 1  -cc numa_node ./run.edison  7  1
34 | aprun -n   1331  -N  2  -S 1  -cc numa_node ./run.edison  7  1
35 | aprun -n   1000  -N  2  -S 1  -cc numa_node ./run.edison  7  1
36 | 
37 | 
38 | export OMP_NUM_THREADS=1
39 | aprun -n  64000  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  1
40 | aprun -n  46656  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  1
41 | aprun -n  32768  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  1
42 | aprun -n  27000  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  1
43 | aprun -n  21952  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  1
44 | aprun -n  13824  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  1
45 | aprun -n   8000  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  1
46 | aprun -n   5832  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  1
47 | aprun -n   4096  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  1
48 | aprun -n   2744  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  1
49 | aprun -n   1728  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  1
50 | aprun -n   1000  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  1
51 | aprun -n    512  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  1
52 | aprun -n    216  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  1
53 | aprun -n     64  -N 16  -S 8  -cc numa_node ./run.edison.flat  6  1
54 | aprun -n      8  -N  8  -S 8  -cc numa_node ./run.edison.flat  6  1
55 | 
56 | 
57 | export OMP_NUM_THREADS=12
58 | aprun -n   8000  -N  2  -S 1  -cc numa_node ./run.edison  7  1
59 | aprun -n   6859  -N  2  -S 1  -cc numa_node ./run.edison  7  1
60 | aprun -n   5832  -N  2  -S 1  -cc numa_node ./run.edison  7  1
61 | aprun -n   4913  -N  2  -S 1  -cc numa_node ./run.edison  7  1
62 | aprun -n   4096  -N  2  -S 1  -cc numa_node ./run.edison  7  1
63 | aprun -n   3375  -N  2  -S 1  -cc numa_node ./run.edison  7  1
64 | aprun -n   2744  -N  2  -S 1  -cc numa_node ./run.edison  7  1
65 | aprun -n   2197  -N  2  -S 1  -cc numa_node ./run.edison  7  1
66 | aprun -n   1728  -N  2  -S 1  -cc numa_node ./run.edison  7  1
67 | aprun -n   1331  -N  2  -S 1  -cc numa_node ./run.edison  7  1
68 | aprun -n   1000  -N  2  -S 1  -cc numa_node ./run.edison  7  1
69 | aprun -n    729  -N  2  -S 1  -cc numa_node ./run.edison  7  1
70 | aprun -n    512  -N  2  -S 1  -cc numa_node ./run.edison  7  1
71 | aprun -n    343  -N  2  -S 1  -cc numa_node ./run.edison  7  1
72 | aprun -n    216  -N  2  -S 1  -cc numa_node ./run.edison  7  1
73 | aprun -n    125  -N  2  -S 1  -cc numa_node ./run.edison  7  1
74 | aprun -n     64  -N  2  -S 1  -cc numa_node ./run.edison  7  1
75 | aprun -n     27  -N  2  -S 1  -cc numa_node ./run.edison  7  1
76 | aprun -n      8  -N  2  -S 1  -cc numa_node ./run.edison  7  1
77 | aprun -n      1  -N  1  -S 1  -cc numa_node ./run.edison  7  1
78 | 
79 | 


--------------------------------------------------------------------------------
/finite-volume/example_jobs/job.edison.10648:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #PBS -N HPGMG
 3 | #PBS -o results.edison.10648
 4 | #PBS -q regular
 5 | #PBS -j oe
 6 | #PBS -l walltime=0:59:00
 7 | #PBS -l mppwidth=129024
 8 | 
 9 | set -x
10 | cd $PBS_O_WORKDIR
11 | 
12 | export OMP_NUM_THREADS=12
13 | aprun -n   9216  -N  2  -S 1  -cc numa_node ./run.edison   7  12
14 | aprun -n  10648  -N  2  -S 1  -cc numa_node ./run.edison   7   8
15 | aprun -n  10752  -N  2  -S 1  -cc numa_node ./run.edison   7  11
16 | aprun -n  10752  -N  2  -S 1  -cc numa_node ./run.edison   7  17
17 | aprun -n  10752  -N  2  -S 1  -cc numa_node ./run.edison   7  25
18 | aprun -n  10368  -N  2  -S 1  -cc numa_node ./run.edison   7  36
19 | aprun -n  10240  -N  2  -S 1  -cc numa_node ./run.edison   7  50
20 | aprun -n  10648  -N  2  -S 1  -cc numa_node ./run.edison   7  64
21 | 
22 | aprun -n  10648  -N  2  -S 1  -cc numa_node ./run.edison   8   8
23 | aprun -n   8000  -N  2  -S 1  -cc numa_node ./run.edison   8   8
24 | aprun -n   5832  -N  2  -S 1  -cc numa_node ./run.edison   8   8
25 | aprun -n   4096  -N  2  -S 1  -cc numa_node ./run.edison   8   8
26 | 
27 | 


--------------------------------------------------------------------------------
/finite-volume/example_jobs/job.edison.4096.strong:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #PBS -N HPGMG
 3 | #PBS -o results.edison.4096.strong2
 4 | #PBS -q regular
 5 | #PBS -j oe
 6 | #PBS -l walltime=0:15:00
 7 | #PBS -l mppwidth=49152
 8 | 
 9 | set -x
10 | cd $PBS_O_WORKDIR
11 | export UGNI_CDM_MDD_DEDICATED=2
12 | 
13 | 
14 | export OMP_NUM_THREADS=1
15 | aprun -n     1  -N  1  -S 1  -ss  -cc numa_node ./run.edison.flat 7  64
16 | aprun -n     8  -N  8  -S 8  -ss  -cc numa_node ./run.edison.flat 7   8
17 | aprun -n    64  -N 16  -S 8  -ss  -cc numa_node ./run.edison.flat 7   1
18 | aprun -n   512  -N 16  -S 8  -ss  -cc numa_node ./run.edison.flat 6   1
19 | aprun -n  4096  -N 16  -S 8  -ss  -cc numa_node ./run.edison.flat 5   1
20 | aprun -n 32768  -N 16  -S 8  -ss  -cc numa_node ./run.edison.flat 4   1
21 | 


--------------------------------------------------------------------------------
/finite-volume/example_jobs/job.edison.pstate:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #PBS  -N HPGMG
 3 | #PBS -o results.edison.01024.pstate
 4 | #PBS -q regular
 5 | #PBS -j oe
 6 | #PBS -l walltime=0:29:00
 7 | #PBS -l mppwidth=12288
 8 | 
 9 | set -x
10 | cd $PBS_O_WORKDIR
11 | export UGNI_CDM_MDD_DEDICATED=2
12 | 
13 | export OMP_NUM_THREADS=12
14 | aprun -n     1  -N  1  -S 1  --p-state 2400000 -ss  -cc numa_node ./run.edison  7  1
15 | aprun -n     8  -N  2  -S 1  --p-state 2400000 -ss  -cc numa_node ./run.edison  7  1
16 | aprun -n    27  -N  2  -S 1  --p-state 2400000 -ss  -cc numa_node ./run.edison  7  1
17 | aprun -n    64  -N  2  -S 1  --p-state 2400000 -ss  -cc numa_node ./run.edison  7  1
18 | aprun -n   125  -N  2  -S 1  --p-state 2400000 -ss  -cc numa_node ./run.edison  7  1
19 | aprun -n   216  -N  2  -S 1  --p-state 2400000 -ss  -cc numa_node ./run.edison  7  1
20 | aprun -n   343  -N  2  -S 1  --p-state 2400000 -ss  -cc numa_node ./run.edison  7  1
21 | aprun -n   512  -N  2  -S 1  --p-state 2400000 -ss  -cc numa_node ./run.edison  7  1
22 | aprun -n   729  -N  2  -S 1  --p-state 2400000 -ss  -cc numa_node ./run.edison  7  1
23 | aprun -n  1000  -N  2  -S 1  --p-state 2400000 -ss  -cc numa_node ./run.edison  7  1
24 | export OMP_NUM_THREADS=12
25 | aprun -n     1  -N  1  -S 1  --p-state 2400000 -ss  -cc numa_node ./run.edison  7  1
26 | aprun -n     8  -N  2  -S 1  --p-state 2400000 -ss  -cc numa_node ./run.edison  7  1
27 | aprun -n    27  -N  2  -S 1  --p-state 2400000 -ss  -cc numa_node ./run.edison  7  1
28 | aprun -n    64  -N  2  -S 1  --p-state 2400000 -ss  -cc numa_node ./run.edison  7  1
29 | aprun -n   125  -N  2  -S 1  --p-state 2400000 -ss  -cc numa_node ./run.edison  7  1
30 | aprun -n   216  -N  2  -S 1  --p-state 2400000 -ss  -cc numa_node ./run.edison  7  1
31 | aprun -n   343  -N  2  -S 1  --p-state 2400000 -ss  -cc numa_node ./run.edison  7  1
32 | aprun -n   512  -N  2  -S 1  --p-state 2400000 -ss  -cc numa_node ./run.edison  7  1
33 | aprun -n   729  -N  2  -S 1  --p-state 2400000 -ss  -cc numa_node ./run.edison  7  1
34 | aprun -n  1000  -N  2  -S 1  --p-state 2400000 -ss  -cc numa_node ./run.edison  7  1
35 | 
36 | export OMP_NUM_THREADS=1
37 | aprun -n     8  -N  8  -S 8  --p-state 2400000 -ss  -cc numa_node ./run.edison  6  1
38 | aprun -n    64  -N 16  -S 8  --p-state 2400000 -ss  -cc numa_node ./run.edison  6  1
39 | aprun -n   216  -N 16  -S 8  --p-state 2400000 -ss  -cc numa_node ./run.edison  6  1
40 | aprun -n   512  -N 16  -S 8  --p-state 2400000 -ss  -cc numa_node ./run.edison  6  1
41 | aprun -n  1000  -N 16  -S 8  --p-state 2400000 -ss  -cc numa_node ./run.edison  6  1
42 | aprun -n  1728  -N 16  -S 8  --p-state 2400000 -ss  -cc numa_node ./run.edison  6  1
43 | aprun -n  2744  -N 16  -S 8  --p-state 2400000 -ss  -cc numa_node ./run.edison  6  1
44 | aprun -n  4096  -N 16  -S 8  --p-state 2400000 -ss  -cc numa_node ./run.edison  6  1
45 | aprun -n  5832  -N 16  -S 8  --p-state 2400000 -ss  -cc numa_node ./run.edison  6  1
46 | aprun -n  8000  -N 16  -S 8  --p-state 2400000 -ss  -cc numa_node ./run.edison  6  1
47 | export OMP_NUM_THREADS=1
48 | aprun -n     8  -N  8  -S 8  --p-state 2400000 -ss  -cc numa_node ./run.edison  6  1
49 | aprun -n    64  -N 16  -S 8  --p-state 2400000 -ss  -cc numa_node ./run.edison  6  1
50 | aprun -n   216  -N 16  -S 8  --p-state 2400000 -ss  -cc numa_node ./run.edison  6  1
51 | aprun -n   512  -N 16  -S 8  --p-state 2400000 -ss  -cc numa_node ./run.edison  6  1
52 | aprun -n  1000  -N 16  -S 8  --p-state 2400000 -ss  -cc numa_node ./run.edison  6  1
53 | aprun -n  1728  -N 16  -S 8  --p-state 2400000 -ss  -cc numa_node ./run.edison  6  1
54 | aprun -n  2744  -N 16  -S 8  --p-state 2400000 -ss  -cc numa_node ./run.edison  6  1
55 | aprun -n  4096  -N 16  -S 8  --p-state 2400000 -ss  -cc numa_node ./run.edison  6  1
56 | aprun -n  5832  -N 16  -S 8  --p-state 2400000 -ss  -cc numa_node ./run.edison  6  1
57 | aprun -n  8000  -N 16  -S 8  --p-state 2400000 -ss  -cc numa_node ./run.edison  6  1
58 | 


--------------------------------------------------------------------------------
/finite-volume/example_jobs/job.edison.strong:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #PBS -N HPGMG
 3 | #PBS -o results.edison.strong
 4 | #PBS -q regular
 5 | #PBS -j oe
 6 | #PBS -l walltime=0:29:00
 7 | #PBS -l mppwidth=49152
 8 | 
 9 | set -x
10 | cd $PBS_O_WORKDIR
11 | export UGNI_CDM_MDD_DEDICATED=2
12 | 
13 | 
14 | export OMP_NUM_THREADS=1
15 | aprun -n     8  -N  8  -S 8  -ss  -cc numa_node ./run.edison.flat      8   1
16 | aprun -n    16  -N 16  -S 8  -ss  -cc numa_node ./run.edison.flat      7   4
17 | aprun -n    32  -N 16  -S 8  -ss  -cc numa_node ./run.edison.flat      7   2
18 | aprun -n    64  -N 16  -S 8  -ss  -cc numa_node ./run.edison.flat      7   1
19 | aprun -n   128  -N 16  -S 8  -ss  -cc numa_node ./run.edison.flat      6   4
20 | aprun -n   256  -N 16  -S 8  -ss  -cc numa_node ./run.edison.flat      6   2
21 | aprun -n   512  -N 16  -S 8  -ss  -cc numa_node ./run.edison.flat      6   1
22 | aprun -n  1024  -N 16  -S 8  -ss  -cc numa_node ./run.edison.flat      5   4
23 | aprun -n  2048  -N 16  -S 8  -ss  -cc numa_node ./run.edison.flat      5   2
24 | aprun -n  4096  -N 16  -S 8  -ss  -cc numa_node ./run.edison.flat      5   1
25 | aprun -n  8192  -N 16  -S 8  -ss  -cc numa_node ./run.edison.flat      4   4
26 | aprun -n 16384  -N 16  -S 8  -ss  -cc numa_node ./run.edison.flat      4   2
27 | aprun -n 32768  -N 16  -S 8  -ss  -cc numa_node ./run.edison.flat      4   1
28 | 
29 | export OMP_NUM_THREADS=1
30 | aprun -n     8  -N  8  -S 8  -ss  -cc numa_node ./run.edison.flat      8   1
31 | aprun -n    16  -N 16  -S 8  -ss  -cc numa_node ./run.edison.flat      7   4
32 | aprun -n    32  -N 16  -S 8  -ss  -cc numa_node ./run.edison.flat      7   2
33 | aprun -n    64  -N 16  -S 8  -ss  -cc numa_node ./run.edison.flat      7   1
34 | aprun -n   128  -N 16  -S 8  -ss  -cc numa_node ./run.edison.flat      6   4
35 | aprun -n   256  -N 16  -S 8  -ss  -cc numa_node ./run.edison.flat      6   2
36 | aprun -n   512  -N 16  -S 8  -ss  -cc numa_node ./run.edison.flat      6   1
37 | aprun -n  1024  -N 16  -S 8  -ss  -cc numa_node ./run.edison.flat      5   4
38 | aprun -n  2048  -N 16  -S 8  -ss  -cc numa_node ./run.edison.flat      5   2
39 | aprun -n  4096  -N 16  -S 8  -ss  -cc numa_node ./run.edison.flat      5   1
40 | aprun -n  8192  -N 16  -S 8  -ss  -cc numa_node ./run.edison.flat      4   4
41 | aprun -n 16384  -N 16  -S 8  -ss  -cc numa_node ./run.edison.flat      4   2
42 | aprun -n 32768  -N 16  -S 8  -ss  -cc numa_node ./run.edison.flat      4   1
43 | 
44 | export OMP_NUM_THREADS=1
45 | aprun -n     8  -N  8  -S 8  -ss  -cc numa_node ./run.edison.flat      8   1
46 | aprun -n    16  -N 16  -S 8  -ss  -cc numa_node ./run.edison.flat      7   4
47 | aprun -n    32  -N 16  -S 8  -ss  -cc numa_node ./run.edison.flat      7   2
48 | aprun -n    64  -N 16  -S 8  -ss  -cc numa_node ./run.edison.flat      7   1
49 | aprun -n   128  -N 16  -S 8  -ss  -cc numa_node ./run.edison.flat      6   4
50 | aprun -n   256  -N 16  -S 8  -ss  -cc numa_node ./run.edison.flat      6   2
51 | aprun -n   512  -N 16  -S 8  -ss  -cc numa_node ./run.edison.flat      6   1
52 | aprun -n  1024  -N 16  -S 8  -ss  -cc numa_node ./run.edison.flat      5   4
53 | aprun -n  2048  -N 16  -S 8  -ss  -cc numa_node ./run.edison.flat      5   2
54 | aprun -n  4096  -N 16  -S 8  -ss  -cc numa_node ./run.edison.flat      5   1
55 | aprun -n  8192  -N 16  -S 8  -ss  -cc numa_node ./run.edison.flat      4   4
56 | aprun -n 16384  -N 16  -S 8  -ss  -cc numa_node ./run.edison.flat      4   2
57 | aprun -n 32768  -N 16  -S 8  -ss  -cc numa_node ./run.edison.flat      4   1
58 | 
59 | 


--------------------------------------------------------------------------------
/finite-volume/example_jobs/job.hopper.01000:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #PBS -N HPGMG
 3 | #PBS -o results.hopper.01000
 4 | #PBS -q regular
 5 | #PBS -j oe
 6 | #PBS -l walltime=0:29:00
 7 | #PBS -l mppwidth=6144
 8 | 
 9 | set -x
10 | cd $PBS_O_WORKDIR
11 | module swap PrgEnv-pgi PrgEnv-intel
12 | 
13 | export OMP_NUM_THREADS=6
14 | aprun -n     1  -d  6               -ss  -cc numa_node ./run.hopper  7  1
15 | aprun -n     8  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
16 | aprun -n    27  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
17 | aprun -n    64  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
18 | aprun -n   125  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
19 | aprun -n   216  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
20 | aprun -n   343  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
21 | aprun -n   512  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
22 | aprun -n   729  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
23 | aprun -n  1000  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
24 | export OMP_NUM_THREADS=6
25 | aprun -n     1  -d  6               -ss  -cc numa_node ./run.hopper  7  1
26 | aprun -n     8  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
27 | aprun -n    27  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
28 | aprun -n    64  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
29 | aprun -n   125  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
30 | aprun -n   216  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
31 | aprun -n   343  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
32 | aprun -n   512  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
33 | aprun -n   729  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
34 | aprun -n  1000  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
35 | export OMP_NUM_THREADS=6
36 | aprun -n     1  -d  6               -ss  -cc numa_node ./run.hopper  7  1
37 | aprun -n     8  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
38 | aprun -n    27  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
39 | aprun -n    64  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
40 | aprun -n   125  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
41 | aprun -n   216  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
42 | aprun -n   343  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
43 | aprun -n   512  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
44 | aprun -n   729  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
45 | aprun -n  1000  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
46 | 


--------------------------------------------------------------------------------
/finite-volume/example_jobs/job.hopper.04096:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #PBS -N HPGMG
 3 | #PBS -o results.hopper.04096
 4 | #PBS -q regular
 5 | #PBS -j oe
 6 | #PBS -l walltime=0:29:00
 7 | #PBS -l mppwidth=24576
 8 | 
 9 | set -x
10 | cd $PBS_O_WORKDIR
11 | module swap PrgEnv-pgi PrgEnv-intel
12 | 
13 | export OMP_NUM_THREADS=6
14 | aprun -n  4096  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
15 | aprun -n  3375  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
16 | aprun -n  2744  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
17 | aprun -n  1728  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
18 | aprun -n  1331  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
19 | export OMP_NUM_THREADS=6
20 | aprun -n  4096  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
21 | aprun -n  3375  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
22 | aprun -n  2744  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
23 | aprun -n  1728  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
24 | aprun -n  1331  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
25 | export OMP_NUM_THREADS=6
26 | aprun -n  4096  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
27 | aprun -n  3375  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
28 | aprun -n  2744  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
29 | aprun -n  1728  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
30 | aprun -n  1331  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
31 | 


--------------------------------------------------------------------------------
/finite-volume/example_jobs/job.hopper.09261:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #PBS -N HPGMG
 3 | #PBS -o results.hopper.09261
 4 | #PBS -q regular
 5 | #PBS -j oe
 6 | #PBS -l walltime=0:29:00
 7 | #PBS -l mppwidth=55584
 8 | 
 9 | set -x
10 | cd $PBS_O_WORKDIR
11 | module swap PrgEnv-pgi PrgEnv-intel
12 | 
13 | export OMP_NUM_THREADS=6
14 | aprun -n  9261  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
15 | aprun -n  8000  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
16 | aprun -n  5832  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
17 | aprun -n  4096  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
18 | 


--------------------------------------------------------------------------------
/finite-volume/example_jobs/job.hopper.13824:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #PBS -N HPGMG
 3 | #PBS -o results.hopper.13824
 4 | #PBS -q regular
 5 | #PBS -j oe
 6 | #PBS -l walltime=0:29:00
 7 | #PBS -l mppwidth=82944
 8 | 
 9 | set -x
10 | cd $PBS_O_WORKDIR
11 | module swap PrgEnv-pgi PrgEnv-intel
12 | 
13 | export OMP_NUM_THREADS=6
14 | aprun -n 13824  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
15 | aprun -n 10648  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
16 | aprun -n  9261  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
17 | aprun -n  8000  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
18 | aprun -n  5832  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
19 | aprun -n  4096  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
20 | 


--------------------------------------------------------------------------------
/finite-volume/example_jobs/job.hopper.21952:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #PBS -N HPGMG
 3 | #PBS -o results.hopper.21952
 4 | #PBS -q regular
 5 | #PBS -j oe
 6 | #PBS -l walltime=0:29:00
 7 | #PBS -l mppwidth=131712
 8 | 
 9 | set -x
10 | cd $PBS_O_WORKDIR
11 | module swap PrgEnv-pgi PrgEnv-intel
12 | 
13 | export OMP_NUM_THREADS=6
14 | aprun -n 21296  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  4
15 | aprun -n 21952  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  8
16 | aprun -n 20736  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7 18
17 | aprun -n 18432  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  6
18 | aprun -n 21952  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7 12
19 | #aprun -n 23328  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  2
20 | #aprun -n 25088  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  7
21 | 


--------------------------------------------------------------------------------
/finite-volume/example_jobs/job.hopper.special:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #PBS -N HPGMG
 3 | #PBS -o results.hopper.special4
 4 | #PBS -q regular
 5 | #PBS -j oe
 6 | #PBS -l walltime=0:19:00
 7 | #PBS -l mppwidth=131712
 8 | #PBS -W x=FLAGS:ADVRES:samw.755
 9 | 
10 | set -x
11 | cd $PBS_O_WORKDIR
12 | module swap PrgEnv-pgi PrgEnv-intel
13 | 
14 | export OMP_NUM_THREADS=6
15 | aprun -n 21952  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
16 | aprun -n 19683  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
17 | aprun -n 15625  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
18 | aprun -n 13824  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
19 | aprun -n 10648  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
20 | aprun -n  9261  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
21 | aprun -n  8000  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
22 | 
23 | export OMP_NUM_THREADS=6
24 | aprun -n 21952  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
25 | aprun -n 19683  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
26 | aprun -n 15625  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
27 | aprun -n 13824  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
28 | aprun -n 10648  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
29 | aprun -n  9261  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
30 | aprun -n  8000  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
31 | 
32 | export OMP_NUM_THREADS=6
33 | aprun -n 21952  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
34 | aprun -n 19683  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
35 | aprun -n 15625  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
36 | aprun -n 13824  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
37 | aprun -n 10648  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
38 | aprun -n  9261  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
39 | aprun -n  8000  -d  6  -N  4  -S 1  -ss  -cc numa_node ./run.hopper  7  1
40 | 
41 | 


--------------------------------------------------------------------------------
/finite-volume/example_jobs/job.titan:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #PBS -o results.titan
 3 | #PBS -q debug
 4 | #PBS -j oe
 5 | #PBS -l walltime=0:10:00
 6 | #PBS -l nodes=1
 7 | 
 8 | source $MODULESHOME/init/bash
 9 | module load cudatoolkit
10 | export PMI_NO_FORK=1
11 | 
12 | set -x
13 | cd $PBS_O_WORKDIR
14 | export OMP_NUM_THREADS=4
15 | 
16 | # normal run
17 | aprun -n 1 -N 1 -d 4 ./build/bin/hpgmg-fv 7 8
18 | 
19 | # generate nvprof timeline
20 | #aprun -b -n 1 -N 1 -d 4 nvprof --unified-memory-profiling per-process-device -o timeline.%p.nvp ./build/bin/hpgmg-fv 7 8
21 | 


--------------------------------------------------------------------------------
/finite-volume/local.mk:
--------------------------------------------------------------------------------
1 | include $(call incsubdirs,source)
2 | 


--------------------------------------------------------------------------------
/finite-volume/source/TODO:
--------------------------------------------------------------------------------
1 | - cubical problem size -> rectahedral problem size ... init problem, restriction rules, etc...
2 | - rectahedral problem size -> arbitrary problem shape...
3 | - more efficient ghost zone exchange (box intersection algebra) when communicating edges and corners
4 | 


--------------------------------------------------------------------------------
/finite-volume/source/compile:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | #=======================================================================================================================
 4 | # mira
 5 | #=======================================================================================================================
 6 | soft add +mpiwrapper-xl
 7 | qsub -t 00:10:00 -n  64 --proccount   64 --mode c1  -A PEACEndStation --env BG_SHAREDMEMSIZE=32MB:PAMID_VERBOSE=1:BG_COREDUMPDISABLED=1:BG_SMP_FAST_WAKEUP=YES:BG_THREADLAYOUT=2:OMP_PROC_BIND=TRUE:OMP_NUM_THREADS=64:OMP_WAIT_POLICY=active ./run.bgq 7 1
 8 | qsub -t 00:10:00 -n  64 --proccount   64 --mode c1  -A PEACEndStation --env BG_SHAREDMEMSIZE=32MB:PAMID_VERBOSE=1:BG_COREDUMPDISABLED=1:BG_SMP_FAST_WAKEUP=YES:BG_THREADLAYOUT=2:OMP_PROC_BIND=TRUE:OMP_NUM_THREADS=64:OMP_WAIT_POLICY=active:OMP_NESTED=true ./run.bgq 6 8
 9 | 
10 | 
11 | mpixlc_r -O5 -qsmp=omp:noauto level.c operators.fv4.c mg.c solvers.c hpgmg-fv.c timers.c -DUSE_MPI -DUSE_FCYCLES -DUSE_GSRB -DUSE_BICGSTAB -DBLOCKCOPY_TILE_K=1 -DBLOCKCOPY_TILE_J=32 -o run.bgq.1x32 -DUSE_HPM -L/soft/perftools/hpctw/lib -L/soft/perftools/bgpm/lib -lmpihpm_smp -lbgpm 
12 | 
13 | 
14 | mpirun.mic -n 8 -ppn 8 -hostfile micfile.$PBS_JOBID -env OMP_NUM_THREADS=30 -env KMP_AFFINITY=compact -env I_MPI_FABRICS=shm      -env I_MPI_PIN_DOMAIN=30 ./run.babbage.baseline 7 1
15 | mpirun.mic -n 8 -ppn 8 -hostfile micfile.$PBS_JOBID -env OMP_NUM_THREADS=30 -env KMP_AFFINITY=compact -env I_MPI_FABRICS=shm:ofa  -env I_MPI_PIN_DOMAIN=30 ./run.babbage.baseline 7 1
16 | mpirun.mic -n 8 -ppn 8 -hostfile micfile.$PBS_JOBID -env OMP_NUM_THREADS=30 -env KMP_AFFINITY=compact -env I_MPI_FABRICS=shm:dapl -env I_MPI_PIN_DOMAIN=30 ./run.babbage.baseline 7 1
17 | 


--------------------------------------------------------------------------------
/finite-volume/source/cuda/boundary_fd.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | # Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions
 6 | # are met:
 7 | #  * Redistributions of source code must retain the above copyright
 8 | #    notice, this list of conditions and the following disclaimer.
 9 | #  * Redistributions in binary form must reproduce the above copyright
10 | #    notice, this list of conditions and the following disclaimer in the
11 | #    documentation and/or other materials provided with the distribution.
12 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
13 | #    contributors may be used to endorse or promote products derived
14 | #    from this software without specific prior written permission.
15 | #
16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | */
28 | 
29 | // Divide thread block into batches of threads (e.g. quads), each batch operates on one HPGMG tile/block
30 | #define BLOCK_SIZE	128	// number of threads per thread block
31 | #define NUM_BATCH	8	// mimber of batches per thread block
32 | 
33 | __constant__ int   faces[27] = {0,0,0,0,1,0,0,0,0,  0,1,0,1,0,1,0,1,0,  0,0,0,0,1,0,0,0,0};
34 | __constant__ int   edges[27] = {0,1,0,1,0,1,0,1,0,  1,0,1,0,0,0,1,0,1,  0,1,0,1,0,1,0,1,0};
35 | __constant__ int corners[27] = {1,0,1,0,0,0,1,0,1,  0,0,0,0,0,0,0,0,0,  1,0,1,0,0,0,1,0,1};
36 | 
37 | template <int log_dim, int num_batch, int batch_size>
38 | __global__ void apply_BCs_v1_kernel(level_type level, int x_id, int shape){
39 |   // thread exit conditions
40 |   int batchid = blockIdx.x*num_batch + threadIdx.x/batch_size;
41 |   if(batchid >= level.boundary_condition.num_blocks[shape]) return;
42 | 
43 |   // one CUDA thread block operates on 'batch_size' HPGMG tiles/blocks
44 |   blockCopy_type block = level.boundary_condition.blocks[shape][batchid];
45 | 
46 |   double scale = 1.0;
47 |   if(  faces[block.subtype])scale=-1.0;
48 |   if(  edges[block.subtype])scale= 1.0;
49 |   if(corners[block.subtype])scale=-1.0;
50 | 
51 |   int i,j,k;
52 |   const int       box = block.read.box;
53 |   const int     dim_i = block.dim.i;
54 |   const int     dim_j = block.dim.j;
55 |   const int     dim_k = block.dim.k;
56 |   const int       ilo = block.read.i;
57 |   const int       jlo = block.read.j;
58 |   const int       klo = block.read.k;
59 |   const int normal = 26-block.subtype; // invert the normal vector
60 |  
61 |   // hard code for box to box BC's 
62 |   const int jStride = level.my_boxes[box].jStride;
63 |   const int kStride = level.my_boxes[box].kStride;
64 |   double * __restrict__  x = level.my_boxes[box].vectors[x_id] + level.my_boxes[box].ghosts*(1+jStride+kStride);
65 | 
66 |   // convert normal vector into pointer offsets...
67 |   const int di = (((normal % 3)  )-1);
68 |   const int dj = (((normal % 9)/3)-1);
69 |   const int dk = (((normal / 9)  )-1);
70 |   const int stride = di + dj*jStride + dk*kStride;
71 | 
72 |   for(int gid=threadIdx.x%batch_size; gid<dim_i*dim_j*dim_k; gid+=batch_size){
73 |     k=(gid/dim_i)/dim_j;
74 |     j=(gid/dim_i)%dim_j;
75 |     i=gid%dim_i;
76 |     int ijk = (i+ilo) + (j+jlo)*jStride + (k+klo)*kStride;
77 |     x[ijk] = scale*x[ijk+stride]; // homogeneous linear = 1pt stencil
78 |   }
79 | }
80 | #undef  KERNEL
81 | #define KERNEL(log_dim, shape) \
82 |   apply_BCs_v1_kernel<log_dim,NUM_BATCH,(BLOCK_SIZE/NUM_BATCH)><<<grid,block>>>(level,x_id,shape);
83 | 
84 | extern "C"
85 | void cuda_apply_BCs_v1(level_type level, int x_id, int shape)
86 | {
87 |   int block = BLOCK_SIZE;
88 |   int grid = (level.boundary_condition.num_blocks[shape]+NUM_BATCH-1)/NUM_BATCH;
89 |   if (grid <= 0) return;
90 | 
91 |   int log_dim = (int)log2((double)level.dim.i); 
92 |   KERNEL_LEVEL(log_dim, shape);
93 |   CUDA_ERROR
94 | }
95 | 


--------------------------------------------------------------------------------
/finite-volume/source/cuda/common.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | # Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions
 6 | # are met:
 7 | #  * Redistributions of source code must retain the above copyright
 8 | #    notice, this list of conditions and the following disclaimer.
 9 | #  * Redistributions in binary form must reproduce the above copyright
10 | #    notice, this list of conditions and the following disclaimer in the
11 | #    documentation and/or other materials provided with the distribution.
12 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
13 | #    contributors may be used to endorse or promote products derived
14 | #    from this software without specific prior written permission.
15 | #
16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | */
28 | 
29 | // available device functions
30 | void cuda_smooth(level_type level, int x_id, int rhs_id, double a, double b, int s, double *c, double *d);
31 | void cuda_residual(level_type d_level, int res_id, int x_id, int rhs_id, double a, double b);
32 | void cuda_rebuild(level_type level, int x_id, int Aii_id, int sumAbsAij_id, double a, double b);
33 | 
34 | void cuda_restriction(level_type d_level_c, int id_c, level_type d_level_f, int id_f, communicator_type restriction, int restrictionType, int block_type);
35 | 
36 | void cuda_interpolation_p0(level_type d_level_f, int id_f, double prescale_f, level_type d_level_c, int id_c, communicator_type interpolation, int block_type);
37 | void cuda_interpolation_p1(level_type d_level_f, int id_f, double prescale_f, level_type d_level_c, int id_c, communicator_type interpolation, int block_type);
38 | void cuda_interpolation_v2(level_type level_f, int id_f, double prescale_f, level_type level_c, int id_c, communicator_type interpolation, int block_type);
39 | void cuda_interpolation_v4(level_type level_f, int id_f, double prescale_f, level_type level_c, int id_c, communicator_type interpolation, int block_type);
40 | 
41 | void cuda_apply_BCs_v1(level_type level, int x_id, int shape);
42 | void cuda_apply_BCs_v2(level_type level, int x_id, int shape);
43 | void cuda_apply_BCs_v4(level_type level, int x_id, int shape);
44 | void cuda_extrapolate_betas(level_type level, int shape);
45 | 
46 | void cuda_zero_vector(level_type d_level, int id);
47 | void cuda_scale_vector(level_type d_level, int id_c, double scale_a, int id_a);
48 | void cuda_shift_vector(level_type d_level, int id_c, double shift_a, int id_a);
49 | void cuda_mul_vectors(level_type d_level, int id_c, double scale, int id_a, int id_b);
50 | void cuda_add_vectors(level_type d_level, int id_c, double scale_a, int id_a, double scale_b, int id_b);
51 | double cuda_sum(level_type d_level, int id);
52 | double cuda_max_abs(level_type d_level, int id);
53 | void cuda_color_vector(level_type d_level, int id_a, int colors_in_each_dim, int icolor, int jcolor, int kcolor);
54 | 
55 | void cuda_copy_block(level_type d_level, int id, communicator_type exchange_ghosts, int block_type, cudaStream_t stream);
56 | void cuda_increment_block(level_type d_level, int id, double prescale, communicator_type exchange_ghosts, int block_type);
57 | void cuda_fused_copy_block(level_type d_level, int id, communicator_type exchange_ghosts, cudaStream_t stream, comm_dev_descs_t descs);
58 | void cuda_fused_copy_block_send(level_type d_level, int id, communicator_type exchange_ghosts, cudaStream_t stream, comm_dev_descs_t descs);
59 | void cuda_fused_copy_block_receive(level_type d_level, int id, communicator_type exchange_ghosts, cudaStream_t stream, comm_dev_descs_t descs);
60 | 
61 | #include "extra.h"
62 | 


--------------------------------------------------------------------------------
/finite-volume/source/cuda/cub/block/specializations/block_histogram_atomic.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 3 |  * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
 4 |  * 
 5 |  * Redistribution and use in source and binary forms, with or without
 6 |  * modification, are permitted provided that the following conditions are met:
 7 |  *     * Redistributions of source code must retain the above copyright
 8 |  *       notice, this list of conditions and the following disclaimer.
 9 |  *     * Redistributions in binary form must reproduce the above copyright
10 |  *       notice, this list of conditions and the following disclaimer in the
11 |  *       documentation and/or other materials provided with the distribution.
12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
13 |  *       names of its contributors may be used to endorse or promote products
14 |  *       derived from this software without specific prior written permission.
15 |  * 
16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |  *
27 |  ******************************************************************************/
28 | 
29 | /**
30 |  * \file
31 |  * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
32 |  */
33 | 
34 | #pragma once
35 | 
36 | #include "../../util_namespace.cuh"
37 | 
38 | /// Optional outer namespace(s)
39 | CUB_NS_PREFIX
40 | 
41 | /// CUB namespace
42 | namespace cub {
43 | 
44 | 
45 | /**
46 |  * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
47 |  */
48 | template <int BINS>
49 | struct BlockHistogramAtomic
50 | {
51 |     /// Shared memory storage layout type
52 |     struct TempStorage {};
53 | 
54 | 
55 |     /// Constructor
56 |     __device__ __forceinline__ BlockHistogramAtomic(
57 |         TempStorage &temp_storage)
58 |     {}
59 | 
60 | 
61 |     /// Composite data onto an existing histogram
62 |     template <
63 |         typename            T,
64 |         typename            CounterT,     
65 |         int                 ITEMS_PER_THREAD>
66 |     __device__ __forceinline__ void Composite(
67 |         T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
68 |         CounterT             histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
69 |     {
70 |         // Update histogram
71 |         #pragma unroll
72 |         for (int i = 0; i < ITEMS_PER_THREAD; ++i)
73 |         {
74 |               atomicAdd(histogram + items[i], 1);
75 |         }
76 |     }
77 | 
78 | };
79 | 
80 | }               // CUB namespace
81 | CUB_NS_POSTFIX  // Optional outer namespace(s)
82 | 
83 | 


--------------------------------------------------------------------------------
/finite-volume/source/cuda/cub/cub.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 3 |  * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
 4 |  * 
 5 |  * Redistribution and use in source and binary forms, with or without
 6 |  * modification, are permitted provided that the following conditions are met:
 7 |  *     * Redistributions of source code must retain the above copyright
 8 |  *       notice, this list of conditions and the following disclaimer.
 9 |  *     * Redistributions in binary form must reproduce the above copyright
10 |  *       notice, this list of conditions and the following disclaimer in the
11 |  *       documentation and/or other materials provided with the distribution.
12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
13 |  *       names of its contributors may be used to endorse or promote products
14 |  *       derived from this software without specific prior written permission.
15 |  * 
16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |  *
27 |  ******************************************************************************/
28 | 
29 | /**
30 |  * \file
31 |  * CUB umbrella include file
32 |  */
33 | 
34 | #pragma once
35 | 
36 | 
37 | // Block
38 | #include "block/block_histogram.cuh"
39 | #include "block/block_discontinuity.cuh"
40 | #include "block/block_exchange.cuh"
41 | #include "block/block_load.cuh"
42 | #include "block/block_radix_rank.cuh"
43 | #include "block/block_radix_sort.cuh"
44 | #include "block/block_reduce.cuh"
45 | #include "block/block_scan.cuh"
46 | #include "block/block_store.cuh"
47 | //#include "block/block_shift.cuh"
48 | 
49 | // Device
50 | #include "device/device_histogram.cuh"
51 | #include "device/device_partition.cuh"
52 | #include "device/device_radix_sort.cuh"
53 | #include "device/device_reduce.cuh"
54 | #include "device/device_run_length_encode.cuh"
55 | #include "device/device_scan.cuh"
56 | #include "device/device_segmented_radix_sort.cuh"
57 | #include "device/device_segmented_reduce.cuh"
58 | #include "device/device_select.cuh"
59 | #include "device/device_spmv.cuh"
60 | 
61 | // Grid
62 | //#include "grid/grid_barrier.cuh"
63 | #include "grid/grid_even_share.cuh"
64 | #include "grid/grid_mapping.cuh"
65 | #include "grid/grid_queue.cuh"
66 | 
67 | // Thread
68 | #include "thread/thread_load.cuh"
69 | #include "thread/thread_operators.cuh"
70 | #include "thread/thread_reduce.cuh"
71 | #include "thread/thread_scan.cuh"
72 | #include "thread/thread_store.cuh"
73 | 
74 | // Warp
75 | #include "warp/warp_reduce.cuh"
76 | #include "warp/warp_scan.cuh"
77 | 
78 | // Iterator
79 | #include "iterator/arg_index_input_iterator.cuh"
80 | #include "iterator/cache_modified_input_iterator.cuh"
81 | #include "iterator/cache_modified_output_iterator.cuh"
82 | #include "iterator/constant_input_iterator.cuh"
83 | #include "iterator/counting_input_iterator.cuh"
84 | #include "iterator/tex_obj_input_iterator.cuh"
85 | #include "iterator/tex_ref_input_iterator.cuh"
86 | #include "iterator/transform_input_iterator.cuh"
87 | 
88 | // Util
89 | #include "util_arch.cuh"
90 | #include "util_debug.cuh"
91 | #include "util_device.cuh"
92 | #include "util_macro.cuh"
93 | #include "util_ptx.cuh"
94 | #include "util_type.cuh"
95 | 
96 | 


--------------------------------------------------------------------------------
/finite-volume/source/cuda/cub/grid/grid_mapping.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  *
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include "../util_namespace.cuh"
 37 | 
 38 | /// Optional outer namespace(s)
 39 | CUB_NS_PREFIX
 40 | 
 41 | /// CUB namespace
 42 | namespace cub {
 43 | 
 44 | 
 45 | /**
 46 |  * \addtogroup GridModule
 47 |  * @{
 48 |  */
 49 | 
 50 | 
 51 | /******************************************************************************
 52 |  * Mapping policies
 53 |  *****************************************************************************/
 54 | 
 55 | 
 56 | /**
 57 |  * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
 58 |  */
 59 | enum GridMappingStrategy
 60 | {
 61 |     /**
 62 |      * \brief An a "raking" access pattern in which each thread block is
 63 |      * assigned a consecutive sequence of input tiles
 64 |      *
 65 |      * \par Overview
 66 |      * The input is evenly partitioned into \p p segments, where \p p is
 67 |      * constant and corresponds loosely to the number of thread blocks that may
 68 |      * actively reside on the target device. Each segment is comprised of
 69 |      * consecutive tiles, where a tile is a small, constant-sized unit of input
 70 |      * to be processed to completion before the thread block terminates or
 71 |      * obtains more work.  The kernel invokes \p p thread blocks, each
 72 |      * of which iteratively consumes a segment of <em>n</em>/<em>p</em> elements
 73 |      * in tile-size increments.
 74 |      */
 75 |     GRID_MAPPING_RAKE,
 76 | 
 77 |     /**
 78 |      * \brief An a "strip mining" access pattern in which the input tiles assigned
 79 |      * to each thread block are separated by a stride equal to the the extent of
 80 |      * the grid.
 81 |      *
 82 |      * \par Overview
 83 |      * The input is evenly partitioned into \p p sets, where \p p is
 84 |      * constant and corresponds loosely to the number of thread blocks that may
 85 |      * actively reside on the target device. Each set is comprised of
 86 |      * data tiles separated by stride \p tiles, where a tile is a small,
 87 |      * constant-sized unit of input to be processed to completion before the
 88 |      * thread block terminates or obtains more work.  The kernel invokes \p p
 89 |      * thread blocks, each of which iteratively consumes a segment of
 90 |      * <em>n</em>/<em>p</em> elements in tile-size increments.
 91 |      */
 92 |     GRID_MAPPING_STRIP_MINE,
 93 | 
 94 |     /**
 95 |      * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks.
 96 |      *
 97 |      * \par Overview
 98 |      * The input is treated as a queue to be dynamically consumed by a grid of
 99 |      * thread blocks.  Work is atomically dequeued in tiles, where a tile is a
100 |      * unit of input to be processed to completion before the thread block
101 |      * terminates or obtains more work.  The grid size \p p is constant,
102 |      * loosely corresponding to the number of thread blocks that may actively
103 |      * reside on the target device.
104 |      */
105 |     GRID_MAPPING_DYNAMIC,
106 | };
107 | 
108 | 
109 | /** @} */       // end group GridModule
110 | 
111 | }               // CUB namespace
112 | CUB_NS_POSTFIX  // Optional outer namespace(s)
113 | 
114 | 


--------------------------------------------------------------------------------
/finite-volume/source/cuda/cub/host/mutex.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  *
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Simple portable mutex
 32 |  */
 33 | 
 34 | 
 35 | #pragma once
 36 | 
 37 | #if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800)
 38 |     #include <mutex>
 39 | #else
 40 |     #if defined(_WIN32) || defined(_WIN64)
 41 |         #include <intrin.h>
 42 | 
 43 |         #define WIN32_LEAN_AND_MEAN
 44 |         #define NOMINMAX
 45 |         #include <windows.h>
 46 |         #undef WIN32_LEAN_AND_MEAN
 47 |         #undef NOMINMAX
 48 | 
 49 |         /**
 50 |          * Compiler read/write barrier
 51 |          */
 52 |         #pragma intrinsic(_ReadWriteBarrier)
 53 | 
 54 |     #endif
 55 | #endif
 56 | 
 57 | #include "../util_namespace.cuh"
 58 | 
 59 | 
 60 | /// Optional outer namespace(s)
 61 | CUB_NS_PREFIX
 62 | 
 63 | /// CUB namespace
 64 | namespace cub {
 65 | 
 66 | 
 67 | /**
 68 |  * Simple portable mutex
 69 |  *   - Wraps std::mutex when compiled with C++11 or newer (supported on all platforms)
 70 |  *   - Uses GNU/Windows spinlock mechanisms for pre C++11 (supported on x86/x64 when compiled with cl.exe or g++)
 71 |  */
 72 | struct Mutex
 73 | {
 74 | #if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800)
 75 | 
 76 |     std::mutex mtx;
 77 | 
 78 |     void Lock()
 79 |     {
 80 |         mtx.lock();
 81 |     }
 82 | 
 83 |     void Unlock()
 84 |     {
 85 |         mtx.unlock();
 86 |     }
 87 | 
 88 |     void TryLock()
 89 |     {
 90 |         mtx.try_lock();
 91 |     }
 92 | 
 93 | #else       //__cplusplus > 199711L
 94 | 
 95 |     #if defined(_MSC_VER)
 96 | 
 97 |         // Microsoft VC++
 98 |         typedef long Spinlock;
 99 | 
100 |     #else
101 | 
102 |         // GNU g++
103 |         typedef int Spinlock;
104 | 
105 |         /**
106 |          * Compiler read/write barrier
107 |          */
108 |         __forceinline__ void _ReadWriteBarrier()
109 |         {
110 |             __sync_synchronize();
111 |         }
112 | 
113 |         /**
114 |          * Atomic exchange
115 |          */
116 |         __forceinline__ long _InterlockedExchange(volatile int * const Target, const int Value)
117 |         {
118 |             // NOTE: __sync_lock_test_and_set would be an acquire barrier, so we force a full barrier
119 |             _ReadWriteBarrier();
120 |             return __sync_lock_test_and_set(Target, Value);
121 |         }
122 | 
123 |         /**
124 |          * Pause instruction to prevent excess processor bus usage
125 |          */
126 |         __forceinline__ void YieldProcessor()
127 |         {
128 |         }
129 | 
130 |     #endif  // defined(_MSC_VER)
131 | 
132 |         /// Lock member
133 |         volatile Spinlock lock;
134 | 
135 |         /**
136 |          * Constructor
137 |          */
138 |         Mutex() : lock(0) {}
139 | 
140 |         /**
141 |          * Return when the specified spinlock has been acquired
142 |          */
143 |         __forceinline__ void Lock()
144 |         {
145 |             while (1)
146 |             {
147 |                 if (!_InterlockedExchange(&lock, 1)) return;
148 |                 while (lock) YieldProcessor();
149 |             }
150 |         }
151 | 
152 | 
153 |         /**
154 |          * Release the specified spinlock
155 |          */
156 |         __forceinline__ void Unlock()
157 |         {
158 |             _ReadWriteBarrier();
159 |             lock = 0;
160 |         }
161 | 
162 | #endif      // __cplusplus > 199711L
163 | 
164 | };
165 | 
166 | 
167 | 
168 | 
169 | }               // CUB namespace
170 | CUB_NS_POSTFIX  // Optional outer namespace(s)
171 | 
172 | 


--------------------------------------------------------------------------------
/finite-volume/source/cuda/cub/host/spinlock.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2015, NVIDIA CORPORATION.  All rights reserved.
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  *
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Simple x86/x64 atomic spinlock, portable across MS Windows (cl.exe) & Linux (g++)
 32 |  */
 33 | 
 34 | 
 35 | #pragma once
 36 | 
 37 | #if defined(_WIN32) || defined(_WIN64)
 38 |     #include <intrin.h>
 39 |     #include <windows.h>
 40 |     #undef small            // Windows is terrible for polluting macro namespace
 41 | 
 42 |     /**
 43 |      * Compiler read/write barrier
 44 |      */
 45 |     #pragma intrinsic(_ReadWriteBarrier)
 46 | 
 47 | #endif
 48 | 
 49 | #include "../util_namespace.cuh"
 50 | 
 51 | /// Optional outer namespace(s)
 52 | CUB_NS_PREFIX
 53 | 
 54 | /// CUB namespace
 55 | namespace cub {
 56 | 
 57 | 
 58 | #if defined(_MSC_VER)
 59 | 
 60 |     // Microsoft VC++
 61 |     typedef long Spinlock;
 62 | 
 63 | #else
 64 | 
 65 |     // GNU g++
 66 |     typedef int Spinlock;
 67 | 
 68 |     /**
 69 |      * Compiler read/write barrier
 70 |      */
 71 |     __forceinline__ void _ReadWriteBarrier()
 72 |     {
 73 |         __sync_synchronize();
 74 |     }
 75 | 
 76 |     /**
 77 |      * Atomic exchange
 78 |      */
 79 |     __forceinline__ long _InterlockedExchange(volatile int * const Target, const int Value)
 80 |     {
 81 |         // NOTE: __sync_lock_test_and_set would be an acquire barrier, so we force a full barrier
 82 |         _ReadWriteBarrier();
 83 |         return __sync_lock_test_and_set(Target, Value);
 84 |     }
 85 | 
 86 |     /**
 87 |      * Pause instruction to prevent excess processor bus usage
 88 |      */
 89 |     __forceinline__ void YieldProcessor()
 90 |     {
 91 | #ifndef __arm__
 92 |         asm volatile("pause\n": : :"memory");
 93 | #endif  // __arm__
 94 |     }
 95 | 
 96 | #endif  // defined(_MSC_VER)
 97 | 
 98 | /**
 99 |  * Return when the specified spinlock has been acquired
100 |  */
101 | __forceinline__ void Lock(volatile Spinlock *lock)
102 | {
103 |     while (1)
104 |     {
105 |         if (!_InterlockedExchange(lock, 1)) return;
106 |         while (*lock) YieldProcessor();
107 |     }
108 | }
109 | 
110 | 
111 | /**
112 |  * Release the specified spinlock
113 |  */
114 | __forceinline__ void Unlock(volatile Spinlock *lock)
115 | {
116 |     _ReadWriteBarrier();
117 |     *lock = 0;
118 | }
119 | 
120 | 
121 | }               // CUB namespace
122 | CUB_NS_POSTFIX  // Optional outer namespace(s)
123 | 
124 | 


--------------------------------------------------------------------------------
/finite-volume/source/cuda/cub/util_macro.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /******************************************************************************
 30 |  * Common C/C++ macro utilities
 31 |  ******************************************************************************/
 32 | 
 33 | #pragma once
 34 | 
 35 | #include "util_namespace.cuh"
 36 | 
 37 | /// Optional outer namespace(s)
 38 | CUB_NS_PREFIX
 39 | 
 40 | /// CUB namespace
 41 | namespace cub {
 42 | 
 43 | 
 44 | /**
 45 |  * \addtogroup UtilModule
 46 |  * @{
 47 |  */
 48 | 
 49 | #ifndef CUB_ALIGN
 50 |     #if defined(_WIN32) || defined(_WIN64)
 51 |         /// Align struct
 52 |         #define CUB_ALIGN(bytes) __declspec(align(32))
 53 |     #else
 54 |         /// Align struct
 55 |         #define CUB_ALIGN(bytes) __attribute__((aligned(bytes)))
 56 |     #endif
 57 | #endif
 58 | 
 59 | #ifndef CUB_MAX
 60 |     /// Select maximum(a, b)
 61 |     #define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a))
 62 | #endif
 63 | 
 64 | #ifndef CUB_MIN
 65 |     /// Select minimum(a, b)
 66 |     #define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a))
 67 | #endif
 68 | 
 69 | #ifndef CUB_QUOTIENT_FLOOR
 70 |     /// Quotient of x/y rounded down to nearest integer
 71 |     #define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y))
 72 | #endif
 73 | 
 74 | #ifndef CUB_QUOTIENT_CEILING
 75 |     /// Quotient of x/y rounded up to nearest integer
 76 |     #define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y))
 77 | #endif
 78 | 
 79 | #ifndef CUB_ROUND_UP_NEAREST
 80 |     /// x rounded up to the nearest multiple of y
 81 |     #define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y)
 82 | #endif
 83 | 
 84 | #ifndef CUB_ROUND_DOWN_NEAREST
 85 |     /// x rounded down to the nearest multiple of y
 86 |     #define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y)
 87 | #endif
 88 | 
 89 | 
 90 | #ifndef CUB_STATIC_ASSERT
 91 |     #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
 92 |         #define CUB_CAT_(a, b) a ## b
 93 |         #define CUB_CAT(a, b) CUB_CAT_(a, b)
 94 |     #endif // DOXYGEN_SHOULD_SKIP_THIS
 95 | 
 96 |     /// Static assert
 97 |     #define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1]
 98 | #endif
 99 | 
100 | /** @} */       // end group UtilModule
101 | 
102 | }               // CUB namespace
103 | CUB_NS_POSTFIX  // Optional outer namespace(s)
104 | 


--------------------------------------------------------------------------------
/finite-volume/source/cuda/cub/util_namespace.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 3 |  * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
 4 |  *
 5 |  * Redistribution and use in source and binary forms, with or without
 6 |  * modification, are permitted provided that the following conditions are met:
 7 |  *     * Redistributions of source code must retain the above copyright
 8 |  *       notice, this list of conditions and the following disclaimer.
 9 |  *     * Redistributions in binary form must reproduce the above copyright
10 |  *       notice, this list of conditions and the following disclaimer in the
11 |  *       documentation and/or other materials provided with the distribution.
12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
13 |  *       names of its contributors may be used to endorse or promote products
14 |  *       derived from this software without specific prior written permission.
15 |  *
16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |  *
27 |  ******************************************************************************/
28 | 
29 | /**
30 |  * \file
31 |  * Place-holder for prefixing the cub namespace
32 |  */
33 | 
34 | #pragma once
35 | 
36 | // For example:
37 | //#define CUB_NS_PREFIX namespace thrust{ namespace detail {
38 | //#define CUB_NS_POSTFIX } }
39 | 
40 | #ifndef CUB_NS_PREFIX
41 | #define CUB_NS_PREFIX
42 | #endif
43 | 
44 | #ifndef CUB_NS_POSTFIX
45 | #define CUB_NS_POSTFIX
46 | #endif
47 | 


--------------------------------------------------------------------------------
/finite-volume/source/debug.c:
--------------------------------------------------------------------------------
 1 | #include "stdlib.h"
 2 | #include "stdio.h"
 3 | #include "debug.h"
 4 | 
 5 | int mpi_comm_rank = 0;
 6 | 
 7 | int dbg_enabled()
 8 | {
 9 |     static int dbg_is_enabled = -1;
10 |     if (-1 == dbg_is_enabled) {        
11 |         const char *env = getenv("HPGMG_ENABLE_DEBUG");
12 |         if (env) {
13 |             int en = atoi(env);
14 |             dbg_is_enabled = !!en;
15 |             printf("HPGMG_ENABLE_DEBUG=%s\n", env);
16 |         } else
17 |             dbg_is_enabled = 0;
18 |     }
19 |     return dbg_is_enabled;
20 | }
21 | 


--------------------------------------------------------------------------------
/finite-volume/source/debug.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <unistd.h>
 4 | 
 5 | extern int mpi_comm_rank;
 6 | 
 7 | #define STRDBG stderr
 8 | 
 9 | #ifdef __cplusplus
10 | extern "C" {
11 | #endif
12 | int dbg_enabled();
13 | 
14 | #define DBG(FMT, ARGS...)                                               \
15 |     do {                                                                \
16 |         if (dbg_enabled()) {                                            \
17 |             fprintf(STRDBG, "[%d] [%d] HPGMG %s(): " FMT,               \
18 |                     getpid(), mpi_comm_rank, __FUNCTION__ , ## ARGS);   \
19 |             fflush(STRDBG);                                             \
20 |         }                                                               \
21 |     } while(0)
22 | 
23 | #ifdef __cplusplus
24 | }
25 | #endif
26 | 
27 | #ifdef PROFILE_NVTX_RANGES
28 | #include "nvToolsExt.h"
29 | 
30 | #define COMM_COL 1
31 | #define SM_COL   2
32 | #define SML_COL  3
33 | #define OP_COL   4
34 | #define COMP_COL 5
35 | #define SOLVE_COL 6
36 | #define WARMUP_COL 7
37 | #define EXEC_COL 8
38 | 
39 | #define SEND_COL 9
40 | #define WAIT_COL 10
41 | #define KERNEL_COL 11
42 | 
43 | 
44 | #define PUSH_RANGE(name,cid)																						\
45 | 	do {																																	\
46 | 	  const uint32_t colors[] = {																					\
47 |             0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, 0x0000ffff, 0x00ff0000, 0x00ffffff, 0xff000000, 0xff0000ff, 0x55ff3300, 0xff660000, 0x66330000  \
48 | 		};																																	\
49 | 		const int num_colors = sizeof(colors)/sizeof(colors[0]);						\
50 | 		int color_id = cid%num_colors;																	\
51 |     nvtxEventAttributes_t eventAttrib = {0};												\
52 |     eventAttrib.version = NVTX_VERSION;															\
53 |     eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;								\
54 |     eventAttrib.colorType = NVTX_COLOR_ARGB;												\
55 |     eventAttrib.color = colors[color_id];														\
56 |     eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;							\
57 |     eventAttrib.message.ascii = name;																\
58 |     nvtxRangePushEx(&eventAttrib);																	\
59 | 	} while(0)
60 | 
61 | #define PUSH_RANGE_STR(cid, FMT, ARGS...)				\
62 | 	do {																					\
63 | 		char str[128];															\
64 | 		snprintf(str, sizeof(str), FMT, ## ARGS);		\
65 | 		PUSH_RANGE(str, cid);												\
66 | 	} while(0)
67 | 
68 | 
69 | #define POP_RANGE do { nvtxRangePop(); } while(0)
70 | 
71 | #else
72 | #define PUSH_RANGE(name,cid)
73 | #define POP_RANGE
74 | #endif
75 | 


--------------------------------------------------------------------------------
/finite-volume/source/defines.h:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------------------------------------------------------
 2 | // Samuel Williams
 3 | // SWWilliams@lbl.gov
 4 | // Lawrence Berkeley National Lab
 5 | //------------------------------------------------------------------------------------------------------------------------------
 6 | // Lu = a*alpha[]*u[] - b*divergence beta[]*gradient u[]
 7 | //------------------------------------------------------------------------------------------------------------------------------
 8 | #ifndef DEFINES_H
 9 | #define DEFINES_H
10 | //------------------------------------------------------------------------------------------------------------------------------
11 | #define  VECTOR_TEMP         0 // 
12 | #define  VECTOR_UTRUE        1 // exact solution used to generate f
13 | #define  VECTOR_F_MINUS_AV   2 // cell centered residual (f-Av)
14 | //------------------------------------------------------------------------------------------------------------------------------
15 | #define  VECTOR_F            3 // original right-hand side (Au=f), cell centered
16 | #define  VECTOR_U            4 // numerical solution
17 | #define  VECTOR_ALPHA        5 // cell centered coefficient
18 | #define  VECTOR_BETA_I       6 // face centered coefficient (n.b. element 0 is the left face of the ghost zone element)
19 | #define  VECTOR_BETA_J       7 // face centered coefficient (n.b. element 0 is the back face of the ghost zone element)
20 | #define  VECTOR_BETA_K       8 // face centered coefficient (n.b. element 0 is the bottom face of the ghost zone element)
21 | //------------------------------------------------------------------------------------------------------------------
22 | #define  VECTOR_DINV         9 // cell centered relaxation parameter (e.g. inverse of the diagonal)
23 | #define  VECTOR_L1INV       10 // cell centered relaxation parameter (e.g. inverse of the L1 norm of each row)
24 | #define  VECTOR_VALID       11 // cell centered array noting which cells are actually present
25 | //------------------------------------------------------------------------------------------------------------------
26 | #define VECTORS_RESERVED    12 // total number of vectors and the starting location for any auxillary bottom solver vectors
27 | //------------------------------------------------------------------------------------------------------------------------------
28 | #endif
29 | 


--------------------------------------------------------------------------------
/finite-volume/source/local.mk:
--------------------------------------------------------------------------------
 1 | hpgmg-fv-y.c += $(call thisdir, \
 2 | 	debug.c \
 3 | 	timers.c \
 4 | 	level.c \
 5 | 	operators.fv4.c \
 6 | 	mg.c \
 7 | 	solvers.c \
 8 | 	hpgmg-fv.c \
 9 | 	)
10 | 
11 | #Useless in case of libmpcomm.so
12 | #hpgmg-fv-y.cc += $(call thisdir, comm.cc)
13 | 
14 | hpgmg-fv-y.cu += $(call thisdir, \
15 | 	cuda/operators.fv4.cu \
16 | 	)
17 | 


--------------------------------------------------------------------------------
/finite-volume/source/mg.h:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------------------------------------------------------
 2 | // Samuel Williams
 3 | // SWWilliams@lbl.gov
 4 | // Lawrence Berkeley National Lab
 5 | //------------------------------------------------------------------------------------------------------------------------------
 6 | #ifndef MG_H
 7 | #define MG_H
 8 | //------------------------------------------------------------------------------------------------------------------------------
 9 | #include <stdio.h>
10 | #include <stdlib.h>
11 | #include <stdint.h>
12 | #include <string.h>
13 | #include <math.h>
14 | //------------------------------------------------------------------------------------------------------------------------------
15 | #ifndef MG_AGGLOMERATION_START
16 | #define MG_AGGLOMERATION_START  8 // i.e. start the distributed v-cycle when boxes are smaller than 8^3
17 | #endif
18 | #ifndef MG_DEFAULT_BOTTOM_NORM
19 | #define MG_DEFAULT_BOTTOM_NORM  1e-3
20 | #endif
21 | //------------------------------------------------------------------------------------------------------------------------------
22 | typedef struct {
23 |   int num_ranks;	// total number of MPI ranks for MPI_COMM_WORLD
24 |   int my_rank;		// my MPI rank for MPI_COMM_WORLD
25 |   int       num_levels;	// depth of the v-cycle
26 |   level_type ** levels;	// array of pointers to levels
27 | 
28 |   struct {
29 |     double MGBuild; // total time spent building the coefficients...
30 |     double MGSolve; // total time spent in MGSolve
31 |   }timers;
32 |   int MGSolves_performed;
33 | } mg_type;
34 | 
35 | 
36 | //------------------------------------------------------------------------------------------------------------------------------
37 | void          MGBuild(mg_type *all_grids, level_type *fine_grid, double a, double b, int minCoarseGridDim);
38 | void          MGSolve(mg_type *all_grids, int onLevel, int u_id, int F_id, double a, double b, double dtol, double rtol);
39 | void         FMGSolve(mg_type *all_grids, int onLevel, int u_id, int F_id, double a, double b, double dtol, double rtol);
40 | void            MGPCG(mg_type *all_grids, int onLevel, int x_id, int F_id, double a, double b, double dtol, double rtol);
41 | void        MGDestroy(mg_type *all_grids);
42 | void    MGPrintTiming(mg_type *all_grids, int fromLevel);
43 | void    MGResetTimers(mg_type *all_grids);
44 | void richardson_error(mg_type *all_grids, int levelh, int u_id);
45 | //------------------------------------------------------------------------------------------------------------------------------
46 | #endif
47 | 


--------------------------------------------------------------------------------
/finite-volume/source/operators.h:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------------------------------------------------------
 2 | // Samuel Williams
 3 | // SWWilliams@lbl.gov
 4 | // Lawrence Berkeley National Lab
 5 | //------------------------------------------------------------------------------------------------------------------------------
 6 | #ifndef OPERATORS_H
 7 | #define OPERATORS_H
 8 | //------------------------------------------------------------------------------------------------------------------------------
 9 | #define RESTRICT_CELL   0
10 | #define RESTRICT_FACE_I 1
11 | #define RESTRICT_FACE_J 2
12 | #define RESTRICT_FACE_K 3
13 | //------------------------------------------------------------------------------------------------------------------------------
14 | int stencil_get_radius(); 
15 | int stencil_get_shape();
16 | //------------------------------------------------------------------------------------------------------------------------------
17 |   void                  apply_op(level_type * level, int Ax_id,  int x_id, double a, double b);
18 |   void                  residual(level_type * level, int res_id, int x_id, int rhs_id, double a, double b);
19 |   void                    smooth(level_type * level, int phi_id, int rhs_id, double a, double b);
20 |   void          rebuild_operator(level_type * level, level_type *fromLevel, double a, double b);
21 |   void rebuild_operator_blackbox(level_type * level, double a, double b, int colors_in_each_dim);
22 | //------------------------------------------------------------------------------------------------------------------------------
23 |   void               restriction(level_type * level_c, int id_c, level_type *level_f, int id_f, int restrictionType);
24 |   void      interpolation_vcycle(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c); // interpolation used inside a v-cycle
25 |   void      interpolation_fcycle(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c); // interpolation used in the f-cycle to create a new initial guess for the next finner v-cycle
26 | //------------------------------------------------------------------------------------------------------------------------------
27 |   void         exchange_boundary(level_type * level, int id_a, int shape);
28 |   void         force_comm_flush();
29 | 
30 |   void              apply_BCs_p1(level_type * level, int x_id, int shape); // piecewise (cell centered) linear
31 |   void              apply_BCs_p2(level_type * level, int x_id, int shape); // piecewise (cell centered) quadratic
32 |   void              apply_BCs_v1(level_type * level, int x_id, int shape); // volumetric linear
33 |   void              apply_BCs_v2(level_type * level, int x_id, int shape); // volumetric quadratic
34 |   void              apply_BCs_v4(level_type * level, int x_id, int shape); // volumetric quartic
35 |   void         extrapolate_betas(level_type * level);
36 | //------------------------------------------------------------------------------------------------------------------------------
37 | double                       dot(level_type * level, int id_a, int id_b);
38 | double                      norm(level_type * level, int id_a);
39 | double                      mean(level_type * level, int id_a);
40 | double                     error(level_type * level, int id_a, int id_b);
41 |   void               add_vectors(level_type * level, int id_c, double scale_a, int id_a, double scale_b, int id_b);
42 |   void             scale_vector( level_type * level, int id_c, double scale_a, int id_a);
43 |   void              zero_vector( level_type * level, int id_a);
44 |   void             shift_vector( level_type * level, int id_c, int id_a, double shift_a);
45 |   void               mul_vectors(level_type * level, int id_c, double scale, int id_a, int id_b);
46 |   void            invert_vector( level_type * level, int id_c, double scale_a, int id_a);
47 |   void              init_vector( level_type * level, int id_a, double scalar);
48 | //------------------------------------------------------------------------------------------------------------------------------
49 | void                color_vector(level_type * level, int id, int colors, int icolor, int jcolor, int kcolor);
50 | void               random_vector(level_type * level, int id);
51 | //------------------------------------------------------------------------------------------------------------------------------
52 |   void        initialize_problem(level_type * level, double hLevel, double a, double b);
53 |   void   initialize_valid_region(level_type * level);
54 | //------------------------------------------------------------------------------------------------------------------------------
55 | #endif
56 | 


--------------------------------------------------------------------------------
/finite-volume/source/operators.old/aggregate.mpi/jacobi.c:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------------------------------------------------------
 2 | // Samuel Williams
 3 | // SWWilliams@lbl.gov
 4 | // Lawrence Berkeley National Lab
 5 | //------------------------------------------------------------------------------------------------------------------------------
 6 | #include <stdint.h>
 7 | #include "../timer.h"
 8 | //------------------------------------------------------------------------------------------------------------------------------
 9 | void smooth(level_type * level, int x_id, int rhs_id, double a, double b){
10 |   if(NUM_SMOOTHS&1){
11 |     printf("error - NUM_SMOOTHS must be even...\n");
12 |     exit(0);
13 |   }
14 | 
15 | 
16 |   int box,s;
17 |   int ghosts = level->box_ghosts;
18 |   int starShaped = stencil_is_star_shaped();
19 |   int communicationAvoiding = ghosts > stencil_get_radius(); 
20 |  
21 |   #ifdef USE_L1JACOBI
22 |   double weight = 1.0;
23 |   #else
24 |   double weight = 2.0/3.0;
25 |   #endif
26 |  
27 |  
28 |   // if communication-avoiding, need updated RHS for stencils in ghost zones
29 |   if(communicationAvoiding)exchange_boundary(level,rhs_id,0); 
30 | 
31 |   for(s=0;s<NUM_SMOOTHS;s+=ghosts){
32 |     // Jacobi ping pongs between x_id and VECTOR_TEMP
33 |     if((s&1)==0){exchange_boundary(level,       x_id,stencil_is_star_shaped() && !communicationAvoiding);apply_BCs(level,       x_id);}
34 |             else{exchange_boundary(level,VECTOR_TEMP,stencil_is_star_shaped() && !communicationAvoiding);apply_BCs(level,VECTOR_TEMP);}
35 | 
36 |     // now do ghosts communication-avoiding smooths on each box...
37 |     uint64_t _timeStart = CycleTime();
38 | 
39 |     PRAGMA_THREAD_ACROSS_BOXES(level,box)
40 |     for(box=0;box<level->num_my_boxes;box++){
41 |       int i,j,k,ss;
42 |       const int jStride = level->my_boxes[box].jStride;
43 |       const int kStride = level->my_boxes[box].kStride;
44 |       const int     dim = level->my_boxes[box].dim;
45 |       const double h2inv = 1.0/(level->h*level->h);
46 |       const double * __restrict__ rhs    = level->my_boxes[box].vectors[       rhs_id] + ghosts*(1+jStride+kStride);
47 |       const double * __restrict__ alpha  = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride);
48 |       const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride);
49 |       const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride);
50 |       const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride);
51 |       const double * __restrict__ valid  = level->my_boxes[box].vectors[VECTOR_VALID ] + ghosts*(1+jStride+kStride); // cell is inside the domain
52 |       #ifdef USE_L1JACOBI
53 |       const double * __restrict__ lambda = level->my_boxes[box].vectors[VECTOR_L1INV ] + ghosts*(1+jStride+kStride);
54 |       #else
55 |       const double * __restrict__ lambda = level->my_boxes[box].vectors[VECTOR_DINV  ] + ghosts*(1+jStride+kStride);
56 |       #endif
57 |       int ghostsToOperateOn=ghosts-1;
58 |       for(ss=s;ss<s+ghosts;ss++,ghostsToOperateOn--){
59 |         const double * __restrict__ x_n;
60 |               double * __restrict__ x_np1;
61 |               if((ss&1)==0){x_n   = level->my_boxes[box].vectors[       x_id] + ghosts*(1+jStride+kStride);
62 |                             x_np1 = level->my_boxes[box].vectors[VECTOR_TEMP] + ghosts*(1+jStride+kStride);}
63 |                        else{x_n   = level->my_boxes[box].vectors[VECTOR_TEMP] + ghosts*(1+jStride+kStride);
64 |                             x_np1 = level->my_boxes[box].vectors[       x_id] + ghosts*(1+jStride+kStride);}
65 |         PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k)
66 |         for(k=0-ghostsToOperateOn;k<dim+ghostsToOperateOn;k++){
67 |         for(j=0-ghostsToOperateOn;j<dim+ghostsToOperateOn;j++){
68 |         for(i=0-ghostsToOperateOn;i<dim+ghostsToOperateOn;i++){
69 |           int ijk = i + j*jStride + k*kStride;
70 |           double Ax_n = apply_op_ijk(x_n);
71 |           x_np1[ijk] = x_n[ijk] + weight*lambda[ijk]*(rhs[ijk]-Ax_n);
72 |         }}}
73 |       } // ss-loop
74 |     } // box-loop
75 |     level->cycles.smooth += (uint64_t)(CycleTime()-_timeStart);
76 |   } // s-loop
77 | }
78 | 
79 | //------------------------------------------------------------------------------------------------------------------------------
80 | 


--------------------------------------------------------------------------------
/finite-volume/source/operators.old/apply_op.c:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------------------------------------------------------
 2 | // Samuel Williams
 3 | // SWWilliams@lbl.gov
 4 | // Lawrence Berkeley National Lab
 5 | //------------------------------------------------------------------------------------------------------------------------------
 6 | void apply_op(level_type * level, int Ax_id, int x_id, double a, double b){  // y=Ax
 7 |   // exchange the boundary of x in preparation for Ax
 8 |   exchange_boundary(level,x_id,stencil_get_shape());
 9 |           apply_BCs(level,x_id,stencil_get_shape());
10 | 
11 |   // now do Ax proper...
12 |   double _timeStart = getTime();
13 |   const int  ghosts = level->box_ghosts;
14 |   const int jStride = level->box_jStride;
15 |   const int kStride = level->box_kStride;
16 |   const int     dim = level->box_dim;
17 |   const double h2inv = 1.0/(level->h*level->h);
18 |   int box;
19 | 
20 |   PRAGMA_THREAD_ACROSS_BOXES(level,box)
21 |   for(box=0;box<level->num_my_boxes;box++){
22 |     int i,j,k;
23 |     const double * __restrict__ x      = level->my_boxes[box].vectors[         x_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point
24 |           double * __restrict__ Ax     = level->my_boxes[box].vectors[        Ax_id] + ghosts*(1+jStride+kStride); 
25 |     const double * __restrict__ alpha  = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride);
26 |     const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride);
27 |     const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride);
28 |     const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride);
29 |     const double * __restrict__  valid = level->my_boxes[box].vectors[VECTOR_VALID ] + ghosts*(1+jStride+kStride);
30 | 
31 |     PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k)
32 |     for(k=0;k<dim;k++){
33 |     for(j=0;j<dim;j++){
34 |     for(i=0;i<dim;i++){
35 |       int ijk = i + j*jStride + k*kStride;
36 |       Ax[ijk] = apply_op_ijk(x);
37 |     }}}
38 |   }
39 |   level->timers.apply_op += (double)(getTime()-_timeStart);
40 | }
41 | //------------------------------------------------------------------------------------------------------------------------------
42 | 


--------------------------------------------------------------------------------
/finite-volume/source/operators.old/jacobi.c:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------------------------------------------------------
 2 | // Samuel Williams
 3 | // SWWilliams@lbl.gov
 4 | // Lawrence Berkeley National Lab
 5 | //------------------------------------------------------------------------------------------------------------------------------
 6 | #include <stdint.h>
 7 | //------------------------------------------------------------------------------------------------------------------------------
 8 | void smooth(level_type * level, int x_id, int rhs_id, double a, double b){
 9 |   if(NUM_SMOOTHS&1){
10 |     fprintf(stderr,"error - NUM_SMOOTHS must be even...\n");
11 |     exit(0);
12 |   }
13 | 
14 |   #ifdef USE_L1JACOBI
15 |   double weight = 1.0;
16 |   #else
17 |   double weight = 2.0/3.0;
18 |   #endif
19 |  
20 |   int box,s;
21 |   for(s=0;s<NUM_SMOOTHS;s++){
22 |     // exchange ghost zone data... Jacobi ping pongs between x_id and VECTOR_TEMP
23 |     if((s&1)==0){exchange_boundary(level,       x_id,stencil_get_shape());apply_BCs(level,       x_id,stencil_get_shape());}
24 |             else{exchange_boundary(level,VECTOR_TEMP,stencil_get_shape());apply_BCs(level,VECTOR_TEMP,stencil_get_shape());}
25 | 
26 |     // apply the smoother... Jacobi ping pongs between x_id and VECTOR_TEMP
27 |     double _timeStart = getTime();
28 |     const int  ghosts = level->box_ghosts;
29 |     const int jStride = level->box_jStride;
30 |     const int kStride = level->box_kStride;
31 |     const int     dim = level->box_dim;
32 |     const double h2inv = 1.0/(level->h*level->h);
33 | 
34 |     PRAGMA_THREAD_ACROSS_BOXES(level,box)
35 |     for(box=0;box<level->num_my_boxes;box++){
36 |       int i,j,k;
37 |       const double * __restrict__ rhs    = level->my_boxes[box].vectors[       rhs_id] + ghosts*(1+jStride+kStride);
38 |       const double * __restrict__ alpha  = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride);
39 |       const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride);
40 |       const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride);
41 |       const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride);
42 |       const double * __restrict__ valid  = level->my_boxes[box].vectors[VECTOR_VALID ] + ghosts*(1+jStride+kStride); // cell is inside the domain
43 |       #ifdef USE_L1JACOBI
44 |       const double * __restrict__ lambda = level->my_boxes[box].vectors[VECTOR_L1INV ] + ghosts*(1+jStride+kStride);
45 |       #else
46 |       const double * __restrict__ lambda = level->my_boxes[box].vectors[VECTOR_DINV  ] + ghosts*(1+jStride+kStride);
47 |       #endif
48 |         const double * __restrict__ x_n;
49 |               double * __restrict__ x_np1;
50 |                       if((s&1)==0){x_n   = level->my_boxes[box].vectors[         x_id] + ghosts*(1+jStride+kStride);
51 |                                    x_np1 = level->my_boxes[box].vectors[VECTOR_TEMP  ] + ghosts*(1+jStride+kStride);}
52 |                               else{x_n   = level->my_boxes[box].vectors[VECTOR_TEMP  ] + ghosts*(1+jStride+kStride);
53 |                                    x_np1 = level->my_boxes[box].vectors[         x_id] + ghosts*(1+jStride+kStride);}
54 |       PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k)
55 |       for(k=0;k<dim;k++){
56 |       for(j=0;j<dim;j++){
57 |       for(i=0;i<dim;i++){
58 |         int ijk = i + j*jStride + k*kStride;
59 |         double Ax_n = apply_op_ijk(x_n);
60 |         x_np1[ijk] = x_n[ijk] + weight*lambda[ijk]*(rhs[ijk]-Ax_n);
61 |       }}}
62 |     } // box-loop
63 |     level->timers.smooth += (double)(getTime()-_timeStart);
64 |   } // s-loop
65 | }
66 | 
67 | //------------------------------------------------------------------------------------------------------------------------------
68 | 


--------------------------------------------------------------------------------
/finite-volume/source/operators.old/residual.c:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------------------------------------------------------
 2 | // Samuel Williams
 3 | // SWWilliams@lbl.gov
 4 | // Lawrence Berkeley National Lab
 5 | //------------------------------------------------------------------------------------------------------------------------------
 6 | // calculate res_id = rhs_id - A(x_id)
 7 | 
 8 | void residual(level_type * level, int res_id, int x_id, int rhs_id, double a, double b){
 9 |   // exchange the boundary for x in prep for Ax...
10 |   exchange_boundary(level,x_id,stencil_get_shape());
11 |           apply_BCs(level,x_id,stencil_get_shape());
12 | 
13 |   // now do residual/restriction proper...
14 |   double _timeStart = getTime();
15 |   const int  ghosts = level->box_ghosts;
16 |   const int jStride = level->box_jStride;
17 |   const int kStride = level->box_kStride;
18 |   const int     dim = level->box_dim;
19 |   const double h2inv = 1.0/(level->h*level->h);
20 |   int box;
21 | 
22 |   PRAGMA_THREAD_ACROSS_BOXES(level,box)
23 |   for(box=0;box<level->num_my_boxes;box++){
24 |     int i,j,k;
25 |     const double * __restrict__ x      = level->my_boxes[box].vectors[         x_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point
26 |     const double * __restrict__ rhs    = level->my_boxes[box].vectors[       rhs_id] + ghosts*(1+jStride+kStride);
27 |     const double * __restrict__ alpha  = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride);
28 |     const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride);
29 |     const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride);
30 |     const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride);
31 |     const double * __restrict__ valid  = level->my_boxes[box].vectors[VECTOR_VALID ] + ghosts*(1+jStride+kStride); // cell is inside the domain
32 |           double * __restrict__ res    = level->my_boxes[box].vectors[       res_id] + ghosts*(1+jStride+kStride);
33 | 
34 |     PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k)
35 |     for(k=0;k<dim;k++){
36 |     for(j=0;j<dim;j++){
37 |     for(i=0;i<dim;i++){
38 |       int ijk = i + j*jStride + k*kStride;
39 |       double Ax = apply_op_ijk(x);
40 |       res[ijk] = rhs[ijk]-Ax;
41 |     }}}
42 |   }
43 |   level->timers.residual += (double)(getTime()-_timeStart);
44 | }
45 | 
46 | 


--------------------------------------------------------------------------------
/finite-volume/source/operators.old/symgs.c:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------------------------------------------------------
 2 | // Samuel Williams
 3 | // SWWilliams@lbl.gov
 4 | // Lawrence Berkeley National Lab
 5 | //------------------------------------------------------------------------------------------------------------------------------
 6 | void smooth(level_type * level, int phi_id, int rhs_id, double a, double b){
 7 |   int box,s;
 8 | 
 9 |   for(s=0;s<2*NUM_SMOOTHS;s++){ // there are two sweeps (forward/backward) per GS smooth
10 |     exchange_boundary(level,phi_id,stencil_get_shape());
11 |             apply_BCs(level,phi_id,stencil_get_shape());
12 | 
13 |     // now do ghosts communication-avoiding smooths on each box...
14 |     double _timeStart = getTime();
15 |     const int  ghosts = level->box_ghosts;
16 |     const int jStride = level->box_jStride;
17 |     const int kStride = level->box_kStride;
18 |     const int     dim = level->box_dim;
19 |     const double h2inv = 1.0/(level->h*level->h);
20 | 
21 |     #ifdef _OPENMP
22 |     #pragma omp parallel for
23 |     #endif
24 |     for(box=0;box<level->num_my_boxes;box++){
25 |       int i,j,k;
26 |             double * __restrict__ phi      = level->my_boxes[box].vectors[       phi_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point
27 |       const double * __restrict__ rhs      = level->my_boxes[box].vectors[       rhs_id] + ghosts*(1+jStride+kStride);
28 |       const double * __restrict__ alpha    = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride);
29 |       const double * __restrict__ beta_i   = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride);
30 |       const double * __restrict__ beta_j   = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride);
31 |       const double * __restrict__ beta_k   = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride);
32 |       const double * __restrict__ Dinv     = level->my_boxes[box].vectors[VECTOR_DINV  ] + ghosts*(1+jStride+kStride);
33 |       const double * __restrict__ valid    = level->my_boxes[box].vectors[VECTOR_VALID ] + ghosts*(1+jStride+kStride); // cell is inside the domain
34 |           
35 | 
36 |       if( (s&0x1)==0 ){ // forward sweep... hard to thread
37 |         for(k=0;k<dim;k++){
38 |         for(j=0;j<dim;j++){
39 |         for(i=0;i<dim;i++){
40 |           int ijk = i + j*jStride + k*kStride;
41 |           double Ax = apply_op_ijk(phi);
42 |           phi[ijk] = phi[ijk] + Dinv[ijk]*(rhs[ijk]-Ax);
43 |         }}}
44 |       }else{ // backward sweep... hard to thread
45 |         for(k=dim-1;k>=0;k--){
46 |         for(j=dim-1;j>=0;j--){
47 |         for(i=dim-1;i>=0;i--){
48 |           int ijk = i + j*jStride + k*kStride;
49 |           double Ax = apply_op_ijk(phi);
50 |           phi[ijk] = phi[ijk] + Dinv[ijk]*(rhs[ijk]-Ax);
51 |         }}}
52 |       }
53 | 
54 |     } // boxes
55 |     level->timers.smooth += (double)(getTime()-_timeStart);
56 |   } // s-loop
57 | }
58 | 
59 | 
60 | //------------------------------------------------------------------------------------------------------------------------------
61 | 


--------------------------------------------------------------------------------
/finite-volume/source/operators/apply_op.c:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------------------------------------------------------
 2 | // Samuel Williams
 3 | // SWWilliams@lbl.gov
 4 | // Lawrence Berkeley National Lab
 5 | //------------------------------------------------------------------------------------------------------------------------------
 6 | // Applies the linear operator specified in the apply_op_ijk macro to vector x_id and stores the result in Ax_id
 7 | // This requires exchanging a ghost zone and/or enforcing a boundary condition.
 8 | // NOTE, Ax_id and x_id must be distinct
 9 | void apply_op(level_type * level, int Ax_id, int x_id, double a, double b){
10 |   // exchange the boundary of x in preparation for Ax
11 |   exchange_boundary(level,x_id,stencil_get_shape());
12 |           apply_BCs(level,x_id,stencil_get_shape());
13 | 
14 |   // now do Ax proper...
15 |   double _timeStart = getTime();
16 |   int block;
17 | 
18 |   if(level->use_cuda)cudaDeviceSynchronize(); // FIX... wait for any other GPU operations on this level to complete
19 | 
20 |   PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks)
21 |   for(block=0;block<level->num_my_blocks;block++){
22 |     const int box = level->my_blocks[block].read.box;
23 |     const int ilo = level->my_blocks[block].read.i;
24 |     const int jlo = level->my_blocks[block].read.j;
25 |     const int klo = level->my_blocks[block].read.k;
26 |     const int ihi = level->my_blocks[block].dim.i + ilo;
27 |     const int jhi = level->my_blocks[block].dim.j + jlo;
28 |     const int khi = level->my_blocks[block].dim.k + klo;
29 |     int i,j,k;
30 |     const int jStride = level->my_boxes[box].jStride;
31 |     const int kStride = level->my_boxes[box].kStride;
32 |     const int  ghosts = level->my_boxes[box].ghosts;
33 |     const double h2inv = 1.0/(level->h*level->h);
34 |     const double * __restrict__ x      = level->my_boxes[box].vectors[         x_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point
35 |           double * __restrict__ Ax     = level->my_boxes[box].vectors[        Ax_id] + ghosts*(1+jStride+kStride); 
36 |     const double * __restrict__ alpha  = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride);
37 |     const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride);
38 |     const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride);
39 |     const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride);
40 |     const double * __restrict__  valid = level->my_boxes[box].vectors[VECTOR_VALID ] + ghosts*(1+jStride+kStride);
41 | 
42 |     for(k=klo;k<khi;k++){
43 |     for(j=jlo;j<jhi;j++){
44 |     for(i=ilo;i<ihi;i++){
45 |       int ijk = i + j*jStride + k*kStride;
46 |       Ax[ijk] = apply_op_ijk(x);
47 |     }}}
48 |   }
49 |   level->timers.apply_op += (double)(getTime()-_timeStart);
50 | }
51 | //------------------------------------------------------------------------------------------------------------------------------
52 | 


--------------------------------------------------------------------------------
/finite-volume/source/operators/jacobi.c:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------------------------------------------------------
 2 | // Samuel Williams
 3 | // SWWilliams@lbl.gov
 4 | // Lawrence Berkeley National Lab
 5 | //------------------------------------------------------------------------------------------------------------------------------
 6 | #include <stdint.h>
 7 | //------------------------------------------------------------------------------------------------------------------------------
 8 | void smooth(level_type * level, int x_id, int rhs_id, double a, double b){
 9 |   if(NUM_SMOOTHS&1){
10 |     fprintf(stderr,"error - NUM_SMOOTHS must be even...\n");
11 |     exit(0);
12 |   }
13 | 
14 |   #ifdef USE_L1JACOBI
15 |   double weight = 1.0;
16 |   #else
17 |   double weight = 2.0/3.0;
18 |   #endif
19 |  
20 |   int block,s;
21 |   for(s=0;s<NUM_SMOOTHS;s++){
22 |     // exchange ghost zone data... Jacobi ping pongs between x_id and VECTOR_TEMP
23 |     if((s&1)==0){exchange_boundary(level,       x_id,stencil_get_shape());apply_BCs(level,       x_id,stencil_get_shape());}
24 |             else{exchange_boundary(level,VECTOR_TEMP,stencil_get_shape());apply_BCs(level,VECTOR_TEMP,stencil_get_shape());}
25 | 
26 |     // apply the smoother... Jacobi ping pongs between x_id and VECTOR_TEMP
27 |     double _timeStart = getTime();
28 |     if (level->use_cuda) {
29 |       cuda_smooth(*level, x_id, rhs_id, a, b, s, NULL, NULL);
30 |     }
31 |     else {
32 |     PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks)
33 |     for(block=0;block<level->num_my_blocks;block++){
34 |       const int box = level->my_blocks[block].read.box;
35 |       const int ilo = level->my_blocks[block].read.i;
36 |       const int jlo = level->my_blocks[block].read.j;
37 |       const int klo = level->my_blocks[block].read.k;
38 |       const int ihi = level->my_blocks[block].dim.i + ilo;
39 |       const int jhi = level->my_blocks[block].dim.j + jlo;
40 |       const int khi = level->my_blocks[block].dim.k + klo;
41 |       int i,j,k;
42 |       const int ghosts = level->box_ghosts;
43 |       const int jStride = level->my_boxes[box].jStride;
44 |       const int kStride = level->my_boxes[box].kStride;
45 |       const double h2inv = 1.0/(level->h*level->h);
46 |       const double * __restrict__ rhs    = level->my_boxes[box].vectors[       rhs_id] + ghosts*(1+jStride+kStride);
47 |       const double * __restrict__ alpha  = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride);
48 |       const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride);
49 |       const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride);
50 |       const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride);
51 |       const double * __restrict__ valid  = level->my_boxes[box].vectors[VECTOR_VALID ] + ghosts*(1+jStride+kStride); // cell is inside the domain
52 |       #ifdef USE_L1JACOBI
53 |       const double * __restrict__ lambda = level->my_boxes[box].vectors[VECTOR_L1INV ] + ghosts*(1+jStride+kStride);
54 |       #else
55 |       const double * __restrict__ lambda = level->my_boxes[box].vectors[VECTOR_DINV  ] + ghosts*(1+jStride+kStride);
56 |       #endif
57 |         const double * __restrict__ x_n;
58 |               double * __restrict__ x_np1;
59 |                       if((s&1)==0){x_n   = level->my_boxes[box].vectors[         x_id] + ghosts*(1+jStride+kStride);
60 |                                    x_np1 = level->my_boxes[box].vectors[VECTOR_TEMP  ] + ghosts*(1+jStride+kStride);}
61 |                               else{x_n   = level->my_boxes[box].vectors[VECTOR_TEMP  ] + ghosts*(1+jStride+kStride);
62 |                                    x_np1 = level->my_boxes[box].vectors[         x_id] + ghosts*(1+jStride+kStride);}
63 | 
64 |       for(k=klo;k<khi;k++){
65 |       for(j=jlo;j<jhi;j++){
66 |       for(i=ilo;i<ihi;i++){
67 |         int ijk = i + j*jStride + k*kStride;
68 |         double Ax_n = apply_op_ijk(x_n);
69 |         x_np1[ijk] = x_n[ijk] + weight*lambda[ijk]*(rhs[ijk]-Ax_n);
70 |       }}}
71 | 
72 |     } // box-loop
73 |     } // use-cuda
74 |     level->timers.smooth += (double)(getTime()-_timeStart);
75 |   } // s-loop
76 | }
77 | 
78 | //------------------------------------------------------------------------------------------------------------------------------
79 | 


--------------------------------------------------------------------------------
/finite-volume/source/operators/residual.c:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------------------------------------------------------
 2 | // Samuel Williams
 3 | // SWWilliams@lbl.gov
 4 | // Lawrence Berkeley National Lab
 5 | //------------------------------------------------------------------------------------------------------------------------------
 6 | // This routines calculates the residual (res=rhs-Ax) using the linear operator specified in the apply_op_ijk macro
 7 | // This requires exchanging a ghost zone and/or enforcing a boundary condition.
 8 | // NOTE, x_id must be distinct from rhs_id and res_id
 9 | void residual(level_type * level, int res_id, int x_id, int rhs_id, double a, double b){
10 |   // exchange the boundary for x in prep for Ax...
11 |   exchange_boundary(level,x_id,stencil_get_shape());
12 |           apply_BCs(level,x_id,stencil_get_shape());
13 | 
14 |   // now do residual/restriction proper...
15 |   double _timeStart = getTime();
16 |   int block;
17 | 
18 |   if (level->use_cuda) {
19 |     cuda_residual(*level, res_id, x_id, rhs_id, a, b);
20 |   }
21 |   else {
22 |   PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks)
23 |   for(block=0;block<level->num_my_blocks;block++){
24 |     const int box = level->my_blocks[block].read.box;
25 |     const int ilo = level->my_blocks[block].read.i;
26 |     const int jlo = level->my_blocks[block].read.j;
27 |     const int klo = level->my_blocks[block].read.k;
28 |     const int ihi = level->my_blocks[block].dim.i + ilo;
29 |     const int jhi = level->my_blocks[block].dim.j + jlo;
30 |     const int khi = level->my_blocks[block].dim.k + klo;
31 |     int i,j,k;
32 |     const int jStride = level->my_boxes[box].jStride;
33 |     const int kStride = level->my_boxes[box].kStride;
34 |     const int  ghosts = level->my_boxes[box].ghosts;
35 |     const double h2inv = 1.0/(level->h*level->h);
36 |     const double * __restrict__ x      = level->my_boxes[box].vectors[         x_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point
37 |     const double * __restrict__ rhs    = level->my_boxes[box].vectors[       rhs_id] + ghosts*(1+jStride+kStride);
38 |     const double * __restrict__ alpha  = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride);
39 |     const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride);
40 |     const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride);
41 |     const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride);
42 |     const double * __restrict__ valid  = level->my_boxes[box].vectors[VECTOR_VALID ] + ghosts*(1+jStride+kStride); // cell is inside the domain
43 |           double * __restrict__ res    = level->my_boxes[box].vectors[       res_id] + ghosts*(1+jStride+kStride);
44 | 
45 |     for(k=klo;k<khi;k++){
46 |     for(j=jlo;j<jhi;j++){
47 |     for(i=ilo;i<ihi;i++){
48 |       int ijk = i + j*jStride + k*kStride;
49 |       double Ax = apply_op_ijk(x);
50 |       res[ijk] = rhs[ijk]-Ax;
51 |     }}}
52 |   }
53 |   }
54 |   level->timers.residual += (double)(getTime()-_timeStart);
55 | }
56 | 
57 | 


--------------------------------------------------------------------------------
/finite-volume/source/operators/symgs.c:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------------------------------------------------------
 2 | // Samuel Williams
 3 | // SWWilliams@lbl.gov
 4 | // Lawrence Berkeley National Lab
 5 | //------------------------------------------------------------------------------------------------------------------------------
 6 | void smooth(level_type * level, int phi_id, int rhs_id, double a, double b){
 7 |   int box,s;
 8 | 
 9 |   for(s=0;s<2*NUM_SMOOTHS;s++){ // there are two sweeps (forward/backward) per GS smooth
10 |     exchange_boundary(level,phi_id,stencil_get_shape());
11 |             apply_BCs(level,phi_id,stencil_get_shape());
12 | 
13 |     double _timeStart = getTime();
14 |     #ifdef _OPENMP
15 |     #pragma omp parallel for private(box)
16 |     #endif
17 |     for(box=0;box<level->num_my_boxes;box++){
18 |       int i,j,k;
19 |       const int ghosts = level->box_ghosts;
20 |       const int jStride = level->my_boxes[box].jStride;
21 |       const int kStride = level->my_boxes[box].kStride;
22 |       const int     dim = level->my_boxes[box].dim;
23 |       const double h2inv = 1.0/(level->h*level->h);
24 |             double * __restrict__ phi      = level->my_boxes[box].vectors[       phi_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point
25 |       const double * __restrict__ rhs      = level->my_boxes[box].vectors[       rhs_id] + ghosts*(1+jStride+kStride);
26 |       const double * __restrict__ alpha    = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride);
27 |       const double * __restrict__ beta_i   = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride);
28 |       const double * __restrict__ beta_j   = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride);
29 |       const double * __restrict__ beta_k   = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride);
30 |       const double * __restrict__ Dinv     = level->my_boxes[box].vectors[VECTOR_DINV  ] + ghosts*(1+jStride+kStride);
31 |       const double * __restrict__ valid    = level->my_boxes[box].vectors[VECTOR_VALID ] + ghosts*(1+jStride+kStride); // cell is inside the domain
32 |           
33 | 
34 |       if( (s&0x1)==0 ){ // forward sweep... hard to thread
35 |         for(k=0;k<dim;k++){
36 |         for(j=0;j<dim;j++){
37 |         for(i=0;i<dim;i++){
38 |           int ijk = i + j*jStride + k*kStride;
39 |           double Ax = apply_op_ijk(phi);
40 |           phi[ijk] = phi[ijk] + Dinv[ijk]*(rhs[ijk]-Ax);
41 |         }}}
42 |       }else{ // backward sweep... hard to thread
43 |         for(k=dim-1;k>=0;k--){
44 |         for(j=dim-1;j>=0;j--){
45 |         for(i=dim-1;i>=0;i--){
46 |           int ijk = i + j*jStride + k*kStride;
47 |           double Ax = apply_op_ijk(phi);
48 |           phi[ijk] = phi[ijk] + Dinv[ijk]*(rhs[ijk]-Ax);
49 |         }}}
50 |       }
51 | 
52 |     } // boxes
53 |     level->timers.smooth += (double)(getTime()-_timeStart);
54 |   } // s-loop
55 | }
56 | 
57 | 
58 | //------------------------------------------------------------------------------------------------------------------------------
59 | 


--------------------------------------------------------------------------------
/finite-volume/source/solvers.c:
--------------------------------------------------------------------------------
  1 | //------------------------------------------------------------------------------------------------------------------------------
  2 | // Samuel Williams
  3 | // SWWilliams@lbl.gov
  4 | // Lawrence Berkeley National Lab
  5 | //------------------------------------------------------------------------------------------------------------------------------
  6 | #include <stdio.h>
  7 | #include <stdlib.h>
  8 | #include <stdint.h>
  9 | #include <string.h>
 10 | #include <math.h>
 11 | //------------------------------------------------------------------------------------------------------------------------------
 12 | #include "timers.h"
 13 | #include "defines.h"
 14 | #include "level.h"
 15 | #include "operators.h"
 16 | //------------------------------------------------------------------------------------------------------------------------------
 17 | #ifdef USE_BICGSTAB
 18 | #include "solvers/bicgstab.c"
 19 | #elif  USE_CG
 20 | #include "solvers/cg.c"
 21 | #elif  USE_CABICGSTAB
 22 | #include "solvers/cabicgstab.c"
 23 | #elif  USE_CACG
 24 | #include "solvers/cacg.c"
 25 | #endif
 26 | //------------------------------------------------------------------------------------------------------------------------------
 27 | void IterativeSolver(level_type * level, int u_id, int f_id, double a, double b, double desired_reduction_in_norm){ 
 28 |   if(!level->active)return;
 29 |   //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
 30 |   if(level->must_subtract_mean==-1){
 31 |     level->must_subtract_mean=0;
 32 |     int alpha_is_zero = (dot(level,VECTOR_ALPHA,VECTOR_ALPHA) == 0.0);
 33 |     if( (level->boundary_condition.type==BC_PERIODIC) && ((a==0) || (alpha_is_zero)) )level->must_subtract_mean = 1; // Poisson with Periodic BCs
 34 |   }
 35 |   //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
 36 |   #if 0
 37 |   if( (level->dim.i==1)&&(level->dim.j==1)&&(level->dim.k==1) ){
 38 |     // I have reduced the system to 1 equation and 1 unknown and know D^{-1} exactly
 39 |     // therefore A^{-1} == D^{-1} = 1/a00
 40 |     // u = A^{-1}f == D^{-1}f
 41 |     mul_vectors(level,u_id,1.0,VECTOR_DINV,f_id); // u = A^{-1}f = D^{-1}f 
 42 |     if(level->must_subtract_mean == 1){
 43 |       double mean_of_u = mean(level,u_id);
 44 |       shift_vector(level,u_id,u_id,-mean_of_u);
 45 |     }
 46 |     return;
 47 |   }
 48 |   #endif
 49 |   //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
 50 |   #ifdef USE_BICGSTAB
 51 |     BiCGStab(level,u_id,f_id,a,b,desired_reduction_in_norm);
 52 |   #elif  USE_CG
 53 |     CG(level,u_id,f_id,a,b,desired_reduction_in_norm);
 54 |   #elif  USE_CABICGSTAB
 55 |     CABiCGStab(level,u_id,f_id,a,b,desired_reduction_in_norm);
 56 |   #elif  USE_CACG
 57 |     CACG(level,u_id,f_id,a,b,desired_reduction_in_norm);
 58 |   #else 
 59 |     // just point relaxation via multiple smooth()'s
 60 |     if(level->must_subtract_mean == 1){
 61 |       double mean_of_u = mean(level,u_id);
 62 |       shift_vector(level,u_id,u_id,-mean_of_u);
 63 |     }
 64 |     residual(level,VECTOR_TEMP,u_id,f_id,a,b);
 65 |     //mul_vectors(level,VECTOR_TEMP,1.0,VECTOR_TEMP,VECTOR_DINV); //  Using ||D^{-1}(b-Ax)||_{inf} as convergence criteria...
 66 |     
 67 |     double norm_of_r0 = norm(level,VECTOR_TEMP);
 68 |     int s=0,maxSmoothsBottom=200,converged=0;
 69 |     while( (s<maxSmoothsBottom) && !converged){
 70 |       s++;
 71 |       level->Krylov_iterations++;
 72 |       smooth(level,u_id,f_id,a,b);
 73 |       if(level->must_subtract_mean == 1){
 74 |         double mean_of_u = mean(level,u_id);
 75 |         shift_vector(level,u_id,u_id,-mean_of_u);
 76 |       }
 77 |       residual(level,VECTOR_TEMP,u_id,f_id,a,b);
 78 |       //mul_vectors(level,VECTOR_TEMP,1.0,VECTOR_TEMP,VECTOR_DINV); //  Using ||D^{-1}(b-Ax)||_{inf} as convergence criteria...
 79 |     
 80 |       double norm_of_r = norm(level,VECTOR_TEMP);
 81 |       if(norm_of_r == 0.0){converged=1;break;}
 82 |       if(norm_of_r < desired_reduction_in_norm*norm_of_r0){converged=1;break;}
 83 |     }
 84 |   #endif
 85 |   //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
 86 | }
 87 | 
 88 | 
 89 | //------------------------------------------------------------------------------------------------------------------------------
 90 | int IterativeSolver_NumVectors(){
 91 |   // additionally number of vectors required by an iterative solver...
 92 |   #ifdef USE_BICGSTAB
 93 |   return(8);                  // BiCGStab requires additional vectors r0,r,p,s,Ap,As
 94 |   #elif  USE_CG
 95 |   return(5);                  // CG requires extra vectors r0,r,p,Ap,z
 96 |   #elif  USE_CABICGSTAB
 97 |   return(4+4*CA_KRYLOV_S);    // CABiCGStab requires additional vectors rt,p,r,P[2s+1],R[2s].
 98 |   #elif  USE_CACG
 99 |   return(4+2*CA_KRYLOV_S);    // CACG requires additional vectors r0,p,r,P[s+1],R[s].
100 |   #endif
101 |   return(0);                  // simply doing multiple smooths requires no extra vectors
102 | }
103 | //------------------------------------------------------------------------------------------------------------------------------
104 | 


--------------------------------------------------------------------------------
/finite-volume/source/solvers.h:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------------------------------------------------------
 2 | // Samuel Williams
 3 | // SWWilliams@lbl.gov
 4 | // Lawrence Berkeley National Lab
 5 | //------------------------------------------------------------------------------------------------------------------------------
 6 | #ifndef SOLVERS_H
 7 | #define SOLVERS_H
 8 | //------------------------------------------------------------------------------------------------------------------------------
 9 | void IterativeSolver(level_type *level, int u_id, int f_id, double a, double b, double desired_reduction_in_norm);
10 | int  IterativeSolver_NumVectors();
11 | //------------------------------------------------------------------------------------------------------------------------------
12 | #endif
13 | 


--------------------------------------------------------------------------------
/finite-volume/source/solvers/matmul.c:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------------------------------------------------------
 2 | // Samuel Williams
 3 | // SWWilliams@lbl.gov
 4 | // Lawrence Berkeley National Lab
 5 | //------------------------------------------------------------------------------------------------------------------------------
 6 | void matmul(level_type * level, double *C, int * id_A, int * id_B, int rows, int cols, int A_equals_B_transpose){
 7 |   // *id_A = m vector_id's (conceptually pointers to the rows    of a m x level->num_my_boxes*volume matrix)
 8 |   // *id_B = n vector_id's (conceptually pointers to the columns of a level->num_my_boxes*volume matrix x n)
 9 |   // *C is a mxn matrix where C[rows][cols] = dot(id_A[rows],id_B[cols])
10 | 
11 |   // FIX, id_A and id_B are likely the same and thus C[][] will be symmetric (modulo missing row?)
12 |   // if(A_equals_B_transpose && (cols>=rows)) then use id_B and only run for nn>=mm // common case for s-step Krylov methods
13 |   // C_is_symmetric && cols< rows (use id_A)
14 |   int mm,nn;
15 | 
16 | 
17 |   double _timeStart = getTime();
18 |   // FIX... rather than performing an all_reduce on the essentially symmetric [G,g], do the all_reduce on the upper triangle and then duplicate (saves BW)
19 |   #ifdef _OPENMP
20 |   #pragma omp parallel for schedule(static,1) collapse(2)
21 |   #endif
22 |   for(mm=0;mm<rows;mm++){
23 |   for(nn=0;nn<cols;nn++){
24 |   if(nn>=mm){ // upper triangular
25 |     int box;
26 |     double a_dot_b_level =  0.0;
27 |     for(box=0;box<level->num_my_boxes;box++){
28 |       int i,j,k;
29 |       const int jStride = level->my_boxes[box].jStride;
30 |       const int kStride = level->my_boxes[box].kStride;
31 |       const int  ghosts = level->my_boxes[box].ghosts;
32 |       const int     dim = level->my_boxes[box].dim;
33 |       double * __restrict__ grid_a = level->my_boxes[box].vectors[id_A[mm]] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point
34 |       double * __restrict__ grid_b = level->my_boxes[box].vectors[id_B[nn]] + ghosts*(1+jStride+kStride); 
35 |       double a_dot_b_box = 0.0;
36 |       for(k=0;k<dim;k++){
37 |       for(j=0;j<dim;j++){
38 |       for(i=0;i<dim;i++){
39 |         int ijk = i + j*jStride + k*kStride;
40 |         a_dot_b_box += grid_a[ijk]*grid_b[ijk];
41 |       }}}
42 |       a_dot_b_level+=a_dot_b_box;
43 |     }
44 |                              C[mm*cols + nn] = a_dot_b_level; // C[mm][nn]
45 |     if((mm<cols)&&(nn<rows)){C[nn*cols + mm] = a_dot_b_level;}// C[nn][mm] 
46 |   }
47 |   }}
48 |   level->timers.blas3 += (double)(getTime()-_timeStart);
49 | 
50 |   #ifdef USE_MPI
51 |   double *send_buffer = (double*)malloc(rows*cols*sizeof(double));
52 |   for(mm=0;mm<rows;mm++){
53 |   for(nn=0;nn<cols;nn++){
54 |     send_buffer[mm*cols + nn] = C[mm*cols + nn];
55 |   }}
56 |   double _timeStartAllReduce = getTime();
57 |   MPI_Allreduce(send_buffer,C,rows*cols,MPI_DOUBLE,MPI_SUM,level->MPI_COMM_ALLREDUCE);
58 |   double _timeEndAllReduce = getTime();
59 |   level->timers.collectives   += (double)(_timeEndAllReduce-_timeStartAllReduce);
60 |   free(send_buffer);
61 |   #endif
62 | 
63 | }
64 | 
65 | 


--------------------------------------------------------------------------------
/finite-volume/source/timers.c:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------------------------------------------------------
 2 | // Samuel Williams
 3 | // SWWilliams@lbl.gov
 4 | // Lawrence Berkeley National Lab
 5 | //------------------------------------------------------------------------------------------------------------------------------
 6 | #ifdef _OPENMP
 7 | // getTime in OpenMP is now defined as a preprocessor macro
 8 | //#include "./timers/omp.c"
 9 | #elif USE_MPI
10 | // getTime in MPI is now defined as a preprocessor macro
11 | //#include "./timers/mpi.c"
12 | #else
13 | #error no timer found.  You must compile with MPI, OpenMP, or include a custom timer routine
14 | #endif
15 | 


--------------------------------------------------------------------------------
/finite-volume/source/timers.h:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------------------------------------------------------
 2 | // Samuel Williams
 3 | // SWWilliams@lbl.gov
 4 | // Lawrence Berkeley National Lab
 5 | //------------------------------------------------------------------------------------------------------------------------------
 6 | #ifndef TIMER_H
 7 | #define TIMER_H
 8 | 
 9 |   #include<stdint.h>
10 | 
11 |   #ifdef _OPENMP
12 |     #include <omp.h>
13 |     #define getTime() (omp_get_wtime())
14 | 
15 |   #elif USE_MPI
16 |     #include <mpi.h>
17 |     #define getTime() (MPI_Wtime())
18 | 
19 |   #else
20 |     // user must provide a function getTime and include it in timers.c
21 |     // if calibration is necesary, then the user must #define CALIBRATE_TIMER
22 |     double getTime();
23 |   #endif
24 | 
25 | #endif
26 | 


--------------------------------------------------------------------------------
/finite-volume/source/timers/mpi.c:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------------------------------------------------------
 2 | // Samuel Williams
 3 | // SWWilliams@lbl.gov
 4 | // Lawrence Berkeley National Lab
 5 | //------------------------------------------------------------------------------------------------------------------------------
 6 | #include <stdint.h>
 7 | #include <mpi.h>
 8 | double getTime(){
 9 |   return(MPI_Wtime()); // timers are in units of seconds; no conversion is necessary
10 | }
11 | 


--------------------------------------------------------------------------------
/finite-volume/source/timers/omp.c:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------------------------------------------------------
 2 | // Samuel Williams
 3 | // SWWilliams@lbl.gov
 4 | // Lawrence Berkeley National Lab
 5 | //------------------------------------------------------------------------------------------------------------------------------
 6 | #include <stdint.h>
 7 | #include <omp.h>
 8 | double getTime(){
 9 |   return(omp_get_wtime()); // timers are in units of seconds; no conversion is necessary
10 | }
11 | 


--------------------------------------------------------------------------------
/finite-volume/source/timers/x86.c:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------------------------------------------------------
 2 | // Samuel Williams
 3 | // SWWilliams@lbl.gov
 4 | // Lawrence Berkeley National Lab
 5 | //------------------------------------------------------------------------------------------------------------------------------
 6 | #include <stdint.h>
 7 | #define CALIBRATE_TIMER // mg.c will calibrate the timer to determine seconds per cycle
 8 | double getTime(){
 9 |   uint64_t lo, hi;
10 |   __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
11 |   return( 1e-9*((double)( (((uint64_t)hi) << 32) | ((uint64_t)lo) )) ); // timers are in units of seconds;  assume 1GHz cycle counter and convert later
12 | }
13 | 


--------------------------------------------------------------------------------
/local.mk:
--------------------------------------------------------------------------------
1 | include $(call incsubdirs,finite-element finite-volume)
2 | 


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 6 | # and proprietary rights in and to this software, related documentation
 7 | # and any modifications thereto.  Any use, reproduction, disclosure or
 8 | # distribution of this software and related documentation without an express
 9 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
10 | 
11 | if [[ $# -ne 6 ]]; then
12 |     echo "Illegal parameters number: hpgmg-fv <MPI proc num> <use libmp> <use sa> <use ki> <log box size> <num boxes>"
13 | 	exit 1
14 | fi
15 | 
16 | NP=$1
17 | if [[ $NP -lt 2 ]]; then
18 |     echo "Illegal procs number: $NP"
19 | 	exit 1
20 | fi
21 | 
22 | export PATH=$PATH
23 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH
24 | 
25 | #Assuming OpenMPI
26 | OMPI_params="$OMPI_params --mca btl openib,self"
27 | OMPI_params="$OMPI_params --mca btl_openib_want_cuda_gdr 1"
28 | OMPI_params="$OMPI_params --mca btl_openib_warn_default_gid_prefix 0"
29 | 
30 | #set -x
31 | $MPI_HOME/bin/mpirun -verbose  $OMPI_params   \
32 | 	-x GDS_CQ_MAP_SMART=0 -x GDS_ENABLE_DEBUG=0 \
33 | 	-x MP_ENABLE_DEBUG=0 -x HPGMG_ENABLE_DEBUG=0 \
34 | 	-x MP_EVENT_ASYNC=0 -x MP_ENABLE_WARN \
35 | 	-x LD_LIBRARY_PATH -x PATH \
36 | 	-x GDS_DISABLE_WRITE64=0 -x GDS_SIMULATE_WRITE64=0 -x GDS_DISABLE_INLINECOPY=0 -x GDS_DISABLE_WEAK_CONSISTENCY=0 -x GDS_DISABLE_MEMBAR=0 \
37 | 	-x OMP_NUM_THREADS=1 -x ASYNC_2_STREAMS=0 \
38 | 	-x COMM_USE_COMM=$2  -x COMM_USE_ASYNC_SA=$3   -x COMM_USE_ASYNC_KI=$4 \
39 | 	-x MP_DBREC_ON_GPU=0 -x MP_RX_CQ_ON_GPU=0 -x MP_TX_CQ_ON_GPU=0 \
40 | 	-x USE_MPI=1 \
41 | 	-x CUDA_PASCAL_FORCE_40_BIT=1 \
42 | 	-x GDS_FLUSHER_TYPE=0 \
43 | 	--map-by node -np $NP -hostfile hostfile ./wrapper.sh ./build/bin/hpgmg-fv $5 $6
44 | 
45 | # ./wrapper.sh  nvprof -o nvprof-kernel.%q{OMPI_COMM_WORLD_RANK}.nvprof
46 | 
47 | echo "Use LibMP=$2"
48 | echo "Use GPUDirect Async, SA model=$3"
49 | echo "Use GPUDirect Async, KI model=$4"
50 | 
51 | # Example with 2 processes, using the SA model, 4 and 8 as size
52 | # ./run.sh 2 1 1 0 4 8
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/run_all_hpgmg.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #MPI PROC
 4 | PROC=(2 4 8 16)
 5 | #LOG2 BOX DIM
 6 | SIZE=(4 5 6 7)
 7 | #default size >= 4
 8 | # 16^3 * 4 = 16384
 9 | # 8^3  * 4 = 2048
10 | # 8^3  * 1 = 512
11 | # 4^3  * 1 = 64
12 | # 2^3 * 1 = 8
13 | MODE=(0 1 2 3) #MPI, COMM, ASYNC, GPU
14 | 
15 | for var_mode in "${MODE[@]}"
16 | do
17 |         for var_size in "${SIZE[@]}"
18 |         do
19 |         	for var_proc in "${PROC[@]}"
20 |         	do
21 |                         for num_iter in {1..5}
22 |                         do
23 |                 		if [ $var_mode -eq 0 ]; then
24 |                 			var_print_mode="MPI"
25 |                 			var_comm=0;
26 |                                 	var_async=0;
27 |                                 	var_gpu=0;
28 |                                 elif [ $var_mode -eq 1 ]; then
29 |                                         var_print_mode="COMM"
30 |                                         var_comm=1;
31 |                                         var_async=0;
32 |                                         var_gpu=0;
33 |                 		elif [ $var_mode -eq 2 ]; then
34 |                 			var_print_mode="ASYNC"		
35 |                                         var_comm=1;
36 |                                         var_async=1;
37 |                                         var_gpu=0;
38 |                 		else
39 |                 			var_print_mode="GPU-initiated"
40 |                                         var_comm=1;
41 |                                         var_async=1;
42 |                                         var_gpu=1;
43 |                 		fi
44 |                 		echo "MODE: $var_print_mode, SIZE: $var_size, PROC: $var_proc, ITER: $num_iter"
45 |                 		file_out="hpgmg-$var_print_mode-s$var_size-p$var_proc.txt"
46 |                                 if [[ $num_iter -eq 1 ]]; then
47 |                                         echo "./run.sh $var_proc $var_comm $var_async $var_gpu $var_size 8" &> $file_out
48 |                                 else
49 |                                         printf "\n\n===========================================================================\n\n" &>> $file_out
50 |                                         echo "./run.sh $var_proc $var_comm $var_async $var_gpu $var_size 8" &>> $file_out
51 |                 		fi
52 |                                 ./run.sh $var_proc $var_comm $var_async $var_gpu $var_size 8 &>> $file_out
53 |                                 
54 |                                 if [[ $num_iter -eq 5 ]]; then
55 |                                         egrep "use cuda" $file_out
56 |                                         egrep "Total by level" $file_out
57 |                                 fi
58 |                 	done
59 |                 done
60 |         done
61 | done


--------------------------------------------------------------------------------
/wrapper.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | exe=$1
 4 | shift
 5 | params=$*
 6 | 
 7 | extra_params=
 8 | lrank=$OMPI_COMM_WORLD_LOCAL_RANK
 9 | 
10 | USE_GPU=0;
11 | USE_CPU=0;
12 | MP_USE_IB_HCA=mlx5_0;
13 | case ${HOSTNAME} in
14 |     *dgx*)
15 |     # let's pick:
16 |     # GPU #0,2,4,6
17 |     # HCA #0,1,2,3
18 |     if (( $lrank > 4 )); then echo "too many ranks"; exit; fi
19 |     hlrank=$(($lrank / 2)) # 0,1
20 |     dlrank=$(($lrank * 2)) # 0,2,4,6
21 |     #CUDA_VISIBLE_DEVICES=$dlrank
22 |     USE_GPU=${dlrank}
23 |     USE_CPU=${hlrank}
24 |     HCA=mlx5_${lrank}
25 |     MP_USE_IB_HCA=${HCA}
26 |     OMPI_MCA_btl_openib_if_include=${HCA}
27 |     ;;
28 | 
29 |     *ivy0*) CUDA_VISIBLE_DEVICES=1; USE_CPU=0; MP_USE_IB_HCA=mlx5_0;;
30 |     *ivy1*) CUDA_VISIBLE_DEVICES=0; USE_CPU=0; MP_USE_IB_HCA=mlx5_0;;
31 |     *ivy2*) CUDA_VISIBLE_DEVICES=0; USE_CPU=0; MP_USE_IB_HCA=mlx5_0;;
32 |     *ivy3*) CUDA_VISIBLE_DEVICES=0; USE_CPU=0; MP_USE_IB_HCA=mlx5_0;;
33 |     *brdw0*) CUDA_VISIBLE_DEVICES=3; USE_CPU=0; MP_USE_IB_HCA=mlx5_0;;
34 |     *brdw1*) CUDA_VISIBLE_DEVICES=0; USE_CPU=0; MP_USE_IB_HCA=mlx5_0;;
35 |     *hsw*) CUDA_VISIBLE_DEVICES=0; USE_CPU=0; MP_USE_IB_HCA=mlx5_0;;
36 |     #*hsw1*)                         USE_GPU=0; USE_CPU=0; MP_USE_IB_HCA=mlx5_0;;
37 |     #Wilkes
38 |     *gpu-e-*) CUDA_VISIBLE_DEVICES=0; USE_CPU=0; USE_GPU=0; MP_USE_IB_HCA=mlx5_0;
39 |     ;;
40 | esac
41 | 
42 | COMM_USE_GPU_ID=$USE_GPU
43 | 
44 | echo ""
45 | echo "# ${HOSTNAME}, Local Rank $lrank, GPU:$CUDA_VISIBLE_DEVICES/$USE_GPU CPU:$USE_CPU HCA:$MP_USE_IB_HCA" >&2
46 | 
47 | export \
48 |     HPGMG_ENABLE_DEBUG \
49 |     CUDA_VISIBLE_DEVICES CUDA_ERROR_LEVEL CUDA_ERROR_FILE CUDA_FILE_LEVEL CUDA_PASCAL_FORCE_40_BIT \
50 |     MP_USE_IB_HCA USE_IB_HCA USE_CPU USE_GPU COMM_USE_GPU_ID \
51 |     MP_ENABLE_DEBUG MP_ENABLE_WARN GDS_ENABLE_DEBUG \
52 |     MP_DBREC_ON_GPU MP_RX_CQ_ON_GPU MP_TX_CQ_ON_GPU \
53 |     MP_EVENT_ASYNC MP_GUARD_PROGRESS \
54 |     GDS_DISABLE_WRITE64 GDS_DISABLE_INLINECOPY GDS_DISABLE_MEMBAR \
55 |     GDS_DISABLE_WEAK_CONSISTENCY GDS_SIMULATE_WRITE64 \
56 |     COMM_USE_COMM COMM_USE_ASYNC_SA COMM_USE_ASYNC_KI OMP_NUM_THREADS \
57 |     OMPI_MCA_btl_openib_if_include \
58 |     GDS_ENABLE_DUMP_MEMOPS \
59 |     USE_MPI \
60 |     LD_LIBRARY_PATH PATH GDS_FLUSHER_TYPE
61 |     
62 | #set -x
63 | 
64 | if [ ! -z $USE_CPU ]; then
65 |     numactl --cpunodebind=${USE_CPU} -l $exe $params $extra_params
66 | else
67 |     $exe $params  $extra_params
68 | fi
69 | 


--------------------------------------------------------------------------------