├── .gitignore ├── Benchmarks ├── .DS_Store ├── brdw-pascal-ofed40 │ ├── cpuflusher │ │ ├── hpgmg-ASYNC-s4-p2.txt │ │ ├── hpgmg-ASYNC-s5-p2.txt │ │ ├── hpgmg-ASYNC-s6-p2.txt │ │ ├── hpgmg-ASYNC-s7-p2.txt │ │ └── out_cpuflusher.txt │ ├── hpgmg-COMM-s4-p2.txt │ ├── hpgmg-COMM-s5-p2.txt │ ├── hpgmg-COMM-s6-p2.txt │ ├── hpgmg-COMM-s7-p2.txt │ ├── hpgmg-GPU-initiated-s4-p2.txt │ ├── hpgmg-GPU-initiated-s5-p2.txt │ ├── hpgmg-GPU-initiated-s6-p2.txt │ ├── hpgmg-GPU-initiated-s7-p2.txt │ ├── hpgmg-MPI-s4-p2.txt │ ├── hpgmg-MPI-s5-p2.txt │ ├── hpgmg-MPI-s6-p2.txt │ ├── hpgmg-MPI-s7-p2.txt │ ├── nicflusher │ │ ├── hpgmg-ASYNC-s4-p2.txt │ │ ├── hpgmg-ASYNC-s5-p2.txt │ │ ├── hpgmg-ASYNC-s6-p2.txt │ │ ├── hpgmg-ASYNC-s7-p2.txt │ │ └── out_nicflusher.txt │ ├── noflusher │ │ ├── hpgmg-ASYNC-s4-p2.txt │ │ ├── hpgmg-ASYNC-s5-p2.txt │ │ ├── hpgmg-ASYNC-s6-p2.txt │ │ └── hpgmg-ASYNC-s7-p2.txt │ ├── out.txt │ └── out_GPU.txt ├── brdw-pascal-ofed42 │ ├── hpgmg-ASYNC-s4-p2.txt │ ├── hpgmg-ASYNC-s5-p2.txt │ ├── hpgmg-ASYNC-s6-p2.txt │ ├── hpgmg-ASYNC-s7-p2.txt │ ├── hpgmg-COMM-s4-p2.txt │ ├── hpgmg-COMM-s5-p2.txt │ ├── hpgmg-COMM-s6-p2.txt │ ├── hpgmg-COMM-s7-p2.txt │ ├── hpgmg-GPU-initiated-s4-p2.txt │ ├── hpgmg-GPU-initiated-s5-p2.txt │ ├── hpgmg-GPU-initiated-s6-p2.txt │ ├── hpgmg-GPU-initiated-s7-p2.txt │ ├── hpgmg-MPI-s4-p2.txt │ ├── hpgmg-MPI-s5-p2.txt │ ├── hpgmg-MPI-s6-p2.txt │ ├── hpgmg-MPI-s7-p2.txt │ ├── out_brdw42.txt │ └── out_brdw42_ki.txt ├── ivys23_ofed34 │ ├── hpgmg-ASYNC-s4-p2.txt │ ├── hpgmg-ASYNC-s5-p2.txt │ ├── hpgmg-ASYNC-s6-p2.txt │ ├── hpgmg-ASYNC-s7-p2.txt │ ├── hpgmg-COMM-s4-p2.txt │ ├── hpgmg-COMM-s5-p2.txt │ ├── hpgmg-COMM-s6-p2.txt │ ├── hpgmg-COMM-s7-p2.txt │ ├── hpgmg-GPU-initiated-s4-p2.txt │ ├── hpgmg-GPU-initiated-s5-p2.txt │ ├── hpgmg-GPU-initiated-s6-p2.txt │ ├── hpgmg-GPU-initiated-s7-p2.txt │ ├── hpgmg-MPI-s4-p2.txt │ ├── hpgmg-MPI-s5-p2.txt │ ├── hpgmg-MPI-s6-p2.txt │ ├── hpgmg-MPI-s7-p2.txt │ └── out_bench.txt ├── ivys23_ofed42 │ ├── hpgmg-ASYNC-s4-p2.txt │ ├── hpgmg-ASYNC-s5-p2.txt │ ├── hpgmg-ASYNC-s6-p2.txt │ ├── hpgmg-ASYNC-s7-p2.txt │ ├── hpgmg-COMM-s4-p2.txt │ ├── hpgmg-COMM-s5-p2.txt │ ├── hpgmg-COMM-s6-p2.txt │ ├── hpgmg-COMM-s7-p2.txt │ ├── hpgmg-GPU-initiated-s4-p2.txt │ ├── hpgmg-GPU-initiated-s5-p2.txt │ ├── hpgmg-GPU-initiated-s6-p2.txt │ ├── hpgmg-GPU-initiated-s7-p2.txt │ ├── hpgmg-MPI-s4-p2.txt │ ├── hpgmg-MPI-s5-p2.txt │ ├── hpgmg-MPI-s6-p2.txt │ ├── hpgmg-MPI-s7-p2.txt │ ├── ivy23_ofed42_ki │ │ ├── hpgmg-GPU-initiated-s4-p2.txt │ │ ├── hpgmg-GPU-initiated-s5-p2.txt │ │ ├── hpgmg-GPU-initiated-s6-p2.txt │ │ ├── hpgmg-GPU-initiated-s7-p2.txt │ │ └── out_ivy23_ki.txt │ └── out_ofed42.txt ├── p9_pwr05 │ ├── hpgmg-ASYNC-s4-p2.txt │ ├── hpgmg-ASYNC-s5-p2.txt │ ├── hpgmg-ASYNC-s6-p2.txt │ ├── hpgmg-ASYNC-s7-p2.txt │ ├── hpgmg-COMM-s4-p2.txt │ ├── hpgmg-COMM-s5-p2.txt │ ├── hpgmg-COMM-s6-p2.txt │ ├── hpgmg-COMM-s7-p2.txt │ ├── hpgmg-GPU-initiated-s4-p2.txt │ ├── hpgmg-GPU-initiated-s5-p2.txt │ ├── hpgmg-GPU-initiated-s6-p2.txt │ ├── hpgmg-GPU-initiated-s7-p2.txt │ ├── hpgmg-MPI-s4-p2.txt │ ├── hpgmg-MPI-s5-p2.txt │ ├── hpgmg-MPI-s6-p2.txt │ ├── hpgmg-MPI-s7-p2.txt │ └── out_pwr09.txt ├── psg-benchmarks_ofed40 │ ├── hpgmg-ASYNC-s4-p2.txt │ ├── hpgmg-ASYNC-s4-p4.txt │ ├── hpgmg-ASYNC-s4-p8.txt │ ├── hpgmg-ASYNC-s5-p2.txt │ ├── hpgmg-ASYNC-s5-p4.txt │ ├── hpgmg-ASYNC-s5-p8.txt │ ├── hpgmg-ASYNC-s6-p2.txt │ ├── hpgmg-ASYNC-s6-p4.txt │ ├── hpgmg-ASYNC-s6-p8.txt │ ├── hpgmg-ASYNC-s7-p2.txt │ ├── hpgmg-ASYNC-s7-p4.txt │ ├── hpgmg-ASYNC-s7-p8.txt │ ├── hpgmg-COMM-s4-p2.txt │ ├── hpgmg-COMM-s4-p4.txt │ ├── hpgmg-COMM-s4-p8.txt │ ├── hpgmg-COMM-s5-p2.txt │ ├── hpgmg-COMM-s5-p4.txt │ ├── hpgmg-COMM-s5-p8.txt │ ├── hpgmg-COMM-s6-p2.txt │ ├── hpgmg-COMM-s6-p4.txt │ ├── hpgmg-COMM-s6-p8.txt │ ├── hpgmg-COMM-s7-p2.txt │ ├── hpgmg-COMM-s7-p4.txt │ ├── hpgmg-COMM-s7-p8.txt │ ├── hpgmg-GPU-initiated-s4-p2.txt │ ├── hpgmg-GPU-initiated-s4-p4.txt │ ├── hpgmg-GPU-initiated-s4-p8.txt │ ├── hpgmg-GPU-initiated-s5-p2.txt │ ├── hpgmg-GPU-initiated-s5-p4.txt │ ├── hpgmg-GPU-initiated-s5-p8.txt │ ├── hpgmg-GPU-initiated-s6-p2.txt │ ├── hpgmg-GPU-initiated-s6-p4.txt │ ├── hpgmg-GPU-initiated-s6-p8.txt │ ├── hpgmg-GPU-initiated-s7-p2.txt │ ├── hpgmg-GPU-initiated-s7-p4.txt │ ├── hpgmg-GPU-initiated-s7-p8.txt │ ├── hpgmg-MPI-s4-p2.txt │ ├── hpgmg-MPI-s4-p4.txt │ ├── hpgmg-MPI-s4-p8.txt │ ├── hpgmg-MPI-s5-p2.txt │ ├── hpgmg-MPI-s5-p4.txt │ ├── hpgmg-MPI-s5-p8.txt │ ├── hpgmg-MPI-s6-p2.txt │ ├── hpgmg-MPI-s6-p4.txt │ ├── hpgmg-MPI-s6-p8.txt │ ├── hpgmg-MPI-s7-p2.txt │ ├── hpgmg-MPI-s7-p4.txt │ ├── hpgmg-MPI-s7-p8.txt │ └── out_ofed40.txt ├── psg-benchmarks_ofed42 │ ├── hpgmg-ASYNC-s4-p2.txt │ ├── hpgmg-ASYNC-s4-p4.txt │ ├── hpgmg-ASYNC-s4-p8.txt │ ├── hpgmg-ASYNC-s5-p2.txt │ ├── hpgmg-ASYNC-s5-p4.txt │ ├── hpgmg-ASYNC-s5-p8.txt │ ├── hpgmg-ASYNC-s6-p2.txt │ ├── hpgmg-ASYNC-s6-p4.txt │ ├── hpgmg-ASYNC-s6-p8.txt │ ├── hpgmg-ASYNC-s7-p2.txt │ ├── hpgmg-ASYNC-s7-p4.txt │ ├── hpgmg-ASYNC-s7-p8.txt │ ├── hpgmg-GPU-initiated-s4-p2.txt │ ├── hpgmg-GPU-initiated-s4-p4.txt │ ├── hpgmg-GPU-initiated-s4-p8.txt │ ├── hpgmg-GPU-initiated-s5-p2.txt │ ├── hpgmg-GPU-initiated-s5-p4.txt │ ├── hpgmg-GPU-initiated-s5-p8.txt │ ├── hpgmg-GPU-initiated-s6-p2.txt │ ├── hpgmg-GPU-initiated-s6-p4.txt │ ├── hpgmg-GPU-initiated-s6-p8.txt │ ├── hpgmg-GPU-initiated-s7-p2.txt │ ├── hpgmg-GPU-initiated-s7-p4.txt │ ├── hpgmg-GPU-initiated-s7-p8.txt │ ├── hpgmg-MPI-s4-p2.txt │ ├── hpgmg-MPI-s4-p4.txt │ ├── hpgmg-MPI-s4-p8.txt │ ├── hpgmg-MPI-s5-p2.txt │ ├── hpgmg-MPI-s5-p4.txt │ ├── hpgmg-MPI-s5-p8.txt │ ├── hpgmg-MPI-s6-p2.txt │ ├── hpgmg-MPI-s6-p4.txt │ ├── hpgmg-MPI-s6-p8.txt │ ├── hpgmg-MPI-s7-p2.txt │ ├── hpgmg-MPI-s7-p4.txt │ ├── hpgmg-MPI-s7-p8.txt │ └── out_psg_ofed42.txt └── wilkes1-dec2016 │ ├── Async1 │ ├── hpgmg-ASYNC-s4-p16.txt │ ├── hpgmg-ASYNC-s4-p2.txt │ ├── hpgmg-ASYNC-s4-p4.txt │ ├── hpgmg-ASYNC-s4-p8.txt │ ├── hpgmg-ASYNC-s5-p16.txt │ ├── hpgmg-ASYNC-s5-p2.txt │ ├── hpgmg-ASYNC-s5-p4.txt │ ├── hpgmg-ASYNC-s5-p8.txt │ ├── hpgmg-ASYNC-s6-p16.txt │ ├── hpgmg-ASYNC-s6-p2.txt │ ├── hpgmg-ASYNC-s6-p4.txt │ ├── hpgmg-ASYNC-s6-p8.txt │ ├── hpgmg-ASYNC-s7-p16.txt │ ├── hpgmg-ASYNC-s7-p2.txt │ ├── hpgmg-ASYNC-s7-p4.txt │ └── hpgmg-ASYNC-s7-p8.txt │ ├── Async2 │ ├── hpgmg-ASYNC-s4-p16.txt │ ├── hpgmg-ASYNC-s4-p2.txt │ ├── hpgmg-ASYNC-s4-p24.txt │ ├── hpgmg-ASYNC-s4-p32.txt │ ├── hpgmg-ASYNC-s4-p4.txt │ ├── hpgmg-ASYNC-s4-p8.txt │ ├── hpgmg-ASYNC-s5-p16.txt │ ├── hpgmg-ASYNC-s5-p2.txt │ ├── hpgmg-ASYNC-s5-p4.txt │ ├── hpgmg-ASYNC-s5-p8.txt │ ├── hpgmg-ASYNC-s6-p16.txt │ ├── hpgmg-ASYNC-s6-p2.txt │ ├── hpgmg-ASYNC-s6-p4.txt │ ├── hpgmg-ASYNC-s6-p8.txt │ ├── hpgmg-ASYNC-s7-p16.txt │ ├── hpgmg-ASYNC-s7-p2.txt │ ├── hpgmg-ASYNC-s7-p4.txt │ └── hpgmg-ASYNC-s7-p8.txt │ ├── GPU │ ├── hpgmg-GPU-initiated-s4-p16.txt │ ├── hpgmg-GPU-initiated-s4-p2.txt │ ├── hpgmg-GPU-initiated-s4-p4.txt │ ├── hpgmg-GPU-initiated-s4-p8.txt │ ├── hpgmg-GPU-initiated-s5-p16.txt │ ├── hpgmg-GPU-initiated-s5-p2.txt │ ├── hpgmg-GPU-initiated-s5-p4.txt │ ├── hpgmg-GPU-initiated-s5-p8.txt │ ├── hpgmg-GPU-initiated-s6-p16.txt │ ├── hpgmg-GPU-initiated-s6-p2.txt │ ├── hpgmg-GPU-initiated-s6-p4.txt │ ├── hpgmg-GPU-initiated-s6-p8.txt │ ├── hpgmg-GPU-initiated-s7-p16.txt │ ├── hpgmg-GPU-initiated-s7-p2.txt │ ├── hpgmg-GPU-initiated-s7-p4.txt │ ├── hpgmg-GPU-initiated-s7-p8.txt │ └── out.txt │ ├── Sync-mine │ ├── hpgmg-MPI-s4-p16.txt │ ├── hpgmg-MPI-s4-p2.txt │ ├── hpgmg-MPI-s4-p24.txt │ ├── hpgmg-MPI-s4-p32.txt │ ├── hpgmg-MPI-s4-p4.txt │ ├── hpgmg-MPI-s4-p8.txt │ ├── hpgmg-MPI-s5-p16.txt │ ├── hpgmg-MPI-s5-p2.txt │ ├── hpgmg-MPI-s5-p4.txt │ ├── hpgmg-MPI-s5-p8.txt │ ├── hpgmg-MPI-s6-p16.txt │ ├── hpgmg-MPI-s6-p2.txt │ ├── hpgmg-MPI-s6-p4.txt │ ├── hpgmg-MPI-s6-p8.txt │ ├── hpgmg-MPI-s7-p16.txt │ ├── hpgmg-MPI-s7-p2.txt │ ├── hpgmg-MPI-s7-p4.txt │ └── hpgmg-MPI-s7-p8.txt │ ├── Sync-original │ ├── hpgmg-MPI-s4-p16.txt │ ├── hpgmg-MPI-s4-p2.txt │ ├── hpgmg-MPI-s4-p24.txt │ ├── hpgmg-MPI-s4-p4.txt │ ├── hpgmg-MPI-s4-p8.txt │ ├── hpgmg-MPI-s5-p16.txt │ ├── hpgmg-MPI-s5-p2.txt │ ├── hpgmg-MPI-s5-p24.txt │ ├── hpgmg-MPI-s5-p4.txt │ ├── hpgmg-MPI-s5-p8.txt │ ├── hpgmg-MPI-s6-p16.txt │ ├── hpgmg-MPI-s6-p2.txt │ ├── hpgmg-MPI-s6-p24.txt │ ├── hpgmg-MPI-s6-p4.txt │ ├── hpgmg-MPI-s6-p8.txt │ ├── hpgmg-MPI-s7-p16.txt │ ├── hpgmg-MPI-s7-p2.txt │ ├── hpgmg-MPI-s7-p24.txt │ ├── hpgmg-MPI-s7-p4.txt │ └── hpgmg-MPI-s7-p8.txt │ └── hpgmg_MPI_9sept.txt ├── HPGMG_Async_manuscript.pdf ├── LICENSE ├── Makefile ├── README.md ├── base.mk ├── build.sh ├── build_titan.sh ├── configure ├── docs ├── HPGMG-logo2.pdf ├── HPGMG-logo2.png ├── HPGMG-logo2.pptx ├── hpgmg-template.pptx ├── ppt │ └── ISC2014.pptx ├── static │ ├── appendix.sty │ ├── comment.sty │ ├── elsart.cls │ ├── hpgmg.tex │ └── thebib.bib └── whitepapers │ ├── hpgmg.bib │ └── hpgmg_intro.tex ├── finite-element ├── fefas-align.h ├── fefas-test.c ├── fefas.c ├── fefas.h ├── fmg.c ├── grid.c ├── hpgmg-analyze.py ├── local.mk ├── memusage.c ├── op │ ├── fefas-op.h │ ├── genregister.py │ ├── local.mk │ ├── op-poisson-qpx.c │ ├── op-poisson1.c │ └── op.c ├── pointwise.h ├── sampler.c ├── sharness │ ├── API.md │ ├── CHANGELOG.md │ ├── COPYING │ ├── README.md │ ├── aggregate-results.sh │ └── sharness.sh ├── tensor-fma.c ├── tensor-qpx.c ├── tensor.c ├── tensor.h ├── tensorimpl.h └── test │ ├── Makefile │ ├── aggregate-results.sh │ ├── hpgmg-sharness.sh │ ├── sharness.sh │ ├── t010-grid.sh │ ├── t020-fespace.sh │ ├── t030-feinject.sh │ ├── t040-feinterp.sh │ ├── t045-ferestrict.sh │ ├── t100-poisson.sh │ ├── t110-poissondiag.sh │ ├── t120-poissonksp.sh │ ├── t200-mgv.sh │ ├── t220-fmg.sh │ ├── t230-fmg-poisson2.sh │ └── t60-sample.sh ├── finite-volume ├── README ├── example_jobs │ ├── job.biou.00008 │ ├── job.carver.00064 │ ├── job.carver.00128 │ ├── job.carver.00512 │ ├── job.carver.01728 │ ├── job.edison.00064 │ ├── job.edison.00512 │ ├── job.edison.01024 │ ├── job.edison.04096 │ ├── job.edison.08000 │ ├── job.edison.10648 │ ├── job.edison.4096.strong │ ├── job.edison.pstate │ ├── job.edison.strong │ ├── job.hopper.01000 │ ├── job.hopper.04096 │ ├── job.hopper.09261 │ ├── job.hopper.13824 │ ├── job.hopper.21952 │ ├── job.hopper.special │ └── job.titan ├── local.mk └── source │ ├── README │ ├── TODO │ ├── compile │ ├── cuda │ ├── blockCopy.h │ ├── boundary_fd.h │ ├── boundary_fv.h │ ├── common.h │ ├── cub │ │ ├── agent │ │ │ ├── agent_histogram.cuh │ │ │ ├── agent_radix_sort_downsweep.cuh │ │ │ ├── agent_radix_sort_upsweep.cuh │ │ │ ├── agent_reduce.cuh │ │ │ ├── agent_reduce_by_key.cuh │ │ │ ├── agent_rle.cuh │ │ │ ├── agent_scan.cuh │ │ │ ├── agent_segment_fixup.cuh │ │ │ ├── agent_select_if.cuh │ │ │ ├── agent_spmv.cuh │ │ │ ├── agent_spmv_orig.cuh │ │ │ └── single_pass_scan_operators.cuh │ │ ├── block │ │ │ ├── block_adjacent_difference.cuh │ │ │ ├── block_discontinuity.cuh │ │ │ ├── block_exchange.cuh │ │ │ ├── block_histogram.cuh │ │ │ ├── block_load.cuh │ │ │ ├── block_radix_rank.cuh │ │ │ ├── block_radix_sort.cuh │ │ │ ├── block_raking_layout.cuh │ │ │ ├── block_reduce.cuh │ │ │ ├── block_scan.cuh │ │ │ ├── block_shuffle.cuh │ │ │ ├── block_store.cuh │ │ │ └── specializations │ │ │ │ ├── block_histogram_atomic.cuh │ │ │ │ ├── block_histogram_sort.cuh │ │ │ │ ├── block_reduce_raking.cuh │ │ │ │ ├── block_reduce_raking_commutative_only.cuh │ │ │ │ ├── block_reduce_warp_reductions.cuh │ │ │ │ ├── block_scan_raking.cuh │ │ │ │ ├── block_scan_warp_scans.cuh │ │ │ │ ├── block_scan_warp_scans2.cuh │ │ │ │ └── block_scan_warp_scans3.cuh │ │ ├── cub.cuh │ │ ├── device │ │ │ ├── device_histogram.cuh │ │ │ ├── device_partition.cuh │ │ │ ├── device_radix_sort.cuh │ │ │ ├── device_reduce.cuh │ │ │ ├── device_run_length_encode.cuh │ │ │ ├── device_scan.cuh │ │ │ ├── device_segmented_radix_sort.cuh │ │ │ ├── device_segmented_reduce.cuh │ │ │ ├── device_select.cuh │ │ │ ├── device_spmv.cuh │ │ │ └── dispatch │ │ │ │ ├── dispatch_histogram.cuh │ │ │ │ ├── dispatch_radix_sort.cuh │ │ │ │ ├── dispatch_reduce.cuh │ │ │ │ ├── dispatch_reduce_by_key.cuh │ │ │ │ ├── dispatch_rle.cuh │ │ │ │ ├── dispatch_scan.cuh │ │ │ │ ├── dispatch_select_if.cuh │ │ │ │ ├── dispatch_spmv.cuh │ │ │ │ └── dispatch_spmv_orig.cuh │ │ ├── grid │ │ │ ├── grid_barrier.cuh │ │ │ ├── grid_even_share.cuh │ │ │ ├── grid_mapping.cuh │ │ │ └── grid_queue.cuh │ │ ├── host │ │ │ ├── mutex.cuh │ │ │ └── spinlock.cuh │ │ ├── iterator │ │ │ ├── arg_index_input_iterator.cuh │ │ │ ├── cache_modified_input_iterator.cuh │ │ │ ├── cache_modified_output_iterator.cuh │ │ │ ├── constant_input_iterator.cuh │ │ │ ├── counting_input_iterator.cuh │ │ │ ├── discard_output_iterator.cuh │ │ │ ├── tex_obj_input_iterator.cuh │ │ │ ├── tex_ref_input_iterator.cuh │ │ │ └── transform_input_iterator.cuh │ │ ├── thread │ │ │ ├── thread_load.cuh │ │ │ ├── thread_operators.cuh │ │ │ ├── thread_reduce.cuh │ │ │ ├── thread_scan.cuh │ │ │ ├── thread_search.cuh │ │ │ └── thread_store.cuh │ │ ├── util_allocator.cuh │ │ ├── util_arch.cuh │ │ ├── util_debug.cuh │ │ ├── util_device.cuh │ │ ├── util_macro.cuh │ │ ├── util_namespace.cuh │ │ ├── util_ptx.cuh │ │ ├── util_type.cuh │ │ └── warp │ │ │ ├── specializations │ │ │ ├── warp_reduce_shfl.cuh │ │ │ ├── warp_reduce_smem.cuh │ │ │ ├── warp_scan_shfl.cuh │ │ │ └── warp_scan_smem.cuh │ │ │ ├── warp_reduce.cuh │ │ │ └── warp_scan.cuh │ ├── extra.h │ ├── interpolation.h │ ├── interpolation_v2.h │ ├── interpolation_v4.h │ ├── misc.h │ ├── operators.7pt.cu │ ├── operators.fv2.cu │ ├── operators.fv4.cu │ ├── restriction.h │ └── stencils │ │ ├── chebyshev.flux.fv4.h │ │ ├── gsrb.h │ │ ├── residual.base.h │ │ ├── residual.reg.fv2.h │ │ ├── residual.reg.fv4.h │ │ ├── smooth.base.h │ │ ├── smooth.reg.fv2.h │ │ ├── smooth.reg.fv4.h │ │ └── smooth.smem.fv4.h │ ├── debug.c │ ├── debug.h │ ├── defines.h │ ├── hpgmg-fv.c │ ├── level.c │ ├── level.h │ ├── local.mk │ ├── mg.c │ ├── mg.h │ ├── operators.27pt.c │ ├── operators.7pt.c │ ├── operators.fv2.c │ ├── operators.fv4.c │ ├── operators.h │ ├── operators.old.c │ ├── operators.old │ ├── aggregate.mpi │ │ ├── chebyshev.c │ │ ├── gsrb.c │ │ └── jacobi.c │ ├── apply_op.c │ ├── chebyshev.c │ ├── gsrb.c │ ├── iterators.c │ ├── jacobi.c │ ├── misc.c │ ├── residual.c │ └── symgs.c │ ├── operators │ ├── apply_op.c │ ├── blockCopy.c │ ├── boundary_fd.c │ ├── boundary_fv.c │ ├── chebyshev.c │ ├── exchange_boundary.c │ ├── gsrb.c │ ├── interpolation_p0.c │ ├── interpolation_p1.c │ ├── interpolation_p2.c │ ├── interpolation_v2.c │ ├── interpolation_v4.c │ ├── jacobi.c │ ├── misc.c │ ├── problem.fv.c │ ├── problem.p4.c │ ├── problem.p6.c │ ├── problem.sine.c │ ├── rebuild.c │ ├── residual.c │ ├── restriction.c │ └── symgs.c │ ├── solvers.c │ ├── solvers.h │ ├── solvers │ ├── bicgstab.c │ ├── cabicgstab.c │ ├── cacg.c │ ├── cg.c │ └── matmul.c │ ├── timers.c │ ├── timers.h │ └── timers │ ├── mpi.c │ ├── omp.c │ └── x86.c ├── hpgmgconf.py ├── local.mk ├── run.sh ├── run_all_hpgmg.sh └── wrapper.sh /.gitignore: -------------------------------------------------------------------------------- 1 | paper/HPGMG-Async.aux 2 | paper/HPGMG-Async.log 3 | paper/HPGMG-Async.pdf 4 | paper/HPGMG-Async.synctex.gz 5 | -------------------------------------------------------------------------------- /Benchmarks/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/e-ago/hpgmg-cuda-async/7896114058909da072e4c3ddb48a188e9d915fb7/Benchmarks/.DS_Store -------------------------------------------------------------------------------- /Benchmarks/wilkes1-dec2016/GPU/out.txt: -------------------------------------------------------------------------------- 1 | MODE: GPU-initiated, SIZE: 4, PROC: 2 2 | use cuda 1 0 0 0 0 3 | Total by level 0.004185 0.002306 0.000697 0.000266 0.000080 0.007535 4 | MODE: GPU-initiated, SIZE: 4, PROC: 4 5 | use cuda 1 0 0 0 0 6 | Total by level 0.005535 0.003701 0.001485 0.001090 0.000206 0.012017 7 | MODE: GPU-initiated, SIZE: 4, PROC: 8 8 | use cuda 1 0 0 0 0 0 9 | Total by level 0.005816 0.004161 0.001490 0.000921 0.000328 0.000099 0.012814 10 | MODE: GPU-initiated, SIZE: 4, PROC: 16 11 | use cuda 1 0 0 0 0 12 | Total by level 0.006549 0.004591 0.002221 0.002146 0.000773 0.016280 13 | MODE: GPU-initiated, SIZE: 5, PROC: 2 14 | use cuda 1 1 0 0 0 0 15 | Total by level 0.005164 0.008937 0.003429 0.000919 0.000331 0.000100 0.018880 16 | MODE: GPU-initiated, SIZE: 5, PROC: 4 17 | use cuda 1 1 0 0 0 0 18 | Total by level 0.007328 0.011813 0.005473 0.001897 0.001216 0.000246 0.027973 19 | MODE: GPU-initiated, SIZE: 5, PROC: 8 20 | use cuda 1 1 0 0 0 0 0 21 | Total by level 0.007448 0.011978 0.006194 0.001803 0.001159 0.000392 0.000118 0.029091 22 | MODE: GPU-initiated, SIZE: 5, PROC: 16 23 | use cuda 1 1 0 0 0 0 24 | Total by level 0.009244 0.014551 0.006774 0.002919 0.002649 0.000933 0.037070 25 | MODE: GPU-initiated, SIZE: 6, PROC: 2 26 | use cuda 1 1 1 0 0 0 0 27 | Total by level 0.015952 0.000940 0.029312 0.004217 0.001140 0.000388 0.000120 0.052069 28 | MODE: GPU-initiated, SIZE: 6, PROC: 4 29 | use cuda 1 1 1 0 0 0 0 30 | Total by level 0.026787 0.001212 0.044937 0.007178 0.002417 0.001444 0.000292 0.084268 31 | MODE: GPU-initiated, SIZE: 6, PROC: 8 32 | use cuda 1 1 1 0 0 0 0 0 33 | Total by level 0.027799 0.001686 0.045790 0.008204 0.002202 0.001366 0.000454 0.000138 0.087638 34 | MODE: GPU-initiated, SIZE: 6, PROC: 16 35 | use cuda 1 1 1 0 0 0 0 36 | Total by level 0.033302 0.002507 0.054828 0.008998 0.003602 0.003365 0.001057 0.107659 37 | MODE: GPU-initiated, SIZE: 7, PROC: 2 38 | use cuda 1 1 1 1 0 0 0 0 39 | Total by level 0.086922 0.000955 0.001372 0.133889 0.005225 0.001368 0.000451 0.000138 0.230319 40 | MODE: GPU-initiated, SIZE: 7, PROC: 4 41 | use cuda 1 1 1 1 0 0 0 0 42 | Total by level 0.155914 0.001211 0.001808 0.231071 0.008935 0.002912 0.001677 0.000339 0.403867 43 | MODE: GPU-initiated, SIZE: 7, PROC: 8 44 | use cuda 1 1 1 1 0 0 0 0 0 45 | Total by level 0.168862 0.001744 0.002545 0.245997 0.010298 0.002673 0.001588 0.000525 0.000160 0.434392 46 | MODE: GPU-initiated, SIZE: 7, PROC: 16 47 | use cuda 1 1 1 1 0 0 0 0 48 | Total by level 0.186298 0.002414 0.003675 0.276368 0.011224 0.004471 0.003649 0.001197 0.489296 49 | 50 | -------------------------------------------------------------------------------- /Benchmarks/wilkes1-dec2016/Sync-original/hpgmg-MPI-s6-p24.txt: -------------------------------------------------------------------------------- 1 | [tesla40:19195] Warning: could not find environment variable "MP_EVENT_ASYNC" 2 | [tesla40:19195] Warning: could not find environment variable "MP_ENABLE_WARN" 3 | [tesla40:19195] Warning: could not find environment variable "MP_GUARD_PROGRESS" 4 | [tesla40:19195] Warning: could not find environment variable "CUDA_VISIBLE_DEVICES" 5 | [tesla40:19195] Warning: could not find environment variable "SIZE" 6 | [tesla40:19195] Warning: could not find environment variable "MAX_SIZE" 7 | [tesla40:19195] Warning: could not find environment variable "KERNEL_TIME" 8 | [tesla40:19195] Warning: could not find environment variable "CALC_SIZE" 9 | [tesla40:19195] Warning: could not find environment variable "COMM_COMP_RATIO" 10 | [tesla40:19195] Warning: could not find environment variable "USE_SINGLE_STREAM" 11 | [tesla40:19195] Warning: could not find environment variable "USE_GPU_ASYNC" 12 | [tesla40:19195] Warning: could not find environment variable "COMM_USE_GDRDMA" 13 | COMM_USE_COMM=0 14 | COMM_USE_ASYNC=0 15 | COMM_USE_GPU_COMM=0 16 | -------------------------------------------------------------------------------- /Benchmarks/wilkes1-dec2016/Sync-original/hpgmg-MPI-s7-p24.txt: -------------------------------------------------------------------------------- 1 | [tesla40:19228] Warning: could not find environment variable "MP_EVENT_ASYNC" 2 | [tesla40:19228] Warning: could not find environment variable "MP_ENABLE_WARN" 3 | [tesla40:19228] Warning: could not find environment variable "MP_GUARD_PROGRESS" 4 | [tesla40:19228] Warning: could not find environment variable "CUDA_VISIBLE_DEVICES" 5 | [tesla40:19228] Warning: could not find environment variable "SIZE" 6 | [tesla40:19228] Warning: could not find environment variable "MAX_SIZE" 7 | [tesla40:19228] Warning: could not find environment variable "KERNEL_TIME" 8 | [tesla40:19228] Warning: could not find environment variable "CALC_SIZE" 9 | [tesla40:19228] Warning: could not find environment variable "COMM_COMP_RATIO" 10 | [tesla40:19228] Warning: could not find environment variable "USE_SINGLE_STREAM" 11 | [tesla40:19228] Warning: could not find environment variable "USE_GPU_ASYNC" 12 | [tesla40:19228] Warning: could not find environment variable "COMM_USE_GDRDMA" 13 | COMM_USE_COMM=0 14 | COMM_USE_ASYNC=0 15 | COMM_USE_GPU_COMM=0 16 | -------------------------------------------------------------------------------- /HPGMG_Async_manuscript.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/e-ago/hpgmg-cuda-async/7896114058909da072e4c3ddb48a188e9d915fb7/HPGMG_Async_manuscript.pdf -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014-2015, NVIDIA CORPORATION. All rights reserved. 2 | 3 | The U.S. Department of Energy funded the development of this software 4 | under subcontract B609478 with Lawrence Livermore National Security (LLNS). 5 | 6 | Copyright (c) 2014, The Regents of the University of California, through 7 | Lawrence Berkeley National Laboratory and UChicago Argonne, LLC. 8 | All rights reserved. 9 | 10 | Redistribution and use in source and binary forms, with or without modification, 11 | are permitted provided that the following conditions are met: 12 | 13 | * Redistributions of source code must retain the above copyright notice, this 14 | list of conditions and the following disclaimer. 15 | * Redistributions in binary form must reproduce the above copyright notice, this 16 | list of conditions and the following disclaimer in the documentation and/or 17 | other materials provided with the distribution. 18 | * Neither the name of NVIDIA CORPORATION, Lawrence Livermore National 19 | Security, the U.S. Department of Energy, nor the names of its 20 | contributors may be used to endorse or promote products derived 21 | from this software without specific prior written permission. 22 | 23 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 24 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 25 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 26 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 27 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 28 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 30 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 32 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | HPGMG_ARCH := $(if $(PETSC_ARCH),$(PETSC_ARCH),build) 2 | 3 | all : 4 | ./configure --arch=$(HPGMG_ARCH) 5 | $(MAKE) -C $(HPGMG_ARCH) 6 | @echo "Build complete in $(HPGMG_ARCH). Use make -C $(HPGMG_ARCH) test to test." 7 | 8 | test : all 9 | $(MAKE) -C $(HPGMG_ARCH) test 10 | 11 | clean : 12 | $(MAKE) -C $(HPGMG_ARCH) clean 13 | 14 | .PHONY: all test clean 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HPGMG-FV CUDA Async 2 | 3 | ## Build 4 | 5 | In `build.sh` script you need to set the GPUDirect Async path. 6 | You can download all GPUDirect Async libraries and headers from here [GDAsync suite](https://github.com/e-ago/gdasync) 7 | 8 | [HPGMG](https://hpgmg.org) is an HPC benchmarking effort and supercomputing ranking metric based on geometric multigrid methods. NVIDIA reworked the original implementation moving from a CPU implementation to an [hybrid CPU-GPU solution](https://bitbucket.org/nsakharnykh/hpgmg-cuda). 9 | In this repository, starting from the NVIDIA solution, we leverage the communications with [GPUDirect Async](https://github.com/gpudirect/libgdsync), recently released by NVIDIA. 10 | 11 | For further information about Async and benchmarks, please refer to: 12 | 13 | - ["GPUDirect Async: exploring GPU synchronous communication techniques for InfiniBand clusters"](https://www.sciencedirect.com/science/article/pii/S0743731517303386), E. Agostini, D. Rossetti, S. Potluri. Journal of Parallel and Distributed Computing, Vol. 114, Pages 28-45, April 2018 14 | - ["Offloading communication control logic in GPU accelerated applications"](http://ieeexplore.ieee.org/document/7973709), E. Agostini, D. Rossetti, S. Potluri. Proceedings of the 17th IEEE/ACM International Symposium on Cluster, Cloud and Grid Computing (CCGrid’ 17), IEEE Conference Publications, Pages 248-257, Nov 2016 15 | -------------------------------------------------------------------------------- /build.sh: -------------------------------------------------------------------------------- 1 | # find MPI compiler 2 | CC=`which mpicc` 3 | #CC=`which mpiicc` 4 | CXX=`which mpic++` 5 | 6 | # find NVCC compiler 7 | NVCC=`which nvcc` 8 | 9 | # set gpu architectures to compile for 10 | CUDA_ARCH="-gencode code=sm_35,arch=compute_35 " 11 | CUDA_ARCH="$CUDA_ARCH -gencode code=sm_60,arch=compute_60 " 12 | #CUDA_ARCH="$CUDA_ARCH -gencode code=sm_70,arch=compute_70 " 13 | 14 | # main tile size 15 | OPTS="-DBLOCKCOPY_TILE_I=32 " 16 | OPTS=" $OPTS -DBLOCKCOPY_TILE_J=4 " 17 | OPTS=" $OPTS -DBLOCKCOPY_TILE_K=8 " 18 | 19 | # special tile size for boundary conditions 20 | OPTS=" $OPTS -DBOUNDARY_TILE_I=64 " 21 | OPTS=" $OPTS -DBOUNDARY_TILE_J=16 " 22 | OPTS=" $OPTS -DBOUNDARY_TILE_K=16 " 23 | 24 | # max number of solves after warmup 25 | OPTS=" $OPTS -DMAX_SOLVES=100 " 26 | 27 | # host level threshold: number of grid elements 28 | OPTS=" $OPTS -DHOST_LEVEL_SIZE_THRESHOLD=10000 " 29 | 30 | # max number of solves after warmup 31 | #OPTS=" $OPTS -DMAX_SOLVES=10 " 32 | 33 | # unified memory allocation options 34 | OPTS=" $OPTS -DCUDA_UM_ALLOC " 35 | #cudaHostAlloc 36 | OPTS=" $OPTS -DCUDA_UM_ZERO_COPY " 37 | 38 | # MPI buffers allocation policy 39 | #cudaHostAlloc 40 | OPTS=" $OPTS -DMPI_ALLOC_ZERO_COPY " 41 | #cudaMalloc 42 | #OPTS=" $OPTS -DMPI_ALLOC_PINNED " 43 | 44 | :<=2&&argv[1]?argv[1]:"");CHKERRQ(ierr); 43 | ierr = PetscFunctionListView(actionlist,PETSC_VIEWER_STDERR_WORLD);CHKERRQ(ierr); 44 | goto out; 45 | } 46 | ierr = PetscFunctionListFind(actionlist,argv[1],action);CHKERRQ(ierr); 47 | if (!*action) { 48 | ierr = PetscViewerASCIIPrintf(PETSC_VIEWER_STDERR_WORLD,"Unknown action '%s':",argc>=2&&argv[1]?argv[1]:"");CHKERRQ(ierr); 49 | ierr = PetscFunctionListView(actionlist,PETSC_VIEWER_STDERR_WORLD);CHKERRQ(ierr); 50 | goto out; 51 | } 52 | out: 53 | ierr = PetscFunctionListDestroy(&actionlist);CHKERRQ(ierr); 54 | PetscFunctionReturn(0); 55 | } 56 | 57 | int main(int argc, char *argv[]) 58 | { 59 | PetscErrorCode ierr,(*action)(void); 60 | 61 | PetscInitialize(&argc,&argv,NULL,help); 62 | ierr = ActionParse(argc,argv,&action);CHKERRQ(ierr); 63 | if (!action) { 64 | PetscFinalize(); 65 | return 1; 66 | } 67 | ierr = (*action)();CHKERRQ(ierr); 68 | PetscFinalize(); 69 | return 0; 70 | } 71 | -------------------------------------------------------------------------------- /finite-element/fefas.h: -------------------------------------------------------------------------------- 1 | #ifndef _fefas_h 2 | #define _fefas_h 3 | 4 | #include 5 | #include 6 | #include "op/fefas-op.h" 7 | 8 | typedef struct Grid_private *Grid; 9 | 10 | typedef enum {DOMAIN_INTERIOR=0x1,DOMAIN_EXTERIOR=0x2,DOMAIN_CLOSURE=0x3} DomainMode; 11 | 12 | PetscErrorCode GridCreate(MPI_Comm comm,const PetscInt M[3],const PetscInt p[3],PetscInt cmax,Grid *grid); 13 | PetscErrorCode GridDestroy(Grid *grid); 14 | PetscErrorCode GridView(Grid grid); 15 | PetscErrorCode GridGetNumLevels(Grid grid,PetscInt *nlevels); 16 | PetscInt GridLevelFromM(const PetscInt M[3]); 17 | PetscErrorCode DMCreateFE(Grid grid,PetscInt fedegree,PetscInt dof,DM *dmfe); 18 | PetscErrorCode DMDestroyFE(DM *dm); 19 | PetscErrorCode DMFESetUniformCoordinates(DM dm,const PetscReal L[]); 20 | PetscErrorCode DMFEGetUniformCoordinates(DM dm,PetscReal L[]); 21 | PetscErrorCode DMFEGetInfo(DM dm,PetscInt *fedegree,PetscInt *level,PetscInt mlocal[],PetscInt Mglobal[],PetscInt procs[]); 22 | PetscErrorCode DMFEGetTensorEval(DM dm,PetscInt *P,PetscInt *Q,const PetscReal **B,const PetscReal **D,const PetscReal **x,const PetscReal **w,const PetscReal **w3); 23 | PetscErrorCode DMFEGetNumElements(DM dm,PetscInt *nelems); 24 | PetscErrorCode DMFEExtractElements(DM dm,const PetscScalar *u,PetscInt elem,PetscInt ne,PetscScalar *y); 25 | PetscErrorCode DMFESetElements(DM dm,PetscScalar *u,PetscInt elem,PetscInt ne,InsertMode imode,DomainMode dmode,const PetscScalar *y); 26 | PetscErrorCode DMFECoarsen(DM dm,DM *dmcoarse); 27 | PetscErrorCode DMFEInject(DM dm,Vec Uf,Vec Uc); 28 | PetscErrorCode DMFEInterpolate(DM dm,Vec Uc,Vec Uf); 29 | PetscErrorCode DMFERestrict(DM dm,Vec Uf,Vec Uc); 30 | PetscErrorCode DMFEZeroBoundaries(DM dm,Vec U); 31 | PetscErrorCode DMCoordDistort(DM dm,const PetscReal L[]); 32 | 33 | typedef struct MG_private *MG; 34 | PetscErrorCode MGCreate(Op op,DM dm,PetscInt nlevels,MG *newmg); 35 | PetscErrorCode MGDestroy(MG *mg); 36 | PetscErrorCode MGMonitorSet(MG mg,PetscBool mon); 37 | PetscErrorCode MGSetUpPC(MG mg); 38 | PetscErrorCode MGFCycle(Op op,MG mg,PetscInt presmooths,PetscInt postsmooths,Vec B,Vec U); 39 | 40 | PetscInt SampleGridNumLevels(const PetscInt p[]); 41 | int64_t SampleGridNumElements(const PetscInt p[]); 42 | PetscErrorCode SampleGridRangeCreate(PetscMPIInt nranks,PetscInt minlocal,PetscInt maxlocal,PetscInt maxsamples,PetscInt *nsamples,PetscInt **gridsizes); 43 | PetscErrorCode ProcessGridFindSquarest(PetscMPIInt nranks,PetscInt squarest[3]); 44 | 45 | PetscErrorCode MemoryGetUsage(double *heapused,double *heapavail); 46 | 47 | #endif 48 | -------------------------------------------------------------------------------- /finite-element/hpgmg-analyze.py: -------------------------------------------------------------------------------- 1 | def parse_logfile(fname): 2 | import re 3 | FP = r'([+-]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][+-]?\d+)?)' 4 | PERFLINE = [] 5 | PERFLINE.append(re.compile(r'Q2 G''\[([\d ]{4})([\d ]{4})([\d ]{4})\] P\[ *(\d+) +(\d+) +(\d+)\] '+FP+r' s +'+FP+r' GF +'+FP+r' MEq/s')) 6 | PERFLINE.append(re.compile(r'Q2 G''\[([\d ]{5})([\d ]{5})([\d ]{5})\] P\[ *(\d+) +(\d+) +(\d+)\] '+FP+r' s +'+FP+r' GF +'+FP+r' MEq/s')) 7 | HOSTLINE = re.compile(r'.*on a ([a-z\-_0-9]+) named [^ ]+ with (\d+) processors') 8 | Dofs = [] 9 | GFlops = [] 10 | MEqs = [] 11 | Procs = None 12 | HostName = 'unknown' 13 | with open(fname) as f: 14 | while 'Starting performance sampling' not in next(f): 15 | pass 16 | while True: 17 | line = next(f) 18 | for perfline in PERFLINE: 19 | m = re.match(perfline,line) 20 | if m: break 21 | if not m: break 22 | g0,g1,g2, p0,p1,p2, time, gflops, meqs = m.groups() 23 | g = (float(g0)*2+1)*(float(g1)*2+1)*(float(g2)*2+1) 24 | p = int(p0)*int(p1)*int(p2) 25 | Dofs.append(g) 26 | GFlops.append(float(gflops)) 27 | MEqs.append(float(meqs)) 28 | if Procs is None: 29 | Procs = p 30 | elif p != Procs: 31 | raise RuntimeError('Procs varies within file "%s"' % (fname,)) 32 | while True: 33 | line = next(f) 34 | m = re.match(HOSTLINE,line) 35 | if m: 36 | HostName, p = m.groups() 37 | assert int(p) == Procs 38 | break 39 | 40 | return Dofs, GFlops, MEqs, HostName, Procs 41 | 42 | def plot(args): 43 | symbols = iter(['ro', 'bv', 'ks', 'g^', 'bx']) 44 | import matplotlib.pyplot as plt 45 | fig, ax1 = plt.subplots() 46 | plt.title('HPGMG-FE Performance') 47 | if args.perprocess: 48 | plt.xlabel('Number of equations/process') 49 | else: 50 | plt.xlabel('Global number of equations') 51 | ax2 = ax1.twinx() 52 | #ax1.set_autoscaley_on(False) 53 | ax1.set_ylabel('MEquations/second') 54 | all_dofs = [] 55 | all_gflops = [] 56 | all_meqs = [] 57 | max_meqs = 0 58 | for f in args.logfiles: 59 | dofs, gflops, meqs, hostname, procs = parse_logfile(f) 60 | if args.perprocess: 61 | dofs = [d/procs for d in dofs] 62 | all_dofs += dofs 63 | all_gflops += gflops 64 | all_meqs += meqs 65 | if args.loglog: 66 | ax1.loglog(dofs, meqs, next(symbols), label='%s np=%d'%(hostname, procs)) 67 | else: 68 | ax1.semilogx(dofs, meqs, next(symbols), label='%s np=%d'%(hostname, procs)) 69 | flops_per_meqn = all_gflops[-1] / all_meqs[-1] 70 | ax1.set_xlim(0.9*min(all_dofs),1.05*max(all_dofs)) 71 | ax2.set_xlim(0.9*min(all_dofs),1.05*max(all_dofs)) 72 | ax2.set_autoscaley_on(False) 73 | if args.loglog: 74 | ax2.set_yscale('log') 75 | ax1.legend(loc='lower right') 76 | else: 77 | ax1.legend(loc='upper left') 78 | ax1.set_ylim(0.9*min(all_meqs),1.1*max(all_meqs)) 79 | ax2.set_ylim(0.9*min(all_meqs)*flops_per_meqn,1.1*max(all_meqs)*flops_per_meqn) 80 | ax2.set_ylabel('GFlop/s') 81 | if args.output: 82 | plt.savefig(args.output) 83 | else: 84 | plt.show() 85 | 86 | if __name__ == "__main__": 87 | import argparse 88 | parser = argparse.ArgumentParser('FE-FAS Performance Analyzer') 89 | parser.add_argument('-o', '--output', type=str, help='Output file') 90 | parser.add_argument('--loglog', action='store_true', help='Use logarithmic y axis (x is always logarithmic)') 91 | parser.add_argument('--perprocess', action='store_true', help='Use problem size per process for x axis') 92 | parser.add_argument('logfiles', nargs='+', type=str, help='List of files to process, usually including -log_summary') 93 | args = parser.parse_args() 94 | plot(args) 95 | -------------------------------------------------------------------------------- /finite-element/local.mk: -------------------------------------------------------------------------------- 1 | hpgmg-fe-y.c += $(call thisdir, \ 2 | fefas.c \ 3 | fefas-test.c \ 4 | fmg.c \ 5 | grid.c \ 6 | memusage.c \ 7 | sampler.c \ 8 | tensor.c \ 9 | tensor-fma.c \ 10 | tensor-qpx.c \ 11 | ) 12 | 13 | include $(call incsubdirs,op) 14 | -------------------------------------------------------------------------------- /finite-element/memusage.c: -------------------------------------------------------------------------------- 1 | #include "fefas.h" 2 | 3 | #ifdef __bgq__ 4 | # include 5 | #endif 6 | 7 | PetscErrorCode MemoryGetUsage(double *heapused,double *heapavail) { 8 | PetscFunctionBegin; 9 | *heapused = -1; 10 | *heapavail = -1; 11 | #ifdef __bgq__ 12 | { 13 | uint64_t heap,avail; 14 | Kernel_GetMemorySize(KERNEL_MEMSIZE_HEAP,&heap); 15 | Kernel_GetMemorySize(KERNEL_MEMSIZE_HEAPAVAIL,&avail); 16 | *heapused = (double)heap; 17 | *heapavail = (double)avail; 18 | } 19 | #else 20 | { 21 | PetscErrorCode ierr; 22 | ierr = PetscMemoryGetCurrentUsage(heapused);CHKERRQ(ierr); 23 | } 24 | #endif 25 | PetscFunctionReturn(0); 26 | } 27 | -------------------------------------------------------------------------------- /finite-element/op/fefas-op.h: -------------------------------------------------------------------------------- 1 | #ifndef _fefas_op_h 2 | #define _fefas_op_h 3 | 4 | #include 5 | #include "../tensor.h" 6 | 7 | typedef struct Op_private *Op; 8 | 9 | MPI_Comm OpComm(Op); 10 | PetscErrorCode OpCreateFromOptions(MPI_Comm,Op*); 11 | PetscErrorCode OpDestroy(Op*); 12 | PetscErrorCode OpSetDof(Op,PetscInt); 13 | PetscErrorCode OpGetDof(Op,PetscInt*); 14 | PetscErrorCode OpSetFEDegree(Op,PetscInt); 15 | PetscErrorCode OpGetFEDegree(Op,PetscInt*); 16 | PetscErrorCode OpSetContext(Op,void*); 17 | PetscErrorCode OpGetContext(Op,void*); 18 | PetscErrorCode OpSetApply(Op,PetscErrorCode (*)(Op,DM,Vec,Vec)); 19 | PetscErrorCode OpSetPointwiseSolution(Op,PetscErrorCode (*)(Op,const PetscReal[],const PetscReal[],PetscScalar[])); 20 | PetscErrorCode OpSetPointwiseForcing(Op,PetscErrorCode (*)(Op,const PetscReal[],const PetscReal[],PetscScalar[])); 21 | typedef PetscErrorCode (*OpPointwiseElementFunction)(Op,PetscInt,PetscInt,const PetscScalar[],const PetscReal[],const PetscScalar[],PetscScalar[]); 22 | PetscErrorCode OpSetPointwiseElement(Op,OpPointwiseElementFunction,PetscInt); 23 | PetscErrorCode OpSetAffineOnly(Op op,PetscBool affine); 24 | PetscErrorCode OpGetAffineOnly(Op op,PetscBool *affine); 25 | PetscErrorCode OpSetDestroy(Op,PetscErrorCode (*)(Op)); 26 | PetscErrorCode OpRegister(const char name[],PetscErrorCode (*f)(Op)); 27 | PetscErrorCode OpInitializePackage(void); 28 | PetscErrorCode OpFinalizePackage(void); 29 | PetscErrorCode OpDestroy(Op*); 30 | PetscErrorCode OpApply(Op op,DM dm,Vec U,Vec F); 31 | PetscErrorCode OpRestrictState(Op op,DM dm,Vec Uf,Vec Uc); 32 | PetscErrorCode OpRestrictResidual(Op op,DM dm,Vec Uf,Vec Uc); 33 | PetscErrorCode OpInterpolate(Op op,DM dm,Vec Uc,Vec Uf); 34 | PetscErrorCode OpSolution(Op op,DM dm,Vec U); 35 | PetscErrorCode OpForcing(Op op,DM dm,Vec F); 36 | PetscErrorCode OpIntegrateNorms(Op op,DM dm,Vec U,PetscReal *normInfty,PetscReal *norm2); 37 | PetscErrorCode OpGetDiagonal(Op op,DM dm,Vec Diag); 38 | PetscErrorCode OpGetMat(Op op,DM dm,Mat *shell); 39 | PetscErrorCode OpGetTensors(Op op,Tensor *TensorDOF,Tensor *Tensor3); 40 | 41 | #endif 42 | -------------------------------------------------------------------------------- /finite-element/op/genregister.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import re 4 | 5 | CREATE_RE = re.compile(r'PetscErrorCode OpCreate_(\w+) ?\(') 6 | 7 | def mangle(name): 8 | return name.lower().replace('_','-'), name 9 | 10 | def build_ops(files): 11 | ops = [] 12 | for src in files: 13 | with open(src) as f: 14 | for line in f: 15 | m = CREATE_RE.match(line) 16 | if m: 17 | ops.append(mangle(m.groups()[0])) 18 | return ops 19 | 20 | def genregister(outname, files): 21 | ops = build_ops(files) 22 | with open(outname, 'w') as out: 23 | out.write("""#include 24 | 25 | %(opdecl)s 26 | 27 | PetscErrorCode OpRegisterAll_Generated() 28 | { 29 | PetscErrorCode ierr; 30 | 31 | PetscFunctionBegin; 32 | %(opreg)s 33 | PetscFunctionReturn(0); 34 | } 35 | """ % dict(opdecl='\n'.join('PetscErrorCode OpCreate_%s(Op);'%(o[1],) for o in ops), 36 | opreg='\n '.join(['ierr = OpRegister("%s",OpCreate_%s);CHKERRQ(ierr);'%o for o in ops]))) 37 | 38 | if __name__ == '__main__': 39 | import sys 40 | genregister(sys.argv[1], sys.argv[2:]) 41 | -------------------------------------------------------------------------------- /finite-element/op/local.mk: -------------------------------------------------------------------------------- 1 | op-impls.c := $(wildcard $(call thisdir,op-*.c)) 2 | genregister := $(call thisdir,genregister.py) 3 | register.c := $(OBJDIR)/register.c 4 | 5 | hpgmg-fe-y.c += $(call thisdir, \ 6 | op.c \ 7 | ) $(op-impls.c) $(register.c) 8 | 9 | $(register.c) : $(genregister) $(op-impls.c) | $$(@D)/.DIR 10 | $(PYTHON) $(genregister) $@ $(op-impls.c) 11 | 12 | HPGMG_FE_DIR := $(call thisdir,..) 13 | $(OBJDIR)/register.o : HPGMG_CPPFLAGS += -I$(HPGMG_FE_DIR) 14 | -------------------------------------------------------------------------------- /finite-element/pointwise.h: -------------------------------------------------------------------------------- 1 | #ifndef _pointwise_h 2 | #define _pointwise_h 3 | 4 | #include 5 | #include "fefas-align.h" 6 | 7 | static PetscErrorCode PointwiseJacobianInvert(PetscInt ne,PetscInt Q,const PetscReal w[Q],PetscScalar dx[3][3][Q][ne],PetscScalar wdxdet[Q][ne]) 8 | { 9 | PetscInt i,j,k,e; 10 | 11 | for (i=0; i 6 | 7 | #ifndef __FMA__ 8 | # define _mm256_fmadd_pd(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b),c) 9 | #endif 10 | 11 | #define NE 4 12 | 13 | static inline PetscErrorCode TensorContract_FMA(PetscInt dof,PetscInt P,PetscInt Q,const PetscReal Rf[],const PetscReal Sf[],const PetscReal Tf[],TensorMode tmode,const PetscScalar xx[],PetscScalar yy[]) 14 | { 15 | 16 | PetscFunctionBegin; 17 | if (tmode == TENSOR_TRANSPOSE) {PetscInt tmp = Q; Q = P; P = tmp;} 18 | { 19 | PetscReal R[Q][P],S[Q][P],T[Q][P]; 20 | const PetscScalar (*x)[P*P*P][NE] = (const PetscScalar(*)[P*P*P][NE])xx; 21 | PetscScalar (*y)[P*P*P][NE] = (PetscScalar(*)[Q*Q*Q][NE])yy; 22 | PetscScalar u[dof][Q*P*P][NE]_align,v[dof][Q*Q*P][NE]_align; 23 | 24 | for (PetscInt i=0; ine == 4) { 105 | PetscInt P = ten->P,Q = ten->Q; 106 | switch (ten->dof) { 107 | case 1: // Scalar problems with Q1 or Q2 elements 108 | if (P == 2 && Q == 2) ten->Contract = TensorContract_FMA_4_1_2_2; 109 | else if (P == 3 && Q == 3) ten->Contract = TensorContract_FMA_4_1_3_3; 110 | break; 111 | case 3: // Coordinates or elasticity 112 | if (P == 2 && Q == 2) ten->Contract = TensorContract_FMA_4_3_2_2; 113 | else if (P == 3 && Q == 3) ten->Contract = TensorContract_FMA_4_3_3_3; 114 | break; 115 | } 116 | } 117 | #endif 118 | PetscFunctionReturn(0); 119 | } 120 | -------------------------------------------------------------------------------- /finite-element/tensor.c: -------------------------------------------------------------------------------- 1 | #include "tensorimpl.h" 2 | 3 | static inline PetscErrorCode TensorContract_Inline(PetscInt ne,PetscInt dof,PetscInt P,PetscInt Q,const PetscReal Rf[],const PetscReal Sf[],const PetscReal Tf[],TensorMode tmode,const PetscScalar xx[],PetscScalar yy[]) 4 | { 5 | 6 | PetscFunctionBegin; 7 | if (tmode == TENSOR_TRANSPOSE) {PetscInt tmp = Q; Q = P; P = tmp;} 8 | { 9 | PetscReal R[Q][P],S[Q][P],T[Q][P]; 10 | const PetscScalar (*restrict x)[P*P*P][ne]_align = (const PetscScalar(*)[P*P*P][ne])xx; 11 | PetscScalar (*restrict y)[P*P*P][ne]_align = (PetscScalar(*)[Q*Q*Q][ne])yy; 12 | PetscScalar u[dof][Q*P*P][ne]_align,v[dof][Q*Q*P][ne]_align; 13 | 14 | for (PetscInt i=0; ine,dof = ten->dof,P = ten->P,Q = ten->Q; 65 | return TensorContract_Inline(ne,dof,P,Q,Rf,Sf,Tf,tmode,xx,yy); 66 | } 67 | 68 | static PetscErrorCode TensorContract_Ref_4_1_2_2(Tensor ten,const PetscReal Rf[],const PetscReal Sf[],const PetscReal Tf[],TensorMode tmode,const PetscScalar xx[],PetscScalar yy[]) { 69 | return TensorContract_Inline(4,1,2,2,Rf,Sf,Tf,tmode,xx,yy); 70 | } 71 | static PetscErrorCode TensorContract_Ref_4_3_2_2(Tensor ten,const PetscReal Rf[],const PetscReal Sf[],const PetscReal Tf[],TensorMode tmode,const PetscScalar xx[],PetscScalar yy[]) { 72 | return TensorContract_Inline(4,3,2,2,Rf,Sf,Tf,tmode,xx,yy); 73 | } 74 | static PetscErrorCode TensorContract_Ref_4_1_3_3(Tensor ten,const PetscReal Rf[],const PetscReal Sf[],const PetscReal Tf[],TensorMode tmode,const PetscScalar xx[],PetscScalar yy[]) { 75 | return TensorContract_Inline(4,1,3,3,Rf,Sf,Tf,tmode,xx,yy); 76 | } 77 | static PetscErrorCode TensorContract_Ref_4_3_3_3(Tensor ten,const PetscReal Rf[],const PetscReal Sf[],const PetscReal Tf[],TensorMode tmode,const PetscScalar xx[],PetscScalar yy[]) { 78 | return TensorContract_Inline(4,3,3,3,Rf,Sf,Tf,tmode,xx,yy); 79 | } 80 | 81 | PetscErrorCode TensorSelect_AVX(Tensor); 82 | PetscErrorCode TensorSelect_QPX(Tensor); 83 | 84 | PetscErrorCode TensorCreate(PetscInt ne,PetscInt dof,PetscInt P,PetscInt Q,Tensor *ten) { 85 | Tensor t; 86 | PetscErrorCode ierr; 87 | 88 | PetscFunctionBegin; 89 | ierr = PetscNew(&t);CHKERRQ(ierr); 90 | t->ne = ne; 91 | t->dof = dof; 92 | t->P = P; 93 | t->Q = Q; 94 | t->Contract = TensorContract_Ref; 95 | if (ne == 4) { 96 | switch (dof) { 97 | case 1: // Scalar problems with Q1 or Q2 elements 98 | if (P == 2 && Q == 2) t->Contract = TensorContract_Ref_4_1_2_2; 99 | else if (P == 3 && Q == 3) t->Contract = TensorContract_Ref_4_1_3_3; 100 | break; 101 | case 3: // Coordinates or elasticity 102 | if (P == 2 && Q == 2) t->Contract = TensorContract_Ref_4_3_2_2; 103 | else if (P == 3 && Q == 3) t->Contract = TensorContract_Ref_4_3_3_3; 104 | break; 105 | } 106 | } 107 | ierr = TensorSelect_AVX(t);CHKERRQ(ierr); 108 | ierr = TensorSelect_QPX(t);CHKERRQ(ierr); 109 | *ten = t; 110 | PetscFunctionReturn(0); 111 | } 112 | 113 | PetscErrorCode TensorDestroy(Tensor *ten) { 114 | PetscErrorCode ierr; 115 | 116 | PetscFunctionBegin; 117 | ierr = PetscFree(*ten);CHKERRQ(ierr); 118 | PetscFunctionReturn(0); 119 | } 120 | 121 | PetscErrorCode TensorContract(Tensor ten,const PetscReal Rf[],const PetscReal Sf[],const PetscReal Tf[],TensorMode tmode,const PetscScalar xx[],PetscScalar yy[]) { 122 | return (*ten->Contract)(ten,Rf,Sf,Tf,tmode,xx,yy); 123 | } 124 | -------------------------------------------------------------------------------- /finite-element/tensor.h: -------------------------------------------------------------------------------- 1 | #ifndef _tensor_h 2 | #define _tensor_h 3 | 4 | #include 5 | #include "fefas-align.h" 6 | 7 | typedef enum {TENSOR_EVAL,TENSOR_TRANSPOSE} TensorMode; 8 | 9 | typedef struct Tensor_private *Tensor; 10 | 11 | PetscErrorCode TensorCreate(PetscInt ne,PetscInt dof,PetscInt P,PetscInt Q,Tensor *ten); 12 | PetscErrorCode TensorDestroy(Tensor *ten); 13 | PetscErrorCode TensorContract(Tensor ten,const PetscReal Rf[],const PetscReal Sf[],const PetscReal Tf[],TensorMode tmode,const PetscScalar xx[],PetscScalar yy[]); 14 | 15 | #endif 16 | -------------------------------------------------------------------------------- /finite-element/tensorimpl.h: -------------------------------------------------------------------------------- 1 | #ifndef _tensorimpl_h 2 | #define _tensorimpl_h 3 | 4 | #include "tensor.h" 5 | 6 | struct Tensor_private { 7 | PetscInt ne; 8 | PetscInt dof; 9 | PetscInt P; 10 | PetscInt Q; 11 | PetscErrorCode (*Contract)(Tensor,const PetscReal Rf[],const PetscReal Sf[],const PetscReal Tf[],TensorMode tmode,const PetscScalar xx[],PetscScalar yy[]); 12 | }; 13 | 14 | #endif 15 | -------------------------------------------------------------------------------- /finite-element/test/Makefile: -------------------------------------------------------------------------------- 1 | # Run tests 2 | # 3 | # Copyright (c) 2011-2012 Mathias Lafeldt 4 | # Copyright (c) 2005-2012 Git project 5 | # Copyright (c) 2005-2012 Junio C Hamano 6 | # 7 | # This program is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU General Public License as published by 9 | # the Free Software Foundation, either version 2 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This program is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU General Public License 18 | # along with this program. If not, see http://www.gnu.org/licenses/ . 19 | 20 | SHELL_PATH ?= $(SHELL) 21 | SHELL_PATH_SQ = $(subst ','\'',$(SHELL_PATH)) 22 | RM ?= rm -f 23 | PROVE ?= prove 24 | AGGREGATE_SCRIPT ?= aggregate-results.sh 25 | DEFAULT_TEST_TARGET ?= test 26 | 27 | T = $(wildcard t[0-9]*.sh) 28 | 29 | all: $(DEFAULT_TEST_TARGET) 30 | 31 | test: pre-clean 32 | $(MAKE) aggregate-results-and-cleanup 33 | 34 | prove: pre-clean 35 | @echo "*** prove ***"; $(PROVE) --exec '$(SHELL_PATH_SQ)' $(PROVE_OPTS) $(T) :: $(TEST_OPTS) 36 | $(MAKE) clean-except-prove-cache 37 | 38 | $(T): 39 | @echo "*** $@ ***"; '$(SHELL_PATH_SQ)' $@ $(TEST_OPTS) 40 | 41 | pre-clean: 42 | $(RM) -r test-results 43 | 44 | clean-except-prove-cache: 45 | $(RM) -r 'trash directory'.* test-results 46 | 47 | clean: clean-except-prove-cache 48 | $(RM) .prove 49 | 50 | aggregate-results-and-cleanup: $(T) 51 | $(MAKE) aggregate-results 52 | $(MAKE) clean 53 | 54 | aggregate-results: 55 | for f in test-results/*.counts; do \ 56 | echo "$$f"; \ 57 | done | '$(SHELL_PATH_SQ)' '$(AGGREGATE_SCRIPT)' 58 | 59 | .PHONY: all test prove $(T) pre-clean clean 60 | .PHONY: aggregate-results-and-cleanup aggregate-results 61 | -------------------------------------------------------------------------------- /finite-element/test/aggregate-results.sh: -------------------------------------------------------------------------------- 1 | ../sharness/aggregate-results.sh -------------------------------------------------------------------------------- /finite-element/test/hpgmg-sharness.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . ./sharness.sh 4 | 5 | # Public: Run parallel executable and compare output 6 | # 7 | # When the test passed, an "ok" message is printed and the number of successful 8 | # tests is incremented. When it failed, a "not ok" message is printed and the 9 | # number of failed tests is incremented. 10 | # 11 | # With --immediate, exit test immediately upon the first failed test. 12 | # 13 | # Usually takes four arguments: 14 | # $1 - Test description 15 | # $2 - Number of processes 16 | # $3 - Executable name (found in ${HPGMG_BINDIR}/) followed by runtime options 17 | # $4 - Expected output 18 | # 19 | # With five arguments, the first will be taken to be a prerequisite: 20 | # $1 - Comma-separated list of test prerequisites. The test will be skipped if 21 | # not all of the given prerequisites are set. To negate a prerequisite, 22 | # put a "!" in front of it. 23 | # $2 - Test description 24 | # $3 - Number of processes 25 | # $4 - Executable name (found in ${HPGMG_BINDIR}/) followed by runtime options 26 | # $5 - Expected output 27 | # 28 | # Returns nothing. 29 | test_expect_stdout() { 30 | test "$#" = 5 && { test_prereq=$1; shift; } || test_prereq= 31 | test "$#" = 4 || error "bug in test script: $# not 4 or 5 parameters to test_expect_stdout" 32 | 33 | export test_prereq 34 | if ! test_skip_ "$@"; then 35 | say >&3 "expecting success: $2 $3" 36 | sed '1d;$d' <<<"$4" > reference.out 37 | diffoutput= 38 | if "${MPIEXEC}" -n $2 "${HPGMG_BINDIR}/"$3 > actual.out 2>&4 && 39 | diffoutput=$(git diff --exit-code --no-index reference.out actual.out); then 40 | test_ok_ "$1" 41 | else 42 | test_failure_ "$1 $2 $3" "${diffoutput}" 43 | 44 | fi 45 | fi 46 | echo >&3 "" 47 | } 48 | 49 | # Public: Run parallel executable and check for failure with error message 50 | # 51 | # When the test passed, an "ok" message is printed and the number of successful 52 | # tests is incremented. When it failed, a "not ok" message is printed and the 53 | # number of failed tests is incremented. 54 | # 55 | # With --immediate, exit test immediately upon the first failed test. 56 | # 57 | # Usually takes four arguments: 58 | # $1 - Test description 59 | # $2 - Number of processes 60 | # $3 - Executable name (found in ${HPGMG_BINDIR}/) followed by runtime options 61 | # $4 - Expected string in error message (stderr) 62 | # 63 | # With five arguments, the first will be taken to be a prerequisite: 64 | # $1 - Comma-separated list of test prerequisites. The test will be skipped if 65 | # not all of the given prerequisites are set. To negate a prerequisite, 66 | # put a "!" in front of it. 67 | # $2 - Test description 68 | # $3 - Number of processes 69 | # $4 - Executable name (found in ${HPGMG_BINDIR}/) followed by runtime options 70 | # $5 - Expected string in error message 71 | # 72 | # Returns nothing. 73 | test_expect_error() { 74 | test "$#" = 5 && { test_prereq=$1; shift; } || test_prereq= 75 | test "$#" = 4 || error "bug in test script: $# not 4 or 5 parameters to test_expect_stdout" 76 | 77 | export test_prereq 78 | if ! test_skip_ "$@"; then 79 | say >&3 "checking known breakage: $2 $3" 80 | expected_stderr=$(sed '1d' <<<"$4") 81 | # Don't check exit code because process managers do not always propagate correctly 82 | "${MPIEXEC}" -n $2 "${HPGMG_BINDIR}/"$3 > /dev/null 2> actual.err 83 | if fgrep -q "${expected_stderr}" actual.err; then 84 | test_ok_ "$1" 85 | else 86 | test_failure_ "$1 $2 $3" "Expecting: ${expected_stderr}$(echo && cat actual.err)" 87 | fi 88 | fi 89 | echo >&3 "" 90 | } 91 | 92 | MPIEXEC=$(awk '/MPIEXEC/{print $3}' "${PETSC_DIR}/${PETSC_ARCH}/conf/petscvariables") 93 | -------------------------------------------------------------------------------- /finite-element/test/sharness.sh: -------------------------------------------------------------------------------- 1 | ../sharness/sharness.sh -------------------------------------------------------------------------------- /finite-element/test/t020-fespace.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | test_description='Test FE creation and scatters' 4 | 5 | . ./hpgmg-sharness.sh 6 | 7 | test_expect_stdout 'FE GlobalToLocal fedegree=1' 4 'hpgmg-fe test-fespace -p 2,2,1 -M 4,4,2' ' 8 | Vec Object: 1 MPI processes 9 | type: seq 10 | 0 11 | 1 12 | 2 13 | 3 14 | 4 15 | 5 16 | 12 17 | 13 18 | 14 19 | 6 20 | 7 21 | 8 22 | 9 23 | 10 24 | 11 25 | 21 26 | 22 27 | 23 28 | 30 29 | 31 30 | 32 31 | 33 32 | 34 33 | 35 34 | 48 35 | 49 36 | 50 37 | ' 38 | 39 | test_expect_stdout 'FE Gradient/coordinates fedegree=1' 4 'hpgmg-fe test-fegrad -M 6,2,10 -p 2,1,2 -L 7,11,13' ' 40 | ' 41 | 42 | test_done 43 | -------------------------------------------------------------------------------- /finite-element/test/t030-feinject.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | test_description='Test FE coarsening and injection' 4 | 5 | . ./hpgmg-sharness.sh 6 | 7 | test_expect_stdout 'FE Inject fedegree=1' 4 'hpgmg-fe test-feinject -M 4,2,6 -p 2,1,2 -L 4,2,6' ' 8 | coarse u[ 0] = 0.0 at 0.0 0.0 0.0 9 | coarse u[ 1] = 2.0 at 0.0 0.0 2.0 10 | coarse u[ 2] = 4.0 at 0.0 0.0 4.0 11 | coarse u[ 3] = 6.0 at 0.0 0.0 6.0 12 | coarse u[ 4] = 2000.0 at 0.0 2.0 0.0 13 | coarse u[ 5] = 2002.0 at 0.0 2.0 2.0 14 | coarse u[ 6] = 2004.0 at 0.0 2.0 4.0 15 | coarse u[ 7] = 2006.0 at 0.0 2.0 6.0 16 | coarse u[ 8] = 2000000.0 at 2.0 0.0 0.0 17 | coarse u[ 9] = 2000002.0 at 2.0 0.0 2.0 18 | coarse u[10] = 2000004.0 at 2.0 0.0 4.0 19 | coarse u[11] = 2000006.0 at 2.0 0.0 6.0 20 | coarse u[12] = 2002000.0 at 2.0 2.0 0.0 21 | coarse u[13] = 2002002.0 at 2.0 2.0 2.0 22 | coarse u[14] = 2002004.0 at 2.0 2.0 4.0 23 | coarse u[15] = 2002006.0 at 2.0 2.0 6.0 24 | coarse u[16] = 4000000.0 at 4.0 0.0 0.0 25 | coarse u[17] = 4000002.0 at 4.0 0.0 2.0 26 | coarse u[18] = 4000004.0 at 4.0 0.0 4.0 27 | coarse u[19] = 4000006.0 at 4.0 0.0 6.0 28 | coarse u[20] = 4002000.0 at 4.0 2.0 0.0 29 | coarse u[21] = 4002002.0 at 4.0 2.0 2.0 30 | coarse u[22] = 4002004.0 at 4.0 2.0 4.0 31 | coarse u[23] = 4002006.0 at 4.0 2.0 6.0 32 | ' 33 | 34 | test_done 35 | -------------------------------------------------------------------------------- /finite-element/test/t040-feinterp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | test_description='Test FE interpolation' 4 | 5 | . ./hpgmg-sharness.sh 6 | 7 | test_expect_stdout 'FE Interpolation fedegree=1 serial' 1 'hpgmg-fe test-feinterp -M 6,2,10 -L 6,2,10' ' 8 | |u - I Ihat u|_max = 0 9 | ' 10 | 11 | test_expect_stdout 'FE Interpolation fedegree=1 parallel' 4 'hpgmg-fe test-feinterp -M 6,2,10 -L 6,2,10 -p 2,1,2' ' 12 | |u - I Ihat u|_max = 0 13 | ' 14 | 15 | test_done 16 | -------------------------------------------------------------------------------- /finite-element/test/t045-ferestrict.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | test_description='Test FE restriction' 4 | 5 | . ./hpgmg-sharness.sh 6 | 7 | test_expect_stdout 'FE Restriction fedegree=1 serial' 1 'hpgmg-fe test-ferestrict -M 4,4,6 -L 4,4,6' ' 8 | |u_c - I_h^H u_f|_max = 0 9 | ' 10 | 11 | test_expect_stdout 'FE Restriction fedegree=1 parallel' 4 'hpgmg-fe test-ferestrict -M 6,4,10 -L 6,4,10 -p 2,1,2' ' 12 | |u_c - I_h^H u_f|_max = 0 13 | ' 14 | 15 | test_expect_stdout 'FE Restriction fedegree=1 parallel ragged coarsening' 4 'hpgmg-fe test-ferestrict -M 4,4,12 -L 1,1,1 -p 1,1,4' ' 16 | |u_c - I_h^H u_f|_max = 0 17 | ' 18 | 19 | test_done 20 | -------------------------------------------------------------------------------- /finite-element/test/t100-poisson.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | test_description='Test Poisson solver' 4 | 5 | . ./hpgmg-sharness.sh 6 | 7 | # We expect second-order convergence on the residual for fedegree=1. 8 | test_expect_stdout 'FE Poisson fedegree=1 serial' 1 'hpgmg-fe test-opapply -op_type poisson1 -M 4,8,12 -L 1,1,1 -poisson_solution sine' ' 9 | [0] Level 2: [ 0: 4, 0: 8, 0: 12] of [ 4, 8, 12] on [ 1, 1, 1] 10 | [0] Level 1: [ 0: 2, 0: 4, 0: 6] of [ 2, 4, 6] on [ 1, 1, 1] 11 | [0] Level 0: [ 0: 1, 0: 2, 0: 3] of [ 1, 2, 3] on [ 1, 1, 1] 12 | |A u - F|_max/|F|_max = 0.0978195 13 | ' 14 | 15 | test_expect_stdout 'FE Poisson fedegree=1 serial refined' 1 'hpgmg-fe test-opapply -op_type poisson1 -M 8,16,24 -L 1,1,1 -poisson_solution sine' ' 16 | [0] Level 3: [ 0: 8, 0: 16, 0: 24] of [ 8, 16, 24] on [ 1, 1, 1] 17 | [0] Level 2: [ 0: 4, 0: 8, 0: 12] of [ 4, 8, 12] on [ 1, 1, 1] 18 | [0] Level 1: [ 0: 2, 0: 4, 0: 6] of [ 2, 4, 6] on [ 1, 1, 1] 19 | [0] Level 0: [ 0: 1, 0: 2, 0: 3] of [ 1, 2, 3] on [ 1, 1, 1] 20 | |A u - F|_max/|F|_max = 0.0253888 21 | ' 22 | 23 | test_expect_stdout 'FE Poisson fedegree=1 parallel refined' 4 'hpgmg-fe test-opapply -op_type poisson1 -M 8,16,24 -L 1,1,1 -p 1,2,2 -cmax 48 -poisson_solution sine' ' 24 | [0] Level 3: [ 0: 8, 0: 8, 0: 12] of [ 8, 16, 24] on [ 1, 2, 2] 25 | [0] Level 2: [ 0: 4, 0: 4, 0: 6] of [ 4, 8, 12] on [ 1, 2, 2] 26 | [0] Level 1: [ 0: 2, 0: 4, 0: 6] of [ 2, 4, 6] on [ 1, 1, 1] 27 | [0] Level 0: [ 0: 1, 0: 2, 0: 3] of [ 1, 2, 3] on [ 1, 1, 1] 28 | [1] Level 3: [ 0: 8, 0: 8, 12: 24] of [ 8, 16, 24] on [ 1, 2, 2] 29 | [1] Level 2: [ 0: 4, 0: 4, 6: 12] of [ 4, 8, 12] on [ 1, 2, 2] 30 | [2] Level 3: [ 0: 8, 8: 16, 0: 12] of [ 8, 16, 24] on [ 1, 2, 2] 31 | [2] Level 2: [ 0: 4, 4: 8, 0: 6] of [ 4, 8, 12] on [ 1, 2, 2] 32 | [3] Level 3: [ 0: 8, 8: 16, 12: 24] of [ 8, 16, 24] on [ 1, 2, 2] 33 | [3] Level 2: [ 0: 4, 4: 8, 6: 12] of [ 4, 8, 12] on [ 1, 2, 2] 34 | |A u - F|_max/|F|_max = 0.0253888 35 | ' 36 | 37 | test_done 38 | -------------------------------------------------------------------------------- /finite-element/test/t110-poissondiag.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | test_description='Test Poisson diagonal' 4 | 5 | . ./hpgmg-sharness.sh 6 | 7 | test_expect_stdout 'FE Poisson diagonal fedegree=1 serial' 1 'hpgmg-fe test-opdiagonal -op_type poisson1 -M 8,12,16 -L 1,1,1' ' 8 | [0] Level 2: [ 0: 8, 0: 12, 0: 16] of [ 8, 12, 16] on [ 1, 1, 1] 9 | [0] Level 1: [ 0: 4, 0: 6, 0: 8] of [ 4, 6, 8] on [ 1, 1, 1] 10 | [0] Level 0: [ 0: 2, 0: 3, 0: 4] of [ 2, 3, 4] on [ 1, 1, 1] 11 | |D|_1 = 310.139 |D|_2 = 9.12568 |D|_max = 0.268519 12 | ' 13 | 14 | test_expect_stdout 'FE Poisson diagonal fedegree=1 parallel' 4 'hpgmg-fe test-opdiagonal -op_type poisson1 -M 8,12,16 -L 1,1,1 -p 2,1,2' ' 15 | [0] Level 2: [ 0: 4, 0: 12, 0: 8] of [ 8, 12, 16] on [ 2, 1, 2] 16 | [0] Level 1: [ 0: 2, 0: 6, 0: 4] of [ 4, 6, 8] on [ 2, 1, 2] 17 | [0] Level 0: [ 0: 2, 0: 3, 0: 4] of [ 2, 3, 4] on [ 1, 1, 1] 18 | [1] Level 2: [ 0: 4, 0: 12, 8: 16] of [ 8, 12, 16] on [ 2, 1, 2] 19 | [1] Level 1: [ 0: 2, 0: 6, 4: 8] of [ 4, 6, 8] on [ 2, 1, 2] 20 | [2] Level 2: [ 4: 8, 0: 12, 0: 8] of [ 8, 12, 16] on [ 2, 1, 2] 21 | [2] Level 1: [ 2: 4, 0: 6, 0: 4] of [ 4, 6, 8] on [ 2, 1, 2] 22 | [3] Level 2: [ 4: 8, 0: 12, 8: 16] of [ 8, 12, 16] on [ 2, 1, 2] 23 | [3] Level 1: [ 2: 4, 0: 6, 4: 8] of [ 4, 6, 8] on [ 2, 1, 2] 24 | |D|_1 = 310.139 |D|_2 = 9.12568 |D|_max = 0.268519 25 | ' 26 | 27 | test_done 28 | -------------------------------------------------------------------------------- /finite-element/test/t120-poissonksp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | test_description='Test Poisson solve using KSP' 4 | 5 | . ./hpgmg-sharness.sh 6 | 7 | # Error norm is converging at second order 8 | test_expect_stdout 'FE Poisson KSP solve fedegree=1 serial' 1 'hpgmg-fe test-kspsolve -op_type poisson1 -M 8,12,16 -L 1,1,1 -ksp_converged_reason -ksp_view -ksp_type chebyshev -ksp_chebyshev_eigenvalues 0.2,2 -pc_type jacobi -poisson_solution sine' ' 9 | [0] Level 2: [ 0: 8, 0: 12, 0: 16] of [ 8, 12, 16] on [ 1, 1, 1] 10 | [0] Level 1: [ 0: 4, 0: 6, 0: 8] of [ 4, 6, 8] on [ 1, 1, 1] 11 | [0] Level 0: [ 0: 2, 0: 3, 0: 4] of [ 2, 3, 4] on [ 1, 1, 1] 12 | Linear solve converged due to CONVERGED_RTOL iterations 20 13 | KSP Object: 1 MPI processes 14 | type: chebyshev 15 | Chebyshev: eigenvalue estimates: min = 0.2, max = 2 16 | maximum iterations=10000, initial guess is zero 17 | tolerances: relative=1e-05, absolute=1e-50, divergence=10000 18 | left preconditioning 19 | using PRECONDITIONED norm type for convergence test 20 | PC Object: 1 MPI processes 21 | type: jacobi 22 | linear system matrix = precond matrix: 23 | Mat Object: 1 MPI processes 24 | type: shell 25 | rows=1989, cols=1989 26 | |v-u|_2/|u|_2 = 0.0393899 27 | ' 28 | 29 | test_expect_stdout 'FE Poisson KSP solve fedegree=1 parallel' 4 'hpgmg-fe test-kspsolve -op_type poisson1 -M 8,12,16 -L 1,1,1 -ksp_converged_reason -ksp_view -ksp_type chebyshev -ksp_chebyshev_eigenvalues 0.2,2 -pc_type jacobi -p 1,2,2 -poisson_solution sine' ' 30 | [0] Level 2: [ 0: 8, 0: 6, 0: 8] of [ 8, 12, 16] on [ 1, 2, 2] 31 | [0] Level 1: [ 0: 4, 0: 3, 0: 4] of [ 4, 6, 8] on [ 1, 2, 2] 32 | [0] Level 0: [ 0: 2, 0: 3, 0: 4] of [ 2, 3, 4] on [ 1, 1, 1] 33 | [1] Level 2: [ 0: 8, 0: 6, 8: 16] of [ 8, 12, 16] on [ 1, 2, 2] 34 | [1] Level 1: [ 0: 4, 0: 3, 4: 8] of [ 4, 6, 8] on [ 1, 2, 2] 35 | [2] Level 2: [ 0: 8, 6: 12, 0: 8] of [ 8, 12, 16] on [ 1, 2, 2] 36 | [2] Level 1: [ 0: 4, 3: 6, 0: 4] of [ 4, 6, 8] on [ 1, 2, 2] 37 | [3] Level 2: [ 0: 8, 6: 12, 8: 16] of [ 8, 12, 16] on [ 1, 2, 2] 38 | [3] Level 1: [ 0: 4, 3: 6, 4: 8] of [ 4, 6, 8] on [ 1, 2, 2] 39 | Linear solve converged due to CONVERGED_RTOL iterations 20 40 | KSP Object: 4 MPI processes 41 | type: chebyshev 42 | Chebyshev: eigenvalue estimates: min = 0.2, max = 2 43 | maximum iterations=10000, initial guess is zero 44 | tolerances: relative=1e-05, absolute=1e-50, divergence=10000 45 | left preconditioning 46 | using PRECONDITIONED norm type for convergence test 47 | PC Object: 4 MPI processes 48 | type: jacobi 49 | linear system matrix = precond matrix: 50 | Mat Object: 4 MPI processes 51 | type: shell 52 | rows=1989, cols=1989 53 | |v-u|_2/|u|_2 = 0.0393899 54 | ' 55 | 56 | test_done 57 | -------------------------------------------------------------------------------- /finite-element/test/t200-mgv.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | test_description='Test Poisson solve using MG V-cycles' 4 | 5 | . ./hpgmg-sharness.sh 6 | 7 | # Error norm is converging at second order 8 | test_expect_stdout 'FE Poisson MG V-cycle solve fedegree=1 serial' 1 'hpgmg-fe mgv -op_type poisson1 -M 16,20,24 -L 1,1,1 -smooth 2,2 -mg_eig_target 2,0.2 -poisson_solution sine' ' 9 | [0] Level 2: [ 0: 16, 0: 20, 0: 24] of [ 16, 20, 24] on [ 1, 1, 1] 10 | [0] Level 1: [ 0: 8, 0: 10, 0: 12] of [ 8, 10, 12] on [ 1, 1, 1] 11 | [0] Level 0: [ 0: 4, 0: 5, 0: 6] of [ 4, 5, 6] on [ 1, 1, 1] 12 | V(2,2) 0: |e|_2/|u|_2 1.50e-02 |r|_2/|f|_2 2.25e-01 13 | V(2,2) 1: |e|_2/|u|_2 1.06e-02 |r|_2/|f|_2 5.40e-02 14 | V(2,2) 2: |e|_2/|u|_2 1.27e-02 |r|_2/|f|_2 1.31e-02 15 | V(2,2) 3: |e|_2/|u|_2 1.34e-02 |r|_2/|f|_2 3.22e-03 16 | V(2,2) 4: |e|_2/|u|_2 1.35e-02 |r|_2/|f|_2 7.91e-04 17 | ' 18 | 19 | test_expect_stdout 'FE Poisson MG V-cycle solve fedegree=1 parallel' 4 'hpgmg-fe mgv -op_type poisson1 -M 16,20,24 -L 1,1,1 -p 1,2,2 -cmax 240 -smooth 2,2 -mg_eig_target 2,0.2 -poisson_solution sine' ' 20 | [0] Level 2: [ 0: 16, 0: 10, 0: 12] of [ 16, 20, 24] on [ 1, 2, 2] 21 | [0] Level 1: [ 0: 8, 0: 5, 0: 6] of [ 8, 10, 12] on [ 1, 2, 2] 22 | [0] Level 0: [ 0: 4, 0: 5, 0: 6] of [ 4, 5, 6] on [ 1, 1, 1] 23 | [1] Level 2: [ 0: 16, 0: 10, 12: 24] of [ 16, 20, 24] on [ 1, 2, 2] 24 | [1] Level 1: [ 0: 8, 0: 5, 6: 12] of [ 8, 10, 12] on [ 1, 2, 2] 25 | [2] Level 2: [ 0: 16, 10: 20, 0: 12] of [ 16, 20, 24] on [ 1, 2, 2] 26 | [2] Level 1: [ 0: 8, 5: 10, 0: 6] of [ 8, 10, 12] on [ 1, 2, 2] 27 | [3] Level 2: [ 0: 16, 10: 20, 12: 24] of [ 16, 20, 24] on [ 1, 2, 2] 28 | [3] Level 1: [ 0: 8, 5: 10, 6: 12] of [ 8, 10, 12] on [ 1, 2, 2] 29 | V(2,2) 0: |e|_2/|u|_2 1.50e-02 |r|_2/|f|_2 2.25e-01 30 | V(2,2) 1: |e|_2/|u|_2 1.06e-02 |r|_2/|f|_2 5.40e-02 31 | V(2,2) 2: |e|_2/|u|_2 1.27e-02 |r|_2/|f|_2 1.31e-02 32 | V(2,2) 3: |e|_2/|u|_2 1.34e-02 |r|_2/|f|_2 3.22e-03 33 | V(2,2) 4: |e|_2/|u|_2 1.35e-02 |r|_2/|f|_2 7.91e-04 34 | ' 35 | 36 | test_done 37 | -------------------------------------------------------------------------------- /finite-element/test/t220-fmg.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | test_description='Test Poisson solve using FMG' 4 | 5 | . ./hpgmg-sharness.sh 6 | 7 | # Error norm is converging at second order 8 | test_expect_stdout 'FE Poisson FMG solve fedegree=1 serial' 1 'hpgmg-fe fmg -op_type poisson1 -M 8,16,24 -smooth 3,3 -mg_eig_target 2,0.2 -poisson_solution sine' ' 9 | F(3,3) 0: |e|_2/|u|_2 2.26e-02 |r|_2/|f|_2 3.37e-02 10 | V(3,3) 1: |e|_2/|u|_2 2.58e-02 |r|_2/|f|_2 2.05e-03 11 | V(3,3) 2: |e|_2/|u|_2 2.60e-02 |r|_2/|f|_2 1.25e-04 12 | ' 13 | 14 | test_expect_stdout 'FE Poisson FMG solve fedegree=1 parallel' 4 'hpgmg-fe fmg -op_type poisson1 -M 8,16,24 -p 1,2,2 -smooth 3,3 -mg_eig_target 2,0.2 -poisson_solution sine' ' 15 | F(3,3) 0: |e|_2/|u|_2 2.26e-02 |r|_2/|f|_2 3.37e-02 16 | V(3,3) 1: |e|_2/|u|_2 2.58e-02 |r|_2/|f|_2 2.05e-03 17 | V(3,3) 2: |e|_2/|u|_2 2.60e-02 |r|_2/|f|_2 1.25e-04 18 | ' 19 | 20 | test_done 21 | -------------------------------------------------------------------------------- /finite-element/test/t230-fmg-poisson2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | test_description='Test Q2 Poisson solve using FMG' 4 | 5 | . ./hpgmg-sharness.sh 6 | 7 | # Error norm is converging at fourth order (superconvergent at Lagrange nodes) 8 | test_expect_stdout 'FE Poisson FMG solve fedegree=2 serial' 1 'hpgmg-fe fmg -op_type poisson2 -M 4,4,6 -smooth 4,3' ' 9 | F(4,3) 0: |e|_2/|u|_2 9.08e-03 |r|_2/|f|_2 3.35e-04 10 | V(4,3) 1: |e|_2/|u|_2 9.17e-03 |r|_2/|f|_2 8.27e-07 11 | V(4,3) 2: |e|_2/|u|_2 9.17e-03 |r|_2/|f|_2 5.54e-09 12 | ' 13 | 14 | test_expect_stdout 'FE Poisson FMG solve fedegree=2 parallel' 4 'hpgmg-fe fmg -op_type poisson2 -M 4,4,6 -smooth 4,3 -p 1,2,2' ' 15 | F(4,3) 0: |e|_2/|u|_2 9.08e-03 |r|_2/|f|_2 3.35e-04 16 | V(4,3) 1: |e|_2/|u|_2 9.17e-03 |r|_2/|f|_2 8.27e-07 17 | V(4,3) 2: |e|_2/|u|_2 9.17e-03 |r|_2/|f|_2 5.54e-09 18 | ' 19 | 20 | test_done 21 | -------------------------------------------------------------------------------- /finite-element/test/t60-sample.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | test_description='Test grid creation' 4 | 5 | . ./hpgmg-sharness.sh 6 | 7 | test_expect_stdout 'list of samples' 1 'hpgmg-fe test-sampler -local 100,1e9 -maxsamples 10 -nranks 192' ' 8 | Processors: [ 4 6 8] = 192 9 | Filtered Grid: L11 [4096 6144 6144] = 154618822656 10 | Filtered Grid: L12 [4096 4096 4096] = 68719476736 11 | Filtered Grid: L10 [2048 2048 3072] = 12884901888 12 | Filtered Grid: L 9 [1024 1024 1536] = 1610612736 13 | Filtered Grid: L 8 [ 512 512 768] = 201326592 14 | Filtered Grid: L 7 [ 256 256 384] = 25165824 15 | Filtered Grid: L 6 [ 128 128 192] = 3145728 16 | Filtered Grid: L 5 [ 64 64 96] = 393216 17 | Filtered Grid: L 4 [ 32 32 48] = 49152 18 | Filtered Grid: L 3 [ 24 24 32] = 18432 19 | ' 20 | 21 | test_done 22 | -------------------------------------------------------------------------------- /finite-volume/README: -------------------------------------------------------------------------------- 1 | *** Copyright Notice *** 2 | 3 | Copyright (c) 2014-2015, NVIDIA CORPORATION. All rights reserved. 4 | 5 | The U.S. Department of Energy funded the development of this software 6 | under subcontract B609478 with Lawrence Livermore National Security (LLNS). 7 | 8 | HPGMG, Copyright (c) 2014, The Regents of the University of 9 | California, through Lawrence Berkeley National Laboratory (subject to 10 | receipt of any required approvals from the U.S. Dept. of Energy). All 11 | rights reserved. 12 | 13 | If you have questions about your rights to use or distribute this 14 | software, please contact Berkeley Lab's Technology Transfer Department 15 | at TTD@lbl.gov. 16 | 17 | NOTICE. This software is owned by the U.S. Department of Energy. As 18 | such, the U.S. Government has been granted for itself and others 19 | acting on its behalf a paid-up, nonexclusive, irrevocable, worldwide 20 | license in the Software to reproduce, prepare derivative works, and 21 | perform publicly and display publicly. Beginning five (5) years after 22 | the date permission to assert copyright is obtained from the U.S. 23 | Department of Energy, and subject to any subsequent five (5) year 24 | renewals, the U.S. Government is granted for itself and others acting 25 | on its behalf a paid-up, nonexclusive, irrevocable, worldwide license 26 | in the Software to reproduce, prepare derivative works, distribute 27 | copies to the public, perform publicly and display publicly, and to 28 | permit others to do so. 29 | **************************** 30 | 31 | This directory contains the current HPGMG finite-volume benchmark. 32 | 33 | Please see ./source/README for details on how to compiler, run, 34 | optimize, and examine the output of the hpgmg finite-volume benchmark. 35 | 36 | Example job scripts are in the ./example_jobs directory 37 | -------------------------------------------------------------------------------- /finite-volume/example_jobs/job.biou.00008: -------------------------------------------------------------------------------- 1 | #PBS -N HPGMG-FV 2 | #PBS -q parallel 3 | #PBS -l nodes=2:ppn=32,walltime=00:30:00 4 | #PBS -m abe 5 | #PBS -V 6 | 7 | cd $PBS_O_WORKDIR 8 | 9 | module load openmpi/1.8.1-gcc 10 | #module load openmpi/1.6.5-gcc 11 | 12 | 13 | # Hybrid MPI + OpenMP 14 | export OMP_NUM_THREADS=8 15 | mpiexec -report-bindings -np 8 --map-by node -bind-to numa ./run.power7 7 1 16 | #mpiexec -report-bindings -np 1 -npernode 1 -cpus-per-proc 8 -bind-to-core ./run.power7 7 1 17 | #mpiexec -report-bindings -np 8 -npernode 4 -cpus-per-proc 8 -bind-to-core ./run.power7 7 1 18 | 19 | # flat MPI 20 | export OMP_NUM_THREADS=1 21 | mpiexec -report-bindings -np 64 --map-by node ./run.power7 6 1 22 | #mpiexec -report-bindings -np 8 -npernode 8 -cpus-per-proc 1 -bind-to-core ./run.power7 6 1 23 | #mpiexec -report-bindings -np 64 -npernode 32 -cpus-per-proc 1 -bind-to-core ./run.power7 6 1 24 | -------------------------------------------------------------------------------- /finite-volume/example_jobs/job.carver.00064: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #PBS -q debug 3 | #PBS -N HPGMG 4 | #PBS -o results.carver.00064 5 | #PBS -j oe 6 | #PBS -l walltime=0:30:00 7 | #PBS -l nodes=32:ppn=8 8 | #####PBS -l pvmem=10GB 9 | 10 | 11 | set -x 12 | cd $PBS_O_WORKDIR 13 | module swap pgi intel 14 | module swap openmpi openmpi-intel 15 | 16 | export OMP_NUM_THREADS=4 17 | mpirun -np 1 -report-bindings -npernode 1 -npersocket 1 -bysocket -bind-to-socket ./run.carver 7 1 18 | mpirun -np 8 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver 7 1 19 | mpirun -np 27 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver 7 1 20 | mpirun -np 64 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver 7 1 21 | -------------------------------------------------------------------------------- /finite-volume/example_jobs/job.carver.00128: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #PBS -q regular 3 | #PBS -N HPGMG 4 | #PBS -o results.carver.00128 5 | #PBS -j oe 6 | #PBS -l walltime=0:30:00 7 | #PBS -l nodes=64:ppn=8 8 | #####PBS -l pvmem=10GB 9 | 10 | 11 | set -x 12 | cd $PBS_O_WORKDIR 13 | module swap pgi intel 14 | module swap openmpi openmpi-intel 15 | 16 | export OMP_NUM_THREADS=4 17 | mpirun -np 128 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver 7 4 18 | mpirun -np 64 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver 7 8 19 | mpirun -np 32 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver 7 2 20 | mpirun -np 16 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver 7 4 21 | 22 | mpirun -np 125 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver 7 1 23 | mpirun -np 64 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver 7 1 24 | mpirun -np 27 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver 7 1 25 | mpirun -np 8 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver 7 1 26 | mpirun -np 1 -report-bindings -npernode 1 -npersocket 1 -bysocket -bind-to-socket ./run.carver 7 1 27 | -------------------------------------------------------------------------------- /finite-volume/example_jobs/job.carver.00512: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #PBS -q regular 3 | #PBS -N HPGMG 4 | #PBS -o results.carver.00512 5 | #PBS -j oe 6 | #PBS -l walltime=0:30:00 7 | #PBS -l nodes=256:ppn=8 8 | #####PBS -l pvmem=10GB 9 | 10 | 11 | set -x 12 | cd $PBS_O_WORKDIR 13 | module swap pgi intel 14 | module swap openmpi openmpi-intel 15 | 16 | export OMP_NUM_THREADS=4 17 | mpirun -np 1 -report-bindings -npernode 1 -npersocket 1 -bysocket -bind-to-socket ./run.carver 7 1 18 | mpirun -np 8 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver 7 1 19 | mpirun -np 27 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver 7 1 20 | mpirun -np 64 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver 7 1 21 | mpirun -np 125 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver 7 1 22 | mpirun -np 216 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver 7 1 23 | mpirun -np 343 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver 7 1 24 | mpirun -np 512 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver 7 1 25 | -------------------------------------------------------------------------------- /finite-volume/example_jobs/job.carver.01728: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #PBS -q regular 3 | #PBS -N HPGMG 4 | #PBS -o results.carver.01728 5 | #PBS -j oe 6 | #PBS -l walltime=0:30:00 7 | #PBS -l nodes=864:ppn=8 8 | #####PBS -l pvmem=10GB 9 | 10 | 11 | set -x 12 | cd $PBS_O_WORKDIR 13 | module swap pgi intel 14 | module swap openmpi openmpi-intel 15 | 16 | export OMP_NUM_THREADS=4 17 | mpirun -np 1 -report-bindings -npernode 1 -npersocket 1 -bysocket -bind-to-socket ./run.carver 7 1 18 | mpirun -np 8 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver 7 1 19 | mpirun -np 27 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver 7 1 20 | mpirun -np 64 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver 7 1 21 | mpirun -np 125 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver 7 1 22 | mpirun -np 216 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver 7 1 23 | mpirun -np 343 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver 7 1 24 | mpirun -np 512 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver 7 1 25 | mpirun -np 729 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver 7 1 26 | mpirun -np 1000 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver 7 1 27 | mpirun -np 1331 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver 7 1 28 | mpirun -np 1728 -report-bindings -npernode 2 -npersocket 1 -bysocket -bind-to-socket ./run.carver 7 1 29 | -------------------------------------------------------------------------------- /finite-volume/example_jobs/job.edison.00064: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #PBS -N HPGMG 3 | #PBS -o results.edison.00064 4 | #PBS -q debug 5 | #PBS -j oe 6 | #PBS -l walltime=0:29:00 7 | #PBS -l mppwidth=768 8 | 9 | set -x 10 | cd $PBS_O_WORKDIR 11 | #export UGNI_CDM_MDD_DEDICATED=2 12 | 13 | #export OMP_NUM_THREADS=12 14 | #aprun -n 64 -N 2 -S 1 -cc numa_node ./run.edison 9 1 15 | #aprun -n 64 -N 2 -S 1 -cc numa_node ./run.edison 8 1 16 | #aprun -n 64 -N 2 -S 1 -cc numa_node ./run.edison 7 1 17 | #aprun -n 64 -N 2 -S 1 -cc numa_node ./run.edison 6 1 18 | #aprun -n 64 -N 2 -S 1 -cc numa_node ./run.edison 5 1 19 | 20 | export OMP_NUM_THREADS=12 21 | aprun -n 8 -N 2 -S 1 -cc numa_node ./run.edison 7 1 22 | aprun -n 27 -N 2 -S 1 -cc numa_node ./run.edison 7 1 23 | aprun -n 64 -N 2 -S 1 -cc numa_node ./run.edison 7 1 24 | 25 | aprun -n 8 -N 2 -S 1 -cc numa_node ./run.edison 7 1 26 | aprun -n 27 -N 2 -S 1 -cc numa_node ./run.edison 7 1 27 | aprun -n 64 -N 2 -S 1 -cc numa_node ./run.edison 7 1 28 | 29 | #export OMP_NUM_THREADS=1 30 | #aprun -n 64 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 31 | #aprun -n 216 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 32 | #aprun -n 512 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 33 | # 34 | #aprun -n 64 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 35 | #aprun -n 216 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 36 | #aprun -n 512 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 37 | -------------------------------------------------------------------------------- /finite-volume/example_jobs/job.edison.00512: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #PBS -N HPGMG 3 | #PBS -o results.edison.00512 4 | #PBS -q regular 5 | #PBS -j oe 6 | #PBS -l walltime=0:29:00 7 | #PBS -l mppwidth=6144 8 | 9 | set -x 10 | cd $PBS_O_WORKDIR 11 | 12 | 13 | export OMP_NUM_THREADS=12 14 | aprun -n 1 -N 1 -S 1 -cc numa_node ./run.edison 7 1 15 | aprun -n 8 -N 2 -S 1 -cc numa_node ./run.edison 7 1 16 | aprun -n 27 -N 2 -S 1 -cc numa_node ./run.edison 7 1 17 | aprun -n 64 -N 2 -S 1 -cc numa_node ./run.edison 7 1 18 | aprun -n 125 -N 2 -S 1 -cc numa_node ./run.edison 7 1 19 | aprun -n 216 -N 2 -S 1 -cc numa_node ./run.edison 7 1 20 | aprun -n 384 -N 2 -S 1 -cc numa_node ./run.edison 7 1 21 | aprun -n 512 -N 2 -S 1 -cc numa_node ./run.edison 7 1 22 | export OMP_NUM_THREADS=12 23 | aprun -n 1 -N 1 -S 1 -cc numa_node ./run.edison 7 64 24 | aprun -n 8 -N 2 -S 1 -cc numa_node ./run.edison 7 64 25 | aprun -n 27 -N 2 -S 1 -cc numa_node ./run.edison 7 64 26 | aprun -n 64 -N 2 -S 1 -cc numa_node ./run.edison 7 64 27 | aprun -n 125 -N 2 -S 1 -cc numa_node ./run.edison 7 64 28 | aprun -n 216 -N 2 -S 1 -cc numa_node ./run.edison 7 64 29 | aprun -n 384 -N 2 -S 1 -cc numa_node ./run.edison 7 64 30 | aprun -n 512 -N 2 -S 1 -cc numa_node ./run.edison 7 64 31 | 32 | 33 | #export OMP_NUM_THREADS=1 34 | #aprun -n 8 -N 8 -S 8 -cc numa_node ./run.edison.flat 6 1 35 | #aprun -n 64 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 36 | #aprun -n 216 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 37 | #aprun -n 512 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 38 | #aprun -n 1000 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 39 | #aprun -n 1728 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 40 | #aprun -n 2744 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 41 | #aprun -n 4096 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 42 | 43 | 44 | -------------------------------------------------------------------------------- /finite-volume/example_jobs/job.edison.01024: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #PBS -N HPGMG 3 | #PBS -o results.edison.01024 4 | #PBS -q regular 5 | #PBS -j oe 6 | #PBS -l walltime=0:29:00 7 | #PBS -l mppwidth=12288 8 | 9 | set -x 10 | cd $PBS_O_WORKDIR 11 | #export UGNI_CDM_MDD_DEDICATED=2 12 | 13 | export OMP_NUM_THREADS=12 14 | aprun -n 1 -N 1 -S 1 -cc numa_node ./run.edison 7 1 15 | aprun -n 8 -N 2 -S 1 -cc numa_node ./run.edison 7 1 16 | aprun -n 27 -N 2 -S 1 -cc numa_node ./run.edison 7 1 17 | aprun -n 64 -N 2 -S 1 -cc numa_node ./run.edison 7 1 18 | aprun -n 125 -N 2 -S 1 -cc numa_node ./run.edison 7 1 19 | aprun -n 216 -N 2 -S 1 -cc numa_node ./run.edison 7 1 20 | aprun -n 343 -N 2 -S 1 -cc numa_node ./run.edison 7 1 21 | aprun -n 512 -N 2 -S 1 -cc numa_node ./run.edison 7 1 22 | aprun -n 729 -N 2 -S 1 -cc numa_node ./run.edison 7 1 23 | aprun -n 1000 -N 2 -S 1 -cc numa_node ./run.edison 7 1 24 | export OMP_NUM_THREADS=1 25 | aprun -n 8 -N 8 -S 8 -cc numa_node ./run.edison.flat 6 1 26 | aprun -n 64 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 27 | aprun -n 216 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 28 | aprun -n 512 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 29 | aprun -n 1000 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 30 | aprun -n 1728 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 31 | aprun -n 2744 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 32 | aprun -n 4096 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 33 | aprun -n 5832 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 34 | aprun -n 8000 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 35 | 36 | 37 | export OMP_NUM_THREADS=12 38 | aprun -n 1 -N 1 -S 1 -cc numa_node ./run.edison 7 8 39 | aprun -n 8 -N 2 -S 1 -cc numa_node ./run.edison 7 8 40 | aprun -n 27 -N 2 -S 1 -cc numa_node ./run.edison 7 8 41 | aprun -n 64 -N 2 -S 1 -cc numa_node ./run.edison 7 8 42 | aprun -n 125 -N 2 -S 1 -cc numa_node ./run.edison 7 8 43 | aprun -n 216 -N 2 -S 1 -cc numa_node ./run.edison 7 8 44 | aprun -n 343 -N 2 -S 1 -cc numa_node ./run.edison 7 8 45 | aprun -n 512 -N 2 -S 1 -cc numa_node ./run.edison 7 8 46 | aprun -n 729 -N 2 -S 1 -cc numa_node ./run.edison 7 8 47 | aprun -n 1000 -N 2 -S 1 -cc numa_node ./run.edison 7 8 48 | export OMP_NUM_THREADS=1 49 | aprun -n 8 -N 8 -S 8 -cc numa_node ./run.edison.flat 6 8 50 | aprun -n 64 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 8 51 | aprun -n 216 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 8 52 | aprun -n 512 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 8 53 | aprun -n 1000 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 8 54 | aprun -n 1728 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 8 55 | aprun -n 2744 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 8 56 | aprun -n 4096 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 8 57 | aprun -n 5832 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 8 58 | aprun -n 8000 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 8 59 | 60 | 61 | -------------------------------------------------------------------------------- /finite-volume/example_jobs/job.edison.08000: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #PBS -N HPGMG 3 | #PBS -o results.edison.08000 4 | #PBS -q regular 5 | #PBS -j oe 6 | #PBS -l walltime=0:59:00 7 | #PBS -l mppwidth=98304 8 | 9 | set -x 10 | cd $PBS_O_WORKDIR 11 | export UGNI_CDM_MDD_DEDICATED=2 12 | 13 | 14 | export OMP_NUM_THREADS=1 15 | aprun -n 64000 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 16 | aprun -n 46656 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 17 | aprun -n 32768 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 18 | aprun -n 27000 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 19 | aprun -n 21952 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 20 | aprun -n 13824 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 21 | aprun -n 8000 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 22 | 23 | 24 | export OMP_NUM_THREADS=12 25 | aprun -n 8000 -N 2 -S 1 -cc numa_node ./run.edison 7 1 26 | aprun -n 6859 -N 2 -S 1 -cc numa_node ./run.edison 7 1 27 | aprun -n 5832 -N 2 -S 1 -cc numa_node ./run.edison 7 1 28 | aprun -n 4913 -N 2 -S 1 -cc numa_node ./run.edison 7 1 29 | aprun -n 4096 -N 2 -S 1 -cc numa_node ./run.edison 7 1 30 | aprun -n 3375 -N 2 -S 1 -cc numa_node ./run.edison 7 1 31 | aprun -n 2744 -N 2 -S 1 -cc numa_node ./run.edison 7 1 32 | aprun -n 2197 -N 2 -S 1 -cc numa_node ./run.edison 7 1 33 | aprun -n 1728 -N 2 -S 1 -cc numa_node ./run.edison 7 1 34 | aprun -n 1331 -N 2 -S 1 -cc numa_node ./run.edison 7 1 35 | aprun -n 1000 -N 2 -S 1 -cc numa_node ./run.edison 7 1 36 | 37 | 38 | export OMP_NUM_THREADS=1 39 | aprun -n 64000 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 40 | aprun -n 46656 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 41 | aprun -n 32768 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 42 | aprun -n 27000 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 43 | aprun -n 21952 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 44 | aprun -n 13824 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 45 | aprun -n 8000 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 46 | aprun -n 5832 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 47 | aprun -n 4096 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 48 | aprun -n 2744 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 49 | aprun -n 1728 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 50 | aprun -n 1000 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 51 | aprun -n 512 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 52 | aprun -n 216 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 53 | aprun -n 64 -N 16 -S 8 -cc numa_node ./run.edison.flat 6 1 54 | aprun -n 8 -N 8 -S 8 -cc numa_node ./run.edison.flat 6 1 55 | 56 | 57 | export OMP_NUM_THREADS=12 58 | aprun -n 8000 -N 2 -S 1 -cc numa_node ./run.edison 7 1 59 | aprun -n 6859 -N 2 -S 1 -cc numa_node ./run.edison 7 1 60 | aprun -n 5832 -N 2 -S 1 -cc numa_node ./run.edison 7 1 61 | aprun -n 4913 -N 2 -S 1 -cc numa_node ./run.edison 7 1 62 | aprun -n 4096 -N 2 -S 1 -cc numa_node ./run.edison 7 1 63 | aprun -n 3375 -N 2 -S 1 -cc numa_node ./run.edison 7 1 64 | aprun -n 2744 -N 2 -S 1 -cc numa_node ./run.edison 7 1 65 | aprun -n 2197 -N 2 -S 1 -cc numa_node ./run.edison 7 1 66 | aprun -n 1728 -N 2 -S 1 -cc numa_node ./run.edison 7 1 67 | aprun -n 1331 -N 2 -S 1 -cc numa_node ./run.edison 7 1 68 | aprun -n 1000 -N 2 -S 1 -cc numa_node ./run.edison 7 1 69 | aprun -n 729 -N 2 -S 1 -cc numa_node ./run.edison 7 1 70 | aprun -n 512 -N 2 -S 1 -cc numa_node ./run.edison 7 1 71 | aprun -n 343 -N 2 -S 1 -cc numa_node ./run.edison 7 1 72 | aprun -n 216 -N 2 -S 1 -cc numa_node ./run.edison 7 1 73 | aprun -n 125 -N 2 -S 1 -cc numa_node ./run.edison 7 1 74 | aprun -n 64 -N 2 -S 1 -cc numa_node ./run.edison 7 1 75 | aprun -n 27 -N 2 -S 1 -cc numa_node ./run.edison 7 1 76 | aprun -n 8 -N 2 -S 1 -cc numa_node ./run.edison 7 1 77 | aprun -n 1 -N 1 -S 1 -cc numa_node ./run.edison 7 1 78 | 79 | -------------------------------------------------------------------------------- /finite-volume/example_jobs/job.edison.10648: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #PBS -N HPGMG 3 | #PBS -o results.edison.10648 4 | #PBS -q regular 5 | #PBS -j oe 6 | #PBS -l walltime=0:59:00 7 | #PBS -l mppwidth=129024 8 | 9 | set -x 10 | cd $PBS_O_WORKDIR 11 | 12 | export OMP_NUM_THREADS=12 13 | aprun -n 9216 -N 2 -S 1 -cc numa_node ./run.edison 7 12 14 | aprun -n 10648 -N 2 -S 1 -cc numa_node ./run.edison 7 8 15 | aprun -n 10752 -N 2 -S 1 -cc numa_node ./run.edison 7 11 16 | aprun -n 10752 -N 2 -S 1 -cc numa_node ./run.edison 7 17 17 | aprun -n 10752 -N 2 -S 1 -cc numa_node ./run.edison 7 25 18 | aprun -n 10368 -N 2 -S 1 -cc numa_node ./run.edison 7 36 19 | aprun -n 10240 -N 2 -S 1 -cc numa_node ./run.edison 7 50 20 | aprun -n 10648 -N 2 -S 1 -cc numa_node ./run.edison 7 64 21 | 22 | aprun -n 10648 -N 2 -S 1 -cc numa_node ./run.edison 8 8 23 | aprun -n 8000 -N 2 -S 1 -cc numa_node ./run.edison 8 8 24 | aprun -n 5832 -N 2 -S 1 -cc numa_node ./run.edison 8 8 25 | aprun -n 4096 -N 2 -S 1 -cc numa_node ./run.edison 8 8 26 | 27 | -------------------------------------------------------------------------------- /finite-volume/example_jobs/job.edison.4096.strong: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #PBS -N HPGMG 3 | #PBS -o results.edison.4096.strong2 4 | #PBS -q regular 5 | #PBS -j oe 6 | #PBS -l walltime=0:15:00 7 | #PBS -l mppwidth=49152 8 | 9 | set -x 10 | cd $PBS_O_WORKDIR 11 | export UGNI_CDM_MDD_DEDICATED=2 12 | 13 | 14 | export OMP_NUM_THREADS=1 15 | aprun -n 1 -N 1 -S 1 -ss -cc numa_node ./run.edison.flat 7 64 16 | aprun -n 8 -N 8 -S 8 -ss -cc numa_node ./run.edison.flat 7 8 17 | aprun -n 64 -N 16 -S 8 -ss -cc numa_node ./run.edison.flat 7 1 18 | aprun -n 512 -N 16 -S 8 -ss -cc numa_node ./run.edison.flat 6 1 19 | aprun -n 4096 -N 16 -S 8 -ss -cc numa_node ./run.edison.flat 5 1 20 | aprun -n 32768 -N 16 -S 8 -ss -cc numa_node ./run.edison.flat 4 1 21 | -------------------------------------------------------------------------------- /finite-volume/example_jobs/job.edison.pstate: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #PBS -N HPGMG 3 | #PBS -o results.edison.01024.pstate 4 | #PBS -q regular 5 | #PBS -j oe 6 | #PBS -l walltime=0:29:00 7 | #PBS -l mppwidth=12288 8 | 9 | set -x 10 | cd $PBS_O_WORKDIR 11 | export UGNI_CDM_MDD_DEDICATED=2 12 | 13 | export OMP_NUM_THREADS=12 14 | aprun -n 1 -N 1 -S 1 --p-state 2400000 -ss -cc numa_node ./run.edison 7 1 15 | aprun -n 8 -N 2 -S 1 --p-state 2400000 -ss -cc numa_node ./run.edison 7 1 16 | aprun -n 27 -N 2 -S 1 --p-state 2400000 -ss -cc numa_node ./run.edison 7 1 17 | aprun -n 64 -N 2 -S 1 --p-state 2400000 -ss -cc numa_node ./run.edison 7 1 18 | aprun -n 125 -N 2 -S 1 --p-state 2400000 -ss -cc numa_node ./run.edison 7 1 19 | aprun -n 216 -N 2 -S 1 --p-state 2400000 -ss -cc numa_node ./run.edison 7 1 20 | aprun -n 343 -N 2 -S 1 --p-state 2400000 -ss -cc numa_node ./run.edison 7 1 21 | aprun -n 512 -N 2 -S 1 --p-state 2400000 -ss -cc numa_node ./run.edison 7 1 22 | aprun -n 729 -N 2 -S 1 --p-state 2400000 -ss -cc numa_node ./run.edison 7 1 23 | aprun -n 1000 -N 2 -S 1 --p-state 2400000 -ss -cc numa_node ./run.edison 7 1 24 | export OMP_NUM_THREADS=12 25 | aprun -n 1 -N 1 -S 1 --p-state 2400000 -ss -cc numa_node ./run.edison 7 1 26 | aprun -n 8 -N 2 -S 1 --p-state 2400000 -ss -cc numa_node ./run.edison 7 1 27 | aprun -n 27 -N 2 -S 1 --p-state 2400000 -ss -cc numa_node ./run.edison 7 1 28 | aprun -n 64 -N 2 -S 1 --p-state 2400000 -ss -cc numa_node ./run.edison 7 1 29 | aprun -n 125 -N 2 -S 1 --p-state 2400000 -ss -cc numa_node ./run.edison 7 1 30 | aprun -n 216 -N 2 -S 1 --p-state 2400000 -ss -cc numa_node ./run.edison 7 1 31 | aprun -n 343 -N 2 -S 1 --p-state 2400000 -ss -cc numa_node ./run.edison 7 1 32 | aprun -n 512 -N 2 -S 1 --p-state 2400000 -ss -cc numa_node ./run.edison 7 1 33 | aprun -n 729 -N 2 -S 1 --p-state 2400000 -ss -cc numa_node ./run.edison 7 1 34 | aprun -n 1000 -N 2 -S 1 --p-state 2400000 -ss -cc numa_node ./run.edison 7 1 35 | 36 | export OMP_NUM_THREADS=1 37 | aprun -n 8 -N 8 -S 8 --p-state 2400000 -ss -cc numa_node ./run.edison 6 1 38 | aprun -n 64 -N 16 -S 8 --p-state 2400000 -ss -cc numa_node ./run.edison 6 1 39 | aprun -n 216 -N 16 -S 8 --p-state 2400000 -ss -cc numa_node ./run.edison 6 1 40 | aprun -n 512 -N 16 -S 8 --p-state 2400000 -ss -cc numa_node ./run.edison 6 1 41 | aprun -n 1000 -N 16 -S 8 --p-state 2400000 -ss -cc numa_node ./run.edison 6 1 42 | aprun -n 1728 -N 16 -S 8 --p-state 2400000 -ss -cc numa_node ./run.edison 6 1 43 | aprun -n 2744 -N 16 -S 8 --p-state 2400000 -ss -cc numa_node ./run.edison 6 1 44 | aprun -n 4096 -N 16 -S 8 --p-state 2400000 -ss -cc numa_node ./run.edison 6 1 45 | aprun -n 5832 -N 16 -S 8 --p-state 2400000 -ss -cc numa_node ./run.edison 6 1 46 | aprun -n 8000 -N 16 -S 8 --p-state 2400000 -ss -cc numa_node ./run.edison 6 1 47 | export OMP_NUM_THREADS=1 48 | aprun -n 8 -N 8 -S 8 --p-state 2400000 -ss -cc numa_node ./run.edison 6 1 49 | aprun -n 64 -N 16 -S 8 --p-state 2400000 -ss -cc numa_node ./run.edison 6 1 50 | aprun -n 216 -N 16 -S 8 --p-state 2400000 -ss -cc numa_node ./run.edison 6 1 51 | aprun -n 512 -N 16 -S 8 --p-state 2400000 -ss -cc numa_node ./run.edison 6 1 52 | aprun -n 1000 -N 16 -S 8 --p-state 2400000 -ss -cc numa_node ./run.edison 6 1 53 | aprun -n 1728 -N 16 -S 8 --p-state 2400000 -ss -cc numa_node ./run.edison 6 1 54 | aprun -n 2744 -N 16 -S 8 --p-state 2400000 -ss -cc numa_node ./run.edison 6 1 55 | aprun -n 4096 -N 16 -S 8 --p-state 2400000 -ss -cc numa_node ./run.edison 6 1 56 | aprun -n 5832 -N 16 -S 8 --p-state 2400000 -ss -cc numa_node ./run.edison 6 1 57 | aprun -n 8000 -N 16 -S 8 --p-state 2400000 -ss -cc numa_node ./run.edison 6 1 58 | -------------------------------------------------------------------------------- /finite-volume/example_jobs/job.edison.strong: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #PBS -N HPGMG 3 | #PBS -o results.edison.strong 4 | #PBS -q regular 5 | #PBS -j oe 6 | #PBS -l walltime=0:29:00 7 | #PBS -l mppwidth=49152 8 | 9 | set -x 10 | cd $PBS_O_WORKDIR 11 | export UGNI_CDM_MDD_DEDICATED=2 12 | 13 | 14 | export OMP_NUM_THREADS=1 15 | aprun -n 8 -N 8 -S 8 -ss -cc numa_node ./run.edison.flat 8 1 16 | aprun -n 16 -N 16 -S 8 -ss -cc numa_node ./run.edison.flat 7 4 17 | aprun -n 32 -N 16 -S 8 -ss -cc numa_node ./run.edison.flat 7 2 18 | aprun -n 64 -N 16 -S 8 -ss -cc numa_node ./run.edison.flat 7 1 19 | aprun -n 128 -N 16 -S 8 -ss -cc numa_node ./run.edison.flat 6 4 20 | aprun -n 256 -N 16 -S 8 -ss -cc numa_node ./run.edison.flat 6 2 21 | aprun -n 512 -N 16 -S 8 -ss -cc numa_node ./run.edison.flat 6 1 22 | aprun -n 1024 -N 16 -S 8 -ss -cc numa_node ./run.edison.flat 5 4 23 | aprun -n 2048 -N 16 -S 8 -ss -cc numa_node ./run.edison.flat 5 2 24 | aprun -n 4096 -N 16 -S 8 -ss -cc numa_node ./run.edison.flat 5 1 25 | aprun -n 8192 -N 16 -S 8 -ss -cc numa_node ./run.edison.flat 4 4 26 | aprun -n 16384 -N 16 -S 8 -ss -cc numa_node ./run.edison.flat 4 2 27 | aprun -n 32768 -N 16 -S 8 -ss -cc numa_node ./run.edison.flat 4 1 28 | 29 | export OMP_NUM_THREADS=1 30 | aprun -n 8 -N 8 -S 8 -ss -cc numa_node ./run.edison.flat 8 1 31 | aprun -n 16 -N 16 -S 8 -ss -cc numa_node ./run.edison.flat 7 4 32 | aprun -n 32 -N 16 -S 8 -ss -cc numa_node ./run.edison.flat 7 2 33 | aprun -n 64 -N 16 -S 8 -ss -cc numa_node ./run.edison.flat 7 1 34 | aprun -n 128 -N 16 -S 8 -ss -cc numa_node ./run.edison.flat 6 4 35 | aprun -n 256 -N 16 -S 8 -ss -cc numa_node ./run.edison.flat 6 2 36 | aprun -n 512 -N 16 -S 8 -ss -cc numa_node ./run.edison.flat 6 1 37 | aprun -n 1024 -N 16 -S 8 -ss -cc numa_node ./run.edison.flat 5 4 38 | aprun -n 2048 -N 16 -S 8 -ss -cc numa_node ./run.edison.flat 5 2 39 | aprun -n 4096 -N 16 -S 8 -ss -cc numa_node ./run.edison.flat 5 1 40 | aprun -n 8192 -N 16 -S 8 -ss -cc numa_node ./run.edison.flat 4 4 41 | aprun -n 16384 -N 16 -S 8 -ss -cc numa_node ./run.edison.flat 4 2 42 | aprun -n 32768 -N 16 -S 8 -ss -cc numa_node ./run.edison.flat 4 1 43 | 44 | export OMP_NUM_THREADS=1 45 | aprun -n 8 -N 8 -S 8 -ss -cc numa_node ./run.edison.flat 8 1 46 | aprun -n 16 -N 16 -S 8 -ss -cc numa_node ./run.edison.flat 7 4 47 | aprun -n 32 -N 16 -S 8 -ss -cc numa_node ./run.edison.flat 7 2 48 | aprun -n 64 -N 16 -S 8 -ss -cc numa_node ./run.edison.flat 7 1 49 | aprun -n 128 -N 16 -S 8 -ss -cc numa_node ./run.edison.flat 6 4 50 | aprun -n 256 -N 16 -S 8 -ss -cc numa_node ./run.edison.flat 6 2 51 | aprun -n 512 -N 16 -S 8 -ss -cc numa_node ./run.edison.flat 6 1 52 | aprun -n 1024 -N 16 -S 8 -ss -cc numa_node ./run.edison.flat 5 4 53 | aprun -n 2048 -N 16 -S 8 -ss -cc numa_node ./run.edison.flat 5 2 54 | aprun -n 4096 -N 16 -S 8 -ss -cc numa_node ./run.edison.flat 5 1 55 | aprun -n 8192 -N 16 -S 8 -ss -cc numa_node ./run.edison.flat 4 4 56 | aprun -n 16384 -N 16 -S 8 -ss -cc numa_node ./run.edison.flat 4 2 57 | aprun -n 32768 -N 16 -S 8 -ss -cc numa_node ./run.edison.flat 4 1 58 | 59 | -------------------------------------------------------------------------------- /finite-volume/example_jobs/job.hopper.01000: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #PBS -N HPGMG 3 | #PBS -o results.hopper.01000 4 | #PBS -q regular 5 | #PBS -j oe 6 | #PBS -l walltime=0:29:00 7 | #PBS -l mppwidth=6144 8 | 9 | set -x 10 | cd $PBS_O_WORKDIR 11 | module swap PrgEnv-pgi PrgEnv-intel 12 | 13 | export OMP_NUM_THREADS=6 14 | aprun -n 1 -d 6 -ss -cc numa_node ./run.hopper 7 1 15 | aprun -n 8 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 16 | aprun -n 27 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 17 | aprun -n 64 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 18 | aprun -n 125 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 19 | aprun -n 216 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 20 | aprun -n 343 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 21 | aprun -n 512 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 22 | aprun -n 729 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 23 | aprun -n 1000 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 24 | export OMP_NUM_THREADS=6 25 | aprun -n 1 -d 6 -ss -cc numa_node ./run.hopper 7 1 26 | aprun -n 8 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 27 | aprun -n 27 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 28 | aprun -n 64 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 29 | aprun -n 125 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 30 | aprun -n 216 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 31 | aprun -n 343 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 32 | aprun -n 512 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 33 | aprun -n 729 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 34 | aprun -n 1000 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 35 | export OMP_NUM_THREADS=6 36 | aprun -n 1 -d 6 -ss -cc numa_node ./run.hopper 7 1 37 | aprun -n 8 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 38 | aprun -n 27 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 39 | aprun -n 64 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 40 | aprun -n 125 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 41 | aprun -n 216 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 42 | aprun -n 343 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 43 | aprun -n 512 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 44 | aprun -n 729 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 45 | aprun -n 1000 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 46 | -------------------------------------------------------------------------------- /finite-volume/example_jobs/job.hopper.04096: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #PBS -N HPGMG 3 | #PBS -o results.hopper.04096 4 | #PBS -q regular 5 | #PBS -j oe 6 | #PBS -l walltime=0:29:00 7 | #PBS -l mppwidth=24576 8 | 9 | set -x 10 | cd $PBS_O_WORKDIR 11 | module swap PrgEnv-pgi PrgEnv-intel 12 | 13 | export OMP_NUM_THREADS=6 14 | aprun -n 4096 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 15 | aprun -n 3375 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 16 | aprun -n 2744 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 17 | aprun -n 1728 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 18 | aprun -n 1331 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 19 | export OMP_NUM_THREADS=6 20 | aprun -n 4096 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 21 | aprun -n 3375 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 22 | aprun -n 2744 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 23 | aprun -n 1728 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 24 | aprun -n 1331 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 25 | export OMP_NUM_THREADS=6 26 | aprun -n 4096 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 27 | aprun -n 3375 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 28 | aprun -n 2744 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 29 | aprun -n 1728 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 30 | aprun -n 1331 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 31 | -------------------------------------------------------------------------------- /finite-volume/example_jobs/job.hopper.09261: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #PBS -N HPGMG 3 | #PBS -o results.hopper.09261 4 | #PBS -q regular 5 | #PBS -j oe 6 | #PBS -l walltime=0:29:00 7 | #PBS -l mppwidth=55584 8 | 9 | set -x 10 | cd $PBS_O_WORKDIR 11 | module swap PrgEnv-pgi PrgEnv-intel 12 | 13 | export OMP_NUM_THREADS=6 14 | aprun -n 9261 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 15 | aprun -n 8000 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 16 | aprun -n 5832 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 17 | aprun -n 4096 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 18 | -------------------------------------------------------------------------------- /finite-volume/example_jobs/job.hopper.13824: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #PBS -N HPGMG 3 | #PBS -o results.hopper.13824 4 | #PBS -q regular 5 | #PBS -j oe 6 | #PBS -l walltime=0:29:00 7 | #PBS -l mppwidth=82944 8 | 9 | set -x 10 | cd $PBS_O_WORKDIR 11 | module swap PrgEnv-pgi PrgEnv-intel 12 | 13 | export OMP_NUM_THREADS=6 14 | aprun -n 13824 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 15 | aprun -n 10648 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 16 | aprun -n 9261 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 17 | aprun -n 8000 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 18 | aprun -n 5832 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 19 | aprun -n 4096 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 20 | -------------------------------------------------------------------------------- /finite-volume/example_jobs/job.hopper.21952: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #PBS -N HPGMG 3 | #PBS -o results.hopper.21952 4 | #PBS -q regular 5 | #PBS -j oe 6 | #PBS -l walltime=0:29:00 7 | #PBS -l mppwidth=131712 8 | 9 | set -x 10 | cd $PBS_O_WORKDIR 11 | module swap PrgEnv-pgi PrgEnv-intel 12 | 13 | export OMP_NUM_THREADS=6 14 | aprun -n 21296 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 4 15 | aprun -n 21952 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 8 16 | aprun -n 20736 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 18 17 | aprun -n 18432 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 6 18 | aprun -n 21952 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 12 19 | #aprun -n 23328 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 2 20 | #aprun -n 25088 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 7 21 | -------------------------------------------------------------------------------- /finite-volume/example_jobs/job.hopper.special: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #PBS -N HPGMG 3 | #PBS -o results.hopper.special4 4 | #PBS -q regular 5 | #PBS -j oe 6 | #PBS -l walltime=0:19:00 7 | #PBS -l mppwidth=131712 8 | #PBS -W x=FLAGS:ADVRES:samw.755 9 | 10 | set -x 11 | cd $PBS_O_WORKDIR 12 | module swap PrgEnv-pgi PrgEnv-intel 13 | 14 | export OMP_NUM_THREADS=6 15 | aprun -n 21952 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 16 | aprun -n 19683 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 17 | aprun -n 15625 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 18 | aprun -n 13824 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 19 | aprun -n 10648 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 20 | aprun -n 9261 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 21 | aprun -n 8000 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 22 | 23 | export OMP_NUM_THREADS=6 24 | aprun -n 21952 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 25 | aprun -n 19683 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 26 | aprun -n 15625 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 27 | aprun -n 13824 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 28 | aprun -n 10648 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 29 | aprun -n 9261 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 30 | aprun -n 8000 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 31 | 32 | export OMP_NUM_THREADS=6 33 | aprun -n 21952 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 34 | aprun -n 19683 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 35 | aprun -n 15625 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 36 | aprun -n 13824 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 37 | aprun -n 10648 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 38 | aprun -n 9261 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 39 | aprun -n 8000 -d 6 -N 4 -S 1 -ss -cc numa_node ./run.hopper 7 1 40 | 41 | -------------------------------------------------------------------------------- /finite-volume/example_jobs/job.titan: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #PBS -o results.titan 3 | #PBS -q debug 4 | #PBS -j oe 5 | #PBS -l walltime=0:10:00 6 | #PBS -l nodes=1 7 | 8 | source $MODULESHOME/init/bash 9 | module load cudatoolkit 10 | export PMI_NO_FORK=1 11 | 12 | set -x 13 | cd $PBS_O_WORKDIR 14 | export OMP_NUM_THREADS=4 15 | 16 | # normal run 17 | aprun -n 1 -N 1 -d 4 ./build/bin/hpgmg-fv 7 8 18 | 19 | # generate nvprof timeline 20 | #aprun -b -n 1 -N 1 -d 4 nvprof --unified-memory-profiling per-process-device -o timeline.%p.nvp ./build/bin/hpgmg-fv 7 8 21 | -------------------------------------------------------------------------------- /finite-volume/local.mk: -------------------------------------------------------------------------------- 1 | include $(call incsubdirs,source) 2 | -------------------------------------------------------------------------------- /finite-volume/source/TODO: -------------------------------------------------------------------------------- 1 | - cubical problem size -> rectahedral problem size ... init problem, restriction rules, etc... 2 | - rectahedral problem size -> arbitrary problem shape... 3 | - more efficient ghost zone exchange (box intersection algebra) when communicating edges and corners 4 | -------------------------------------------------------------------------------- /finite-volume/source/compile: -------------------------------------------------------------------------------- 1 | 2 | 3 | #======================================================================================================================= 4 | # mira 5 | #======================================================================================================================= 6 | soft add +mpiwrapper-xl 7 | qsub -t 00:10:00 -n 64 --proccount 64 --mode c1 -A PEACEndStation --env BG_SHAREDMEMSIZE=32MB:PAMID_VERBOSE=1:BG_COREDUMPDISABLED=1:BG_SMP_FAST_WAKEUP=YES:BG_THREADLAYOUT=2:OMP_PROC_BIND=TRUE:OMP_NUM_THREADS=64:OMP_WAIT_POLICY=active ./run.bgq 7 1 8 | qsub -t 00:10:00 -n 64 --proccount 64 --mode c1 -A PEACEndStation --env BG_SHAREDMEMSIZE=32MB:PAMID_VERBOSE=1:BG_COREDUMPDISABLED=1:BG_SMP_FAST_WAKEUP=YES:BG_THREADLAYOUT=2:OMP_PROC_BIND=TRUE:OMP_NUM_THREADS=64:OMP_WAIT_POLICY=active:OMP_NESTED=true ./run.bgq 6 8 9 | 10 | 11 | mpixlc_r -O5 -qsmp=omp:noauto level.c operators.fv4.c mg.c solvers.c hpgmg-fv.c timers.c -DUSE_MPI -DUSE_FCYCLES -DUSE_GSRB -DUSE_BICGSTAB -DBLOCKCOPY_TILE_K=1 -DBLOCKCOPY_TILE_J=32 -o run.bgq.1x32 -DUSE_HPM -L/soft/perftools/hpctw/lib -L/soft/perftools/bgpm/lib -lmpihpm_smp -lbgpm 12 | 13 | 14 | mpirun.mic -n 8 -ppn 8 -hostfile micfile.$PBS_JOBID -env OMP_NUM_THREADS=30 -env KMP_AFFINITY=compact -env I_MPI_FABRICS=shm -env I_MPI_PIN_DOMAIN=30 ./run.babbage.baseline 7 1 15 | mpirun.mic -n 8 -ppn 8 -hostfile micfile.$PBS_JOBID -env OMP_NUM_THREADS=30 -env KMP_AFFINITY=compact -env I_MPI_FABRICS=shm:ofa -env I_MPI_PIN_DOMAIN=30 ./run.babbage.baseline 7 1 16 | mpirun.mic -n 8 -ppn 8 -hostfile micfile.$PBS_JOBID -env OMP_NUM_THREADS=30 -env KMP_AFFINITY=compact -env I_MPI_FABRICS=shm:dapl -env I_MPI_PIN_DOMAIN=30 ./run.babbage.baseline 7 1 17 | -------------------------------------------------------------------------------- /finite-volume/source/cuda/boundary_fd.h: -------------------------------------------------------------------------------- 1 | /* 2 | # Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions 6 | # are met: 7 | # * Redistributions of source code must retain the above copyright 8 | # notice, this list of conditions and the following disclaimer. 9 | # * Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # * Neither the name of NVIDIA CORPORATION nor the names of its 13 | # contributors may be used to endorse or promote products derived 14 | # from this software without specific prior written permission. 15 | # 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | */ 28 | 29 | // Divide thread block into batches of threads (e.g. quads), each batch operates on one HPGMG tile/block 30 | #define BLOCK_SIZE 128 // number of threads per thread block 31 | #define NUM_BATCH 8 // mimber of batches per thread block 32 | 33 | __constant__ int faces[27] = {0,0,0,0,1,0,0,0,0, 0,1,0,1,0,1,0,1,0, 0,0,0,0,1,0,0,0,0}; 34 | __constant__ int edges[27] = {0,1,0,1,0,1,0,1,0, 1,0,1,0,0,0,1,0,1, 0,1,0,1,0,1,0,1,0}; 35 | __constant__ int corners[27] = {1,0,1,0,0,0,1,0,1, 0,0,0,0,0,0,0,0,0, 1,0,1,0,0,0,1,0,1}; 36 | 37 | template 38 | __global__ void apply_BCs_v1_kernel(level_type level, int x_id, int shape){ 39 | // thread exit conditions 40 | int batchid = blockIdx.x*num_batch + threadIdx.x/batch_size; 41 | if(batchid >= level.boundary_condition.num_blocks[shape]) return; 42 | 43 | // one CUDA thread block operates on 'batch_size' HPGMG tiles/blocks 44 | blockCopy_type block = level.boundary_condition.blocks[shape][batchid]; 45 | 46 | double scale = 1.0; 47 | if( faces[block.subtype])scale=-1.0; 48 | if( edges[block.subtype])scale= 1.0; 49 | if(corners[block.subtype])scale=-1.0; 50 | 51 | int i,j,k; 52 | const int box = block.read.box; 53 | const int dim_i = block.dim.i; 54 | const int dim_j = block.dim.j; 55 | const int dim_k = block.dim.k; 56 | const int ilo = block.read.i; 57 | const int jlo = block.read.j; 58 | const int klo = block.read.k; 59 | const int normal = 26-block.subtype; // invert the normal vector 60 | 61 | // hard code for box to box BC's 62 | const int jStride = level.my_boxes[box].jStride; 63 | const int kStride = level.my_boxes[box].kStride; 64 | double * __restrict__ x = level.my_boxes[box].vectors[x_id] + level.my_boxes[box].ghosts*(1+jStride+kStride); 65 | 66 | // convert normal vector into pointer offsets... 67 | const int di = (((normal % 3) )-1); 68 | const int dj = (((normal % 9)/3)-1); 69 | const int dk = (((normal / 9) )-1); 70 | const int stride = di + dj*jStride + dk*kStride; 71 | 72 | for(int gid=threadIdx.x%batch_size; gid<<>>(level,x_id,shape); 83 | 84 | extern "C" 85 | void cuda_apply_BCs_v1(level_type level, int x_id, int shape) 86 | { 87 | int block = BLOCK_SIZE; 88 | int grid = (level.boundary_condition.num_blocks[shape]+NUM_BATCH-1)/NUM_BATCH; 89 | if (grid <= 0) return; 90 | 91 | int log_dim = (int)log2((double)level.dim.i); 92 | KERNEL_LEVEL(log_dim, shape); 93 | CUDA_ERROR 94 | } 95 | -------------------------------------------------------------------------------- /finite-volume/source/cuda/common.h: -------------------------------------------------------------------------------- 1 | /* 2 | # Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions 6 | # are met: 7 | # * Redistributions of source code must retain the above copyright 8 | # notice, this list of conditions and the following disclaimer. 9 | # * Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # * Neither the name of NVIDIA CORPORATION nor the names of its 13 | # contributors may be used to endorse or promote products derived 14 | # from this software without specific prior written permission. 15 | # 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | */ 28 | 29 | // available device functions 30 | void cuda_smooth(level_type level, int x_id, int rhs_id, double a, double b, int s, double *c, double *d); 31 | void cuda_residual(level_type d_level, int res_id, int x_id, int rhs_id, double a, double b); 32 | void cuda_rebuild(level_type level, int x_id, int Aii_id, int sumAbsAij_id, double a, double b); 33 | 34 | void cuda_restriction(level_type d_level_c, int id_c, level_type d_level_f, int id_f, communicator_type restriction, int restrictionType, int block_type); 35 | 36 | void cuda_interpolation_p0(level_type d_level_f, int id_f, double prescale_f, level_type d_level_c, int id_c, communicator_type interpolation, int block_type); 37 | void cuda_interpolation_p1(level_type d_level_f, int id_f, double prescale_f, level_type d_level_c, int id_c, communicator_type interpolation, int block_type); 38 | void cuda_interpolation_v2(level_type level_f, int id_f, double prescale_f, level_type level_c, int id_c, communicator_type interpolation, int block_type); 39 | void cuda_interpolation_v4(level_type level_f, int id_f, double prescale_f, level_type level_c, int id_c, communicator_type interpolation, int block_type); 40 | 41 | void cuda_apply_BCs_v1(level_type level, int x_id, int shape); 42 | void cuda_apply_BCs_v2(level_type level, int x_id, int shape); 43 | void cuda_apply_BCs_v4(level_type level, int x_id, int shape); 44 | void cuda_extrapolate_betas(level_type level, int shape); 45 | 46 | void cuda_zero_vector(level_type d_level, int id); 47 | void cuda_scale_vector(level_type d_level, int id_c, double scale_a, int id_a); 48 | void cuda_shift_vector(level_type d_level, int id_c, double shift_a, int id_a); 49 | void cuda_mul_vectors(level_type d_level, int id_c, double scale, int id_a, int id_b); 50 | void cuda_add_vectors(level_type d_level, int id_c, double scale_a, int id_a, double scale_b, int id_b); 51 | double cuda_sum(level_type d_level, int id); 52 | double cuda_max_abs(level_type d_level, int id); 53 | void cuda_color_vector(level_type d_level, int id_a, int colors_in_each_dim, int icolor, int jcolor, int kcolor); 54 | 55 | void cuda_copy_block(level_type d_level, int id, communicator_type exchange_ghosts, int block_type, cudaStream_t stream); 56 | void cuda_increment_block(level_type d_level, int id, double prescale, communicator_type exchange_ghosts, int block_type); 57 | void cuda_fused_copy_block(level_type d_level, int id, communicator_type exchange_ghosts, cudaStream_t stream, comm_dev_descs_t descs); 58 | void cuda_fused_copy_block_send(level_type d_level, int id, communicator_type exchange_ghosts, cudaStream_t stream, comm_dev_descs_t descs); 59 | void cuda_fused_copy_block_receive(level_type d_level, int id, communicator_type exchange_ghosts, cudaStream_t stream, comm_dev_descs_t descs); 60 | 61 | #include "extra.h" 62 | -------------------------------------------------------------------------------- /finite-volume/source/cuda/cub/block/specializations/block_histogram_atomic.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "../../util_namespace.cuh" 37 | 38 | /// Optional outer namespace(s) 39 | CUB_NS_PREFIX 40 | 41 | /// CUB namespace 42 | namespace cub { 43 | 44 | 45 | /** 46 | * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. 47 | */ 48 | template 49 | struct BlockHistogramAtomic 50 | { 51 | /// Shared memory storage layout type 52 | struct TempStorage {}; 53 | 54 | 55 | /// Constructor 56 | __device__ __forceinline__ BlockHistogramAtomic( 57 | TempStorage &temp_storage) 58 | {} 59 | 60 | 61 | /// Composite data onto an existing histogram 62 | template < 63 | typename T, 64 | typename CounterT, 65 | int ITEMS_PER_THREAD> 66 | __device__ __forceinline__ void Composite( 67 | T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram 68 | CounterT histogram[BINS]) ///< [out] Reference to shared/device-accessible memory histogram 69 | { 70 | // Update histogram 71 | #pragma unroll 72 | for (int i = 0; i < ITEMS_PER_THREAD; ++i) 73 | { 74 | atomicAdd(histogram + items[i], 1); 75 | } 76 | } 77 | 78 | }; 79 | 80 | } // CUB namespace 81 | CUB_NS_POSTFIX // Optional outer namespace(s) 82 | 83 | -------------------------------------------------------------------------------- /finite-volume/source/cuda/cub/cub.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * CUB umbrella include file 32 | */ 33 | 34 | #pragma once 35 | 36 | 37 | // Block 38 | #include "block/block_histogram.cuh" 39 | #include "block/block_discontinuity.cuh" 40 | #include "block/block_exchange.cuh" 41 | #include "block/block_load.cuh" 42 | #include "block/block_radix_rank.cuh" 43 | #include "block/block_radix_sort.cuh" 44 | #include "block/block_reduce.cuh" 45 | #include "block/block_scan.cuh" 46 | #include "block/block_store.cuh" 47 | //#include "block/block_shift.cuh" 48 | 49 | // Device 50 | #include "device/device_histogram.cuh" 51 | #include "device/device_partition.cuh" 52 | #include "device/device_radix_sort.cuh" 53 | #include "device/device_reduce.cuh" 54 | #include "device/device_run_length_encode.cuh" 55 | #include "device/device_scan.cuh" 56 | #include "device/device_segmented_radix_sort.cuh" 57 | #include "device/device_segmented_reduce.cuh" 58 | #include "device/device_select.cuh" 59 | #include "device/device_spmv.cuh" 60 | 61 | // Grid 62 | //#include "grid/grid_barrier.cuh" 63 | #include "grid/grid_even_share.cuh" 64 | #include "grid/grid_mapping.cuh" 65 | #include "grid/grid_queue.cuh" 66 | 67 | // Thread 68 | #include "thread/thread_load.cuh" 69 | #include "thread/thread_operators.cuh" 70 | #include "thread/thread_reduce.cuh" 71 | #include "thread/thread_scan.cuh" 72 | #include "thread/thread_store.cuh" 73 | 74 | // Warp 75 | #include "warp/warp_reduce.cuh" 76 | #include "warp/warp_scan.cuh" 77 | 78 | // Iterator 79 | #include "iterator/arg_index_input_iterator.cuh" 80 | #include "iterator/cache_modified_input_iterator.cuh" 81 | #include "iterator/cache_modified_output_iterator.cuh" 82 | #include "iterator/constant_input_iterator.cuh" 83 | #include "iterator/counting_input_iterator.cuh" 84 | #include "iterator/tex_obj_input_iterator.cuh" 85 | #include "iterator/tex_ref_input_iterator.cuh" 86 | #include "iterator/transform_input_iterator.cuh" 87 | 88 | // Util 89 | #include "util_arch.cuh" 90 | #include "util_debug.cuh" 91 | #include "util_device.cuh" 92 | #include "util_macro.cuh" 93 | #include "util_ptx.cuh" 94 | #include "util_type.cuh" 95 | 96 | -------------------------------------------------------------------------------- /finite-volume/source/cuda/cub/grid/grid_mapping.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks. 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "../util_namespace.cuh" 37 | 38 | /// Optional outer namespace(s) 39 | CUB_NS_PREFIX 40 | 41 | /// CUB namespace 42 | namespace cub { 43 | 44 | 45 | /** 46 | * \addtogroup GridModule 47 | * @{ 48 | */ 49 | 50 | 51 | /****************************************************************************** 52 | * Mapping policies 53 | *****************************************************************************/ 54 | 55 | 56 | /** 57 | * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks. 58 | */ 59 | enum GridMappingStrategy 60 | { 61 | /** 62 | * \brief An a "raking" access pattern in which each thread block is 63 | * assigned a consecutive sequence of input tiles 64 | * 65 | * \par Overview 66 | * The input is evenly partitioned into \p p segments, where \p p is 67 | * constant and corresponds loosely to the number of thread blocks that may 68 | * actively reside on the target device. Each segment is comprised of 69 | * consecutive tiles, where a tile is a small, constant-sized unit of input 70 | * to be processed to completion before the thread block terminates or 71 | * obtains more work. The kernel invokes \p p thread blocks, each 72 | * of which iteratively consumes a segment of n/p elements 73 | * in tile-size increments. 74 | */ 75 | GRID_MAPPING_RAKE, 76 | 77 | /** 78 | * \brief An a "strip mining" access pattern in which the input tiles assigned 79 | * to each thread block are separated by a stride equal to the the extent of 80 | * the grid. 81 | * 82 | * \par Overview 83 | * The input is evenly partitioned into \p p sets, where \p p is 84 | * constant and corresponds loosely to the number of thread blocks that may 85 | * actively reside on the target device. Each set is comprised of 86 | * data tiles separated by stride \p tiles, where a tile is a small, 87 | * constant-sized unit of input to be processed to completion before the 88 | * thread block terminates or obtains more work. The kernel invokes \p p 89 | * thread blocks, each of which iteratively consumes a segment of 90 | * n/p elements in tile-size increments. 91 | */ 92 | GRID_MAPPING_STRIP_MINE, 93 | 94 | /** 95 | * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks. 96 | * 97 | * \par Overview 98 | * The input is treated as a queue to be dynamically consumed by a grid of 99 | * thread blocks. Work is atomically dequeued in tiles, where a tile is a 100 | * unit of input to be processed to completion before the thread block 101 | * terminates or obtains more work. The grid size \p p is constant, 102 | * loosely corresponding to the number of thread blocks that may actively 103 | * reside on the target device. 104 | */ 105 | GRID_MAPPING_DYNAMIC, 106 | }; 107 | 108 | 109 | /** @} */ // end group GridModule 110 | 111 | } // CUB namespace 112 | CUB_NS_POSTFIX // Optional outer namespace(s) 113 | 114 | -------------------------------------------------------------------------------- /finite-volume/source/cuda/cub/host/mutex.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Simple portable mutex 32 | */ 33 | 34 | 35 | #pragma once 36 | 37 | #if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800) 38 | #include 39 | #else 40 | #if defined(_WIN32) || defined(_WIN64) 41 | #include 42 | 43 | #define WIN32_LEAN_AND_MEAN 44 | #define NOMINMAX 45 | #include 46 | #undef WIN32_LEAN_AND_MEAN 47 | #undef NOMINMAX 48 | 49 | /** 50 | * Compiler read/write barrier 51 | */ 52 | #pragma intrinsic(_ReadWriteBarrier) 53 | 54 | #endif 55 | #endif 56 | 57 | #include "../util_namespace.cuh" 58 | 59 | 60 | /// Optional outer namespace(s) 61 | CUB_NS_PREFIX 62 | 63 | /// CUB namespace 64 | namespace cub { 65 | 66 | 67 | /** 68 | * Simple portable mutex 69 | * - Wraps std::mutex when compiled with C++11 or newer (supported on all platforms) 70 | * - Uses GNU/Windows spinlock mechanisms for pre C++11 (supported on x86/x64 when compiled with cl.exe or g++) 71 | */ 72 | struct Mutex 73 | { 74 | #if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800) 75 | 76 | std::mutex mtx; 77 | 78 | void Lock() 79 | { 80 | mtx.lock(); 81 | } 82 | 83 | void Unlock() 84 | { 85 | mtx.unlock(); 86 | } 87 | 88 | void TryLock() 89 | { 90 | mtx.try_lock(); 91 | } 92 | 93 | #else //__cplusplus > 199711L 94 | 95 | #if defined(_MSC_VER) 96 | 97 | // Microsoft VC++ 98 | typedef long Spinlock; 99 | 100 | #else 101 | 102 | // GNU g++ 103 | typedef int Spinlock; 104 | 105 | /** 106 | * Compiler read/write barrier 107 | */ 108 | __forceinline__ void _ReadWriteBarrier() 109 | { 110 | __sync_synchronize(); 111 | } 112 | 113 | /** 114 | * Atomic exchange 115 | */ 116 | __forceinline__ long _InterlockedExchange(volatile int * const Target, const int Value) 117 | { 118 | // NOTE: __sync_lock_test_and_set would be an acquire barrier, so we force a full barrier 119 | _ReadWriteBarrier(); 120 | return __sync_lock_test_and_set(Target, Value); 121 | } 122 | 123 | /** 124 | * Pause instruction to prevent excess processor bus usage 125 | */ 126 | __forceinline__ void YieldProcessor() 127 | { 128 | } 129 | 130 | #endif // defined(_MSC_VER) 131 | 132 | /// Lock member 133 | volatile Spinlock lock; 134 | 135 | /** 136 | * Constructor 137 | */ 138 | Mutex() : lock(0) {} 139 | 140 | /** 141 | * Return when the specified spinlock has been acquired 142 | */ 143 | __forceinline__ void Lock() 144 | { 145 | while (1) 146 | { 147 | if (!_InterlockedExchange(&lock, 1)) return; 148 | while (lock) YieldProcessor(); 149 | } 150 | } 151 | 152 | 153 | /** 154 | * Release the specified spinlock 155 | */ 156 | __forceinline__ void Unlock() 157 | { 158 | _ReadWriteBarrier(); 159 | lock = 0; 160 | } 161 | 162 | #endif // __cplusplus > 199711L 163 | 164 | }; 165 | 166 | 167 | 168 | 169 | } // CUB namespace 170 | CUB_NS_POSTFIX // Optional outer namespace(s) 171 | 172 | -------------------------------------------------------------------------------- /finite-volume/source/cuda/cub/host/spinlock.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2015, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Simple x86/x64 atomic spinlock, portable across MS Windows (cl.exe) & Linux (g++) 32 | */ 33 | 34 | 35 | #pragma once 36 | 37 | #if defined(_WIN32) || defined(_WIN64) 38 | #include 39 | #include 40 | #undef small // Windows is terrible for polluting macro namespace 41 | 42 | /** 43 | * Compiler read/write barrier 44 | */ 45 | #pragma intrinsic(_ReadWriteBarrier) 46 | 47 | #endif 48 | 49 | #include "../util_namespace.cuh" 50 | 51 | /// Optional outer namespace(s) 52 | CUB_NS_PREFIX 53 | 54 | /// CUB namespace 55 | namespace cub { 56 | 57 | 58 | #if defined(_MSC_VER) 59 | 60 | // Microsoft VC++ 61 | typedef long Spinlock; 62 | 63 | #else 64 | 65 | // GNU g++ 66 | typedef int Spinlock; 67 | 68 | /** 69 | * Compiler read/write barrier 70 | */ 71 | __forceinline__ void _ReadWriteBarrier() 72 | { 73 | __sync_synchronize(); 74 | } 75 | 76 | /** 77 | * Atomic exchange 78 | */ 79 | __forceinline__ long _InterlockedExchange(volatile int * const Target, const int Value) 80 | { 81 | // NOTE: __sync_lock_test_and_set would be an acquire barrier, so we force a full barrier 82 | _ReadWriteBarrier(); 83 | return __sync_lock_test_and_set(Target, Value); 84 | } 85 | 86 | /** 87 | * Pause instruction to prevent excess processor bus usage 88 | */ 89 | __forceinline__ void YieldProcessor() 90 | { 91 | #ifndef __arm__ 92 | asm volatile("pause\n": : :"memory"); 93 | #endif // __arm__ 94 | } 95 | 96 | #endif // defined(_MSC_VER) 97 | 98 | /** 99 | * Return when the specified spinlock has been acquired 100 | */ 101 | __forceinline__ void Lock(volatile Spinlock *lock) 102 | { 103 | while (1) 104 | { 105 | if (!_InterlockedExchange(lock, 1)) return; 106 | while (*lock) YieldProcessor(); 107 | } 108 | } 109 | 110 | 111 | /** 112 | * Release the specified spinlock 113 | */ 114 | __forceinline__ void Unlock(volatile Spinlock *lock) 115 | { 116 | _ReadWriteBarrier(); 117 | *lock = 0; 118 | } 119 | 120 | 121 | } // CUB namespace 122 | CUB_NS_POSTFIX // Optional outer namespace(s) 123 | 124 | -------------------------------------------------------------------------------- /finite-volume/source/cuda/cub/util_macro.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /****************************************************************************** 30 | * Common C/C++ macro utilities 31 | ******************************************************************************/ 32 | 33 | #pragma once 34 | 35 | #include "util_namespace.cuh" 36 | 37 | /// Optional outer namespace(s) 38 | CUB_NS_PREFIX 39 | 40 | /// CUB namespace 41 | namespace cub { 42 | 43 | 44 | /** 45 | * \addtogroup UtilModule 46 | * @{ 47 | */ 48 | 49 | #ifndef CUB_ALIGN 50 | #if defined(_WIN32) || defined(_WIN64) 51 | /// Align struct 52 | #define CUB_ALIGN(bytes) __declspec(align(32)) 53 | #else 54 | /// Align struct 55 | #define CUB_ALIGN(bytes) __attribute__((aligned(bytes))) 56 | #endif 57 | #endif 58 | 59 | #ifndef CUB_MAX 60 | /// Select maximum(a, b) 61 | #define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a)) 62 | #endif 63 | 64 | #ifndef CUB_MIN 65 | /// Select minimum(a, b) 66 | #define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a)) 67 | #endif 68 | 69 | #ifndef CUB_QUOTIENT_FLOOR 70 | /// Quotient of x/y rounded down to nearest integer 71 | #define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y)) 72 | #endif 73 | 74 | #ifndef CUB_QUOTIENT_CEILING 75 | /// Quotient of x/y rounded up to nearest integer 76 | #define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y)) 77 | #endif 78 | 79 | #ifndef CUB_ROUND_UP_NEAREST 80 | /// x rounded up to the nearest multiple of y 81 | #define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y) 82 | #endif 83 | 84 | #ifndef CUB_ROUND_DOWN_NEAREST 85 | /// x rounded down to the nearest multiple of y 86 | #define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y) 87 | #endif 88 | 89 | 90 | #ifndef CUB_STATIC_ASSERT 91 | #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document 92 | #define CUB_CAT_(a, b) a ## b 93 | #define CUB_CAT(a, b) CUB_CAT_(a, b) 94 | #endif // DOXYGEN_SHOULD_SKIP_THIS 95 | 96 | /// Static assert 97 | #define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1] 98 | #endif 99 | 100 | /** @} */ // end group UtilModule 101 | 102 | } // CUB namespace 103 | CUB_NS_POSTFIX // Optional outer namespace(s) 104 | -------------------------------------------------------------------------------- /finite-volume/source/cuda/cub/util_namespace.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Place-holder for prefixing the cub namespace 32 | */ 33 | 34 | #pragma once 35 | 36 | // For example: 37 | //#define CUB_NS_PREFIX namespace thrust{ namespace detail { 38 | //#define CUB_NS_POSTFIX } } 39 | 40 | #ifndef CUB_NS_PREFIX 41 | #define CUB_NS_PREFIX 42 | #endif 43 | 44 | #ifndef CUB_NS_POSTFIX 45 | #define CUB_NS_POSTFIX 46 | #endif 47 | -------------------------------------------------------------------------------- /finite-volume/source/debug.c: -------------------------------------------------------------------------------- 1 | #include "stdlib.h" 2 | #include "stdio.h" 3 | #include "debug.h" 4 | 5 | int mpi_comm_rank = 0; 6 | 7 | int dbg_enabled() 8 | { 9 | static int dbg_is_enabled = -1; 10 | if (-1 == dbg_is_enabled) { 11 | const char *env = getenv("HPGMG_ENABLE_DEBUG"); 12 | if (env) { 13 | int en = atoi(env); 14 | dbg_is_enabled = !!en; 15 | printf("HPGMG_ENABLE_DEBUG=%s\n", env); 16 | } else 17 | dbg_is_enabled = 0; 18 | } 19 | return dbg_is_enabled; 20 | } 21 | -------------------------------------------------------------------------------- /finite-volume/source/debug.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | extern int mpi_comm_rank; 6 | 7 | #define STRDBG stderr 8 | 9 | #ifdef __cplusplus 10 | extern "C" { 11 | #endif 12 | int dbg_enabled(); 13 | 14 | #define DBG(FMT, ARGS...) \ 15 | do { \ 16 | if (dbg_enabled()) { \ 17 | fprintf(STRDBG, "[%d] [%d] HPGMG %s(): " FMT, \ 18 | getpid(), mpi_comm_rank, __FUNCTION__ , ## ARGS); \ 19 | fflush(STRDBG); \ 20 | } \ 21 | } while(0) 22 | 23 | #ifdef __cplusplus 24 | } 25 | #endif 26 | 27 | #ifdef PROFILE_NVTX_RANGES 28 | #include "nvToolsExt.h" 29 | 30 | #define COMM_COL 1 31 | #define SM_COL 2 32 | #define SML_COL 3 33 | #define OP_COL 4 34 | #define COMP_COL 5 35 | #define SOLVE_COL 6 36 | #define WARMUP_COL 7 37 | #define EXEC_COL 8 38 | 39 | #define SEND_COL 9 40 | #define WAIT_COL 10 41 | #define KERNEL_COL 11 42 | 43 | 44 | #define PUSH_RANGE(name,cid) \ 45 | do { \ 46 | const uint32_t colors[] = { \ 47 | 0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, 0x0000ffff, 0x00ff0000, 0x00ffffff, 0xff000000, 0xff0000ff, 0x55ff3300, 0xff660000, 0x66330000 \ 48 | }; \ 49 | const int num_colors = sizeof(colors)/sizeof(colors[0]); \ 50 | int color_id = cid%num_colors; \ 51 | nvtxEventAttributes_t eventAttrib = {0}; \ 52 | eventAttrib.version = NVTX_VERSION; \ 53 | eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \ 54 | eventAttrib.colorType = NVTX_COLOR_ARGB; \ 55 | eventAttrib.color = colors[color_id]; \ 56 | eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \ 57 | eventAttrib.message.ascii = name; \ 58 | nvtxRangePushEx(&eventAttrib); \ 59 | } while(0) 60 | 61 | #define PUSH_RANGE_STR(cid, FMT, ARGS...) \ 62 | do { \ 63 | char str[128]; \ 64 | snprintf(str, sizeof(str), FMT, ## ARGS); \ 65 | PUSH_RANGE(str, cid); \ 66 | } while(0) 67 | 68 | 69 | #define POP_RANGE do { nvtxRangePop(); } while(0) 70 | 71 | #else 72 | #define PUSH_RANGE(name,cid) 73 | #define POP_RANGE 74 | #endif 75 | -------------------------------------------------------------------------------- /finite-volume/source/defines.h: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------------------------------------------------------ 2 | // Samuel Williams 3 | // SWWilliams@lbl.gov 4 | // Lawrence Berkeley National Lab 5 | //------------------------------------------------------------------------------------------------------------------------------ 6 | // Lu = a*alpha[]*u[] - b*divergence beta[]*gradient u[] 7 | //------------------------------------------------------------------------------------------------------------------------------ 8 | #ifndef DEFINES_H 9 | #define DEFINES_H 10 | //------------------------------------------------------------------------------------------------------------------------------ 11 | #define VECTOR_TEMP 0 // 12 | #define VECTOR_UTRUE 1 // exact solution used to generate f 13 | #define VECTOR_F_MINUS_AV 2 // cell centered residual (f-Av) 14 | //------------------------------------------------------------------------------------------------------------------------------ 15 | #define VECTOR_F 3 // original right-hand side (Au=f), cell centered 16 | #define VECTOR_U 4 // numerical solution 17 | #define VECTOR_ALPHA 5 // cell centered coefficient 18 | #define VECTOR_BETA_I 6 // face centered coefficient (n.b. element 0 is the left face of the ghost zone element) 19 | #define VECTOR_BETA_J 7 // face centered coefficient (n.b. element 0 is the back face of the ghost zone element) 20 | #define VECTOR_BETA_K 8 // face centered coefficient (n.b. element 0 is the bottom face of the ghost zone element) 21 | //------------------------------------------------------------------------------------------------------------------ 22 | #define VECTOR_DINV 9 // cell centered relaxation parameter (e.g. inverse of the diagonal) 23 | #define VECTOR_L1INV 10 // cell centered relaxation parameter (e.g. inverse of the L1 norm of each row) 24 | #define VECTOR_VALID 11 // cell centered array noting which cells are actually present 25 | //------------------------------------------------------------------------------------------------------------------ 26 | #define VECTORS_RESERVED 12 // total number of vectors and the starting location for any auxillary bottom solver vectors 27 | //------------------------------------------------------------------------------------------------------------------------------ 28 | #endif 29 | -------------------------------------------------------------------------------- /finite-volume/source/local.mk: -------------------------------------------------------------------------------- 1 | hpgmg-fv-y.c += $(call thisdir, \ 2 | debug.c \ 3 | timers.c \ 4 | level.c \ 5 | operators.fv4.c \ 6 | mg.c \ 7 | solvers.c \ 8 | hpgmg-fv.c \ 9 | ) 10 | 11 | #Useless in case of libmpcomm.so 12 | #hpgmg-fv-y.cc += $(call thisdir, comm.cc) 13 | 14 | hpgmg-fv-y.cu += $(call thisdir, \ 15 | cuda/operators.fv4.cu \ 16 | ) 17 | -------------------------------------------------------------------------------- /finite-volume/source/mg.h: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------------------------------------------------------ 2 | // Samuel Williams 3 | // SWWilliams@lbl.gov 4 | // Lawrence Berkeley National Lab 5 | //------------------------------------------------------------------------------------------------------------------------------ 6 | #ifndef MG_H 7 | #define MG_H 8 | //------------------------------------------------------------------------------------------------------------------------------ 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | //------------------------------------------------------------------------------------------------------------------------------ 15 | #ifndef MG_AGGLOMERATION_START 16 | #define MG_AGGLOMERATION_START 8 // i.e. start the distributed v-cycle when boxes are smaller than 8^3 17 | #endif 18 | #ifndef MG_DEFAULT_BOTTOM_NORM 19 | #define MG_DEFAULT_BOTTOM_NORM 1e-3 20 | #endif 21 | //------------------------------------------------------------------------------------------------------------------------------ 22 | typedef struct { 23 | int num_ranks; // total number of MPI ranks for MPI_COMM_WORLD 24 | int my_rank; // my MPI rank for MPI_COMM_WORLD 25 | int num_levels; // depth of the v-cycle 26 | level_type ** levels; // array of pointers to levels 27 | 28 | struct { 29 | double MGBuild; // total time spent building the coefficients... 30 | double MGSolve; // total time spent in MGSolve 31 | }timers; 32 | int MGSolves_performed; 33 | } mg_type; 34 | 35 | 36 | //------------------------------------------------------------------------------------------------------------------------------ 37 | void MGBuild(mg_type *all_grids, level_type *fine_grid, double a, double b, int minCoarseGridDim); 38 | void MGSolve(mg_type *all_grids, int onLevel, int u_id, int F_id, double a, double b, double dtol, double rtol); 39 | void FMGSolve(mg_type *all_grids, int onLevel, int u_id, int F_id, double a, double b, double dtol, double rtol); 40 | void MGPCG(mg_type *all_grids, int onLevel, int x_id, int F_id, double a, double b, double dtol, double rtol); 41 | void MGDestroy(mg_type *all_grids); 42 | void MGPrintTiming(mg_type *all_grids, int fromLevel); 43 | void MGResetTimers(mg_type *all_grids); 44 | void richardson_error(mg_type *all_grids, int levelh, int u_id); 45 | //------------------------------------------------------------------------------------------------------------------------------ 46 | #endif 47 | -------------------------------------------------------------------------------- /finite-volume/source/operators.h: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------------------------------------------------------ 2 | // Samuel Williams 3 | // SWWilliams@lbl.gov 4 | // Lawrence Berkeley National Lab 5 | //------------------------------------------------------------------------------------------------------------------------------ 6 | #ifndef OPERATORS_H 7 | #define OPERATORS_H 8 | //------------------------------------------------------------------------------------------------------------------------------ 9 | #define RESTRICT_CELL 0 10 | #define RESTRICT_FACE_I 1 11 | #define RESTRICT_FACE_J 2 12 | #define RESTRICT_FACE_K 3 13 | //------------------------------------------------------------------------------------------------------------------------------ 14 | int stencil_get_radius(); 15 | int stencil_get_shape(); 16 | //------------------------------------------------------------------------------------------------------------------------------ 17 | void apply_op(level_type * level, int Ax_id, int x_id, double a, double b); 18 | void residual(level_type * level, int res_id, int x_id, int rhs_id, double a, double b); 19 | void smooth(level_type * level, int phi_id, int rhs_id, double a, double b); 20 | void rebuild_operator(level_type * level, level_type *fromLevel, double a, double b); 21 | void rebuild_operator_blackbox(level_type * level, double a, double b, int colors_in_each_dim); 22 | //------------------------------------------------------------------------------------------------------------------------------ 23 | void restriction(level_type * level_c, int id_c, level_type *level_f, int id_f, int restrictionType); 24 | void interpolation_vcycle(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c); // interpolation used inside a v-cycle 25 | void interpolation_fcycle(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c); // interpolation used in the f-cycle to create a new initial guess for the next finner v-cycle 26 | //------------------------------------------------------------------------------------------------------------------------------ 27 | void exchange_boundary(level_type * level, int id_a, int shape); 28 | void force_comm_flush(); 29 | 30 | void apply_BCs_p1(level_type * level, int x_id, int shape); // piecewise (cell centered) linear 31 | void apply_BCs_p2(level_type * level, int x_id, int shape); // piecewise (cell centered) quadratic 32 | void apply_BCs_v1(level_type * level, int x_id, int shape); // volumetric linear 33 | void apply_BCs_v2(level_type * level, int x_id, int shape); // volumetric quadratic 34 | void apply_BCs_v4(level_type * level, int x_id, int shape); // volumetric quartic 35 | void extrapolate_betas(level_type * level); 36 | //------------------------------------------------------------------------------------------------------------------------------ 37 | double dot(level_type * level, int id_a, int id_b); 38 | double norm(level_type * level, int id_a); 39 | double mean(level_type * level, int id_a); 40 | double error(level_type * level, int id_a, int id_b); 41 | void add_vectors(level_type * level, int id_c, double scale_a, int id_a, double scale_b, int id_b); 42 | void scale_vector( level_type * level, int id_c, double scale_a, int id_a); 43 | void zero_vector( level_type * level, int id_a); 44 | void shift_vector( level_type * level, int id_c, int id_a, double shift_a); 45 | void mul_vectors(level_type * level, int id_c, double scale, int id_a, int id_b); 46 | void invert_vector( level_type * level, int id_c, double scale_a, int id_a); 47 | void init_vector( level_type * level, int id_a, double scalar); 48 | //------------------------------------------------------------------------------------------------------------------------------ 49 | void color_vector(level_type * level, int id, int colors, int icolor, int jcolor, int kcolor); 50 | void random_vector(level_type * level, int id); 51 | //------------------------------------------------------------------------------------------------------------------------------ 52 | void initialize_problem(level_type * level, double hLevel, double a, double b); 53 | void initialize_valid_region(level_type * level); 54 | //------------------------------------------------------------------------------------------------------------------------------ 55 | #endif 56 | -------------------------------------------------------------------------------- /finite-volume/source/operators.old/aggregate.mpi/jacobi.c: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------------------------------------------------------ 2 | // Samuel Williams 3 | // SWWilliams@lbl.gov 4 | // Lawrence Berkeley National Lab 5 | //------------------------------------------------------------------------------------------------------------------------------ 6 | #include 7 | #include "../timer.h" 8 | //------------------------------------------------------------------------------------------------------------------------------ 9 | void smooth(level_type * level, int x_id, int rhs_id, double a, double b){ 10 | if(NUM_SMOOTHS&1){ 11 | printf("error - NUM_SMOOTHS must be even...\n"); 12 | exit(0); 13 | } 14 | 15 | 16 | int box,s; 17 | int ghosts = level->box_ghosts; 18 | int starShaped = stencil_is_star_shaped(); 19 | int communicationAvoiding = ghosts > stencil_get_radius(); 20 | 21 | #ifdef USE_L1JACOBI 22 | double weight = 1.0; 23 | #else 24 | double weight = 2.0/3.0; 25 | #endif 26 | 27 | 28 | // if communication-avoiding, need updated RHS for stencils in ghost zones 29 | if(communicationAvoiding)exchange_boundary(level,rhs_id,0); 30 | 31 | for(s=0;snum_my_boxes;box++){ 41 | int i,j,k,ss; 42 | const int jStride = level->my_boxes[box].jStride; 43 | const int kStride = level->my_boxes[box].kStride; 44 | const int dim = level->my_boxes[box].dim; 45 | const double h2inv = 1.0/(level->h*level->h); 46 | const double * __restrict__ rhs = level->my_boxes[box].vectors[ rhs_id] + ghosts*(1+jStride+kStride); 47 | const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); 48 | const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); 49 | const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); 50 | const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); 51 | const double * __restrict__ valid = level->my_boxes[box].vectors[VECTOR_VALID ] + ghosts*(1+jStride+kStride); // cell is inside the domain 52 | #ifdef USE_L1JACOBI 53 | const double * __restrict__ lambda = level->my_boxes[box].vectors[VECTOR_L1INV ] + ghosts*(1+jStride+kStride); 54 | #else 55 | const double * __restrict__ lambda = level->my_boxes[box].vectors[VECTOR_DINV ] + ghosts*(1+jStride+kStride); 56 | #endif 57 | int ghostsToOperateOn=ghosts-1; 58 | for(ss=s;ssmy_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); 62 | x_np1 = level->my_boxes[box].vectors[VECTOR_TEMP] + ghosts*(1+jStride+kStride);} 63 | else{x_n = level->my_boxes[box].vectors[VECTOR_TEMP] + ghosts*(1+jStride+kStride); 64 | x_np1 = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride);} 65 | PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k) 66 | for(k=0-ghostsToOperateOn;kcycles.smooth += (uint64_t)(CycleTime()-_timeStart); 76 | } // s-loop 77 | } 78 | 79 | //------------------------------------------------------------------------------------------------------------------------------ 80 | -------------------------------------------------------------------------------- /finite-volume/source/operators.old/apply_op.c: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------------------------------------------------------ 2 | // Samuel Williams 3 | // SWWilliams@lbl.gov 4 | // Lawrence Berkeley National Lab 5 | //------------------------------------------------------------------------------------------------------------------------------ 6 | void apply_op(level_type * level, int Ax_id, int x_id, double a, double b){ // y=Ax 7 | // exchange the boundary of x in preparation for Ax 8 | exchange_boundary(level,x_id,stencil_get_shape()); 9 | apply_BCs(level,x_id,stencil_get_shape()); 10 | 11 | // now do Ax proper... 12 | double _timeStart = getTime(); 13 | const int ghosts = level->box_ghosts; 14 | const int jStride = level->box_jStride; 15 | const int kStride = level->box_kStride; 16 | const int dim = level->box_dim; 17 | const double h2inv = 1.0/(level->h*level->h); 18 | int box; 19 | 20 | PRAGMA_THREAD_ACROSS_BOXES(level,box) 21 | for(box=0;boxnum_my_boxes;box++){ 22 | int i,j,k; 23 | const double * __restrict__ x = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point 24 | double * __restrict__ Ax = level->my_boxes[box].vectors[ Ax_id] + ghosts*(1+jStride+kStride); 25 | const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); 26 | const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); 27 | const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); 28 | const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); 29 | const double * __restrict__ valid = level->my_boxes[box].vectors[VECTOR_VALID ] + ghosts*(1+jStride+kStride); 30 | 31 | PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k) 32 | for(k=0;ktimers.apply_op += (double)(getTime()-_timeStart); 40 | } 41 | //------------------------------------------------------------------------------------------------------------------------------ 42 | -------------------------------------------------------------------------------- /finite-volume/source/operators.old/jacobi.c: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------------------------------------------------------ 2 | // Samuel Williams 3 | // SWWilliams@lbl.gov 4 | // Lawrence Berkeley National Lab 5 | //------------------------------------------------------------------------------------------------------------------------------ 6 | #include 7 | //------------------------------------------------------------------------------------------------------------------------------ 8 | void smooth(level_type * level, int x_id, int rhs_id, double a, double b){ 9 | if(NUM_SMOOTHS&1){ 10 | fprintf(stderr,"error - NUM_SMOOTHS must be even...\n"); 11 | exit(0); 12 | } 13 | 14 | #ifdef USE_L1JACOBI 15 | double weight = 1.0; 16 | #else 17 | double weight = 2.0/3.0; 18 | #endif 19 | 20 | int box,s; 21 | for(s=0;sbox_ghosts; 29 | const int jStride = level->box_jStride; 30 | const int kStride = level->box_kStride; 31 | const int dim = level->box_dim; 32 | const double h2inv = 1.0/(level->h*level->h); 33 | 34 | PRAGMA_THREAD_ACROSS_BOXES(level,box) 35 | for(box=0;boxnum_my_boxes;box++){ 36 | int i,j,k; 37 | const double * __restrict__ rhs = level->my_boxes[box].vectors[ rhs_id] + ghosts*(1+jStride+kStride); 38 | const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); 39 | const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); 40 | const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); 41 | const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); 42 | const double * __restrict__ valid = level->my_boxes[box].vectors[VECTOR_VALID ] + ghosts*(1+jStride+kStride); // cell is inside the domain 43 | #ifdef USE_L1JACOBI 44 | const double * __restrict__ lambda = level->my_boxes[box].vectors[VECTOR_L1INV ] + ghosts*(1+jStride+kStride); 45 | #else 46 | const double * __restrict__ lambda = level->my_boxes[box].vectors[VECTOR_DINV ] + ghosts*(1+jStride+kStride); 47 | #endif 48 | const double * __restrict__ x_n; 49 | double * __restrict__ x_np1; 50 | if((s&1)==0){x_n = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); 51 | x_np1 = level->my_boxes[box].vectors[VECTOR_TEMP ] + ghosts*(1+jStride+kStride);} 52 | else{x_n = level->my_boxes[box].vectors[VECTOR_TEMP ] + ghosts*(1+jStride+kStride); 53 | x_np1 = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride);} 54 | PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k) 55 | for(k=0;ktimers.smooth += (double)(getTime()-_timeStart); 64 | } // s-loop 65 | } 66 | 67 | //------------------------------------------------------------------------------------------------------------------------------ 68 | -------------------------------------------------------------------------------- /finite-volume/source/operators.old/residual.c: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------------------------------------------------------ 2 | // Samuel Williams 3 | // SWWilliams@lbl.gov 4 | // Lawrence Berkeley National Lab 5 | //------------------------------------------------------------------------------------------------------------------------------ 6 | // calculate res_id = rhs_id - A(x_id) 7 | 8 | void residual(level_type * level, int res_id, int x_id, int rhs_id, double a, double b){ 9 | // exchange the boundary for x in prep for Ax... 10 | exchange_boundary(level,x_id,stencil_get_shape()); 11 | apply_BCs(level,x_id,stencil_get_shape()); 12 | 13 | // now do residual/restriction proper... 14 | double _timeStart = getTime(); 15 | const int ghosts = level->box_ghosts; 16 | const int jStride = level->box_jStride; 17 | const int kStride = level->box_kStride; 18 | const int dim = level->box_dim; 19 | const double h2inv = 1.0/(level->h*level->h); 20 | int box; 21 | 22 | PRAGMA_THREAD_ACROSS_BOXES(level,box) 23 | for(box=0;boxnum_my_boxes;box++){ 24 | int i,j,k; 25 | const double * __restrict__ x = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point 26 | const double * __restrict__ rhs = level->my_boxes[box].vectors[ rhs_id] + ghosts*(1+jStride+kStride); 27 | const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); 28 | const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); 29 | const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); 30 | const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); 31 | const double * __restrict__ valid = level->my_boxes[box].vectors[VECTOR_VALID ] + ghosts*(1+jStride+kStride); // cell is inside the domain 32 | double * __restrict__ res = level->my_boxes[box].vectors[ res_id] + ghosts*(1+jStride+kStride); 33 | 34 | PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k) 35 | for(k=0;ktimers.residual += (double)(getTime()-_timeStart); 44 | } 45 | 46 | -------------------------------------------------------------------------------- /finite-volume/source/operators.old/symgs.c: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------------------------------------------------------ 2 | // Samuel Williams 3 | // SWWilliams@lbl.gov 4 | // Lawrence Berkeley National Lab 5 | //------------------------------------------------------------------------------------------------------------------------------ 6 | void smooth(level_type * level, int phi_id, int rhs_id, double a, double b){ 7 | int box,s; 8 | 9 | for(s=0;s<2*NUM_SMOOTHS;s++){ // there are two sweeps (forward/backward) per GS smooth 10 | exchange_boundary(level,phi_id,stencil_get_shape()); 11 | apply_BCs(level,phi_id,stencil_get_shape()); 12 | 13 | // now do ghosts communication-avoiding smooths on each box... 14 | double _timeStart = getTime(); 15 | const int ghosts = level->box_ghosts; 16 | const int jStride = level->box_jStride; 17 | const int kStride = level->box_kStride; 18 | const int dim = level->box_dim; 19 | const double h2inv = 1.0/(level->h*level->h); 20 | 21 | #ifdef _OPENMP 22 | #pragma omp parallel for 23 | #endif 24 | for(box=0;boxnum_my_boxes;box++){ 25 | int i,j,k; 26 | double * __restrict__ phi = level->my_boxes[box].vectors[ phi_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point 27 | const double * __restrict__ rhs = level->my_boxes[box].vectors[ rhs_id] + ghosts*(1+jStride+kStride); 28 | const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); 29 | const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); 30 | const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); 31 | const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); 32 | const double * __restrict__ Dinv = level->my_boxes[box].vectors[VECTOR_DINV ] + ghosts*(1+jStride+kStride); 33 | const double * __restrict__ valid = level->my_boxes[box].vectors[VECTOR_VALID ] + ghosts*(1+jStride+kStride); // cell is inside the domain 34 | 35 | 36 | if( (s&0x1)==0 ){ // forward sweep... hard to thread 37 | for(k=0;k=0;k--){ 46 | for(j=dim-1;j>=0;j--){ 47 | for(i=dim-1;i>=0;i--){ 48 | int ijk = i + j*jStride + k*kStride; 49 | double Ax = apply_op_ijk(phi); 50 | phi[ijk] = phi[ijk] + Dinv[ijk]*(rhs[ijk]-Ax); 51 | }}} 52 | } 53 | 54 | } // boxes 55 | level->timers.smooth += (double)(getTime()-_timeStart); 56 | } // s-loop 57 | } 58 | 59 | 60 | //------------------------------------------------------------------------------------------------------------------------------ 61 | -------------------------------------------------------------------------------- /finite-volume/source/operators/apply_op.c: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------------------------------------------------------ 2 | // Samuel Williams 3 | // SWWilliams@lbl.gov 4 | // Lawrence Berkeley National Lab 5 | //------------------------------------------------------------------------------------------------------------------------------ 6 | // Applies the linear operator specified in the apply_op_ijk macro to vector x_id and stores the result in Ax_id 7 | // This requires exchanging a ghost zone and/or enforcing a boundary condition. 8 | // NOTE, Ax_id and x_id must be distinct 9 | void apply_op(level_type * level, int Ax_id, int x_id, double a, double b){ 10 | // exchange the boundary of x in preparation for Ax 11 | exchange_boundary(level,x_id,stencil_get_shape()); 12 | apply_BCs(level,x_id,stencil_get_shape()); 13 | 14 | // now do Ax proper... 15 | double _timeStart = getTime(); 16 | int block; 17 | 18 | if(level->use_cuda)cudaDeviceSynchronize(); // FIX... wait for any other GPU operations on this level to complete 19 | 20 | PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks) 21 | for(block=0;blocknum_my_blocks;block++){ 22 | const int box = level->my_blocks[block].read.box; 23 | const int ilo = level->my_blocks[block].read.i; 24 | const int jlo = level->my_blocks[block].read.j; 25 | const int klo = level->my_blocks[block].read.k; 26 | const int ihi = level->my_blocks[block].dim.i + ilo; 27 | const int jhi = level->my_blocks[block].dim.j + jlo; 28 | const int khi = level->my_blocks[block].dim.k + klo; 29 | int i,j,k; 30 | const int jStride = level->my_boxes[box].jStride; 31 | const int kStride = level->my_boxes[box].kStride; 32 | const int ghosts = level->my_boxes[box].ghosts; 33 | const double h2inv = 1.0/(level->h*level->h); 34 | const double * __restrict__ x = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point 35 | double * __restrict__ Ax = level->my_boxes[box].vectors[ Ax_id] + ghosts*(1+jStride+kStride); 36 | const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); 37 | const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); 38 | const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); 39 | const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); 40 | const double * __restrict__ valid = level->my_boxes[box].vectors[VECTOR_VALID ] + ghosts*(1+jStride+kStride); 41 | 42 | for(k=klo;ktimers.apply_op += (double)(getTime()-_timeStart); 50 | } 51 | //------------------------------------------------------------------------------------------------------------------------------ 52 | -------------------------------------------------------------------------------- /finite-volume/source/operators/jacobi.c: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------------------------------------------------------ 2 | // Samuel Williams 3 | // SWWilliams@lbl.gov 4 | // Lawrence Berkeley National Lab 5 | //------------------------------------------------------------------------------------------------------------------------------ 6 | #include 7 | //------------------------------------------------------------------------------------------------------------------------------ 8 | void smooth(level_type * level, int x_id, int rhs_id, double a, double b){ 9 | if(NUM_SMOOTHS&1){ 10 | fprintf(stderr,"error - NUM_SMOOTHS must be even...\n"); 11 | exit(0); 12 | } 13 | 14 | #ifdef USE_L1JACOBI 15 | double weight = 1.0; 16 | #else 17 | double weight = 2.0/3.0; 18 | #endif 19 | 20 | int block,s; 21 | for(s=0;suse_cuda) { 29 | cuda_smooth(*level, x_id, rhs_id, a, b, s, NULL, NULL); 30 | } 31 | else { 32 | PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks) 33 | for(block=0;blocknum_my_blocks;block++){ 34 | const int box = level->my_blocks[block].read.box; 35 | const int ilo = level->my_blocks[block].read.i; 36 | const int jlo = level->my_blocks[block].read.j; 37 | const int klo = level->my_blocks[block].read.k; 38 | const int ihi = level->my_blocks[block].dim.i + ilo; 39 | const int jhi = level->my_blocks[block].dim.j + jlo; 40 | const int khi = level->my_blocks[block].dim.k + klo; 41 | int i,j,k; 42 | const int ghosts = level->box_ghosts; 43 | const int jStride = level->my_boxes[box].jStride; 44 | const int kStride = level->my_boxes[box].kStride; 45 | const double h2inv = 1.0/(level->h*level->h); 46 | const double * __restrict__ rhs = level->my_boxes[box].vectors[ rhs_id] + ghosts*(1+jStride+kStride); 47 | const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); 48 | const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); 49 | const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); 50 | const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); 51 | const double * __restrict__ valid = level->my_boxes[box].vectors[VECTOR_VALID ] + ghosts*(1+jStride+kStride); // cell is inside the domain 52 | #ifdef USE_L1JACOBI 53 | const double * __restrict__ lambda = level->my_boxes[box].vectors[VECTOR_L1INV ] + ghosts*(1+jStride+kStride); 54 | #else 55 | const double * __restrict__ lambda = level->my_boxes[box].vectors[VECTOR_DINV ] + ghosts*(1+jStride+kStride); 56 | #endif 57 | const double * __restrict__ x_n; 58 | double * __restrict__ x_np1; 59 | if((s&1)==0){x_n = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); 60 | x_np1 = level->my_boxes[box].vectors[VECTOR_TEMP ] + ghosts*(1+jStride+kStride);} 61 | else{x_n = level->my_boxes[box].vectors[VECTOR_TEMP ] + ghosts*(1+jStride+kStride); 62 | x_np1 = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride);} 63 | 64 | for(k=klo;ktimers.smooth += (double)(getTime()-_timeStart); 75 | } // s-loop 76 | } 77 | 78 | //------------------------------------------------------------------------------------------------------------------------------ 79 | -------------------------------------------------------------------------------- /finite-volume/source/operators/residual.c: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------------------------------------------------------ 2 | // Samuel Williams 3 | // SWWilliams@lbl.gov 4 | // Lawrence Berkeley National Lab 5 | //------------------------------------------------------------------------------------------------------------------------------ 6 | // This routines calculates the residual (res=rhs-Ax) using the linear operator specified in the apply_op_ijk macro 7 | // This requires exchanging a ghost zone and/or enforcing a boundary condition. 8 | // NOTE, x_id must be distinct from rhs_id and res_id 9 | void residual(level_type * level, int res_id, int x_id, int rhs_id, double a, double b){ 10 | // exchange the boundary for x in prep for Ax... 11 | exchange_boundary(level,x_id,stencil_get_shape()); 12 | apply_BCs(level,x_id,stencil_get_shape()); 13 | 14 | // now do residual/restriction proper... 15 | double _timeStart = getTime(); 16 | int block; 17 | 18 | if (level->use_cuda) { 19 | cuda_residual(*level, res_id, x_id, rhs_id, a, b); 20 | } 21 | else { 22 | PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks) 23 | for(block=0;blocknum_my_blocks;block++){ 24 | const int box = level->my_blocks[block].read.box; 25 | const int ilo = level->my_blocks[block].read.i; 26 | const int jlo = level->my_blocks[block].read.j; 27 | const int klo = level->my_blocks[block].read.k; 28 | const int ihi = level->my_blocks[block].dim.i + ilo; 29 | const int jhi = level->my_blocks[block].dim.j + jlo; 30 | const int khi = level->my_blocks[block].dim.k + klo; 31 | int i,j,k; 32 | const int jStride = level->my_boxes[box].jStride; 33 | const int kStride = level->my_boxes[box].kStride; 34 | const int ghosts = level->my_boxes[box].ghosts; 35 | const double h2inv = 1.0/(level->h*level->h); 36 | const double * __restrict__ x = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point 37 | const double * __restrict__ rhs = level->my_boxes[box].vectors[ rhs_id] + ghosts*(1+jStride+kStride); 38 | const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); 39 | const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); 40 | const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); 41 | const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); 42 | const double * __restrict__ valid = level->my_boxes[box].vectors[VECTOR_VALID ] + ghosts*(1+jStride+kStride); // cell is inside the domain 43 | double * __restrict__ res = level->my_boxes[box].vectors[ res_id] + ghosts*(1+jStride+kStride); 44 | 45 | for(k=klo;ktimers.residual += (double)(getTime()-_timeStart); 55 | } 56 | 57 | -------------------------------------------------------------------------------- /finite-volume/source/operators/symgs.c: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------------------------------------------------------ 2 | // Samuel Williams 3 | // SWWilliams@lbl.gov 4 | // Lawrence Berkeley National Lab 5 | //------------------------------------------------------------------------------------------------------------------------------ 6 | void smooth(level_type * level, int phi_id, int rhs_id, double a, double b){ 7 | int box,s; 8 | 9 | for(s=0;s<2*NUM_SMOOTHS;s++){ // there are two sweeps (forward/backward) per GS smooth 10 | exchange_boundary(level,phi_id,stencil_get_shape()); 11 | apply_BCs(level,phi_id,stencil_get_shape()); 12 | 13 | double _timeStart = getTime(); 14 | #ifdef _OPENMP 15 | #pragma omp parallel for private(box) 16 | #endif 17 | for(box=0;boxnum_my_boxes;box++){ 18 | int i,j,k; 19 | const int ghosts = level->box_ghosts; 20 | const int jStride = level->my_boxes[box].jStride; 21 | const int kStride = level->my_boxes[box].kStride; 22 | const int dim = level->my_boxes[box].dim; 23 | const double h2inv = 1.0/(level->h*level->h); 24 | double * __restrict__ phi = level->my_boxes[box].vectors[ phi_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point 25 | const double * __restrict__ rhs = level->my_boxes[box].vectors[ rhs_id] + ghosts*(1+jStride+kStride); 26 | const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); 27 | const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); 28 | const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); 29 | const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); 30 | const double * __restrict__ Dinv = level->my_boxes[box].vectors[VECTOR_DINV ] + ghosts*(1+jStride+kStride); 31 | const double * __restrict__ valid = level->my_boxes[box].vectors[VECTOR_VALID ] + ghosts*(1+jStride+kStride); // cell is inside the domain 32 | 33 | 34 | if( (s&0x1)==0 ){ // forward sweep... hard to thread 35 | for(k=0;k=0;k--){ 44 | for(j=dim-1;j>=0;j--){ 45 | for(i=dim-1;i>=0;i--){ 46 | int ijk = i + j*jStride + k*kStride; 47 | double Ax = apply_op_ijk(phi); 48 | phi[ijk] = phi[ijk] + Dinv[ijk]*(rhs[ijk]-Ax); 49 | }}} 50 | } 51 | 52 | } // boxes 53 | level->timers.smooth += (double)(getTime()-_timeStart); 54 | } // s-loop 55 | } 56 | 57 | 58 | //------------------------------------------------------------------------------------------------------------------------------ 59 | -------------------------------------------------------------------------------- /finite-volume/source/solvers.c: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------------------------------------------------------ 2 | // Samuel Williams 3 | // SWWilliams@lbl.gov 4 | // Lawrence Berkeley National Lab 5 | //------------------------------------------------------------------------------------------------------------------------------ 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | //------------------------------------------------------------------------------------------------------------------------------ 12 | #include "timers.h" 13 | #include "defines.h" 14 | #include "level.h" 15 | #include "operators.h" 16 | //------------------------------------------------------------------------------------------------------------------------------ 17 | #ifdef USE_BICGSTAB 18 | #include "solvers/bicgstab.c" 19 | #elif USE_CG 20 | #include "solvers/cg.c" 21 | #elif USE_CABICGSTAB 22 | #include "solvers/cabicgstab.c" 23 | #elif USE_CACG 24 | #include "solvers/cacg.c" 25 | #endif 26 | //------------------------------------------------------------------------------------------------------------------------------ 27 | void IterativeSolver(level_type * level, int u_id, int f_id, double a, double b, double desired_reduction_in_norm){ 28 | if(!level->active)return; 29 | //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 30 | if(level->must_subtract_mean==-1){ 31 | level->must_subtract_mean=0; 32 | int alpha_is_zero = (dot(level,VECTOR_ALPHA,VECTOR_ALPHA) == 0.0); 33 | if( (level->boundary_condition.type==BC_PERIODIC) && ((a==0) || (alpha_is_zero)) )level->must_subtract_mean = 1; // Poisson with Periodic BCs 34 | } 35 | //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 36 | #if 0 37 | if( (level->dim.i==1)&&(level->dim.j==1)&&(level->dim.k==1) ){ 38 | // I have reduced the system to 1 equation and 1 unknown and know D^{-1} exactly 39 | // therefore A^{-1} == D^{-1} = 1/a00 40 | // u = A^{-1}f == D^{-1}f 41 | mul_vectors(level,u_id,1.0,VECTOR_DINV,f_id); // u = A^{-1}f = D^{-1}f 42 | if(level->must_subtract_mean == 1){ 43 | double mean_of_u = mean(level,u_id); 44 | shift_vector(level,u_id,u_id,-mean_of_u); 45 | } 46 | return; 47 | } 48 | #endif 49 | //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 50 | #ifdef USE_BICGSTAB 51 | BiCGStab(level,u_id,f_id,a,b,desired_reduction_in_norm); 52 | #elif USE_CG 53 | CG(level,u_id,f_id,a,b,desired_reduction_in_norm); 54 | #elif USE_CABICGSTAB 55 | CABiCGStab(level,u_id,f_id,a,b,desired_reduction_in_norm); 56 | #elif USE_CACG 57 | CACG(level,u_id,f_id,a,b,desired_reduction_in_norm); 58 | #else 59 | // just point relaxation via multiple smooth()'s 60 | if(level->must_subtract_mean == 1){ 61 | double mean_of_u = mean(level,u_id); 62 | shift_vector(level,u_id,u_id,-mean_of_u); 63 | } 64 | residual(level,VECTOR_TEMP,u_id,f_id,a,b); 65 | //mul_vectors(level,VECTOR_TEMP,1.0,VECTOR_TEMP,VECTOR_DINV); // Using ||D^{-1}(b-Ax)||_{inf} as convergence criteria... 66 | 67 | double norm_of_r0 = norm(level,VECTOR_TEMP); 68 | int s=0,maxSmoothsBottom=200,converged=0; 69 | while( (sKrylov_iterations++; 72 | smooth(level,u_id,f_id,a,b); 73 | if(level->must_subtract_mean == 1){ 74 | double mean_of_u = mean(level,u_id); 75 | shift_vector(level,u_id,u_id,-mean_of_u); 76 | } 77 | residual(level,VECTOR_TEMP,u_id,f_id,a,b); 78 | //mul_vectors(level,VECTOR_TEMP,1.0,VECTOR_TEMP,VECTOR_DINV); // Using ||D^{-1}(b-Ax)||_{inf} as convergence criteria... 79 | 80 | double norm_of_r = norm(level,VECTOR_TEMP); 81 | if(norm_of_r == 0.0){converged=1;break;} 82 | if(norm_of_r < desired_reduction_in_norm*norm_of_r0){converged=1;break;} 83 | } 84 | #endif 85 | //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 86 | } 87 | 88 | 89 | //------------------------------------------------------------------------------------------------------------------------------ 90 | int IterativeSolver_NumVectors(){ 91 | // additionally number of vectors required by an iterative solver... 92 | #ifdef USE_BICGSTAB 93 | return(8); // BiCGStab requires additional vectors r0,r,p,s,Ap,As 94 | #elif USE_CG 95 | return(5); // CG requires extra vectors r0,r,p,Ap,z 96 | #elif USE_CABICGSTAB 97 | return(4+4*CA_KRYLOV_S); // CABiCGStab requires additional vectors rt,p,r,P[2s+1],R[2s]. 98 | #elif USE_CACG 99 | return(4+2*CA_KRYLOV_S); // CACG requires additional vectors r0,p,r,P[s+1],R[s]. 100 | #endif 101 | return(0); // simply doing multiple smooths requires no extra vectors 102 | } 103 | //------------------------------------------------------------------------------------------------------------------------------ 104 | -------------------------------------------------------------------------------- /finite-volume/source/solvers.h: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------------------------------------------------------ 2 | // Samuel Williams 3 | // SWWilliams@lbl.gov 4 | // Lawrence Berkeley National Lab 5 | //------------------------------------------------------------------------------------------------------------------------------ 6 | #ifndef SOLVERS_H 7 | #define SOLVERS_H 8 | //------------------------------------------------------------------------------------------------------------------------------ 9 | void IterativeSolver(level_type *level, int u_id, int f_id, double a, double b, double desired_reduction_in_norm); 10 | int IterativeSolver_NumVectors(); 11 | //------------------------------------------------------------------------------------------------------------------------------ 12 | #endif 13 | -------------------------------------------------------------------------------- /finite-volume/source/solvers/matmul.c: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------------------------------------------------------ 2 | // Samuel Williams 3 | // SWWilliams@lbl.gov 4 | // Lawrence Berkeley National Lab 5 | //------------------------------------------------------------------------------------------------------------------------------ 6 | void matmul(level_type * level, double *C, int * id_A, int * id_B, int rows, int cols, int A_equals_B_transpose){ 7 | // *id_A = m vector_id's (conceptually pointers to the rows of a m x level->num_my_boxes*volume matrix) 8 | // *id_B = n vector_id's (conceptually pointers to the columns of a level->num_my_boxes*volume matrix x n) 9 | // *C is a mxn matrix where C[rows][cols] = dot(id_A[rows],id_B[cols]) 10 | 11 | // FIX, id_A and id_B are likely the same and thus C[][] will be symmetric (modulo missing row?) 12 | // if(A_equals_B_transpose && (cols>=rows)) then use id_B and only run for nn>=mm // common case for s-step Krylov methods 13 | // C_is_symmetric && cols< rows (use id_A) 14 | int mm,nn; 15 | 16 | 17 | double _timeStart = getTime(); 18 | // FIX... rather than performing an all_reduce on the essentially symmetric [G,g], do the all_reduce on the upper triangle and then duplicate (saves BW) 19 | #ifdef _OPENMP 20 | #pragma omp parallel for schedule(static,1) collapse(2) 21 | #endif 22 | for(mm=0;mm=mm){ // upper triangular 25 | int box; 26 | double a_dot_b_level = 0.0; 27 | for(box=0;boxnum_my_boxes;box++){ 28 | int i,j,k; 29 | const int jStride = level->my_boxes[box].jStride; 30 | const int kStride = level->my_boxes[box].kStride; 31 | const int ghosts = level->my_boxes[box].ghosts; 32 | const int dim = level->my_boxes[box].dim; 33 | double * __restrict__ grid_a = level->my_boxes[box].vectors[id_A[mm]] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point 34 | double * __restrict__ grid_b = level->my_boxes[box].vectors[id_B[nn]] + ghosts*(1+jStride+kStride); 35 | double a_dot_b_box = 0.0; 36 | for(k=0;ktimers.blas3 += (double)(getTime()-_timeStart); 49 | 50 | #ifdef USE_MPI 51 | double *send_buffer = (double*)malloc(rows*cols*sizeof(double)); 52 | for(mm=0;mmMPI_COMM_ALLREDUCE); 58 | double _timeEndAllReduce = getTime(); 59 | level->timers.collectives += (double)(_timeEndAllReduce-_timeStartAllReduce); 60 | free(send_buffer); 61 | #endif 62 | 63 | } 64 | 65 | -------------------------------------------------------------------------------- /finite-volume/source/timers.c: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------------------------------------------------------ 2 | // Samuel Williams 3 | // SWWilliams@lbl.gov 4 | // Lawrence Berkeley National Lab 5 | //------------------------------------------------------------------------------------------------------------------------------ 6 | #ifdef _OPENMP 7 | // getTime in OpenMP is now defined as a preprocessor macro 8 | //#include "./timers/omp.c" 9 | #elif USE_MPI 10 | // getTime in MPI is now defined as a preprocessor macro 11 | //#include "./timers/mpi.c" 12 | #else 13 | #error no timer found. You must compile with MPI, OpenMP, or include a custom timer routine 14 | #endif 15 | -------------------------------------------------------------------------------- /finite-volume/source/timers.h: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------------------------------------------------------ 2 | // Samuel Williams 3 | // SWWilliams@lbl.gov 4 | // Lawrence Berkeley National Lab 5 | //------------------------------------------------------------------------------------------------------------------------------ 6 | #ifndef TIMER_H 7 | #define TIMER_H 8 | 9 | #include 10 | 11 | #ifdef _OPENMP 12 | #include 13 | #define getTime() (omp_get_wtime()) 14 | 15 | #elif USE_MPI 16 | #include 17 | #define getTime() (MPI_Wtime()) 18 | 19 | #else 20 | // user must provide a function getTime and include it in timers.c 21 | // if calibration is necesary, then the user must #define CALIBRATE_TIMER 22 | double getTime(); 23 | #endif 24 | 25 | #endif 26 | -------------------------------------------------------------------------------- /finite-volume/source/timers/mpi.c: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------------------------------------------------------ 2 | // Samuel Williams 3 | // SWWilliams@lbl.gov 4 | // Lawrence Berkeley National Lab 5 | //------------------------------------------------------------------------------------------------------------------------------ 6 | #include 7 | #include 8 | double getTime(){ 9 | return(MPI_Wtime()); // timers are in units of seconds; no conversion is necessary 10 | } 11 | -------------------------------------------------------------------------------- /finite-volume/source/timers/omp.c: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------------------------------------------------------ 2 | // Samuel Williams 3 | // SWWilliams@lbl.gov 4 | // Lawrence Berkeley National Lab 5 | //------------------------------------------------------------------------------------------------------------------------------ 6 | #include 7 | #include 8 | double getTime(){ 9 | return(omp_get_wtime()); // timers are in units of seconds; no conversion is necessary 10 | } 11 | -------------------------------------------------------------------------------- /finite-volume/source/timers/x86.c: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------------------------------------------------------ 2 | // Samuel Williams 3 | // SWWilliams@lbl.gov 4 | // Lawrence Berkeley National Lab 5 | //------------------------------------------------------------------------------------------------------------------------------ 6 | #include 7 | #define CALIBRATE_TIMER // mg.c will calibrate the timer to determine seconds per cycle 8 | double getTime(){ 9 | uint64_t lo, hi; 10 | __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi)); 11 | return( 1e-9*((double)( (((uint64_t)hi) << 32) | ((uint64_t)lo) )) ); // timers are in units of seconds; assume 1GHz cycle counter and convert later 12 | } 13 | -------------------------------------------------------------------------------- /local.mk: -------------------------------------------------------------------------------- 1 | include $(call incsubdirs,finite-element finite-volume) 2 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # NVIDIA CORPORATION and its licensors retain all intellectual property 6 | # and proprietary rights in and to this software, related documentation 7 | # and any modifications thereto. Any use, reproduction, disclosure or 8 | # distribution of this software and related documentation without an express 9 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 10 | 11 | if [[ $# -ne 6 ]]; then 12 | echo "Illegal parameters number: hpgmg-fv " 13 | exit 1 14 | fi 15 | 16 | NP=$1 17 | if [[ $NP -lt 2 ]]; then 18 | echo "Illegal procs number: $NP" 19 | exit 1 20 | fi 21 | 22 | export PATH=$PATH 23 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH 24 | 25 | #Assuming OpenMPI 26 | OMPI_params="$OMPI_params --mca btl openib,self" 27 | OMPI_params="$OMPI_params --mca btl_openib_want_cuda_gdr 1" 28 | OMPI_params="$OMPI_params --mca btl_openib_warn_default_gid_prefix 0" 29 | 30 | #set -x 31 | $MPI_HOME/bin/mpirun -verbose $OMPI_params \ 32 | -x GDS_CQ_MAP_SMART=0 -x GDS_ENABLE_DEBUG=0 \ 33 | -x MP_ENABLE_DEBUG=0 -x HPGMG_ENABLE_DEBUG=0 \ 34 | -x MP_EVENT_ASYNC=0 -x MP_ENABLE_WARN \ 35 | -x LD_LIBRARY_PATH -x PATH \ 36 | -x GDS_DISABLE_WRITE64=0 -x GDS_SIMULATE_WRITE64=0 -x GDS_DISABLE_INLINECOPY=0 -x GDS_DISABLE_WEAK_CONSISTENCY=0 -x GDS_DISABLE_MEMBAR=0 \ 37 | -x OMP_NUM_THREADS=1 -x ASYNC_2_STREAMS=0 \ 38 | -x COMM_USE_COMM=$2 -x COMM_USE_ASYNC_SA=$3 -x COMM_USE_ASYNC_KI=$4 \ 39 | -x MP_DBREC_ON_GPU=0 -x MP_RX_CQ_ON_GPU=0 -x MP_TX_CQ_ON_GPU=0 \ 40 | -x USE_MPI=1 \ 41 | -x CUDA_PASCAL_FORCE_40_BIT=1 \ 42 | -x GDS_FLUSHER_TYPE=0 \ 43 | --map-by node -np $NP -hostfile hostfile ./wrapper.sh ./build/bin/hpgmg-fv $5 $6 44 | 45 | # ./wrapper.sh nvprof -o nvprof-kernel.%q{OMPI_COMM_WORLD_RANK}.nvprof 46 | 47 | echo "Use LibMP=$2" 48 | echo "Use GPUDirect Async, SA model=$3" 49 | echo "Use GPUDirect Async, KI model=$4" 50 | 51 | # Example with 2 processes, using the SA model, 4 and 8 as size 52 | # ./run.sh 2 1 1 0 4 8 53 | 54 | 55 | -------------------------------------------------------------------------------- /run_all_hpgmg.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | #MPI PROC 4 | PROC=(2 4 8 16) 5 | #LOG2 BOX DIM 6 | SIZE=(4 5 6 7) 7 | #default size >= 4 8 | # 16^3 * 4 = 16384 9 | # 8^3 * 4 = 2048 10 | # 8^3 * 1 = 512 11 | # 4^3 * 1 = 64 12 | # 2^3 * 1 = 8 13 | MODE=(0 1 2 3) #MPI, COMM, ASYNC, GPU 14 | 15 | for var_mode in "${MODE[@]}" 16 | do 17 | for var_size in "${SIZE[@]}" 18 | do 19 | for var_proc in "${PROC[@]}" 20 | do 21 | for num_iter in {1..5} 22 | do 23 | if [ $var_mode -eq 0 ]; then 24 | var_print_mode="MPI" 25 | var_comm=0; 26 | var_async=0; 27 | var_gpu=0; 28 | elif [ $var_mode -eq 1 ]; then 29 | var_print_mode="COMM" 30 | var_comm=1; 31 | var_async=0; 32 | var_gpu=0; 33 | elif [ $var_mode -eq 2 ]; then 34 | var_print_mode="ASYNC" 35 | var_comm=1; 36 | var_async=1; 37 | var_gpu=0; 38 | else 39 | var_print_mode="GPU-initiated" 40 | var_comm=1; 41 | var_async=1; 42 | var_gpu=1; 43 | fi 44 | echo "MODE: $var_print_mode, SIZE: $var_size, PROC: $var_proc, ITER: $num_iter" 45 | file_out="hpgmg-$var_print_mode-s$var_size-p$var_proc.txt" 46 | if [[ $num_iter -eq 1 ]]; then 47 | echo "./run.sh $var_proc $var_comm $var_async $var_gpu $var_size 8" &> $file_out 48 | else 49 | printf "\n\n===========================================================================\n\n" &>> $file_out 50 | echo "./run.sh $var_proc $var_comm $var_async $var_gpu $var_size 8" &>> $file_out 51 | fi 52 | ./run.sh $var_proc $var_comm $var_async $var_gpu $var_size 8 &>> $file_out 53 | 54 | if [[ $num_iter -eq 5 ]]; then 55 | egrep "use cuda" $file_out 56 | egrep "Total by level" $file_out 57 | fi 58 | done 59 | done 60 | done 61 | done -------------------------------------------------------------------------------- /wrapper.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | exe=$1 4 | shift 5 | params=$* 6 | 7 | extra_params= 8 | lrank=$OMPI_COMM_WORLD_LOCAL_RANK 9 | 10 | USE_GPU=0; 11 | USE_CPU=0; 12 | MP_USE_IB_HCA=mlx5_0; 13 | case ${HOSTNAME} in 14 | *dgx*) 15 | # let's pick: 16 | # GPU #0,2,4,6 17 | # HCA #0,1,2,3 18 | if (( $lrank > 4 )); then echo "too many ranks"; exit; fi 19 | hlrank=$(($lrank / 2)) # 0,1 20 | dlrank=$(($lrank * 2)) # 0,2,4,6 21 | #CUDA_VISIBLE_DEVICES=$dlrank 22 | USE_GPU=${dlrank} 23 | USE_CPU=${hlrank} 24 | HCA=mlx5_${lrank} 25 | MP_USE_IB_HCA=${HCA} 26 | OMPI_MCA_btl_openib_if_include=${HCA} 27 | ;; 28 | 29 | *ivy0*) CUDA_VISIBLE_DEVICES=1; USE_CPU=0; MP_USE_IB_HCA=mlx5_0;; 30 | *ivy1*) CUDA_VISIBLE_DEVICES=0; USE_CPU=0; MP_USE_IB_HCA=mlx5_0;; 31 | *ivy2*) CUDA_VISIBLE_DEVICES=0; USE_CPU=0; MP_USE_IB_HCA=mlx5_0;; 32 | *ivy3*) CUDA_VISIBLE_DEVICES=0; USE_CPU=0; MP_USE_IB_HCA=mlx5_0;; 33 | *brdw0*) CUDA_VISIBLE_DEVICES=3; USE_CPU=0; MP_USE_IB_HCA=mlx5_0;; 34 | *brdw1*) CUDA_VISIBLE_DEVICES=0; USE_CPU=0; MP_USE_IB_HCA=mlx5_0;; 35 | *hsw*) CUDA_VISIBLE_DEVICES=0; USE_CPU=0; MP_USE_IB_HCA=mlx5_0;; 36 | #*hsw1*) USE_GPU=0; USE_CPU=0; MP_USE_IB_HCA=mlx5_0;; 37 | #Wilkes 38 | *gpu-e-*) CUDA_VISIBLE_DEVICES=0; USE_CPU=0; USE_GPU=0; MP_USE_IB_HCA=mlx5_0; 39 | ;; 40 | esac 41 | 42 | COMM_USE_GPU_ID=$USE_GPU 43 | 44 | echo "" 45 | echo "# ${HOSTNAME}, Local Rank $lrank, GPU:$CUDA_VISIBLE_DEVICES/$USE_GPU CPU:$USE_CPU HCA:$MP_USE_IB_HCA" >&2 46 | 47 | export \ 48 | HPGMG_ENABLE_DEBUG \ 49 | CUDA_VISIBLE_DEVICES CUDA_ERROR_LEVEL CUDA_ERROR_FILE CUDA_FILE_LEVEL CUDA_PASCAL_FORCE_40_BIT \ 50 | MP_USE_IB_HCA USE_IB_HCA USE_CPU USE_GPU COMM_USE_GPU_ID \ 51 | MP_ENABLE_DEBUG MP_ENABLE_WARN GDS_ENABLE_DEBUG \ 52 | MP_DBREC_ON_GPU MP_RX_CQ_ON_GPU MP_TX_CQ_ON_GPU \ 53 | MP_EVENT_ASYNC MP_GUARD_PROGRESS \ 54 | GDS_DISABLE_WRITE64 GDS_DISABLE_INLINECOPY GDS_DISABLE_MEMBAR \ 55 | GDS_DISABLE_WEAK_CONSISTENCY GDS_SIMULATE_WRITE64 \ 56 | COMM_USE_COMM COMM_USE_ASYNC_SA COMM_USE_ASYNC_KI OMP_NUM_THREADS \ 57 | OMPI_MCA_btl_openib_if_include \ 58 | GDS_ENABLE_DUMP_MEMOPS \ 59 | USE_MPI \ 60 | LD_LIBRARY_PATH PATH GDS_FLUSHER_TYPE 61 | 62 | #set -x 63 | 64 | if [ ! -z $USE_CPU ]; then 65 | numactl --cpunodebind=${USE_CPU} -l $exe $params $extra_params 66 | else 67 | $exe $params $extra_params 68 | fi 69 | --------------------------------------------------------------------------------