├── .clang-format ├── .cmake-format.py ├── .github ├── CODEOWNERS └── workflows │ ├── cuda_githubactions_build.yml │ ├── cuda_githubactions_build_beta.yml │ └── rocm_githubactions_build.yml ├── .gitignore ├── CMakeLists.txt ├── LICENSE ├── NEWS ├── QUDAConfig.cmake.in ├── README.md ├── ci ├── docker │ └── Dockerfile.build └── pipeline.yml ├── cmake ├── CPM.cmake ├── FindEigen.cmake ├── FindLibDL.cmake ├── find_target_cuda_dependencies.cmake └── find_target_hip_dependencies.cmake ├── doc ├── CMakeLists.txt └── Doxyfile.in ├── include ├── accelerator.h ├── array.h ├── blas_3d.h ├── blas_helper.cuh ├── blas_lapack.h ├── blas_quda.h ├── clover_backup.h ├── clover_field.h ├── clover_field_order.h ├── color_spinor.h ├── color_spinor_field.h ├── color_spinor_field_order.h ├── comm_key.h ├── comm_quda.h ├── communicator_quda.h ├── complex_quda.h ├── contract_quda.h ├── convert.h ├── dbldbl.h ├── declare_enum.h ├── deflation.h ├── device.h ├── device_vector.h ├── dirac_quda.h ├── domain_decomposition.h ├── domain_decomposition_helper.cuh ├── domain_wall_helper.h ├── double_single.h ├── dslash.h ├── dslash_helper.cuh ├── dslash_quda.h ├── dslash_shmem.h ├── eigen_helper.h ├── eigensolve_quda.h ├── enum_quda.h ├── enum_quda_fortran.h ├── expand_list.hpp ├── externals │ ├── .clang-format │ ├── CLI11.hpp │ └── json.hpp ├── field_cache.h ├── float_vector.h ├── gamma.cuh ├── gauge_backup.h ├── gauge_field.h ├── gauge_field_order.h ├── gauge_fix_ovr_hit_devf.cuh ├── gauge_path_helper.cuh ├── gauge_path_quda.h ├── gauge_tools.h ├── gauge_update_quda.h ├── hw_quda.h ├── index_helper.cuh ├── inline_ptx.h ├── instantiate.h ├── instantiate_dslash.h ├── int_factor_array.hpp ├── int_list.hpp ├── invert_quda.h ├── invert_x_update.h ├── json_helper.h ├── kernel_helper.h ├── kernels │ ├── blas_3d.cuh │ ├── blas_core.cuh │ ├── block_orthogonalize.cuh │ ├── block_transpose.cuh │ ├── clover_compute.cuh │ ├── clover_deriv.cuh │ ├── clover_invert.cuh │ ├── clover_outer_product.cuh │ ├── clover_sigma_outer_product.cuh │ ├── clover_trace.cuh │ ├── coarse_op_kernel.cuh │ ├── coarse_op_kernel_mma.cuh │ ├── coarse_op_preconditioned.cuh │ ├── coarse_op_preconditioned_mma.cuh │ ├── color_spinor_pack.cuh │ ├── color_spinor_project_domain_decomp.cuh │ ├── contraction.cuh │ ├── copy_clover.cuh │ ├── copy_color_spinor.cuh │ ├── copy_color_spinor_mg.cuh │ ├── copy_field_offset.cuh │ ├── copy_gauge.cuh │ ├── copy_gauge_extended.cuh │ ├── covariant_derivative.cuh │ ├── device_vector_axpby.cuh │ ├── dslash_clover_helper.cuh │ ├── dslash_coarse.cuh │ ├── dslash_coarse_mma.cuh │ ├── dslash_domain_wall_4d.cuh │ ├── dslash_domain_wall_4d_fused_m5.cuh │ ├── dslash_domain_wall_5d.cuh │ ├── dslash_domain_wall_m5.cuh │ ├── dslash_gamma_helper.cuh │ ├── dslash_mdw_fused.cuh │ ├── dslash_mobius_eofa.cuh │ ├── dslash_ndeg_twisted_clover.cuh │ ├── dslash_ndeg_twisted_clover_preconditioned.cuh │ ├── dslash_ndeg_twisted_mass.cuh │ ├── dslash_ndeg_twisted_mass_preconditioned.cuh │ ├── dslash_pack.cuh │ ├── dslash_shmem_helper.cuh │ ├── dslash_staggered.cuh │ ├── dslash_twisted_clover_preconditioned.cuh │ ├── dslash_twisted_mass.cuh │ ├── dslash_twisted_mass_preconditioned.cuh │ ├── dslash_wilson.cuh │ ├── dslash_wilson_clover.cuh │ ├── dslash_wilson_clover_hasenbusch_twist.cuh │ ├── dslash_wilson_clover_hasenbusch_twist_preconditioned.cuh │ ├── dslash_wilson_clover_preconditioned.cuh │ ├── evec_project.cuh │ ├── extract_gauge_ghost.cuh │ ├── extract_gauge_ghost_extended.cuh │ ├── field_strength_tensor.cuh │ ├── gauge_ape.cuh │ ├── gauge_det_trace.cuh │ ├── gauge_fix_fft.cuh │ ├── gauge_fix_ovr.cuh │ ├── gauge_force.cuh │ ├── gauge_heatbath.cuh │ ├── gauge_hyp.cuh │ ├── gauge_loop_trace.cuh │ ├── gauge_noise.cuh │ ├── gauge_phase.cuh │ ├── gauge_plaq.cuh │ ├── gauge_plaqrect.cuh │ ├── gauge_polyakov_loop.cuh │ ├── gauge_qcharge.cuh │ ├── gauge_random.cuh │ ├── gauge_stout.cuh │ ├── gauge_update.cuh │ ├── gauge_utils.cuh │ ├── gauge_wilson_flow.cuh │ ├── hisq_paths_force.cuh │ ├── laplace.cuh │ ├── llfat.cuh │ ├── madwf_tensor.cuh │ ├── madwf_transfer.cuh │ ├── momentum.cuh │ ├── multi_blas_core.cuh │ ├── multi_reduce_core.cuh │ ├── pgauge_exchange.cuh │ ├── pgauge_init.cuh │ ├── prolongator.cuh │ ├── prolongator_mma.cuh │ ├── random_init.cuh │ ├── reduce_core.cuh │ ├── reduce_init.cuh │ ├── restrictor.cuh │ ├── restrictor_mma.cuh │ ├── spin_duplicate.cuh │ ├── spin_taste.cuh │ ├── spinor_dilute.cuh │ ├── spinor_noise.cuh │ ├── spinor_reweight.cuh │ ├── staggered_coarse_op_kernel.cuh │ ├── staggered_kd_apply_xinv_kernel.cuh │ ├── staggered_kd_reorder_xinv_kernel.cuh │ ├── staggered_outer_product.cuh │ ├── staggered_prolong_restrict.cuh │ ├── staggered_quark_smearing.cuh │ ├── staggered_two_link.cuh │ ├── transform_reduce.cuh │ ├── unitarize_force.cuh │ └── unitarize_links.cuh ├── ks_force_quda.h ├── ks_improved_force.h ├── ks_qsmear.h ├── lattice_field.h ├── layout_hyper.h ├── linalg.cuh ├── llfat_quda.h ├── madwf_ml.h ├── madwf_param.h ├── malloc_quda.h ├── matrix_field.h ├── matrix_tile.cuh ├── milc_interface_internal.hpp ├── momentum.h ├── monitor.h ├── mpi_comm_handle.h ├── multi_blas_helper.cuh ├── multigrid.h ├── multigrid_helper.cuh ├── numa_affinity.h ├── object.h ├── pgauge_monte.h ├── polynomial.h ├── power_of_two_array.h ├── qio_field.h ├── quda.h ├── quda_api.h ├── quda_arch.h ├── quda_arpack_interface.h ├── quda_constants.h ├── quda_define.h.in ├── quda_fortran.h ├── quda_internal.h ├── quda_matrix.h ├── quda_milc_interface.h ├── quda_ptr.h ├── random_quda.h ├── reducer.h ├── reference_wrapper_helper.h ├── register_traits.h ├── reliable_updates.h ├── shmem_helper.cuh ├── shmem_pack_helper.cuh ├── spin_taste.h ├── split_grid.h ├── staggered_kd_build_xinv.h ├── staggered_oprod.h ├── su3_project.cuh ├── svd_quda.h ├── targets │ ├── cuda │ │ ├── FFT_Plans.h │ │ ├── aos.h │ │ ├── atomic_helper.h │ │ ├── atomic_helper_cas.h │ │ ├── block_reduce_helper.h │ │ ├── block_reduction_kernel.h │ │ ├── constant_kernel_arg.h │ │ ├── device.in.hpp │ │ ├── externals │ │ │ ├── generics │ │ │ │ ├── detail │ │ │ │ │ ├── alias.h │ │ │ │ │ └── array.h │ │ │ │ ├── ldg.h │ │ │ │ └── shfl.h │ │ │ ├── jitify.hpp │ │ │ └── trove │ │ │ │ ├── aos.h │ │ │ │ ├── array.h │ │ │ │ ├── block.h │ │ │ │ ├── detail │ │ │ │ ├── dismember.h │ │ │ │ └── fallback.h │ │ │ │ ├── memory.h │ │ │ │ ├── print_array.h │ │ │ │ ├── ptr.h │ │ │ │ ├── rotate.h │ │ │ │ ├── shfl.h │ │ │ │ ├── static_gcd.h │ │ │ │ ├── static_mod_inverse.h │ │ │ │ ├── transpose.h │ │ │ │ ├── utility.h │ │ │ │ └── warp.h │ │ ├── fast_intdiv.h │ │ ├── jitify_helper.h │ │ ├── jitify_options.hpp.in │ │ ├── kernel.h │ │ ├── load_store.h │ │ ├── math_helper.cuh │ │ ├── math_helper.h │ │ ├── mdw_dslash5_tensor_core.cuh │ │ ├── mma_tensor_op │ │ │ ├── gemm.cuh │ │ │ ├── gmem_loader.cuh │ │ │ ├── hmma_m16n16k4_sm70.cuh │ │ │ ├── hmma_m16n8k8_sm70.cuh │ │ │ ├── hmma_m16n8k8_sm80.cuh │ │ │ ├── hmma_tfloat32_sm80.cuh │ │ │ ├── mma_dispatch.cuh │ │ │ ├── mma_instruction.cuh │ │ │ ├── shared_memory_pattern.cuh │ │ │ ├── simt.cuh │ │ │ ├── simt_half.cuh │ │ │ ├── smma_m16n16k4_sm70.cuh │ │ │ ├── smma_m16n8_sm80.cuh │ │ │ └── smma_m16n8k8_sm70.cuh │ │ ├── pipeline.cuh │ │ ├── quda_cuda_api.h │ │ ├── quda_fp16.cuh │ │ ├── random_helper.h │ │ ├── reduce_helper.h │ │ ├── reduction_kernel.h │ │ ├── shared_memory_cache_helper.h │ │ ├── shared_memory_helper.h │ │ ├── target_device.h │ │ ├── thread_local_cache.h │ │ ├── tma_helper.hpp │ │ ├── tunable_kernel.h │ │ └── warp_collective.h │ ├── generic │ │ ├── FFT_Plans.h │ │ ├── aos.h │ │ ├── block_reduce_helper.h │ │ ├── block_reduction_kernel_host.h │ │ ├── fast_intdiv.h │ │ ├── kernel_host.h │ │ ├── kernel_ops.h │ │ ├── kernel_ops_target.h │ │ ├── load_store.h │ │ ├── math_helper.h │ │ ├── mrg32k3a.h │ │ ├── random_helper.h │ │ ├── reduce_helper.h │ │ ├── reduction_kernel_host.h │ │ ├── shared_memory_cache_helper.h │ │ ├── thread_array.h │ │ └── thread_local_cache.h │ └── hip │ │ ├── FFT_Plans.h │ │ ├── atomic_helper.h │ │ ├── block_reduce_helper.h │ │ ├── block_reduction_kernel.h │ │ ├── constant_kernel_arg.h │ │ ├── kernel.h │ │ ├── load_store.h │ │ ├── math_helper.cuh │ │ ├── math_helper.h │ │ ├── quda_hip_api.h │ │ ├── random_helper.h │ │ ├── reduce_helper.h │ │ ├── reduction_kernel.h │ │ ├── shared_memory_cache_helper.h │ │ ├── shared_memory_helper.h │ │ ├── target_device.h │ │ ├── tunable_kernel.h │ │ └── warp_collective.h ├── timer.h ├── transfer.h ├── transform_reduce.h ├── tunable_block_reduction.h ├── tunable_nd.h ├── tunable_reduction.h ├── tune_key.h ├── tune_quda.h ├── uint_to_char.h ├── unitarization_links.h ├── util_quda.h ├── vector_io.h └── worker.h ├── jenkins ├── bqcd.config.cmake ├── milc.config.cmake └── twistedmass.config.cmake ├── lib ├── .directory ├── CMakeLists.txt ├── blas_3d.cu ├── blas_quda.cu ├── block_orthogonalize.in.cpp ├── block_orthogonalize.in.cu ├── block_transpose.in.cu ├── check_params.h ├── checksum.cu ├── clover_deriv_quda.cu ├── clover_field.cpp ├── clover_force.cpp ├── clover_invert.cu ├── clover_outer_product.cu ├── clover_quda.cu ├── clover_sigma_outer_product.cu ├── clover_trace_quda.cu ├── coarse_op.cuh ├── coarse_op.in.cpp ├── coarse_op.in.cu ├── coarse_op_mma_launch.h ├── coarse_op_preconditioned.in.cpp ├── coarse_op_preconditioned.in.cu ├── coarse_op_preconditioned_mma_launch.h ├── coarsecoarse_op.hpp ├── coarsecoarse_op.in.cpp ├── coarsecoarse_op.in.cu ├── coarsecoarse_op_mma.in.cu ├── color_spinor_field.cpp ├── color_spinor_pack.in.cu ├── color_spinor_project_domain_decomp.cu ├── color_spinor_util.in.cu ├── comm_common.cpp ├── communicator_mpi.cpp ├── communicator_qmp.cpp ├── communicator_single.cpp ├── communicator_stack.cpp ├── contract.cu ├── copy_clover.cu ├── copy_clover_offset.cu ├── copy_color_spinor.cpp ├── copy_color_spinor.cuh ├── copy_color_spinor_dd.cu ├── copy_color_spinor_dh.cu ├── copy_color_spinor_dq.cu ├── copy_color_spinor_ds.cu ├── copy_color_spinor_hd.cu ├── copy_color_spinor_hh.cu ├── copy_color_spinor_hq.cu ├── copy_color_spinor_hs.cu ├── copy_color_spinor_mg.in.hpp ├── copy_color_spinor_mg_dd.cu ├── copy_color_spinor_mg_ds.cu ├── copy_color_spinor_mg_hh.cu ├── copy_color_spinor_mg_hq.cu ├── copy_color_spinor_mg_hs.cu ├── copy_color_spinor_mg_qh.cu ├── copy_color_spinor_mg_qq.cu ├── copy_color_spinor_mg_qs.cu ├── copy_color_spinor_mg_sd.cu ├── copy_color_spinor_mg_sh.cu ├── copy_color_spinor_mg_sq.cu ├── copy_color_spinor_mg_ss.cu ├── copy_color_spinor_offset.cu ├── copy_color_spinor_qd.cu ├── copy_color_spinor_qh.cu ├── copy_color_spinor_qq.cu ├── copy_color_spinor_qs.cu ├── copy_color_spinor_sd.cu ├── copy_color_spinor_sh.cu ├── copy_color_spinor_sq.cu ├── copy_color_spinor_ss.cu ├── copy_field_offset.hpp ├── copy_gauge.in.cpp ├── copy_gauge_double.cu ├── copy_gauge_extended.cu ├── copy_gauge_half.cu ├── copy_gauge_helper.hpp ├── copy_gauge_inc.cu ├── copy_gauge_mg.in.cu ├── copy_gauge_offset.cu ├── copy_gauge_quarter.cu ├── copy_gauge_single.cu ├── covariant_derivative.cu ├── deflation.cpp ├── device_vector.cu ├── dirac.cpp ├── dirac_clover.cpp ├── dirac_clover_hasenbusch_twist.cpp ├── dirac_coarse.cpp ├── dirac_domain_wall.cpp ├── dirac_domain_wall_4d.cpp ├── dirac_improved_staggered.cpp ├── dirac_improved_staggered_kd.cpp ├── dirac_mobius.cpp ├── dirac_staggered.cpp ├── dirac_staggered_kd.cpp ├── dirac_twisted_clover.cpp ├── dirac_twisted_mass.cpp ├── dirac_wilson.cpp ├── dslash5_domain_wall.cu ├── dslash5_mobius_eofa.cu ├── dslash_clover_helper.cu ├── dslash_coarse.hpp ├── dslash_coarse.in.cpp ├── dslash_coarse.in.cu ├── dslash_coarse_mma.in.cu ├── dslash_coarse_mma.in.hpp ├── dslash_constant_arg.cu ├── dslash_domain_wall_4d.cpp ├── dslash_domain_wall_4d.hpp ├── dslash_domain_wall_4d.in.cu ├── dslash_domain_wall_4d_fused_m5.hpp ├── dslash_domain_wall_4d_fused_m5.in.cu ├── dslash_domain_wall_4d_m5inv.cpp ├── dslash_domain_wall_4d_m5inv.hpp ├── dslash_domain_wall_4d_m5inv.in.cu ├── dslash_domain_wall_4d_m5inv_m5inv.cpp ├── dslash_domain_wall_4d_m5inv_m5pre.cpp ├── dslash_domain_wall_4d_m5mob.cpp ├── dslash_domain_wall_4d_m5pre.cpp ├── dslash_domain_wall_4d_m5pre_m5inv.cpp ├── dslash_domain_wall_4d_m5pre_m5mob.cpp ├── dslash_domain_wall_5d.cpp ├── dslash_domain_wall_5d.hpp ├── dslash_domain_wall_5d.in.cu ├── dslash_gamma_helper.cu ├── dslash_improved_staggered.cpp ├── dslash_improved_staggered.hpp ├── dslash_improved_staggered.in.cu ├── dslash_index.cuh ├── dslash_mdw_fused.in.cu ├── dslash_mdw_fused.in.hpp ├── dslash_mdw_fused_impl.hpp ├── dslash_ndeg_twisted_clover.cpp ├── dslash_ndeg_twisted_clover.hpp ├── dslash_ndeg_twisted_clover.in.cu ├── dslash_ndeg_twisted_clover_preconditioned.cpp ├── dslash_ndeg_twisted_clover_preconditioned.hpp ├── dslash_ndeg_twisted_clover_preconditioned.in.cu ├── dslash_ndeg_twisted_mass.cpp ├── dslash_ndeg_twisted_mass.hpp ├── dslash_ndeg_twisted_mass.in.cu ├── dslash_ndeg_twisted_mass_preconditioned.cpp ├── dslash_ndeg_twisted_mass_preconditioned.hpp ├── dslash_ndeg_twisted_mass_preconditioned.in.cu ├── dslash_pack2.cu ├── dslash_policy.hpp ├── dslash_quda.cu ├── dslash_staggered.cpp ├── dslash_staggered.hpp ├── dslash_staggered.in.cu ├── dslash_twisted_clover.cpp ├── dslash_twisted_clover.hpp ├── dslash_twisted_clover.in.cu ├── dslash_twisted_clover_preconditioned.cpp ├── dslash_twisted_clover_preconditioned.hpp ├── dslash_twisted_clover_preconditioned.in.cu ├── dslash_twisted_mass.cpp ├── dslash_twisted_mass.hpp ├── dslash_twisted_mass.in.cu ├── dslash_twisted_mass_preconditioned.cpp ├── dslash_twisted_mass_preconditioned.hpp ├── dslash_twisted_mass_preconditioned.in.cu ├── dslash_wilson.cpp ├── dslash_wilson.hpp ├── dslash_wilson.in.cu ├── dslash_wilson_clover.cpp ├── dslash_wilson_clover.hpp ├── dslash_wilson_clover.in.cu ├── dslash_wilson_clover_distance.cpp ├── dslash_wilson_clover_distance.in.cu ├── dslash_wilson_clover_hasenbusch_twist.cpp ├── dslash_wilson_clover_hasenbusch_twist.hpp ├── dslash_wilson_clover_hasenbusch_twist.in.cu ├── dslash_wilson_clover_hasenbusch_twist_preconditioned.cpp ├── dslash_wilson_clover_hasenbusch_twist_preconditioned.hpp ├── dslash_wilson_clover_hasenbusch_twist_preconditioned_clovinv.in.cu ├── dslash_wilson_clover_hasenbusch_twist_preconditioned_no_clovinv.in.cu ├── dslash_wilson_clover_preconditioned.cpp ├── dslash_wilson_clover_preconditioned.hpp ├── dslash_wilson_clover_preconditioned.in.cu ├── dslash_wilson_clover_preconditioned_distance.cpp ├── dslash_wilson_clover_preconditioned_distance.in.cu ├── dslash_wilson_distance.cpp ├── dslash_wilson_distance.in.cu ├── eig_block_trlm.cpp ├── eig_iram.cpp ├── eig_trlm.cpp ├── eig_trlm_3d.cpp ├── eigensolve_quda.cpp ├── evec_project.cu ├── extract_gauge_ghost.in.cu ├── extract_gauge_ghost_extended.cu ├── extract_gauge_ghost_helper.cuh ├── extract_gauge_ghost_mg.in.cu ├── field_cache.cpp ├── gauge_ape.cu ├── gauge_covdev.cpp ├── gauge_field.cpp ├── gauge_field_strength_tensor.cu ├── gauge_fix_fft.cu ├── gauge_fix_ovr.cu ├── gauge_force.cu ├── gauge_hyp.cu ├── gauge_laplace.cpp ├── gauge_loop_trace.cu ├── gauge_noise.in.cu ├── gauge_norm.in.cu ├── gauge_observable.cpp ├── gauge_phase.cu ├── gauge_plaq.cu ├── gauge_plaqrect.cu ├── gauge_polyakov_loop.cu ├── gauge_qcharge.cu ├── gauge_random.cu ├── gauge_stout.cu ├── gauge_update_quda.cu ├── gauge_wilson_flow.cu ├── generate │ ├── nvtx.w │ └── wrap.py ├── hisq_paths_force_quda.cu ├── instantiate.cpp ├── interface │ ├── CMakeLists.txt │ ├── blas_interface.cpp │ └── fortran_interface.cpp ├── interface_quda.cpp ├── inv_bicgstab_quda.cpp ├── inv_bicgstabl_quda.cpp ├── inv_ca_cg.cpp ├── inv_ca_gcr.cpp ├── inv_cg3_quda.cpp ├── inv_cg_quda.cpp ├── inv_cgne.cpp ├── inv_cgnr.cpp ├── inv_eigcg_quda.cpp ├── inv_gcr_quda.cpp ├── inv_gmresdr_quda.cpp ├── inv_mr_quda.cpp ├── inv_mre.cpp ├── inv_msrc_cg_quda.cpp ├── inv_multi_cg_quda.cpp ├── inv_pcg_quda.cpp ├── inv_sd_quda.cpp ├── laplace.cpp ├── laplace.hpp ├── laplace.in.cu ├── lattice_field.cpp ├── layout_hyper.cpp ├── llfat_quda.cu ├── madwf_ml.cpp ├── madwf_tensor.cu ├── madwf_transfer.cu ├── madwf_transfer.h ├── max_clover.cu ├── milc_interface.cpp ├── milc_interface_internal.cpp ├── momentum.cu ├── monitor.cpp ├── multi_blas_quda.cu ├── multi_reduce_quda.cu ├── multigrid.cpp ├── multigrid.in.hpp ├── numa_affinity.cpp ├── nvtx_pmpi.c ├── pgauge_det_trace.cu ├── pgauge_exchange.cu ├── pgauge_heatbath.cu ├── pgauge_init.cu ├── prolongator.in.cpp ├── prolongator.in.cu ├── prolongator_mma.in.cu ├── qio_field.cpp ├── quda_arpack_interface.cpp ├── quda_fortran.F90 ├── quda_ptr.cpp ├── random.cu ├── reduce_helper.cu ├── reduce_quda.cu ├── restrictor.in.cpp ├── restrictor.in.cu ├── restrictor_mma.in.cu ├── solve.cpp ├── solver.cpp ├── solver.hpp ├── spin_duplicate.in.cu ├── spin_taste.cu ├── spinor_dilute.in.cu ├── spinor_noise.in.cu ├── spinor_reweight.cu ├── staggered_coarse_op.in.cpp ├── staggered_coarse_op.in.cu ├── staggered_kd_apply_xinv.cu ├── staggered_kd_build_xinv.cu ├── staggered_kd_reorder_xinv.cu ├── staggered_oprod.cu ├── staggered_prolong_restrict.cu ├── staggered_quark_smearing.cu ├── staggered_two_link_quda.cu ├── targets │ ├── cuda │ │ ├── CMakeLists.txt │ │ ├── blas_lapack_cublas.cpp │ │ ├── comm_target.cpp │ │ ├── device.cpp │ │ ├── jitify_helper.cpp │ │ ├── malloc.cpp │ │ ├── quda_api.cpp │ │ └── target_cuda.cmake │ ├── generic │ │ ├── CMakeLists.txt │ │ └── blas_lapack_eigen.cpp │ └── hip │ │ ├── CMakeLists.txt │ │ ├── blas_lapack_hipblas.cpp │ │ ├── comm_target.cpp │ │ ├── device.cpp │ │ ├── malloc.cpp │ │ ├── quda_api.cpp │ │ └── target_hip.cmake ├── timer.cpp ├── transfer.cpp ├── transform_reduce.cu ├── tune.cpp ├── unitarize_force_quda.cu ├── unitarize_links_quda.cu ├── util_quda.cpp ├── vector_io.cpp └── version.cpp └── tests ├── CMakeLists.txt ├── asan.h ├── blas_interface_test.cpp ├── blas_interface_test_gtest.hpp ├── blas_test.cpp ├── c_interface_test.c ├── clover_force_test.cpp ├── contract_ft_test.cpp ├── contract_ft_test_gtest.hpp ├── covdev_test.cpp ├── covdev_test_gtest.hpp ├── deflated_invert_test.cpp ├── dilution_test.cpp ├── dslash_ctest.cpp ├── dslash_test.cpp ├── dslash_test_utils.h ├── eigensolve_test.cpp ├── eigensolve_test_gtest.hpp ├── gauge_alg_test.cpp ├── gauge_path_test.cpp ├── googletest ├── include │ └── gtest │ │ ├── gtest-death-test.h │ │ ├── gtest-matchers.h │ │ ├── gtest-message.h │ │ ├── gtest-param-test.h │ │ ├── gtest-printers.h │ │ ├── gtest-spi.h │ │ ├── gtest-test-part.h │ │ ├── gtest-typed-test.h │ │ ├── gtest.h │ │ ├── gtest_pred_impl.h │ │ ├── gtest_prod.h │ │ └── internal │ │ ├── custom │ │ ├── README.md │ │ ├── gtest-port.h │ │ ├── gtest-printers.h │ │ └── gtest.h │ │ ├── gtest-death-test-internal.h │ │ ├── gtest-filepath.h │ │ ├── gtest-internal.h │ │ ├── gtest-param-util.h │ │ ├── gtest-port-arch.h │ │ ├── gtest-port.h │ │ ├── gtest-string.h │ │ ├── gtest-type-util.h │ │ └── gtest-type-util.h.pump └── src │ ├── gtest-all.cc │ ├── gtest-death-test.cc │ ├── gtest-filepath.cc │ ├── gtest-internal-inl.h │ ├── gtest-matchers.cc │ ├── gtest-port.cc │ ├── gtest-printers.cc │ ├── gtest-test-part.cc │ ├── gtest-typed-test.cc │ ├── gtest.cc │ └── gtest_main.cc ├── heatbath_test.cpp ├── hisq_paths_force_test.cpp ├── hisq_stencil_ctest.cpp ├── hisq_stencil_test.cpp ├── hisq_stencil_test_utils.h ├── hisq_unitarize_force_test.cpp ├── host_reference ├── CMakeLists.txt ├── README.md ├── blas_reference.cpp ├── blas_reference.h ├── clover_force_reference.cpp ├── clover_force_reference.h ├── clover_reference.cpp ├── contract_ft_reference.h ├── contract_reference.h ├── covdev_reference.cpp ├── covdev_reference.h ├── domain_wall_dslash_reference.cpp ├── domain_wall_dslash_reference.h ├── dslash_reference.cpp ├── dslash_reference.h ├── dslash_test_helpers.cpp ├── dslash_test_helpers.h ├── gamma_reference.h ├── gauge_force_reference.cpp ├── gauge_force_reference.h ├── hisq_force_reference.cpp ├── hisq_force_reference.h ├── staggered_dslash_reference.cpp ├── staggered_dslash_reference.h ├── wilson_dslash_reference.cpp └── wilson_dslash_reference.h ├── invert_test.cpp ├── invert_test_gtest.hpp ├── io_test.cpp ├── laph_test.cpp ├── llfat_test.cpp ├── multigrid_benchmark_test.cpp ├── multigrid_evolve_test.cpp ├── new_half.cu ├── pack_test.cpp ├── plaq_test.cpp ├── sanity_check.sh ├── scale_staggered_dslash_test.sh ├── scale_wilson_dslash_test.sh ├── sim_scale_staggered_dslash.sh ├── sim_scale_staggered_dslash_no_comms.sh ├── sim_scale_wilson_dslash.sh ├── sim_scale_wilson_dslash_no_comms.sh ├── staggered_dslash_ctest.cpp ├── staggered_dslash_test.cpp ├── staggered_dslash_test_utils.h ├── staggered_eigensolve_test.cpp ├── staggered_eigensolve_test_gtest.hpp ├── staggered_gauge_utils.cpp ├── staggered_gsmear_test.cpp ├── staggered_gsmear_test_utils.h ├── staggered_invert_test.cpp ├── staggered_invert_test_gtest.hpp ├── su3_fermion_test.cpp ├── su3_test.cpp ├── test.h ├── tune_test.cpp ├── unitarize_link_test.cpp └── utils ├── CMakeLists.txt ├── README.md ├── command_line_params.cpp ├── command_line_params.h ├── face_gauge.cpp ├── force_utils.hpp ├── gauge_utils.cpp ├── gauge_utils.h ├── host_blas.cpp ├── host_utils.cpp ├── host_utils.h ├── index_utils.cpp ├── index_utils.hpp ├── instantiate_host.hpp ├── llfat_utils.cpp ├── llfat_utils.h ├── misc.cpp ├── misc.h ├── momentum_utils.cpp ├── momentum_utils.h ├── rng_utils.hpp ├── set_params.cpp ├── short.h ├── staggered_gauge_utils.cpp ├── staggered_gauge_utils.h └── staggered_host_utils.cpp /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | BasedOnStyle: Webkit 3 | IndentWidth: 2 4 | AccessModifierOffset: -2 5 | AlignAfterOpenBracket: Align 6 | AlignTrailingComments: true 7 | AllowShortBlocksOnASingleLine: true 8 | AllowShortCaseLabelsOnASingleLine : true 9 | AllowShortIfStatementsOnASingleLine: true 10 | AllowShortLoopsOnASingleLine: true 11 | BreakBeforeBraces: Linux 12 | BreakBeforeTernaryOperators: false 13 | BreakConstructorInitializers: AfterColon 14 | ColumnLimit: 120 15 | ConstructorInitializerAllOnOneLineOrOnePerLine: true 16 | ConstructorInitializerIndentWidth: 2 17 | ContinuationIndentWidth: 2 18 | Cpp11BracedListStyle: true 19 | FixNamespaceComments: true 20 | NamespaceIndentation: All 21 | PenaltyExcessCharacter: 10 22 | PointerAlignment: Right 23 | SortIncludes: false 24 | SpaceBeforeAssignmentOperators: true 25 | CommentPragmas: '^\\.+' 26 | UseTab: Never 27 | ... 28 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # See https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners 2 | # Each line is a file pattern followed by one or more owners. 3 | 4 | # global catch call if not 5 | * @lattice/quda_core 6 | 7 | # CMake Maintainers 8 | CMakeLists.txt @lattice/cmake_maintainers 9 | *.cmake @lattice/cmake_maintainers 10 | 11 | # CUDA Maintainers 12 | cuda/ @lattice/target_cuda_maintainers 13 | 14 | # HIP Maintainers 15 | hip/ @lattice/target_hip_maintainers 16 | 17 | # SYCL Maintainers 18 | sycl/ @lattice/target_sycl_maintainers 19 | 20 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.f90 3 | *.mod 4 | *.a 5 | *~ 6 | tests/*_test 7 | milc_interface/* 8 | *#* 9 | *.pyc 10 | tunecache.tsv 11 | profile.tsv 12 | config.log 13 | CMakeCache.txt 14 | CMakeFiles 15 | externals 16 | !include/externals 17 | include/quda_define.h 18 | include/jitify_options.hpp 19 | .tags* 20 | autom4te.cache/* 21 | .vscode 22 | cmake/CPM_*.cmake 23 | -------------------------------------------------------------------------------- /QUDAConfig.cmake.in: -------------------------------------------------------------------------------- 1 | @PACKAGE_INIT@ 2 | 3 | include(CMakeFindDependencyMacro) 4 | 5 | set(QUDA_QMP @QUDA_QMP@) 6 | set(QUDA_MPI @QUDA_MPI@) 7 | set(QUDA_QIO @QUDA_QIO@) 8 | set(QUDA_OPENMP @QUDA_OPENMP@) 9 | set(QUDA_QDPJIT @QUDA_QDPJIT@) 10 | set(QUDA_GITVERSION @GITVERSION@) 11 | set(QUDA_PRECISION @QUDA_PRECISION@) 12 | set(QUDA_RECONSTRUCT @QUDA_RECONSTRUCT@) 13 | 14 | set(QUDA_TARGET_CUDA @QUDA_TARGET_CUDA@) 15 | set(QUDA_TARGET_HIP @QUDA_TARGET_HIP@) 16 | 17 | set(QUDA_NVSHMEM @QUDA_NVSHMEM@) 18 | 19 | if( QUDA_QMP AND QUDA_MPI ) 20 | message(FATAL_ERROR "Cannot have both QMP and MPI configured") 21 | endif() 22 | 23 | # Everyone needs this 24 | find_dependency(Threads REQUIRED) 25 | 26 | if( QUDA_QMP ) 27 | find_dependency(QMP REQUIRED) 28 | endif() 29 | 30 | if( QUDA_MPI ) 31 | find_dependency(MPI REQUIRED) 32 | endif() 33 | 34 | if( QUDA_QIO ) 35 | find_dependency(QIO REQUIRED) 36 | endif() 37 | 38 | if( QUDA_OPENMP ) 39 | find_dependency(OpenMP REQUIRED) 40 | endif() 41 | 42 | if( QUDA_TARGET_CUDA ) 43 | include(${CMAKE_CURRENT_LIST_DIR}/find_target_cuda_dependencies.cmake) 44 | elseif(QUDA_TARGET_HIP ) 45 | include(${CMAKE_CURRENT_LIST_DIR}/find_target_hip_dependencies.cmake ) 46 | endif() 47 | 48 | if( QUDA_QDPJIT ) 49 | find_dependency( QDPXX REQUIRED ) 50 | endif() 51 | 52 | include(${CMAKE_CURRENT_LIST_DIR}/QUDATargets.cmake) 53 | 54 | 55 | -------------------------------------------------------------------------------- /ci/docker/Dockerfile.build: -------------------------------------------------------------------------------- 1 | FROM docker.io/nvidia/cuda:12.6.3-devel-ubuntu24.04 2 | 3 | ARG DEBIAN_FRONTEND=noninteractive 4 | 5 | RUN echo "Running CSCS CI on $(nproc) processors" 6 | 7 | RUN apt-get update -qq && apt-get install -qq -y --no-install-recommends \ 8 | build-essential \ 9 | cmake \ 10 | wget \ 11 | ninja-build && \ 12 | rm -rf /var/lib/apt/lists/* 13 | 14 | ARG MPICH_VERSION=3.3.2 15 | ARG MPICH_PATH=/usr/local/mpich 16 | RUN wget -q https://www.mpich.org/static/downloads/${MPICH_VERSION}/mpich-${MPICH_VERSION}.tar.gz && \ 17 | tar -xzf mpich-${MPICH_VERSION}.tar.gz && \ 18 | cd mpich-${MPICH_VERSION} && \ 19 | ./configure \ 20 | --disable-fortran \ 21 | --prefix=$MPICH_PATH && \ 22 | make install -j$(nproc) && \ 23 | rm -rf /root/mpich-${MPICH_VERSION}.tar.gz /root/mpich-${MPICH_VERSION} 24 | 25 | RUN echo "${MPICH_PATH}/lib" >> /etc/ld.so.conf.d/cscs.conf && ldconfig 26 | 27 | COPY . /quda/src 28 | 29 | ENV QUDA_TEST_GRID_SIZE="1 1 2 2" 30 | 31 | RUN QUDA_TEST_GRID_SIZE=$QUDA_TEST_GRID_SIZE cmake -S /quda/src \ 32 | -DCMAKE_CUDA_COMPILER=nvcc \ 33 | -DCMAKE_CXX_COMPILER=/usr/local/mpich/bin/mpicxx \ 34 | -DCMAKE_C_COMPILER=/usr/local/mpich/bin/mpicc \ 35 | -DCMAKE_BUILD_TYPE=STRICT \ 36 | -DQUDA_CTEST_LAUNCH="" \ 37 | -DQUDA_GPU_ARCH=sm_90 \ 38 | -DQUDA_MULTIGRID=ON \ 39 | -DQUDA_MULTIGRID_NVEC_LIST=6 \ 40 | -DQUDA_MDW_FUSED_LS_LIST=4 \ 41 | -DQUDA_MPI=ON \ 42 | -DQUDA_DIRAC_DISTANCE_PRECONDITIONING=ON \ 43 | -DQUDA_DIRAC_DEFAULT_OFF=ON \ 44 | -DQUDA_DIRAC_WILSON=ON \ 45 | -DQUDA_DIRAC_CLOVER=ON \ 46 | -DQUDA_DIRAC_TWISTED_CLOVER=ON \ 47 | -DQUDA_DIRAC_STAGGERED=ON \ 48 | -DQUDA_DIRAC_LAPLACE=ON \ 49 | -DQUDA_DIRAC_COVDEV=ON \ 50 | -GNinja \ 51 | -B /quda/build 52 | 53 | RUN cmake --build /quda/build -j $(nproc) 54 | 55 | RUN cmake --install /quda/build 56 | -------------------------------------------------------------------------------- /ci/pipeline.yml: -------------------------------------------------------------------------------- 1 | include: 2 | - remote: "https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml" 3 | 4 | stages: 5 | - build 6 | - test 7 | 8 | variables: 9 | PERSIST_IMAGE_NAME: $CSCS_REGISTRY_PATH/quda/public/build:$CI_COMMIT_SHORT_SHA 10 | 11 | build_job: 12 | stage: build 13 | extends: .container-builder-cscs-gh200 14 | variables: 15 | DOCKERFILE: ci/docker/Dockerfile.build 16 | 17 | test_job: 18 | stage: test 19 | extends: .container-runner-daint-gh200 20 | image: $PERSIST_IMAGE_NAME 21 | script: 22 | - export LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH 23 | - export LD_PRELOAD=/usr/lib64/libmpi_gtl_cuda.so 24 | - if [[ $SLURM_LOCALID == "0" ]]; then rm -rf /quda/build/Testing && ln -s /dev/shm /quda/build/Testing; fi 25 | - sleep 1 26 | - ctest --test-dir /quda/build/ --output-on-failure 27 | variables: 28 | CRAY_CUDA_MPS: 0 29 | NVIDIA_VISIBLE_DEVICES: all 30 | SLURM_JOB_NUM_NODES: 1 31 | SLURM_NTASKS: 4 32 | SLURM_PARTITION: normal 33 | SLURM_TIMELIMIT: "0:30:00" 34 | USE_MPI: "YES" 35 | QUDA_ENABLE_TUNING: 0 36 | QUDA_RESOURCE_PATH: . 37 | SLURM_MPI_TYPE: cray_shasta 38 | CSCS_ADDITIONAL_MOUNTS: '["/opt/cray/pe/mpich/8.1.28/ofi/gnu/12.3/lib/libmpi.so:/usr/local/mpich/lib/libmpi.so.12.1.8", "/opt/cray/pe/lib64/libpmi.so.0:/usr/lib64/libpmi.so.0", "/opt/cray/pe/lib64/libpmi2.so.0:/usr/lib64/libpmi2.so.0", "/opt/cray/pals/1.4/lib/libpals.so.0:/usr/lib64/libpals.so.0", "/usr/lib64/libgfortran.so.5:/usr/lib64/libgfortran.so.5", "/opt/cray/pe/mpich/8.1.28/gtl/lib/libmpi_gtl_cuda.so:/usr/lib64/libmpi_gtl_cuda.so"]' 39 | -------------------------------------------------------------------------------- /cmake/CPM.cmake: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | # 3 | # SPDX-FileCopyrightText: Copyright (c) 2019-2023 Lars Melchior and contributors 4 | 5 | set(CPM_DOWNLOAD_VERSION 0.40.2) 6 | set(CPM_HASH_SUM "c8cdc32c03816538ce22781ed72964dc864b2a34a310d3b7104812a5ca2d835d") 7 | 8 | if(CPM_SOURCE_CACHE) 9 | set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake") 10 | elseif(DEFINED ENV{CPM_SOURCE_CACHE}) 11 | set(CPM_DOWNLOAD_LOCATION "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake") 12 | else() 13 | set(CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake") 14 | endif() 15 | 16 | # Expand relative path. This is important if the provided path contains a tilde (~) 17 | get_filename_component(CPM_DOWNLOAD_LOCATION ${CPM_DOWNLOAD_LOCATION} ABSOLUTE) 18 | 19 | file(DOWNLOAD 20 | https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake 21 | ${CPM_DOWNLOAD_LOCATION} EXPECTED_HASH SHA256=${CPM_HASH_SUM} 22 | ) 23 | 24 | include(${CPM_DOWNLOAD_LOCATION}) 25 | -------------------------------------------------------------------------------- /cmake/FindLibDL.cmake: -------------------------------------------------------------------------------- 1 | # - Find libdl 2 | # Find the native LIBDL includes and library 3 | # 4 | # LIBDL_INCLUDE_DIR - where to find dlfcn.h, etc. 5 | # LIBDL_LIBRARIES - List of libraries when using libdl. 6 | # LIBDL_FOUND - True if libdl found. 7 | 8 | 9 | IF (LIBDL_INCLUDE_DIR) 10 | # Already in cache, be silent 11 | SET(LIBDL_FIND_QUIETLY TRUE) 12 | ENDIF (LIBDL_INCLUDE_DIR) 13 | 14 | FIND_PATH(LIBDL_INCLUDE_DIR dlfcn.h) 15 | 16 | SET(LIBDL_NAMES dl libdl ltdl libltdl) 17 | FIND_LIBRARY(LIBDL_LIBRARY NAMES ${LIBDL_NAMES} ) 18 | 19 | # handle the QUIETLY and REQUIRED arguments and set LIBDL_FOUND to TRUE if 20 | # all listed variables are TRUE 21 | INCLUDE(FindPackageHandleStandardArgs) 22 | FIND_PACKAGE_HANDLE_STANDARD_ARGS(LibDL DEFAULT_MSG LIBDL_LIBRARY LIBDL_INCLUDE_DIR) 23 | 24 | IF(LIBDL_FOUND) 25 | SET( LIBDL_LIBRARIES ${LIBDL_LIBRARY} ) 26 | ELSE(LIBDL_FOUND) 27 | SET( LIBDL_LIBRARIES ) 28 | ENDIF(LIBDL_FOUND) 29 | 30 | MARK_AS_ADVANCED( LIBDL_LIBRARY LIBDL_INCLUDE_DIR ) 31 | -------------------------------------------------------------------------------- /cmake/find_target_cuda_dependencies.cmake: -------------------------------------------------------------------------------- 1 | # CUDA Specific CMake 2 | 3 | enable_language(CUDA) 4 | 5 | find_dependency(CUDAToolkit REQUIRED) 6 | 7 | -------------------------------------------------------------------------------- /cmake/find_target_hip_dependencies.cmake: -------------------------------------------------------------------------------- 1 | # HIP Specific CMake 2 | enable_language(HIP) 3 | 4 | if (NOT DEFINED ROCM_PATH ) 5 | if (NOT DEFINED ENV{ROCM_PATH} ) 6 | set(ROCM_PATH "/opt/rocm" CACHE PATH "ROCm path") 7 | else() 8 | set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "ROCm path") 9 | endif() 10 | endif() 11 | 12 | set(CMAKE_MODULE_PATH "${ROCM_PATH}/lib/cmake" ${CMAKE_MODULE_PATH}) 13 | find_dependency(HIP REQUIRED) 14 | find_dependency(hipfft REQUIRED) 15 | find_dependency(hiprand REQUIRED) 16 | find_dependency(rocrand REQUIRED) 17 | find_dependency(hipblas REQUIRED) 18 | find_dependency(rocblas REQUIRED) 19 | find_dependency(hipcub REQUIRED) 20 | find_dependency(rocprim REQUIRED) 21 | -------------------------------------------------------------------------------- /doc/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # add doxygen add doxygen documentation note that cmake 3.9 introduced a nicer way to do this but we don't want to 2 | # require cmake 3.9 by default yet 3 | 4 | option(QUDA_GENERATE_DOXYGEN "generate doxygen documentation") 5 | 6 | if(QUDA_GENERATE_DOXYGEN) 7 | find_package(Doxygen) 8 | 9 | if(DOXYGEN_FOUND) 10 | if(DOXYGEN_DOT_FOUND) 11 | get_filename_component(DOXYGEN_DOT_PATH ${DOXYGEN_DOT_EXECUTABLE} DIRECTORY) 12 | endif() 13 | set(DOXYGEN_OUT ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile) 14 | configure_file(${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile.in ${DOXYGEN_OUT} @ONLY) 15 | 16 | add_custom_target( 17 | doc 18 | COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYGEN_OUT} 19 | WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} 20 | COMMENT "Generating doxygen documentation" 21 | VERBATIM) 22 | endif() 23 | endif() 24 | -------------------------------------------------------------------------------- /include/domain_wall_helper.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace quda 4 | { 5 | 6 | enum class Dslash5Type { 7 | DSLASH5_DWF, 8 | DSLASH5_MOBIUS_PRE, 9 | DSLASH5_MOBIUS, 10 | M5_INV_DWF, 11 | M5_INV_MOBIUS, 12 | M5_INV_MOBIUS_M5_PRE, // M5inv + M5pre 13 | M5_PRE_MOBIUS_M5_INV, // M5pre + M5inv 14 | M5_INV_MOBIUS_M5_INV_DAG, // M5pre + M5inv 15 | DSLASH5_MOBIUS_PRE_M5_MOB, 16 | M5_INV_ZMOBIUS, 17 | M5_EOFA, 18 | M5INV_EOFA 19 | }; 20 | 21 | /** 22 | Applying the following five kernels in the order of 4-0-1-2-3 is equivalent to applying 23 | the full even-odd preconditioned symmetric MdagM operator: 24 | op = (1 - M5inv * D4 * D5pre * M5inv * D4 * D5pre)^dag 25 | * (1 - M5inv * D4 * D5pre * M5inv * D4 * D5pre) 26 | */ 27 | enum class MdwfFusedDslashType { 28 | D4_D5INV_D5PRE, 29 | D4_D5INV_D5INVDAG, 30 | D4DAG_D5PREDAG_D5INVDAG, 31 | D4DAG_D5PREDAG, 32 | D5PRE, 33 | }; 34 | 35 | } // namespace quda 36 | -------------------------------------------------------------------------------- /include/eigen_helper.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef OPENBLAS_LIB 4 | #define EIGEN_USE_LAPACKE 5 | #define EIGEN_USE_BLAS 6 | #endif 7 | 8 | #include 9 | 10 | // hide annoying warning 11 | #if !defined(__clang__) && !defined(_NVHPC_CUDA) 12 | #pragma GCC diagnostic push 13 | #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" 14 | #endif 15 | 16 | #include 17 | #include 18 | #include 19 | 20 | #if !defined(__clang__) && !defined(_NVHPC_CUDA) 21 | #pragma GCC diagnostic pop 22 | #endif 23 | 24 | using namespace Eigen; 25 | -------------------------------------------------------------------------------- /include/externals/.clang-format: -------------------------------------------------------------------------------- 1 | DisableFormat: true 2 | SortIncludes: Never 3 | 4 | -------------------------------------------------------------------------------- /include/gauge_update_quda.h: -------------------------------------------------------------------------------- 1 | #ifndef _GAUGE_UPDATE_QUDA_H_ 2 | #define _GAUGE_UPDATE_QUDA_H_ 3 | 4 | namespace quda { 5 | 6 | /** 7 | Evolve the gauge field by step size dt using the momentuim field 8 | @param out Updated gauge field 9 | @param dt Step size 10 | @param in Input gauge field 11 | @param mom Momentum field 12 | @param conj_mom Whether we conjugate the momentum in the exponential 13 | @param exact Calculate exact exponential or use an expansion 14 | */ 15 | void updateGaugeField(GaugeField &out, double dt, const GaugeField& in, 16 | const GaugeField& mom, bool conj_mom, bool exact); 17 | 18 | } // namespace quda 19 | 20 | #endif // _GAUGE_UPDATE_QUDA_H_ 21 | -------------------------------------------------------------------------------- /include/hw_quda.h: -------------------------------------------------------------------------------- 1 | #ifndef _HW_QUDA_H 2 | #define _HW_QUDA_H 3 | 4 | #include 5 | #include 6 | 7 | #ifdef __cplusplus 8 | extern "C" { 9 | #endif 10 | 11 | FullHw createHwQuda(int* X, QudaPrecision precision); 12 | void loadHwToGPU(FullHw ret, void* hw, QudaPrecision cpu_prec); 13 | void freeHwQuda(FullHw hw); 14 | 15 | #ifdef __cplusplus 16 | } 17 | #endif 18 | 19 | #endif // _HW_QUDA_H 20 | -------------------------------------------------------------------------------- /include/int_factor_array.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace quda 6 | { 7 | 8 | /** 9 | * @brief compute number of factors of an integer 10 | * 11 | */ 12 | template constexpr unsigned int numFactors() noexcept 13 | { 14 | unsigned int i = 0; 15 | for (unsigned int j = 1u; j <= Int; j++) { 16 | if (Int % j == 0) { i++; } 17 | } 18 | return i; 19 | } 20 | 21 | /** 22 | * @brief A struct containing a compile time generated array 23 | * containing factors of an integer. 24 | */ 25 | template struct IntFactorArray { 26 | 27 | array()> data_; 28 | 29 | constexpr IntFactorArray() : data_() 30 | { 31 | static_assert(Int > 0, "Int has to be > 0"); 32 | for (unsigned int i = 0, j = 1; j <= Int; j++) { 33 | if (Int % j == 0) { 34 | data_[i] = j; 35 | i++; 36 | } 37 | } 38 | } 39 | 40 | /** 41 | * @brief returns the size of the array 42 | */ 43 | constexpr unsigned int size() const noexcept { return numFactors(); } 44 | 45 | /** 46 | * @brief read only constant index operator[] 47 | * @param i the index to look up 48 | */ 49 | constexpr unsigned int operator[](int i) const noexcept { return Multiple * data_[i]; } 50 | 51 | constexpr unsigned int get_index(unsigned int value) const noexcept 52 | { 53 | unsigned int i = 0; 54 | for (; i < numFactors(); i++) { 55 | if (Multiple * data_[i] == static_cast(value)) { return i; } 56 | } 57 | return i; 58 | } 59 | 60 | }; // end struct 61 | 62 | } // namespace quda 63 | -------------------------------------------------------------------------------- /include/int_list.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace quda 4 | { 5 | 6 | /** 7 | @brief This is a dummy struct that wraps around a list of integers 8 | */ 9 | template struct IntList { 10 | }; 11 | 12 | } // namespace quda 13 | -------------------------------------------------------------------------------- /include/json_helper.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "externals/json.hpp" 4 | using json = nlohmann::json; 5 | 6 | void to_json(json &j, const dim3 &p) { j = json {{"x", p.x}, {"y", p.y}, {"z", p.z}}; } 7 | 8 | void from_json(const json &j, dim3 &p) 9 | { 10 | j.at("x").get_to(p.x); 11 | j.at("y").get_to(p.y); 12 | j.at("z").get_to(p.z); 13 | } 14 | 15 | void to_json(json &j, const int4 &p) { j = json {{"x", p.x}, {"y", p.y}, {"z", p.z}, {"w", p.w}}; } 16 | 17 | void from_json(const json &j, int4 &p) 18 | { 19 | j.at("x").get_to(p.x); 20 | j.at("y").get_to(p.y); 21 | j.at("z").get_to(p.z); 22 | j.at("w").get_to(p.w); 23 | } 24 | -------------------------------------------------------------------------------- /include/kernels/copy_color_spinor_mg.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace quda { 7 | 8 | using namespace colorspinor; 9 | 10 | template 11 | struct CopyArg : kernel_param<> { 12 | static constexpr int nSpin = nSpin_; 13 | static constexpr int nColor = nColor_; 14 | OutOrder out; 15 | const InOrder in; 16 | 17 | template 18 | CopyArg(ColorSpinorField &out, const ColorSpinorField &in, T1 *Out, T2 *In) : 19 | kernel_param(dim3(in.VolumeCB(), nSpin, nColor)), out(out, 1, Out), in(in, 1, In) 20 | {} 21 | }; 22 | 23 | template struct CopySpinor_ { 24 | const Arg &arg; 25 | constexpr CopySpinor_(const Arg &arg) : arg(arg) {} 26 | static constexpr const char *filename() { return KERNEL_FILE; } 27 | 28 | __device__ __host__ inline void operator()(int x_cb, int s, int c) 29 | { 30 | arg.out(0, x_cb, s, c) = arg.in(0, x_cb, s, c); 31 | } 32 | }; 33 | 34 | } 35 | -------------------------------------------------------------------------------- /include/kernels/device_vector_axpby.cuh: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | /** 4 | @file The following contains the argument and kernel for applying axpby to device vectors. 5 | */ 6 | 7 | namespace quda 8 | { 9 | 10 | template struct AxpbyArg : kernel_param<> { 11 | T *out; 12 | T a; 13 | const T *x; 14 | T b; 15 | const T *y; 16 | 17 | AxpbyArg(T *out, T a, const T *x, T b, const T *y, int size) : kernel_param(size), out(out), a(a), x(x), b(b), y(y) 18 | { 19 | } 20 | }; 21 | 22 | template struct Axpby { 23 | const Arg &arg; 24 | constexpr Axpby(const Arg &arg) : arg(arg) { } 25 | static constexpr const char *filename() { return KERNEL_FILE; } 26 | 27 | __device__ __host__ inline void operator()(int thread_idx) 28 | { 29 | arg.out[thread_idx] += arg.a * arg.x[thread_idx] + arg.b * arg.y[thread_idx]; 30 | } 31 | }; 32 | 33 | } // namespace quda 34 | -------------------------------------------------------------------------------- /include/kernels/random_init.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | namespace quda { 10 | 11 | struct rngArg : kernel_param<> { 12 | int commCoord[QUDA_MAX_DIM]; 13 | int X[QUDA_MAX_DIM]; 14 | uint64_t X_global[QUDA_MAX_DIM]; 15 | RNGState *state; 16 | unsigned long long seed; 17 | rngArg(RNGState *state, unsigned long long seed, const LatticeField &meta) : 18 | kernel_param(dim3(meta.LocalVolumeCB(), meta.SiteSubset(), 1)), 19 | state(state), 20 | seed(seed) 21 | { 22 | for (int i=0; i<4; i++) { 23 | commCoord[i] = comm_coord(i); 24 | X[i] = meta.LocalX()[i]; 25 | X_global[i] = X[i] * comm_dim(i); 26 | } 27 | } 28 | }; 29 | 30 | /** 31 | @brief functor to initialize the RNG states 32 | @param state RNG state array 33 | @param seed initial seed for RNG 34 | @param size size of the RNG state array 35 | @param arg Metadata needed for computing multi-gpu offsets 36 | */ 37 | template 38 | struct init_random { 39 | const Arg &arg; 40 | __device__ constexpr init_random(const Arg &arg) : arg(arg) {} 41 | static constexpr const char *filename() { return KERNEL_FILE; } 42 | 43 | __device__ inline void operator()(int id, int parity) 44 | { 45 | // Each thread gets same seed, a different sequence number, no offset 46 | int x[4]; 47 | getCoords(x, id, arg.X, parity); 48 | for (int i = 0; i < 4; i++) x[i] += arg.commCoord[i] * arg.X[i]; 49 | auto idd = (((x[3] * arg.X_global[2] + x[2]) * arg.X_global[1]) + x[1]) * arg.X_global[0] + x[0]; 50 | random_init(arg.seed, idd, 0, arg.state[parity * arg.threads.x + id]); 51 | } 52 | }; 53 | 54 | } 55 | -------------------------------------------------------------------------------- /include/kernels/reduce_init.cuh: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | namespace quda { 5 | 6 | namespace reducer { 7 | 8 | template struct init_arg : kernel_param<> { 9 | using T = T_; 10 | T *count; 11 | init_arg(T *count, int n_reduce) : 12 | kernel_param(dim3(n_reduce, 1, 1)), 13 | count(count) { } 14 | }; 15 | 16 | template struct init_count { 17 | const Arg &arg; 18 | static constexpr const char *filename() { return KERNEL_FILE; } 19 | constexpr init_count(const Arg &arg) : arg(arg) {} 20 | __device__ void operator()(int i) { new (arg.count + i) typename Arg::T {0}; } 21 | }; 22 | 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /include/kernels/spin_duplicate.cuh: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | namespace quda { 5 | 6 | using namespace colorspinor; 7 | 8 | template 9 | struct SpinorDuplicateArg : kernel_param<> { 10 | using real = typename mapper::type; 11 | static constexpr int nSpin = nSpin_; 12 | static constexpr int nColor = nColor_; 13 | using V = typename colorspinor_mapper::type; 14 | V v[nSpin]; 15 | V src; 16 | 17 | /** 18 | @brief Constructor for the duplication arg 19 | @param v The spin duplicated set 20 | @param src The source vector we are duplicating 21 | */ 22 | SpinorDuplicateArg(cvector_ref &v, const ColorSpinorField &src) : 23 | kernel_param(dim3(src.VolumeCB(), src.SiteSubset(), 1)), 24 | src(src) 25 | { 26 | for (auto i = 0u; i < v.size(); i++) this->v[i] = V(v[i]); 27 | } 28 | }; 29 | 30 | /** 31 | Functor for spin duplicating the src vector 32 | */ 33 | template struct DuplicateSpinor { 34 | const Arg &arg; 35 | constexpr DuplicateSpinor(const Arg &arg) : arg(arg) {} 36 | static constexpr const char* filename() { return KERNEL_FILE; } 37 | 38 | __device__ __host__ void operator()(int x_cb, int parity) 39 | { 40 | using vector = ColorSpinor; 41 | vector src = arg.src(x_cb, parity); 42 | 43 | for (int i = 0; i < Arg::nSpin; i++) { 44 | vector v; 45 | 46 | for (int s = 0; s < Arg::nSpin; s++) { 47 | for (int c = 0; c < Arg::nColor; c++) { 48 | v(s, c) = src(i, c); 49 | } 50 | } 51 | 52 | arg.v[i](x_cb, parity) = v; 53 | } 54 | } 55 | 56 | }; 57 | 58 | } 59 | -------------------------------------------------------------------------------- /include/kernels/spinor_reweight.cuh: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | namespace quda 7 | { 8 | 9 | using namespace colorspinor; 10 | 11 | template struct SpinorDistanceReweightArg : kernel_param<> { 12 | using real = typename mapper::type; 13 | static constexpr int nSpin = nSpin_; 14 | static constexpr int nColor = nColor_; 15 | using V = typename colorspinor_mapper::type; 16 | 17 | int X[4]; 18 | V v; 19 | real alpha0; 20 | int t0; 21 | SpinorDistanceReweightArg(ColorSpinorField &v, real alpha0, int t0) : 22 | kernel_param(dim3(v.VolumeCB(), v.SiteSubset(), 1)), v(v), alpha0(alpha0), t0(t0) 23 | { 24 | for (int dir = 0; dir < 4; dir++) X[dir] = v.X()[dir]; 25 | X[0] *= (v.SiteSubset() == 1) ? 2 : 1; // need full lattice dims 26 | } 27 | }; 28 | 29 | template __device__ __host__ inline auto distanceWeight(const Arg &arg, int t, int nt) 30 | { 31 | using real = typename Arg::real; 32 | if (arg.alpha0 > 0) { 33 | return cosh(arg.alpha0 * real((t - arg.t0 + nt) % nt - nt / 2)); 34 | } else { 35 | return 1 / cosh(arg.alpha0 * real((t - arg.t0 + nt) % nt - nt / 2)); 36 | } 37 | } 38 | 39 | template struct DistanceReweightSpinor { 40 | const Arg &arg; 41 | constexpr DistanceReweightSpinor(const Arg &arg) : arg(arg) { } 42 | static constexpr const char *filename() { return KERNEL_FILE; } 43 | 44 | __device__ __host__ void operator()(int x_cb, int parity) 45 | { 46 | using Vector = ColorSpinor; 47 | int x[4]; 48 | getCoords(x, x_cb, arg.X, parity); 49 | Vector tmp = arg.v(x_cb, parity); 50 | tmp *= distanceWeight(arg, arg.comms_coord[3] * arg.X[3] + x[3], arg.comms_dim[3] * arg.X[3]); 51 | arg.v(x_cb, parity) = tmp; 52 | } 53 | }; 54 | 55 | } // namespace quda 56 | -------------------------------------------------------------------------------- /include/kernels/transform_reduce.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace quda { 8 | 9 | template 10 | struct TransformReduceArg : public ReduceArg { 11 | using reducer = reducer_; 12 | using reduce_t = typename reducer::reduce_t; 13 | static constexpr int n_batch_max = 8; 14 | const T *v[n_batch_max]; 15 | count_t n_items; 16 | int n_batch; 17 | transformer h; 18 | mapper m; 19 | 20 | TransformReduceArg(const std::vector &v, count_t n_items, transformer h, mapper m) : 21 | ReduceArg(dim3(n_items, 1, v.size()), v.size()), 22 | n_items(n_items), 23 | n_batch(v.size()), 24 | h(h), 25 | m(m) 26 | { 27 | if (n_batch > n_batch_max) errorQuda("Requested batch %d greater than max supported %d", n_batch, n_batch_max); 28 | if (n_items > std::numeric_limits::max()) 29 | errorQuda("Requested size %lu greater than max supported %lu", 30 | (uint64_t)n_items, (uint64_t)std::numeric_limits::max()); 31 | std::copy(v.begin(), v.end(), this->v); 32 | } 33 | }; 34 | 35 | template struct transform_reducer : Arg::reducer { 36 | using reduce_t = typename Arg::reduce_t; 37 | using Arg::reducer::operator(); 38 | static constexpr int reduce_block_dim = 1; 39 | using count_t = decltype(Arg::n_items); 40 | 41 | const Arg &arg; 42 | static constexpr const char *filename() { return KERNEL_FILE; } 43 | constexpr transform_reducer(const Arg &arg) : arg(arg) {} 44 | 45 | __device__ __host__ inline reduce_t operator()(reduce_t &value, count_t i, int, int j) 46 | { 47 | auto k = arg.m(i); 48 | auto v = arg.v[j]; 49 | auto t = arg.h(v[k]); 50 | return operator()(t, value); 51 | } 52 | }; 53 | 54 | } 55 | -------------------------------------------------------------------------------- /include/ks_force_quda.h: -------------------------------------------------------------------------------- 1 | #ifndef __KS_FORCE_QUDA_H__ 2 | #define __KS_FORCE_QUDA_H__ 3 | 4 | #include 5 | 6 | 7 | namespace quda { 8 | 9 | void completeKSForce(GaugeField &mom, const GaugeField &oprod, const GaugeField &gauge, QudaFieldLocation location, long long *flops = NULL); 10 | 11 | } // namespace quda 12 | 13 | #endif 14 | -------------------------------------------------------------------------------- /include/ks_qsmear.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace quda { 7 | 8 | /** 9 | @brief Compute the 2-link field for the smearing operation 10 | @param[out] newTwoLink The computed 2-link output 11 | @param[in] link Thin-link gauge field 12 | */ 13 | void computeTwoLink(GaugeField &newTwoLink, const GaugeField &link); 14 | 15 | 16 | } // namespace quda 17 | -------------------------------------------------------------------------------- /include/layout_hyper.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | // for QIO_HAS_EXTENDED_LAYOUT, QIO_Index 4 | #include 5 | 6 | #ifdef __cplusplus 7 | extern "C" { 8 | #endif 9 | /* These routines get a quda_* prefix to avoid 10 | potential linker conflicts, with MILC */ 11 | int quda_setup_layout(int len[], int nd, int numnodes, int single_parity); 12 | extern int quda_this_node; 13 | 14 | #ifdef QIO_HAS_EXTENDED_LAYOUT 15 | int quda_node_number_ext(const int x[], void *arg); 16 | QIO_Index quda_node_index_ext(const int x[], void *arg); 17 | void quda_get_coords_ext(int x[], int node, QIO_Index index, void *arg); 18 | QIO_Index quda_num_sites_ext(int node, void *arg); 19 | #else 20 | int quda_node_number(const int x[]); 21 | int quda_node_index(const int x[]); 22 | void quda_get_coords(int x[], int node, int index); 23 | int quda_num_sites(int node); 24 | #endif 25 | 26 | #ifdef __cplusplus 27 | } 28 | #endif 29 | -------------------------------------------------------------------------------- /include/llfat_quda.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "quda.h" 4 | #include "quda_internal.h" 5 | 6 | namespace quda { 7 | 8 | /** 9 | @brief Compute the fat links for an improved staggered (Kogut-Susskind) fermions. 10 | @param fat[out] The computed fat link 11 | @param u[in] The input gauge field 12 | @param coeff[in] Array of path coefficients 13 | */ 14 | void fatKSLink(GaugeField &fat, const GaugeField &u, const double *coeff); 15 | 16 | /** 17 | @brief Compute the long links for an improved staggered (Kogut-Susskind) fermions. 18 | @param lng[out] The computed long link (only computed if lng!=0) 19 | @param u[in] The input gauge field 20 | @param coeff[in] Array of path coefficients 21 | */ 22 | void longKSLink(GaugeField &lng, const GaugeField &u, const double *coeff); 23 | 24 | } // namespace quda 25 | -------------------------------------------------------------------------------- /include/madwf_param.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace quda 4 | { 5 | /** 6 | @brief Parameter structure for holding the various MADWF parameters. 7 | */ 8 | struct MadwfParam { 9 | 10 | /** The diagonal constant to suppress the low modes when performing 5D transfer */ 11 | double madwf_diagonal_suppressor; 12 | 13 | /** The target MADWF Ls to be used in the accelerator */ 14 | int madwf_ls; 15 | 16 | /** The minimum number of iterations after which to generate the null vectors for MADWF */ 17 | int madwf_null_miniter; 18 | 19 | /** The maximum tolerance after which to generate the null vectors for MADWF */ 20 | double madwf_null_tol; 21 | 22 | /** The maximum number of iterations for the training iterations */ 23 | int madwf_train_maxiter; 24 | 25 | /** Whether to load the MADWF parameters from the file system */ 26 | bool madwf_param_load; 27 | 28 | /** Whether to save the MADWF parameters to the file system */ 29 | bool madwf_param_save; 30 | 31 | /** Path to load from the file system */ 32 | std::string madwf_param_infile; 33 | 34 | /** Path to save to the file system */ 35 | std::string madwf_param_outfile; 36 | }; 37 | 38 | } // namespace quda 39 | -------------------------------------------------------------------------------- /include/matrix_field.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | /** 4 | * @field matrix_accessor.h 5 | * @brief Simple accessor used for matrix fields, e.g., each lattice 6 | * site consists of an n x n matrix 7 | */ 8 | 9 | #include 10 | #include 11 | 12 | namespace quda 13 | { 14 | 15 | template struct matrix_field { 16 | T *field; 17 | int volume_cb; 18 | 19 | matrix_field(T *field, int volume_cb) : field(field), volume_cb(volume_cb) {} 20 | 21 | __device__ __host__ inline void load(Matrix &A, int x_cb, int parity) const 22 | { 23 | int idx = parity * volume_cb + x_cb; 24 | block_load(A, reinterpret_cast *>(field) + idx); 25 | } 26 | 27 | __device__ __host__ inline void save(const Matrix &A, int x_cb, int parity) const 28 | { 29 | int idx = parity * volume_cb + x_cb; 30 | block_store(reinterpret_cast *>(field) + idx, A); 31 | } 32 | }; 33 | 34 | } // namespace quda 35 | -------------------------------------------------------------------------------- /include/momentum.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace quda { 5 | 6 | /** 7 | @brief Compute and return global the momentum action 1/2 mom^2 8 | @param mom Momentum field 9 | @return Momentum action contribution 10 | */ 11 | double computeMomAction(const GaugeField &mom); 12 | 13 | /** 14 | Update the momentum field from the force field 15 | 16 | mom = mom - coeff * [force]_TA 17 | 18 | where [A]_TA means the traceless anti-hermitian projection of A 19 | 20 | @param mom Momentum field 21 | @param coeff Integration stepsize 22 | @param force Force field 23 | @param func The function calling this (fname will be printed if force monitoring is enabled) 24 | */ 25 | void updateMomentum(GaugeField &mom, double coeff, GaugeField &force, const char *fname); 26 | 27 | /** 28 | Left multiply the force field by the gauge field 29 | 30 | force = U * force 31 | 32 | @param force Force field 33 | @param U Gauge field 34 | */ 35 | void applyU(GaugeField &force, GaugeField &U); 36 | 37 | /** 38 | @brief Whether we are monitoring the force or not 39 | @return Boolean whether we are monitoring the force 40 | */ 41 | bool forceMonitor(); 42 | 43 | /** 44 | @brief Flush any outstanding force monitoring information 45 | */ 46 | void flushForceMonitor(); 47 | 48 | } // namespace quda 49 | -------------------------------------------------------------------------------- /include/monitor.h: -------------------------------------------------------------------------------- 1 | #include "device.h" 2 | 3 | namespace quda 4 | { 5 | 6 | namespace monitor 7 | { 8 | 9 | /** 10 | @brief Initialize device monitoring if supported. On CUDA this 11 | uses NVML-based monitoring. 12 | */ 13 | void init(); 14 | 15 | /** 16 | @brief Tear down any state associated with device monitoring 17 | */ 18 | void destroy(); 19 | 20 | /** 21 | @brief Serlialize the monitor state history to disk. If 22 | QUDA_RESOURCE_PATH is not defined then no action is taken 23 | */ 24 | void serialize(); 25 | 26 | /** 27 | @brief Get the current size of the monitor state. Used for 28 | bookending a period for later analysis. 29 | */ 30 | size_t size(); 31 | 32 | struct state_t { 33 | double energy = 0.0; 34 | double power = 0.0; 35 | double temp = 0.0; 36 | double clock = 0.0; 37 | }; 38 | 39 | /** 40 | @brief Get the mean state observables between start and end, where 41 | start and end are two intervals of history in the state. 42 | */ 43 | state_t mean(size_t start, size_t end); 44 | 45 | } // namespace monitor 46 | 47 | } // namespace quda 48 | -------------------------------------------------------------------------------- /include/mpi_comm_handle.h: -------------------------------------------------------------------------------- 1 | #ifndef _COMM_HANDLE_H 2 | #define _COMM_HANDLE_H 3 | 4 | #if defined(QMP_COMMS) || defined(MPI_COMMS) 5 | #include 6 | namespace quda { 7 | MPI_Comm get_mpi_handle(); 8 | } 9 | #endif 10 | 11 | #ifdef QMP_COMMS 12 | #include 13 | 14 | #ifdef __cplusplus 15 | extern "C" { 16 | #endif 17 | 18 | QMP_status_t QMP_get_mpi_comm(QMP_comm_t comm, void **mpicomm); 19 | 20 | #ifdef __cplusplus 21 | } 22 | #endif 23 | 24 | #endif 25 | 26 | #endif /* _COMM_HANDLE_H */ 27 | -------------------------------------------------------------------------------- /include/multigrid_helper.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace quda { 4 | 5 | /** 6 | Helper struct for dealing with spin coarsening. This helper 7 | should work with all types of fermions. 8 | */ 9 | template 10 | struct spin_mapper { 11 | // fineSpin == 1, coarseSpin == 2 identifies staggered fine -> coarse w/ spin. 12 | static constexpr int spin_block_size = (fineSpin == 1 && coarseSpin == 2) ? 0 : fineSpin / coarseSpin; 13 | 14 | static constexpr int get_spin_block_factor() { return (spin_block_size == 0) ? 1 : spin_block_size; } 15 | 16 | /** 17 | Return the coarse spin coordinate from the fine spin coordinate 18 | @param s Fine spin coordinate 19 | @param parity fine parity, for staggered 20 | @return Coarse spin coordinate 21 | */ 22 | constexpr int operator()(int s, int parity) const 23 | { 24 | return (spin_block_size == 0) ? parity : s / spin_block_size; 25 | } 26 | }; 27 | 28 | } 29 | -------------------------------------------------------------------------------- /include/numa_affinity.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | 4 | /** 5 | * sets the cpu affinity of the calling process to the affinity mask reported by nvidia-smi topo 6 | * Note that older driver versions might pin all mpi ranks to the same single conre instead of a range 7 | * @param deviceid gpu to determine affinity for 8 | * @return 0 if numa affinity was set 9 | */ 10 | int setNumaAffinityNVML(int deviceid); 11 | -------------------------------------------------------------------------------- /include/object.h: -------------------------------------------------------------------------------- 1 | /** 2 | @file object.h 3 | 4 | @section DESCRIPTION 5 | 6 | Abstract parent class for all classes in QUDA. This parent class 7 | defines the new/delete methods to use QUDA's memory allocators. 8 | This gives us memory leak checking on these object instances. 9 | */ 10 | 11 | #pragma once 12 | 13 | #include 14 | 15 | namespace quda { 16 | 17 | struct Object { 18 | 19 | Object() { } 20 | virtual ~Object() { } 21 | 22 | void *operator new(std::size_t size) { return safe_malloc(size); } 23 | 24 | void operator delete(void *p) { host_free(p); } 25 | 26 | void *operator new[](std::size_t size) { return safe_malloc(size); } 27 | 28 | void operator delete[](void *p) { host_free(p); } 29 | }; 30 | 31 | } // namespace quda 32 | -------------------------------------------------------------------------------- /include/power_of_two_array.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace quda 6 | { 7 | 8 | /** 9 | * @brief compute number of elements of an array containing powers 10 | * of 2 starting at a minumum up to and including a maximum 11 | * 12 | */ 13 | template constexpr unsigned int numElements() noexcept 14 | { 15 | unsigned int i = 0; 16 | for (auto j = Min; j <= Max; j *= 2) i++; 17 | return i; 18 | } 19 | 20 | /** 21 | * @brief A struct containing a compile time generated array 22 | * containing powers of 2 starting at Min up to and includeing Max 23 | * with thanks to StackOverflow: 24 | * https://stackoverflow.com/questions/19019252/create-n-element-constexpr-array-in-c11 25 | */ 26 | template struct PowerOfTwoArray { 27 | 28 | array()> data_; 29 | 30 | constexpr PowerOfTwoArray() : data_() 31 | { 32 | static_assert(Min <= Max, "Min has to be <= Max"); 33 | for (unsigned int i = 0, j = Min; j <= Max; j *= 2, i++) data_[i] = j; 34 | } 35 | 36 | /** 37 | * @brief returns the size of the array 38 | */ 39 | constexpr unsigned int size() const noexcept { return numElements(); } 40 | 41 | /** 42 | * @brief read only constant index operator[] 43 | * @param i the index to look up 44 | */ 45 | constexpr unsigned int operator[](int i) const noexcept { return data_[i]; } 46 | 47 | }; // end struct 48 | 49 | } // namespace quda 50 | -------------------------------------------------------------------------------- /include/qio_field.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef HAVE_QIO 4 | void read_gauge_field(const char *filename, void *gauge[], QudaPrecision prec, const int *X, 5 | int argc, char *argv[]); 6 | void write_gauge_field(const char *filename, void *gauge[], QudaPrecision prec, const int *X, int argc, char *argv[]); 7 | void read_spinor_field(const char *filename, void *V[], QudaPrecision precision, const int *X, QudaSiteSubset subset, 8 | QudaParity parity, int nColor, int nSpin, int Nvec, int argc, char *argv[]); 9 | void write_spinor_field(const char *filename, const void *V[], QudaPrecision precision, const int *X, 10 | QudaSiteSubset subset, QudaParity parity, int nColor, int nSpin, int Nvec, int argc, 11 | char *argv[], bool partfile = false); 12 | #else 13 | inline void read_gauge_field(const char *, void *[], QudaPrecision, const int *, int, char *[]) 14 | { 15 | printf("QIO support has not been enabled\n"); 16 | exit(-1); 17 | } 18 | inline void write_gauge_field(const char *, void *[], QudaPrecision, const int *, int, char *[]) 19 | { 20 | printf("QIO support has not been enabled\n"); 21 | exit(-1); 22 | } 23 | inline void read_spinor_field(const char *, void *[], QudaPrecision, const int *, QudaSiteSubset, QudaParity, int, int, 24 | int, int, char *[]) 25 | { 26 | printf("QIO support has not been enabled\n"); 27 | exit(-1); 28 | } 29 | inline void write_spinor_field(const char *, const void *[], QudaPrecision, const int *, QudaSiteSubset, QudaParity, 30 | int, int, int, int, char *[], bool) 31 | { 32 | printf("QIO support has not been enabled\n"); 33 | exit(-1); 34 | } 35 | 36 | #endif 37 | -------------------------------------------------------------------------------- /include/quda_arch.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | #if defined(QUDA_TARGET_CUDA) 5 | #include 6 | #include 7 | 8 | #if (__COMPUTE_CAPABILITY__ >= 700) && defined(QUDA_ENABLE_MMA) 9 | #define QUDA_MMA_AVAILABLE 10 | #endif 11 | 12 | #elif defined(QUDA_TARGET_HIP) 13 | #include 14 | 15 | #elif defined(QUDA_TARGET_SYCL) 16 | #include 17 | #endif 18 | 19 | #ifdef QUDA_OPENMP 20 | #include 21 | #endif 22 | -------------------------------------------------------------------------------- /include/quda_constants.h: -------------------------------------------------------------------------------- 1 | #define QUDA_VERSION_MAJOR 1 2 | #define QUDA_VERSION_MINOR 1 3 | #define QUDA_VERSION_SUBMINOR 0 4 | 5 | /** 6 | * @def QUDA_VERSION 7 | * @brief This macro is deprecated. Use QUDA_VERSION_MAJOR, etc., instead. 8 | */ 9 | #define QUDA_VERSION ((QUDA_VERSION_MAJOR<<16) | (QUDA_VERSION_MINOR<<8) | QUDA_VERSION_SUBMINOR) 10 | 11 | 12 | /** 13 | * @def QUDA_MAX_DIM 14 | * @brief Maximum number of dimensions supported by QUDA. In practice, no 15 | * routines make use of more than 5. 16 | */ 17 | #define QUDA_MAX_DIM 6 18 | 19 | /** 20 | * @def QUDA_MAX_GEOMETRY 21 | * @brief Maximum geometry supported by a field. This essentially is 22 | * the maximum number of dimensions supported per lattice site. 23 | */ 24 | #define QUDA_MAX_GEOMETRY 8 25 | 26 | /** 27 | * @def QUDA_MAX_MULTI_SHIFT 28 | * @brief Maximum number of shifts supported by the multi-shift solver. 29 | * This number may be changed if need be. 30 | */ 31 | #define QUDA_MAX_MULTI_SHIFT 32 32 | 33 | /** 34 | * @def QUDA_MAX_BLOCK_SRC 35 | * @brief Maximum number of sources that can be supported by the multi-src solver 36 | */ 37 | #define QUDA_MAX_MULTI_SRC 128 38 | 39 | /** 40 | * @def QUDA_MAX_DWF_LS 41 | * @brief Maximum length of the Ls dimension for domain-wall fermions 42 | */ 43 | #define QUDA_MAX_DWF_LS 32 44 | 45 | /** 46 | * @def QUDA_MAX_MG_LEVEL 47 | * @brief Maximum number of multi-grid levels. This number may be 48 | * increased if needed. 49 | */ 50 | #define QUDA_MAX_MG_LEVEL 5 51 | -------------------------------------------------------------------------------- /include/random_quda.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace quda { 8 | 9 | // The nature of the state is defined in the target-specific implementation 10 | struct RNGState; 11 | 12 | /** 13 | @brief Class declaration to initialize and hold RNG states 14 | */ 15 | class RNG 16 | { 17 | 18 | bool is_initialized = false; /*! @brief whether or not the RNG is initialized */ 19 | size_t size; /*! @brief number of rand states */ 20 | std::shared_ptr state; /*! array with current rand rng state */ 21 | RNGState *backup_state; /*! array for backup of current rand rng state */ 22 | unsigned long long seed; /*! initial rng seed */ 23 | 24 | public: 25 | /*! @brief Default constructor */ 26 | RNG() = default; 27 | 28 | /** 29 | @brief Allocate and initialize RNG states. Constructor that 30 | takes its metadata from pre-existing field 31 | @param[in] meta The field whose data we use 32 | @param[in] seed Seed to initialize the RNG 33 | */ 34 | RNG(const LatticeField &meta, unsigned long long seedin); 35 | 36 | unsigned long long Seed() { return seed; }; 37 | 38 | /*! @brief Check if the RNG is initialized */ 39 | bool isInitialized() { return is_initialized; }; 40 | 41 | /*! @brief Restore rng array states initialization */ 42 | void restore(); 43 | 44 | /*! @brief Backup rng array states initialization */ 45 | void backup(); 46 | 47 | /*! @brief Get pointer to RNGState */ 48 | RNGState *State(); 49 | }; 50 | } 51 | -------------------------------------------------------------------------------- /include/shmem_helper.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | /** 4 | @file shmem_helper.cuh 5 | 6 | @section Description 7 | Include this file as opposed to nvshmem headers directly to ensure 8 | correct compilation with NVSHMEM 9 | */ 10 | 11 | #if defined(NVSHMEM_COMMS) 12 | #include 13 | #include 14 | #include 15 | #if defined(__CUDACC__) || defined(_NVHPC_CUDA) || (defined(__clang__) && defined(__CUDA__)) 16 | // only include if using a CUDA compiler 17 | #include 18 | #endif 19 | #endif 20 | -------------------------------------------------------------------------------- /include/spin_taste.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace quda 5 | { 6 | 7 | /** 8 | @brief Compute the outer-product field between the staggered quark 9 | field's one and (for HISQ and ASQTAD) three hop sites. E.g., 10 | 11 | out[0][d](x) = (in(x+1_d) x conj(in(x))) 12 | out[1][d](x) = (in(x+3_d) x conj(in(x))) 13 | 14 | where 1_d and 3_d represent a relative shift of magnitude 1 and 3 in dimension d, respectively 15 | 16 | Note out[1] is only computed if nFace=3 17 | 18 | @param[out] out Array of nFace outer-product matrix fields 19 | @param[in] in Input quark field 20 | @param[in] coeff Coefficient 21 | @param[in] nFace Number of faces (1 or 3) 22 | */ 23 | void applySpinTaste(ColorSpinorField &out, const ColorSpinorField &in, QudaSpinTasteGamma gamma); 24 | 25 | } // namespace quda 26 | -------------------------------------------------------------------------------- /include/staggered_oprod.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | namespace quda { 6 | 7 | /** 8 | @brief Compute the outer-product field between the staggered quark 9 | field's one and (for HISQ and ASQTAD) three hop sites. E.g., 10 | 11 | out[0][d](x) = (in(x+1_d) x conj(in(x))) 12 | out[1][d](x) = (in(x+3_d) x conj(in(x))) 13 | 14 | where 1_d and 3_d represent a relative shift of magnitude 1 and 3 in dimension d, respectively 15 | 16 | Note out[1] is only computed if nFace=3 17 | 18 | @param[out] out Array of nFace outer-product matrix fields 19 | @param[in] in Input quark field 20 | @param[in] coeff Coefficient 21 | @param[in] nFace Number of faces (1 or 3) 22 | */ 23 | void computeStaggeredOprod(GaugeField *out[], ColorSpinorField& in, const double coeff[], int nFace); 24 | 25 | } // namespace quda 26 | -------------------------------------------------------------------------------- /include/targets/cuda/constant_kernel_arg.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | /** 8 | @file constant_kernel_arg.h 9 | 10 | This file should be included in the kernel files for which we wish 11 | to utilize __constant__ memory for the kernel parameter struct. 12 | This needs to be included before the definition of the kernel, 13 | e.g., kernel.h in order for the compiler to do the kernel 14 | instantiation correctly. 15 | */ 16 | 17 | #ifndef QUDA_LARGE_KERNEL_ARG 18 | 19 | // set a preprocessor flag that we have included constant_kernel_arg.h 20 | #define QUDA_USE_CONSTANT_MEMORY 21 | 22 | namespace quda 23 | { 24 | 25 | namespace device 26 | { 27 | 28 | /** 29 | @brief The __constant__ buffer used for kernel parameters 30 | */ 31 | #if defined(__CUDACC_RDC__) && !defined(QUDA_CONSTANT_DEFINE) 32 | // rdc is enabled when NVSHMEM is enabled, so we need to make the 33 | // buffer as extern and define it in one place only 34 | extern __constant__ char buffer[max_constant_size()]; 35 | #else 36 | __constant__ char buffer[max_constant_size()]; 37 | #endif 38 | 39 | /** 40 | @brief Helper function that returns kernel argument from 41 | __constant__ memory. 42 | */ 43 | template constexpr std::enable_if_t(), Arg &> get_arg() 44 | { 45 | return reinterpret_cast(buffer); 46 | } 47 | 48 | /** 49 | @brief Helper function that returns a pointer to the 50 | __constant__ memory buffer. 51 | */ 52 | template constexpr std::enable_if_t(), void *> get_constant_buffer() 53 | { 54 | return qudaGetSymbolAddress(buffer); 55 | } 56 | 57 | } // namespace device 58 | 59 | } // namespace quda 60 | 61 | #endif 62 | -------------------------------------------------------------------------------- /include/targets/cuda/externals/generics/ldg.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace detail { 5 | 6 | template::type, 8 | int r = aliased_size::value> 9 | struct load_storage { 10 | typedef array result_type; 11 | static const int idx = aliased_size::value - r; 12 | __device__ __forceinline__ 13 | static result_type impl(const T* ptr) { 14 | return result_type(__ldg(((const U*)ptr) + idx), 15 | load_storage::impl(ptr)); 16 | } 17 | }; 18 | 19 | template 20 | struct load_storage { 21 | typedef array result_type; 22 | static const int idx = aliased_size::value - 1; 23 | __device__ __forceinline__ 24 | static result_type impl(const T* ptr) { 25 | return result_type(__ldg(((const U*)ptr) + idx)); 26 | } 27 | }; 28 | 29 | } 30 | 31 | 32 | #if __CUDA_ARCH__ >= 350 33 | // Device has ldg 34 | template 35 | __device__ __forceinline__ T __ldg(const T* ptr) { 36 | typedef typename detail::working_array::type aliased; 37 | aliased storage = detail::load_storage::impl(ptr); 38 | return detail::fuse(storage); 39 | } 40 | 41 | #else 42 | //Device does not, fall back. 43 | template 44 | __device__ __forceinline__ T __ldg(const T* ptr) { 45 | return *ptr; 46 | } 47 | 48 | #endif 49 | -------------------------------------------------------------------------------- /include/targets/cuda/externals/trove/warp.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2013, NVIDIA Corporation 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | * Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above copyright 10 | notice, this list of conditions and the following disclaimer in the 11 | documentation and/or other materials provided with the distribution. 12 | * Neither the name of the nor the 13 | names of its contributors may be used to endorse or promote products 14 | derived from this software without specific prior written permission. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY 20 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | #pragma once 29 | 30 | namespace trove { 31 | 32 | #define WARP_CONVERGED 0xffffffff 33 | 34 | __device__ 35 | inline bool warp_converged() { 36 | #if (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000) 37 | return (__activemask() == WARP_CONVERGED); 38 | #else 39 | return (__ballot(true) == WARP_CONVERGED); 40 | #endif 41 | } 42 | 43 | #undef WARP_CONVERGED 44 | 45 | #define WARP_SIZE 32 46 | #define WARP_MASK 0x1f 47 | #define LOG_WARP_SIZE 5 48 | 49 | } 50 | -------------------------------------------------------------------------------- /include/targets/cuda/fast_intdiv.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | // declaration of class we wish to specialize 4 | template struct mul_hi; 5 | 6 | template <> struct mul_hi { 7 | __device__ __forceinline__ int operator()(const int n, const int m) 8 | { 9 | int q; 10 | asm("mul.hi.s32 %0, %1, %2;" : "=r"(q) : "r"(m), "r"(n)); 11 | return q; 12 | } 13 | }; 14 | 15 | #include "../generic/fast_intdiv.h" 16 | -------------------------------------------------------------------------------- /include/targets/cuda/jitify_options.hpp.in: -------------------------------------------------------------------------------- 1 | #define JITIFY_OPTIONS -I${CMAKE_BINARY_DIR}/lib \ 2 | -I${CMAKE_BINARY_DIR}/include \ 3 | -I${CMAKE_BINARY_DIR}/include/externals \ 4 | -I${CMAKE_BINARY_DIR}/include/targets/cuda \ 5 | -I${CMAKE_BINARY_DIR}/include/targets/cuda/externals \ 6 | -I${CUDAToolkit_INCLUDE_DIRS} 7 | -------------------------------------------------------------------------------- /include/targets/cuda/math_helper.h: -------------------------------------------------------------------------------- 1 | #if defined(__CUDACC__) 2 | 3 | #include 4 | 5 | #else 6 | 7 | #include "../generic/math_helper.h" 8 | 9 | #endif 10 | -------------------------------------------------------------------------------- /include/targets/cuda/pipeline.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | /** @file The wrapper here abstract the cuda::pipeline, but _only_ when 4 | * we believe it gives the better performance. 5 | */ 6 | 7 | #if (__COMPUTE_CAPABILITY__ >= 800) && (CUDA_VERSION >= 11080) 8 | #define QUDA_USE_CUDA_PIPELINE 9 | #include 10 | #endif 11 | 12 | namespace quda 13 | { 14 | 15 | #ifdef QUDA_USE_CUDA_PIPELINE 16 | struct pipeline_t { 17 | cuda::pipeline pipe; 18 | 19 | __device__ inline void producer_acquire() { pipe.producer_acquire(); } 20 | 21 | __device__ inline void producer_commit() { pipe.producer_commit(); } 22 | 23 | __device__ inline void consumer_wait() { pipe.consumer_wait(); } 24 | 25 | __device__ inline void consumer_release() { pipe.consumer_release(); } 26 | }; 27 | 28 | __device__ inline pipeline_t make_pipeline() 29 | { 30 | pipeline_t p = {cuda::make_pipeline()}; 31 | return p; 32 | } 33 | #else 34 | struct pipeline_t { 35 | __device__ inline void producer_acquire() { } 36 | 37 | __device__ inline void producer_commit() { } 38 | 39 | __device__ inline void consumer_wait() { } 40 | 41 | __device__ inline void consumer_release() { } 42 | }; 43 | 44 | __device__ inline pipeline_t make_pipeline() 45 | { 46 | pipeline_t p; 47 | return p; 48 | } 49 | #endif 50 | 51 | template __device__ inline void memcpy_async(T *destination, T *source, size_t size, pipeline_t &pipe) 52 | { 53 | #ifdef QUDA_USE_CUDA_PIPELINE 54 | cuda::memcpy_async(destination, source, size, pipe.pipe); 55 | #else 56 | *destination = *source; 57 | #endif 58 | } 59 | 60 | } // namespace quda 61 | -------------------------------------------------------------------------------- /include/targets/cuda/quda_cuda_api.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | /** 7 | @file quda_cuda_api.h 8 | @brief Header file that declares some functions that will be called from within the CUDA target 9 | */ 10 | 11 | namespace quda 12 | { 13 | 14 | namespace target 15 | { 16 | 17 | namespace cuda 18 | { 19 | 20 | /** 21 | @brief Return CUDA stream from QUDA stream. This is only for 22 | use inside target/cuda. 23 | @param stream QUDA stream we which to convert to CUDA stream 24 | @return CUDA stream 25 | */ 26 | cudaStream_t get_stream(const qudaStream_t &stream); 27 | 28 | void set_runtime_error(cudaError_t error, const char *api_func, const char *func, const char *file, 29 | const char *line, bool allow_error = false); 30 | 31 | // defined in quda_api.cpp 32 | void set_driver_error(CUresult error, const char *api_func, const char *func, const char *file, const char *line, 33 | bool allow_error = false); 34 | 35 | } // namespace cuda 36 | } // namespace target 37 | 38 | } // namespace quda 39 | -------------------------------------------------------------------------------- /include/targets/cuda/quda_fp16.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace quda 6 | { 7 | 8 | __device__ inline half2 habs2(half2 input) { 9 | #if !(defined(__clang__) && defined(__CUDA__)) 10 | return __habs2(input); 11 | #else 12 | static constexpr uint32_t maximum_mask = 0x7fff7fffu; // 0111 1111 1111 1111 0111 1111 1111 1111 13 | 14 | uint32_t input_masked = *reinterpret_cast(&input) & maximum_mask; 15 | return *reinterpret_cast(&input_masked); 16 | #endif 17 | } 18 | 19 | } // namespace quda 20 | -------------------------------------------------------------------------------- /include/targets/cuda/shared_memory_cache_helper.h: -------------------------------------------------------------------------------- 1 | #include "../generic/shared_memory_cache_helper.h" 2 | -------------------------------------------------------------------------------- /include/targets/cuda/thread_local_cache.h: -------------------------------------------------------------------------------- 1 | #include "../generic/thread_local_cache.h" 2 | -------------------------------------------------------------------------------- /include/targets/cuda/warp_collective.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace quda 6 | { 7 | 8 | template struct warp_combine_impl { 9 | template T operator()(T &x, int) { return x; } 10 | }; 11 | 12 | template <> struct warp_combine_impl { 13 | template __device__ inline T operator()(T &x, int warp_split) 14 | { 15 | constexpr int warp_size = device::warp_size(); 16 | if (warp_split > 1) { 17 | #pragma unroll 18 | for (int i = 0; i < x.size(); i++) { 19 | // reduce down to the first group of column-split threads 20 | #pragma unroll 21 | for (int offset = warp_size / 2; offset >= warp_size / warp_split; offset /= 2) { 22 | // TODO - add support for non-converged warps 23 | x[i].real(x[i].real() + __shfl_down_sync(device::warp_converged_mask(), x[i].real(), offset)); 24 | x[i].imag(x[i].imag() + __shfl_down_sync(device::warp_converged_mask(), x[i].imag(), offset)); 25 | } 26 | } 27 | } 28 | return x; 29 | } 30 | }; 31 | 32 | template __device__ __host__ inline T warp_combine(T &x) 33 | { 34 | return target::dispatch(x, warp_split); 35 | } 36 | 37 | } // namespace quda 38 | -------------------------------------------------------------------------------- /include/targets/generic/FFT_Plans.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | // Dummy implementation that does nothing 6 | 7 | #define FFT_FORWARD 0 8 | #define FFT_INVERSE 1 9 | 10 | namespace quda 11 | { 12 | 13 | typedef struct { 14 | bool isDouble; 15 | } FFTPlanHandle; 16 | 17 | inline static constexpr bool HaveFFT() { return false; } 18 | 19 | inline void ApplyFFT(FFTPlanHandle &, float2 *, float2 *, int) { errorQuda("FFTs are disabled"); } 20 | 21 | inline void ApplyFFT(FFTPlanHandle &, double2 *, double2 *, int) { errorQuda("FFTs are disabled"); } 22 | 23 | inline void SetPlanFFTMany(FFTPlanHandle &, int4, int, QudaPrecision) { errorQuda("FFTs are disabled"); } 24 | 25 | inline void SetPlanFFT2DMany(FFTPlanHandle &, int4, int, QudaPrecision) { errorQuda("FFTs are disabled"); } 26 | 27 | inline void FFTDestroyPlan(FFTPlanHandle &) { errorQuda("FFTs are disabled"); } 28 | 29 | } // namespace quda 30 | -------------------------------------------------------------------------------- /include/targets/generic/aos.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace quda 4 | { 5 | 6 | /** 7 | @brief Load n-length block of memory of type T and return in local array 8 | @tparam T Array element type 9 | @tparam n Number of elements in the structure 10 | @param[out] out Output array 11 | @param[in] in Input memory pointer we are block loading from 12 | */ 13 | template __host__ __device__ void block_load(T out[n], const T *in) 14 | { 15 | #pragma unroll 16 | for (int i = 0; i < n; i++) out[i] = in[i]; 17 | } 18 | 19 | /** 20 | @brief Store n-length array of type T in block of memory 21 | @tparam T Array element type 22 | @tparam n Number of elements in the array 23 | @param[out] out Output memory pointer we are block storing to 24 | @param[in] in Input array 25 | */ 26 | template __host__ __device__ void block_store(T *out, const T in[n]) 27 | { 28 | #pragma unroll 29 | for (int i = 0; i < n; i++) out[i] = in[i]; 30 | } 31 | 32 | /** 33 | @brief Load type T from contiguous memory 34 | @tparam T Element type 35 | @param[out] out Output value 36 | @param[in] in Input memory pointer we are loading from 37 | */ 38 | template __host__ __device__ void block_load(T &out, const T *in) { out = *in; } 39 | 40 | /** 41 | @brief Store type T in contiguous memory 42 | @tparam Element type 43 | @param[out] out Output memory pointer we are storing to 44 | @param[in] in Input value 45 | */ 46 | template __host__ __device__ void block_store(T *out, const T &in) { *out = in; } 47 | 48 | } // namespace quda 49 | -------------------------------------------------------------------------------- /include/targets/generic/block_reduction_kernel_host.h: -------------------------------------------------------------------------------- 1 | namespace quda 2 | { 3 | 4 | template