├── .clang-format
├── .cmake-format.py
├── .github
    ├── CODEOWNERS
    └── workflows
    │   ├── cuda_githubactions_build.yml
    │   ├── cuda_githubactions_build_beta.yml
    │   └── rocm_githubactions_build.yml
├── .gitignore
├── CMakeLists.txt
├── LICENSE
├── NEWS
├── QUDAConfig.cmake.in
├── README.md
├── ci
    ├── docker
    │   └── Dockerfile.build
    └── pipeline.yml
├── cmake
    ├── CPM.cmake
    ├── FindEigen.cmake
    ├── FindLibDL.cmake
    ├── find_target_cuda_dependencies.cmake
    └── find_target_hip_dependencies.cmake
├── doc
    ├── CMakeLists.txt
    └── Doxyfile.in
├── include
    ├── accelerator.h
    ├── array.h
    ├── blas_3d.h
    ├── blas_helper.cuh
    ├── blas_lapack.h
    ├── blas_quda.h
    ├── clover_backup.h
    ├── clover_field.h
    ├── clover_field_order.h
    ├── color_spinor.h
    ├── color_spinor_field.h
    ├── color_spinor_field_order.h
    ├── comm_key.h
    ├── comm_quda.h
    ├── communicator_quda.h
    ├── complex_quda.h
    ├── contract_quda.h
    ├── convert.h
    ├── dbldbl.h
    ├── declare_enum.h
    ├── deflation.h
    ├── device.h
    ├── device_vector.h
    ├── dirac_quda.h
    ├── domain_decomposition.h
    ├── domain_decomposition_helper.cuh
    ├── domain_wall_helper.h
    ├── double_single.h
    ├── dslash.h
    ├── dslash_helper.cuh
    ├── dslash_quda.h
    ├── dslash_shmem.h
    ├── eigen_helper.h
    ├── eigensolve_quda.h
    ├── enum_quda.h
    ├── enum_quda_fortran.h
    ├── expand_list.hpp
    ├── externals
    │   ├── .clang-format
    │   ├── CLI11.hpp
    │   └── json.hpp
    ├── field_cache.h
    ├── float_vector.h
    ├── gamma.cuh
    ├── gauge_backup.h
    ├── gauge_field.h
    ├── gauge_field_order.h
    ├── gauge_fix_ovr_hit_devf.cuh
    ├── gauge_path_helper.cuh
    ├── gauge_path_quda.h
    ├── gauge_tools.h
    ├── gauge_update_quda.h
    ├── hw_quda.h
    ├── index_helper.cuh
    ├── inline_ptx.h
    ├── instantiate.h
    ├── instantiate_dslash.h
    ├── int_factor_array.hpp
    ├── int_list.hpp
    ├── invert_quda.h
    ├── invert_x_update.h
    ├── json_helper.h
    ├── kernel_helper.h
    ├── kernels
    │   ├── blas_3d.cuh
    │   ├── blas_core.cuh
    │   ├── block_orthogonalize.cuh
    │   ├── block_transpose.cuh
    │   ├── clover_compute.cuh
    │   ├── clover_deriv.cuh
    │   ├── clover_invert.cuh
    │   ├── clover_outer_product.cuh
    │   ├── clover_sigma_outer_product.cuh
    │   ├── clover_trace.cuh
    │   ├── coarse_op_kernel.cuh
    │   ├── coarse_op_kernel_mma.cuh
    │   ├── coarse_op_preconditioned.cuh
    │   ├── coarse_op_preconditioned_mma.cuh
    │   ├── color_spinor_pack.cuh
    │   ├── color_spinor_project_domain_decomp.cuh
    │   ├── contraction.cuh
    │   ├── copy_clover.cuh
    │   ├── copy_color_spinor.cuh
    │   ├── copy_color_spinor_mg.cuh
    │   ├── copy_field_offset.cuh
    │   ├── copy_gauge.cuh
    │   ├── copy_gauge_extended.cuh
    │   ├── covariant_derivative.cuh
    │   ├── device_vector_axpby.cuh
    │   ├── dslash_clover_helper.cuh
    │   ├── dslash_coarse.cuh
    │   ├── dslash_coarse_mma.cuh
    │   ├── dslash_domain_wall_4d.cuh
    │   ├── dslash_domain_wall_4d_fused_m5.cuh
    │   ├── dslash_domain_wall_5d.cuh
    │   ├── dslash_domain_wall_m5.cuh
    │   ├── dslash_gamma_helper.cuh
    │   ├── dslash_mdw_fused.cuh
    │   ├── dslash_mobius_eofa.cuh
    │   ├── dslash_ndeg_twisted_clover.cuh
    │   ├── dslash_ndeg_twisted_clover_preconditioned.cuh
    │   ├── dslash_ndeg_twisted_mass.cuh
    │   ├── dslash_ndeg_twisted_mass_preconditioned.cuh
    │   ├── dslash_pack.cuh
    │   ├── dslash_shmem_helper.cuh
    │   ├── dslash_staggered.cuh
    │   ├── dslash_twisted_clover_preconditioned.cuh
    │   ├── dslash_twisted_mass.cuh
    │   ├── dslash_twisted_mass_preconditioned.cuh
    │   ├── dslash_wilson.cuh
    │   ├── dslash_wilson_clover.cuh
    │   ├── dslash_wilson_clover_hasenbusch_twist.cuh
    │   ├── dslash_wilson_clover_hasenbusch_twist_preconditioned.cuh
    │   ├── dslash_wilson_clover_preconditioned.cuh
    │   ├── evec_project.cuh
    │   ├── extract_gauge_ghost.cuh
    │   ├── extract_gauge_ghost_extended.cuh
    │   ├── field_strength_tensor.cuh
    │   ├── gauge_ape.cuh
    │   ├── gauge_det_trace.cuh
    │   ├── gauge_fix_fft.cuh
    │   ├── gauge_fix_ovr.cuh
    │   ├── gauge_force.cuh
    │   ├── gauge_heatbath.cuh
    │   ├── gauge_hyp.cuh
    │   ├── gauge_loop_trace.cuh
    │   ├── gauge_noise.cuh
    │   ├── gauge_phase.cuh
    │   ├── gauge_plaq.cuh
    │   ├── gauge_plaqrect.cuh
    │   ├── gauge_polyakov_loop.cuh
    │   ├── gauge_qcharge.cuh
    │   ├── gauge_random.cuh
    │   ├── gauge_stout.cuh
    │   ├── gauge_update.cuh
    │   ├── gauge_utils.cuh
    │   ├── gauge_wilson_flow.cuh
    │   ├── hisq_paths_force.cuh
    │   ├── laplace.cuh
    │   ├── llfat.cuh
    │   ├── madwf_tensor.cuh
    │   ├── madwf_transfer.cuh
    │   ├── momentum.cuh
    │   ├── multi_blas_core.cuh
    │   ├── multi_reduce_core.cuh
    │   ├── pgauge_exchange.cuh
    │   ├── pgauge_init.cuh
    │   ├── prolongator.cuh
    │   ├── prolongator_mma.cuh
    │   ├── random_init.cuh
    │   ├── reduce_core.cuh
    │   ├── reduce_init.cuh
    │   ├── restrictor.cuh
    │   ├── restrictor_mma.cuh
    │   ├── spin_duplicate.cuh
    │   ├── spin_taste.cuh
    │   ├── spinor_dilute.cuh
    │   ├── spinor_noise.cuh
    │   ├── spinor_reweight.cuh
    │   ├── staggered_coarse_op_kernel.cuh
    │   ├── staggered_kd_apply_xinv_kernel.cuh
    │   ├── staggered_kd_reorder_xinv_kernel.cuh
    │   ├── staggered_outer_product.cuh
    │   ├── staggered_prolong_restrict.cuh
    │   ├── staggered_quark_smearing.cuh
    │   ├── staggered_two_link.cuh
    │   ├── transform_reduce.cuh
    │   ├── unitarize_force.cuh
    │   └── unitarize_links.cuh
    ├── ks_force_quda.h
    ├── ks_improved_force.h
    ├── ks_qsmear.h
    ├── lattice_field.h
    ├── layout_hyper.h
    ├── linalg.cuh
    ├── llfat_quda.h
    ├── madwf_ml.h
    ├── madwf_param.h
    ├── malloc_quda.h
    ├── matrix_field.h
    ├── matrix_tile.cuh
    ├── milc_interface_internal.hpp
    ├── momentum.h
    ├── monitor.h
    ├── mpi_comm_handle.h
    ├── multi_blas_helper.cuh
    ├── multigrid.h
    ├── multigrid_helper.cuh
    ├── numa_affinity.h
    ├── object.h
    ├── pgauge_monte.h
    ├── polynomial.h
    ├── power_of_two_array.h
    ├── qio_field.h
    ├── quda.h
    ├── quda_api.h
    ├── quda_arch.h
    ├── quda_arpack_interface.h
    ├── quda_constants.h
    ├── quda_define.h.in
    ├── quda_fortran.h
    ├── quda_internal.h
    ├── quda_matrix.h
    ├── quda_milc_interface.h
    ├── quda_ptr.h
    ├── random_quda.h
    ├── reducer.h
    ├── reference_wrapper_helper.h
    ├── register_traits.h
    ├── reliable_updates.h
    ├── shmem_helper.cuh
    ├── shmem_pack_helper.cuh
    ├── spin_taste.h
    ├── split_grid.h
    ├── staggered_kd_build_xinv.h
    ├── staggered_oprod.h
    ├── su3_project.cuh
    ├── svd_quda.h
    ├── targets
    │   ├── cuda
    │   │   ├── FFT_Plans.h
    │   │   ├── aos.h
    │   │   ├── atomic_helper.h
    │   │   ├── atomic_helper_cas.h
    │   │   ├── block_reduce_helper.h
    │   │   ├── block_reduction_kernel.h
    │   │   ├── constant_kernel_arg.h
    │   │   ├── device.in.hpp
    │   │   ├── externals
    │   │   │   ├── generics
    │   │   │   │   ├── detail
    │   │   │   │   │   ├── alias.h
    │   │   │   │   │   └── array.h
    │   │   │   │   ├── ldg.h
    │   │   │   │   └── shfl.h
    │   │   │   ├── jitify.hpp
    │   │   │   └── trove
    │   │   │   │   ├── aos.h
    │   │   │   │   ├── array.h
    │   │   │   │   ├── block.h
    │   │   │   │   ├── detail
    │   │   │   │       ├── dismember.h
    │   │   │   │       └── fallback.h
    │   │   │   │   ├── memory.h
    │   │   │   │   ├── print_array.h
    │   │   │   │   ├── ptr.h
    │   │   │   │   ├── rotate.h
    │   │   │   │   ├── shfl.h
    │   │   │   │   ├── static_gcd.h
    │   │   │   │   ├── static_mod_inverse.h
    │   │   │   │   ├── transpose.h
    │   │   │   │   ├── utility.h
    │   │   │   │   └── warp.h
    │   │   ├── fast_intdiv.h
    │   │   ├── jitify_helper.h
    │   │   ├── jitify_options.hpp.in
    │   │   ├── kernel.h
    │   │   ├── load_store.h
    │   │   ├── math_helper.cuh
    │   │   ├── math_helper.h
    │   │   ├── mdw_dslash5_tensor_core.cuh
    │   │   ├── mma_tensor_op
    │   │   │   ├── gemm.cuh
    │   │   │   ├── gmem_loader.cuh
    │   │   │   ├── hmma_m16n16k4_sm70.cuh
    │   │   │   ├── hmma_m16n8k8_sm70.cuh
    │   │   │   ├── hmma_m16n8k8_sm80.cuh
    │   │   │   ├── hmma_tfloat32_sm80.cuh
    │   │   │   ├── mma_dispatch.cuh
    │   │   │   ├── mma_instruction.cuh
    │   │   │   ├── shared_memory_pattern.cuh
    │   │   │   ├── simt.cuh
    │   │   │   ├── simt_half.cuh
    │   │   │   ├── smma_m16n16k4_sm70.cuh
    │   │   │   ├── smma_m16n8_sm80.cuh
    │   │   │   └── smma_m16n8k8_sm70.cuh
    │   │   ├── pipeline.cuh
    │   │   ├── quda_cuda_api.h
    │   │   ├── quda_fp16.cuh
    │   │   ├── random_helper.h
    │   │   ├── reduce_helper.h
    │   │   ├── reduction_kernel.h
    │   │   ├── shared_memory_cache_helper.h
    │   │   ├── shared_memory_helper.h
    │   │   ├── target_device.h
    │   │   ├── thread_local_cache.h
    │   │   ├── tma_helper.hpp
    │   │   ├── tunable_kernel.h
    │   │   └── warp_collective.h
    │   ├── generic
    │   │   ├── FFT_Plans.h
    │   │   ├── aos.h
    │   │   ├── block_reduce_helper.h
    │   │   ├── block_reduction_kernel_host.h
    │   │   ├── fast_intdiv.h
    │   │   ├── kernel_host.h
    │   │   ├── kernel_ops.h
    │   │   ├── kernel_ops_target.h
    │   │   ├── load_store.h
    │   │   ├── math_helper.h
    │   │   ├── mrg32k3a.h
    │   │   ├── random_helper.h
    │   │   ├── reduce_helper.h
    │   │   ├── reduction_kernel_host.h
    │   │   ├── shared_memory_cache_helper.h
    │   │   ├── thread_array.h
    │   │   └── thread_local_cache.h
    │   └── hip
    │   │   ├── FFT_Plans.h
    │   │   ├── atomic_helper.h
    │   │   ├── block_reduce_helper.h
    │   │   ├── block_reduction_kernel.h
    │   │   ├── constant_kernel_arg.h
    │   │   ├── kernel.h
    │   │   ├── load_store.h
    │   │   ├── math_helper.cuh
    │   │   ├── math_helper.h
    │   │   ├── quda_hip_api.h
    │   │   ├── random_helper.h
    │   │   ├── reduce_helper.h
    │   │   ├── reduction_kernel.h
    │   │   ├── shared_memory_cache_helper.h
    │   │   ├── shared_memory_helper.h
    │   │   ├── target_device.h
    │   │   ├── tunable_kernel.h
    │   │   └── warp_collective.h
    ├── timer.h
    ├── transfer.h
    ├── transform_reduce.h
    ├── tunable_block_reduction.h
    ├── tunable_nd.h
    ├── tunable_reduction.h
    ├── tune_key.h
    ├── tune_quda.h
    ├── uint_to_char.h
    ├── unitarization_links.h
    ├── util_quda.h
    ├── vector_io.h
    └── worker.h
├── jenkins
    ├── bqcd.config.cmake
    ├── milc.config.cmake
    └── twistedmass.config.cmake
├── lib
    ├── .directory
    ├── CMakeLists.txt
    ├── blas_3d.cu
    ├── blas_quda.cu
    ├── block_orthogonalize.in.cpp
    ├── block_orthogonalize.in.cu
    ├── block_transpose.in.cu
    ├── check_params.h
    ├── checksum.cu
    ├── clover_deriv_quda.cu
    ├── clover_field.cpp
    ├── clover_force.cpp
    ├── clover_invert.cu
    ├── clover_outer_product.cu
    ├── clover_quda.cu
    ├── clover_sigma_outer_product.cu
    ├── clover_trace_quda.cu
    ├── coarse_op.cuh
    ├── coarse_op.in.cpp
    ├── coarse_op.in.cu
    ├── coarse_op_mma_launch.h
    ├── coarse_op_preconditioned.in.cpp
    ├── coarse_op_preconditioned.in.cu
    ├── coarse_op_preconditioned_mma_launch.h
    ├── coarsecoarse_op.hpp
    ├── coarsecoarse_op.in.cpp
    ├── coarsecoarse_op.in.cu
    ├── coarsecoarse_op_mma.in.cu
    ├── color_spinor_field.cpp
    ├── color_spinor_pack.in.cu
    ├── color_spinor_project_domain_decomp.cu
    ├── color_spinor_util.in.cu
    ├── comm_common.cpp
    ├── communicator_mpi.cpp
    ├── communicator_qmp.cpp
    ├── communicator_single.cpp
    ├── communicator_stack.cpp
    ├── contract.cu
    ├── copy_clover.cu
    ├── copy_clover_offset.cu
    ├── copy_color_spinor.cpp
    ├── copy_color_spinor.cuh
    ├── copy_color_spinor_dd.cu
    ├── copy_color_spinor_dh.cu
    ├── copy_color_spinor_dq.cu
    ├── copy_color_spinor_ds.cu
    ├── copy_color_spinor_hd.cu
    ├── copy_color_spinor_hh.cu
    ├── copy_color_spinor_hq.cu
    ├── copy_color_spinor_hs.cu
    ├── copy_color_spinor_mg.in.hpp
    ├── copy_color_spinor_mg_dd.cu
    ├── copy_color_spinor_mg_ds.cu
    ├── copy_color_spinor_mg_hh.cu
    ├── copy_color_spinor_mg_hq.cu
    ├── copy_color_spinor_mg_hs.cu
    ├── copy_color_spinor_mg_qh.cu
    ├── copy_color_spinor_mg_qq.cu
    ├── copy_color_spinor_mg_qs.cu
    ├── copy_color_spinor_mg_sd.cu
    ├── copy_color_spinor_mg_sh.cu
    ├── copy_color_spinor_mg_sq.cu
    ├── copy_color_spinor_mg_ss.cu
    ├── copy_color_spinor_offset.cu
    ├── copy_color_spinor_qd.cu
    ├── copy_color_spinor_qh.cu
    ├── copy_color_spinor_qq.cu
    ├── copy_color_spinor_qs.cu
    ├── copy_color_spinor_sd.cu
    ├── copy_color_spinor_sh.cu
    ├── copy_color_spinor_sq.cu
    ├── copy_color_spinor_ss.cu
    ├── copy_field_offset.hpp
    ├── copy_gauge.in.cpp
    ├── copy_gauge_double.cu
    ├── copy_gauge_extended.cu
    ├── copy_gauge_half.cu
    ├── copy_gauge_helper.hpp
    ├── copy_gauge_inc.cu
    ├── copy_gauge_mg.in.cu
    ├── copy_gauge_offset.cu
    ├── copy_gauge_quarter.cu
    ├── copy_gauge_single.cu
    ├── covariant_derivative.cu
    ├── deflation.cpp
    ├── device_vector.cu
    ├── dirac.cpp
    ├── dirac_clover.cpp
    ├── dirac_clover_hasenbusch_twist.cpp
    ├── dirac_coarse.cpp
    ├── dirac_domain_wall.cpp
    ├── dirac_domain_wall_4d.cpp
    ├── dirac_improved_staggered.cpp
    ├── dirac_improved_staggered_kd.cpp
    ├── dirac_mobius.cpp
    ├── dirac_staggered.cpp
    ├── dirac_staggered_kd.cpp
    ├── dirac_twisted_clover.cpp
    ├── dirac_twisted_mass.cpp
    ├── dirac_wilson.cpp
    ├── dslash5_domain_wall.cu
    ├── dslash5_mobius_eofa.cu
    ├── dslash_clover_helper.cu
    ├── dslash_coarse.hpp
    ├── dslash_coarse.in.cpp
    ├── dslash_coarse.in.cu
    ├── dslash_coarse_mma.in.cu
    ├── dslash_coarse_mma.in.hpp
    ├── dslash_constant_arg.cu
    ├── dslash_domain_wall_4d.cpp
    ├── dslash_domain_wall_4d.hpp
    ├── dslash_domain_wall_4d.in.cu
    ├── dslash_domain_wall_4d_fused_m5.hpp
    ├── dslash_domain_wall_4d_fused_m5.in.cu
    ├── dslash_domain_wall_4d_m5inv.cpp
    ├── dslash_domain_wall_4d_m5inv.hpp
    ├── dslash_domain_wall_4d_m5inv.in.cu
    ├── dslash_domain_wall_4d_m5inv_m5inv.cpp
    ├── dslash_domain_wall_4d_m5inv_m5pre.cpp
    ├── dslash_domain_wall_4d_m5mob.cpp
    ├── dslash_domain_wall_4d_m5pre.cpp
    ├── dslash_domain_wall_4d_m5pre_m5inv.cpp
    ├── dslash_domain_wall_4d_m5pre_m5mob.cpp
    ├── dslash_domain_wall_5d.cpp
    ├── dslash_domain_wall_5d.hpp
    ├── dslash_domain_wall_5d.in.cu
    ├── dslash_gamma_helper.cu
    ├── dslash_improved_staggered.cpp
    ├── dslash_improved_staggered.hpp
    ├── dslash_improved_staggered.in.cu
    ├── dslash_index.cuh
    ├── dslash_mdw_fused.in.cu
    ├── dslash_mdw_fused.in.hpp
    ├── dslash_mdw_fused_impl.hpp
    ├── dslash_ndeg_twisted_clover.cpp
    ├── dslash_ndeg_twisted_clover.hpp
    ├── dslash_ndeg_twisted_clover.in.cu
    ├── dslash_ndeg_twisted_clover_preconditioned.cpp
    ├── dslash_ndeg_twisted_clover_preconditioned.hpp
    ├── dslash_ndeg_twisted_clover_preconditioned.in.cu
    ├── dslash_ndeg_twisted_mass.cpp
    ├── dslash_ndeg_twisted_mass.hpp
    ├── dslash_ndeg_twisted_mass.in.cu
    ├── dslash_ndeg_twisted_mass_preconditioned.cpp
    ├── dslash_ndeg_twisted_mass_preconditioned.hpp
    ├── dslash_ndeg_twisted_mass_preconditioned.in.cu
    ├── dslash_pack2.cu
    ├── dslash_policy.hpp
    ├── dslash_quda.cu
    ├── dslash_staggered.cpp
    ├── dslash_staggered.hpp
    ├── dslash_staggered.in.cu
    ├── dslash_twisted_clover.cpp
    ├── dslash_twisted_clover.hpp
    ├── dslash_twisted_clover.in.cu
    ├── dslash_twisted_clover_preconditioned.cpp
    ├── dslash_twisted_clover_preconditioned.hpp
    ├── dslash_twisted_clover_preconditioned.in.cu
    ├── dslash_twisted_mass.cpp
    ├── dslash_twisted_mass.hpp
    ├── dslash_twisted_mass.in.cu
    ├── dslash_twisted_mass_preconditioned.cpp
    ├── dslash_twisted_mass_preconditioned.hpp
    ├── dslash_twisted_mass_preconditioned.in.cu
    ├── dslash_wilson.cpp
    ├── dslash_wilson.hpp
    ├── dslash_wilson.in.cu
    ├── dslash_wilson_clover.cpp
    ├── dslash_wilson_clover.hpp
    ├── dslash_wilson_clover.in.cu
    ├── dslash_wilson_clover_distance.cpp
    ├── dslash_wilson_clover_distance.in.cu
    ├── dslash_wilson_clover_hasenbusch_twist.cpp
    ├── dslash_wilson_clover_hasenbusch_twist.hpp
    ├── dslash_wilson_clover_hasenbusch_twist.in.cu
    ├── dslash_wilson_clover_hasenbusch_twist_preconditioned.cpp
    ├── dslash_wilson_clover_hasenbusch_twist_preconditioned.hpp
    ├── dslash_wilson_clover_hasenbusch_twist_preconditioned_clovinv.in.cu
    ├── dslash_wilson_clover_hasenbusch_twist_preconditioned_no_clovinv.in.cu
    ├── dslash_wilson_clover_preconditioned.cpp
    ├── dslash_wilson_clover_preconditioned.hpp
    ├── dslash_wilson_clover_preconditioned.in.cu
    ├── dslash_wilson_clover_preconditioned_distance.cpp
    ├── dslash_wilson_clover_preconditioned_distance.in.cu
    ├── dslash_wilson_distance.cpp
    ├── dslash_wilson_distance.in.cu
    ├── eig_block_trlm.cpp
    ├── eig_iram.cpp
    ├── eig_trlm.cpp
    ├── eig_trlm_3d.cpp
    ├── eigensolve_quda.cpp
    ├── evec_project.cu
    ├── extract_gauge_ghost.in.cu
    ├── extract_gauge_ghost_extended.cu
    ├── extract_gauge_ghost_helper.cuh
    ├── extract_gauge_ghost_mg.in.cu
    ├── field_cache.cpp
    ├── gauge_ape.cu
    ├── gauge_covdev.cpp
    ├── gauge_field.cpp
    ├── gauge_field_strength_tensor.cu
    ├── gauge_fix_fft.cu
    ├── gauge_fix_ovr.cu
    ├── gauge_force.cu
    ├── gauge_hyp.cu
    ├── gauge_laplace.cpp
    ├── gauge_loop_trace.cu
    ├── gauge_noise.in.cu
    ├── gauge_norm.in.cu
    ├── gauge_observable.cpp
    ├── gauge_phase.cu
    ├── gauge_plaq.cu
    ├── gauge_plaqrect.cu
    ├── gauge_polyakov_loop.cu
    ├── gauge_qcharge.cu
    ├── gauge_random.cu
    ├── gauge_stout.cu
    ├── gauge_update_quda.cu
    ├── gauge_wilson_flow.cu
    ├── generate
    │   ├── nvtx.w
    │   └── wrap.py
    ├── hisq_paths_force_quda.cu
    ├── instantiate.cpp
    ├── interface
    │   ├── CMakeLists.txt
    │   ├── blas_interface.cpp
    │   └── fortran_interface.cpp
    ├── interface_quda.cpp
    ├── inv_bicgstab_quda.cpp
    ├── inv_bicgstabl_quda.cpp
    ├── inv_ca_cg.cpp
    ├── inv_ca_gcr.cpp
    ├── inv_cg3_quda.cpp
    ├── inv_cg_quda.cpp
    ├── inv_cgne.cpp
    ├── inv_cgnr.cpp
    ├── inv_eigcg_quda.cpp
    ├── inv_gcr_quda.cpp
    ├── inv_gmresdr_quda.cpp
    ├── inv_mr_quda.cpp
    ├── inv_mre.cpp
    ├── inv_msrc_cg_quda.cpp
    ├── inv_multi_cg_quda.cpp
    ├── inv_pcg_quda.cpp
    ├── inv_sd_quda.cpp
    ├── laplace.cpp
    ├── laplace.hpp
    ├── laplace.in.cu
    ├── lattice_field.cpp
    ├── layout_hyper.cpp
    ├── llfat_quda.cu
    ├── madwf_ml.cpp
    ├── madwf_tensor.cu
    ├── madwf_transfer.cu
    ├── madwf_transfer.h
    ├── max_clover.cu
    ├── milc_interface.cpp
    ├── milc_interface_internal.cpp
    ├── momentum.cu
    ├── monitor.cpp
    ├── multi_blas_quda.cu
    ├── multi_reduce_quda.cu
    ├── multigrid.cpp
    ├── multigrid.in.hpp
    ├── numa_affinity.cpp
    ├── nvtx_pmpi.c
    ├── pgauge_det_trace.cu
    ├── pgauge_exchange.cu
    ├── pgauge_heatbath.cu
    ├── pgauge_init.cu
    ├── prolongator.in.cpp
    ├── prolongator.in.cu
    ├── prolongator_mma.in.cu
    ├── qio_field.cpp
    ├── quda_arpack_interface.cpp
    ├── quda_fortran.F90
    ├── quda_ptr.cpp
    ├── random.cu
    ├── reduce_helper.cu
    ├── reduce_quda.cu
    ├── restrictor.in.cpp
    ├── restrictor.in.cu
    ├── restrictor_mma.in.cu
    ├── solve.cpp
    ├── solver.cpp
    ├── solver.hpp
    ├── spin_duplicate.in.cu
    ├── spin_taste.cu
    ├── spinor_dilute.in.cu
    ├── spinor_noise.in.cu
    ├── spinor_reweight.cu
    ├── staggered_coarse_op.in.cpp
    ├── staggered_coarse_op.in.cu
    ├── staggered_kd_apply_xinv.cu
    ├── staggered_kd_build_xinv.cu
    ├── staggered_kd_reorder_xinv.cu
    ├── staggered_oprod.cu
    ├── staggered_prolong_restrict.cu
    ├── staggered_quark_smearing.cu
    ├── staggered_two_link_quda.cu
    ├── targets
    │   ├── cuda
    │   │   ├── CMakeLists.txt
    │   │   ├── blas_lapack_cublas.cpp
    │   │   ├── comm_target.cpp
    │   │   ├── device.cpp
    │   │   ├── jitify_helper.cpp
    │   │   ├── malloc.cpp
    │   │   ├── quda_api.cpp
    │   │   └── target_cuda.cmake
    │   ├── generic
    │   │   ├── CMakeLists.txt
    │   │   └── blas_lapack_eigen.cpp
    │   └── hip
    │   │   ├── CMakeLists.txt
    │   │   ├── blas_lapack_hipblas.cpp
    │   │   ├── comm_target.cpp
    │   │   ├── device.cpp
    │   │   ├── malloc.cpp
    │   │   ├── quda_api.cpp
    │   │   └── target_hip.cmake
    ├── timer.cpp
    ├── transfer.cpp
    ├── transform_reduce.cu
    ├── tune.cpp
    ├── unitarize_force_quda.cu
    ├── unitarize_links_quda.cu
    ├── util_quda.cpp
    ├── vector_io.cpp
    └── version.cpp
└── tests
    ├── CMakeLists.txt
    ├── asan.h
    ├── blas_interface_test.cpp
    ├── blas_interface_test_gtest.hpp
    ├── blas_test.cpp
    ├── c_interface_test.c
    ├── clover_force_test.cpp
    ├── contract_ft_test.cpp
    ├── contract_ft_test_gtest.hpp
    ├── covdev_test.cpp
    ├── covdev_test_gtest.hpp
    ├── deflated_invert_test.cpp
    ├── dilution_test.cpp
    ├── dslash_ctest.cpp
    ├── dslash_test.cpp
    ├── dslash_test_utils.h
    ├── eigensolve_test.cpp
    ├── eigensolve_test_gtest.hpp
    ├── gauge_alg_test.cpp
    ├── gauge_path_test.cpp
    ├── googletest
        ├── include
        │   └── gtest
        │   │   ├── gtest-death-test.h
        │   │   ├── gtest-matchers.h
        │   │   ├── gtest-message.h
        │   │   ├── gtest-param-test.h
        │   │   ├── gtest-printers.h
        │   │   ├── gtest-spi.h
        │   │   ├── gtest-test-part.h
        │   │   ├── gtest-typed-test.h
        │   │   ├── gtest.h
        │   │   ├── gtest_pred_impl.h
        │   │   ├── gtest_prod.h
        │   │   └── internal
        │   │       ├── custom
        │   │           ├── README.md
        │   │           ├── gtest-port.h
        │   │           ├── gtest-printers.h
        │   │           └── gtest.h
        │   │       ├── gtest-death-test-internal.h
        │   │       ├── gtest-filepath.h
        │   │       ├── gtest-internal.h
        │   │       ├── gtest-param-util.h
        │   │       ├── gtest-port-arch.h
        │   │       ├── gtest-port.h
        │   │       ├── gtest-string.h
        │   │       ├── gtest-type-util.h
        │   │       └── gtest-type-util.h.pump
        └── src
        │   ├── gtest-all.cc
        │   ├── gtest-death-test.cc
        │   ├── gtest-filepath.cc
        │   ├── gtest-internal-inl.h
        │   ├── gtest-matchers.cc
        │   ├── gtest-port.cc
        │   ├── gtest-printers.cc
        │   ├── gtest-test-part.cc
        │   ├── gtest-typed-test.cc
        │   ├── gtest.cc
        │   └── gtest_main.cc
    ├── heatbath_test.cpp
    ├── hisq_paths_force_test.cpp
    ├── hisq_stencil_ctest.cpp
    ├── hisq_stencil_test.cpp
    ├── hisq_stencil_test_utils.h
    ├── hisq_unitarize_force_test.cpp
    ├── host_reference
        ├── CMakeLists.txt
        ├── README.md
        ├── blas_reference.cpp
        ├── blas_reference.h
        ├── clover_force_reference.cpp
        ├── clover_force_reference.h
        ├── clover_reference.cpp
        ├── contract_ft_reference.h
        ├── contract_reference.h
        ├── covdev_reference.cpp
        ├── covdev_reference.h
        ├── domain_wall_dslash_reference.cpp
        ├── domain_wall_dslash_reference.h
        ├── dslash_reference.cpp
        ├── dslash_reference.h
        ├── dslash_test_helpers.cpp
        ├── dslash_test_helpers.h
        ├── gamma_reference.h
        ├── gauge_force_reference.cpp
        ├── gauge_force_reference.h
        ├── hisq_force_reference.cpp
        ├── hisq_force_reference.h
        ├── staggered_dslash_reference.cpp
        ├── staggered_dslash_reference.h
        ├── wilson_dslash_reference.cpp
        └── wilson_dslash_reference.h
    ├── invert_test.cpp
    ├── invert_test_gtest.hpp
    ├── io_test.cpp
    ├── laph_test.cpp
    ├── llfat_test.cpp
    ├── multigrid_benchmark_test.cpp
    ├── multigrid_evolve_test.cpp
    ├── new_half.cu
    ├── pack_test.cpp
    ├── plaq_test.cpp
    ├── sanity_check.sh
    ├── scale_staggered_dslash_test.sh
    ├── scale_wilson_dslash_test.sh
    ├── sim_scale_staggered_dslash.sh
    ├── sim_scale_staggered_dslash_no_comms.sh
    ├── sim_scale_wilson_dslash.sh
    ├── sim_scale_wilson_dslash_no_comms.sh
    ├── staggered_dslash_ctest.cpp
    ├── staggered_dslash_test.cpp
    ├── staggered_dslash_test_utils.h
    ├── staggered_eigensolve_test.cpp
    ├── staggered_eigensolve_test_gtest.hpp
    ├── staggered_gauge_utils.cpp
    ├── staggered_gsmear_test.cpp
    ├── staggered_gsmear_test_utils.h
    ├── staggered_invert_test.cpp
    ├── staggered_invert_test_gtest.hpp
    ├── su3_fermion_test.cpp
    ├── su3_test.cpp
    ├── test.h
    ├── tune_test.cpp
    ├── unitarize_link_test.cpp
    └── utils
        ├── CMakeLists.txt
        ├── README.md
        ├── command_line_params.cpp
        ├── command_line_params.h
        ├── face_gauge.cpp
        ├── force_utils.hpp
        ├── gauge_utils.cpp
        ├── gauge_utils.h
        ├── host_blas.cpp
        ├── host_utils.cpp
        ├── host_utils.h
        ├── index_utils.cpp
        ├── index_utils.hpp
        ├── instantiate_host.hpp
        ├── llfat_utils.cpp
        ├── llfat_utils.h
        ├── misc.cpp
        ├── misc.h
        ├── momentum_utils.cpp
        ├── momentum_utils.h
        ├── rng_utils.hpp
        ├── set_params.cpp
        ├── short.h
        ├── staggered_gauge_utils.cpp
        ├── staggered_gauge_utils.h
        └── staggered_host_utils.cpp


/.clang-format:
--------------------------------------------------------------------------------
 1 | ---
 2 | BasedOnStyle: Webkit
 3 | IndentWidth: 2
 4 | AccessModifierOffset: -2
 5 | AlignAfterOpenBracket: Align
 6 | AlignTrailingComments: true
 7 | AllowShortBlocksOnASingleLine: true
 8 | AllowShortCaseLabelsOnASingleLine : true
 9 | AllowShortIfStatementsOnASingleLine: true
10 | AllowShortLoopsOnASingleLine: true
11 | BreakBeforeBraces: Linux
12 | BreakBeforeTernaryOperators: false
13 | BreakConstructorInitializers: AfterColon
14 | ColumnLimit: 120
15 | ConstructorInitializerAllOnOneLineOrOnePerLine: true
16 | ConstructorInitializerIndentWidth: 2
17 | ContinuationIndentWidth: 2
18 | Cpp11BracedListStyle: true
19 | FixNamespaceComments: true
20 | NamespaceIndentation: All
21 | PenaltyExcessCharacter: 10
22 | PointerAlignment: Right
23 | SortIncludes: false
24 | SpaceBeforeAssignmentOperators: true
25 | CommentPragmas:  '^\\.+'
26 | UseTab: Never
27 | ...
28 | 


--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
 1 | # See https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners
 2 | # Each line is a file pattern followed by one or more owners.
 3 | 
 4 | # global catch call if not 
 5 | *       @lattice/quda_core
 6 | 
 7 | # CMake Maintainers
 8 | CMakeLists.txt @lattice/cmake_maintainers
 9 | *.cmake @lattice/cmake_maintainers
10 | 
11 | # CUDA Maintainers
12 | cuda/   @lattice/target_cuda_maintainers
13 | 
14 | # HIP Maintainers
15 | hip/    @lattice/target_hip_maintainers
16 | 
17 | # SYCL Maintainers
18 | sycl/   @lattice/target_sycl_maintainers
19 | 
20 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.o
 2 | *.f90
 3 | *.mod
 4 | *.a
 5 | *~
 6 | tests/*_test
 7 | milc_interface/*
 8 | *#*
 9 | *.pyc
10 | tunecache.tsv
11 | profile.tsv
12 | config.log
13 | CMakeCache.txt
14 | CMakeFiles
15 | externals
16 | !include/externals
17 | include/quda_define.h
18 | include/jitify_options.hpp
19 | .tags*
20 | autom4te.cache/*
21 | .vscode
22 | cmake/CPM_*.cmake
23 | 


--------------------------------------------------------------------------------
/QUDAConfig.cmake.in:
--------------------------------------------------------------------------------
 1 | @PACKAGE_INIT@
 2 | 
 3 | include(CMakeFindDependencyMacro)
 4 | 
 5 | set(QUDA_QMP @QUDA_QMP@)
 6 | set(QUDA_MPI @QUDA_MPI@)
 7 | set(QUDA_QIO @QUDA_QIO@)
 8 | set(QUDA_OPENMP @QUDA_OPENMP@)
 9 | set(QUDA_QDPJIT @QUDA_QDPJIT@)
10 | set(QUDA_GITVERSION @GITVERSION@)
11 | set(QUDA_PRECISION @QUDA_PRECISION@)
12 | set(QUDA_RECONSTRUCT @QUDA_RECONSTRUCT@)
13 | 
14 | set(QUDA_TARGET_CUDA @QUDA_TARGET_CUDA@)
15 | set(QUDA_TARGET_HIP  @QUDA_TARGET_HIP@)
16 | 
17 | set(QUDA_NVSHMEM  @QUDA_NVSHMEM@)
18 | 
19 | if( QUDA_QMP AND QUDA_MPI )
20 |   message(FATAL_ERROR "Cannot have both QMP and MPI configured")  
21 | endif()
22 | 
23 | # Everyone needs this
24 | find_dependency(Threads REQUIRED)
25 | 
26 | if( QUDA_QMP )
27 |   find_dependency(QMP REQUIRED)
28 | endif()
29 | 
30 | if( QUDA_MPI )
31 |   find_dependency(MPI REQUIRED)
32 | endif()
33 | 
34 | if( QUDA_QIO )
35 |   find_dependency(QIO REQUIRED)
36 | endif()
37 | 
38 | if( QUDA_OPENMP )
39 |   find_dependency(OpenMP REQUIRED)
40 | endif()
41 | 
42 | if( QUDA_TARGET_CUDA )
43 |   include(${CMAKE_CURRENT_LIST_DIR}/find_target_cuda_dependencies.cmake)
44 | elseif(QUDA_TARGET_HIP )
45 |   include(${CMAKE_CURRENT_LIST_DIR}/find_target_hip_dependencies.cmake )
46 | endif()
47 | 
48 | if( QUDA_QDPJIT )
49 |   find_dependency( QDPXX REQUIRED )
50 | endif()
51 | 
52 | include(${CMAKE_CURRENT_LIST_DIR}/QUDATargets.cmake)
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/ci/docker/Dockerfile.build:
--------------------------------------------------------------------------------
 1 | FROM docker.io/nvidia/cuda:12.6.3-devel-ubuntu24.04
 2 | 
 3 | ARG DEBIAN_FRONTEND=noninteractive
 4 | 
 5 | RUN echo "Running CSCS CI on $(nproc) processors"
 6 | 
 7 | RUN apt-get update -qq && apt-get install -qq -y --no-install-recommends \
 8 |     build-essential \
 9 |     cmake \
10 |     wget \
11 |     ninja-build && \
12 |     rm -rf /var/lib/apt/lists/*
13 | 
14 | ARG MPICH_VERSION=3.3.2
15 | ARG MPICH_PATH=/usr/local/mpich
16 | RUN wget -q https://www.mpich.org/static/downloads/${MPICH_VERSION}/mpich-${MPICH_VERSION}.tar.gz && \
17 |     tar -xzf mpich-${MPICH_VERSION}.tar.gz && \
18 |     cd mpich-${MPICH_VERSION} && \
19 |     ./configure \
20 |     --disable-fortran \
21 |     --prefix=$MPICH_PATH && \
22 |     make install -j$(nproc) && \
23 |     rm -rf /root/mpich-${MPICH_VERSION}.tar.gz /root/mpich-${MPICH_VERSION}
24 | 
25 | RUN echo "${MPICH_PATH}/lib" >> /etc/ld.so.conf.d/cscs.conf && ldconfig
26 | 
27 | COPY . /quda/src
28 | 
29 | ENV QUDA_TEST_GRID_SIZE="1 1 2 2"
30 | 
31 | RUN  QUDA_TEST_GRID_SIZE=$QUDA_TEST_GRID_SIZE cmake -S /quda/src \
32 |     -DCMAKE_CUDA_COMPILER=nvcc \
33 |     -DCMAKE_CXX_COMPILER=/usr/local/mpich/bin/mpicxx \
34 |     -DCMAKE_C_COMPILER=/usr/local/mpich/bin/mpicc \
35 |     -DCMAKE_BUILD_TYPE=STRICT \
36 |     -DQUDA_CTEST_LAUNCH="" \
37 |     -DQUDA_GPU_ARCH=sm_90 \
38 |     -DQUDA_MULTIGRID=ON \
39 |     -DQUDA_MULTIGRID_NVEC_LIST=6 \
40 |     -DQUDA_MDW_FUSED_LS_LIST=4 \
41 |     -DQUDA_MPI=ON  \
42 |     -DQUDA_DIRAC_DISTANCE_PRECONDITIONING=ON \
43 |     -DQUDA_DIRAC_DEFAULT_OFF=ON \
44 |     -DQUDA_DIRAC_WILSON=ON \
45 |     -DQUDA_DIRAC_CLOVER=ON \
46 |     -DQUDA_DIRAC_TWISTED_CLOVER=ON \
47 |     -DQUDA_DIRAC_STAGGERED=ON \
48 |     -DQUDA_DIRAC_LAPLACE=ON \
49 |     -DQUDA_DIRAC_COVDEV=ON \
50 |     -GNinja \
51 |     -B /quda/build 
52 | 
53 | RUN cmake --build /quda/build -j $(nproc)
54 | 
55 | RUN cmake --install /quda/build
56 | 


--------------------------------------------------------------------------------
/ci/pipeline.yml:
--------------------------------------------------------------------------------
 1 | include:
 2 |   - remote: "https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml"
 3 | 
 4 | stages:
 5 |   - build
 6 |   - test
 7 | 
 8 | variables:
 9 |   PERSIST_IMAGE_NAME: $CSCS_REGISTRY_PATH/quda/public/build:$CI_COMMIT_SHORT_SHA
10 | 
11 | build_job:
12 |   stage: build
13 |   extends: .container-builder-cscs-gh200
14 |   variables:
15 |     DOCKERFILE: ci/docker/Dockerfile.build
16 | 
17 | test_job:
18 |   stage: test
19 |   extends: .container-runner-daint-gh200
20 |   image: $PERSIST_IMAGE_NAME
21 |   script:
22 |     - export LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH
23 |     - export LD_PRELOAD=/usr/lib64/libmpi_gtl_cuda.so
24 |     - if [[ $SLURM_LOCALID == "0" ]]; then rm -rf /quda/build/Testing && ln -s /dev/shm /quda/build/Testing; fi
25 |     - sleep 1
26 |     - ctest --test-dir /quda/build/ --output-on-failure
27 |   variables:
28 |     CRAY_CUDA_MPS: 0
29 |     NVIDIA_VISIBLE_DEVICES: all
30 |     SLURM_JOB_NUM_NODES: 1
31 |     SLURM_NTASKS: 4
32 |     SLURM_PARTITION: normal
33 |     SLURM_TIMELIMIT: "0:30:00"
34 |     USE_MPI: "YES"
35 |     QUDA_ENABLE_TUNING: 0
36 |     QUDA_RESOURCE_PATH: .
37 |     SLURM_MPI_TYPE: cray_shasta
38 |     CSCS_ADDITIONAL_MOUNTS: '["/opt/cray/pe/mpich/8.1.28/ofi/gnu/12.3/lib/libmpi.so:/usr/local/mpich/lib/libmpi.so.12.1.8", "/opt/cray/pe/lib64/libpmi.so.0:/usr/lib64/libpmi.so.0", "/opt/cray/pe/lib64/libpmi2.so.0:/usr/lib64/libpmi2.so.0",  "/opt/cray/pals/1.4/lib/libpals.so.0:/usr/lib64/libpals.so.0",  "/usr/lib64/libgfortran.so.5:/usr/lib64/libgfortran.so.5", "/opt/cray/pe/mpich/8.1.28/gtl/lib/libmpi_gtl_cuda.so:/usr/lib64/libmpi_gtl_cuda.so"]'
39 | 


--------------------------------------------------------------------------------
/cmake/CPM.cmake:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: MIT
 2 | #
 3 | # SPDX-FileCopyrightText: Copyright (c) 2019-2023 Lars Melchior and contributors
 4 | 
 5 | set(CPM_DOWNLOAD_VERSION 0.40.2)
 6 | set(CPM_HASH_SUM "c8cdc32c03816538ce22781ed72964dc864b2a34a310d3b7104812a5ca2d835d")
 7 | 
 8 | if(CPM_SOURCE_CACHE)
 9 |   set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
10 | elseif(DEFINED ENV{CPM_SOURCE_CACHE})
11 |   set(CPM_DOWNLOAD_LOCATION "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
12 | else()
13 |   set(CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
14 | endif()
15 | 
16 | # Expand relative path. This is important if the provided path contains a tilde (~)
17 | get_filename_component(CPM_DOWNLOAD_LOCATION ${CPM_DOWNLOAD_LOCATION} ABSOLUTE)
18 | 
19 | file(DOWNLOAD
20 |      https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake
21 |      ${CPM_DOWNLOAD_LOCATION} EXPECTED_HASH SHA256=${CPM_HASH_SUM}
22 | )
23 | 
24 | include(${CPM_DOWNLOAD_LOCATION})
25 | 


--------------------------------------------------------------------------------
/cmake/FindLibDL.cmake:
--------------------------------------------------------------------------------
 1 | # - Find libdl
 2 | # Find the native LIBDL includes and library
 3 | #
 4 | #  LIBDL_INCLUDE_DIR - where to find dlfcn.h, etc.
 5 | #  LIBDL_LIBRARIES   - List of libraries when using libdl.
 6 | #  LIBDL_FOUND       - True if libdl found.
 7 | 
 8 | 
 9 | IF (LIBDL_INCLUDE_DIR)
10 |   # Already in cache, be silent
11 |   SET(LIBDL_FIND_QUIETLY TRUE)
12 | ENDIF (LIBDL_INCLUDE_DIR)
13 | 
14 | FIND_PATH(LIBDL_INCLUDE_DIR dlfcn.h)
15 | 
16 | SET(LIBDL_NAMES dl libdl ltdl libltdl)
17 | FIND_LIBRARY(LIBDL_LIBRARY NAMES ${LIBDL_NAMES} )
18 | 
19 | # handle the QUIETLY and REQUIRED arguments and set LIBDL_FOUND to TRUE if 
20 | # all listed variables are TRUE
21 | INCLUDE(FindPackageHandleStandardArgs)
22 | FIND_PACKAGE_HANDLE_STANDARD_ARGS(LibDL DEFAULT_MSG LIBDL_LIBRARY LIBDL_INCLUDE_DIR)
23 | 
24 | IF(LIBDL_FOUND)
25 |   SET( LIBDL_LIBRARIES ${LIBDL_LIBRARY} )
26 | ELSE(LIBDL_FOUND)
27 |   SET( LIBDL_LIBRARIES )
28 | ENDIF(LIBDL_FOUND)
29 | 
30 | MARK_AS_ADVANCED( LIBDL_LIBRARY LIBDL_INCLUDE_DIR )
31 | 


--------------------------------------------------------------------------------
/cmake/find_target_cuda_dependencies.cmake:
--------------------------------------------------------------------------------
1 | # CUDA Specific CMake
2 | 
3 | enable_language(CUDA)
4 | 
5 | find_dependency(CUDAToolkit REQUIRED)
6 | 
7 | 


--------------------------------------------------------------------------------
/cmake/find_target_hip_dependencies.cmake:
--------------------------------------------------------------------------------
 1 | # HIP Specific CMake
 2 | enable_language(HIP)
 3 | 
 4 | if (NOT DEFINED ROCM_PATH )
 5 |   if (NOT DEFINED ENV{ROCM_PATH} )
 6 |     set(ROCM_PATH "/opt/rocm" CACHE PATH "ROCm path")
 7 |   else()
 8 |     set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "ROCm path")
 9 |   endif()
10 | endif()
11 | 
12 | set(CMAKE_MODULE_PATH "${ROCM_PATH}/lib/cmake" ${CMAKE_MODULE_PATH})
13 | find_dependency(HIP REQUIRED)
14 | find_dependency(hipfft REQUIRED)
15 | find_dependency(hiprand REQUIRED)
16 | find_dependency(rocrand REQUIRED)
17 | find_dependency(hipblas REQUIRED)
18 | find_dependency(rocblas REQUIRED)
19 | find_dependency(hipcub REQUIRED)
20 | find_dependency(rocprim REQUIRED)
21 | 


--------------------------------------------------------------------------------
/doc/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # add doxygen add doxygen documentation note that cmake 3.9 introduced a nicer way to do this but we don't want to
 2 | # require cmake 3.9 by default yet
 3 | 
 4 | option(QUDA_GENERATE_DOXYGEN "generate doxygen documentation")
 5 | 
 6 | if(QUDA_GENERATE_DOXYGEN)
 7 |   find_package(Doxygen)
 8 | 
 9 |   if(DOXYGEN_FOUND)
10 |     if(DOXYGEN_DOT_FOUND)
11 |       get_filename_component(DOXYGEN_DOT_PATH ${DOXYGEN_DOT_EXECUTABLE} DIRECTORY)
12 |     endif()
13 |     set(DOXYGEN_OUT ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile)
14 |     configure_file(${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile.in ${DOXYGEN_OUT} @ONLY)
15 | 
16 |     add_custom_target(
17 |       doc
18 |       COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYGEN_OUT}
19 |       WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
20 |       COMMENT "Generating doxygen documentation"
21 |       VERBATIM)
22 |   endif()
23 | endif()
24 | 


--------------------------------------------------------------------------------
/include/domain_wall_helper.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | namespace quda
 4 | {
 5 | 
 6 |   enum class Dslash5Type {
 7 |     DSLASH5_DWF,
 8 |     DSLASH5_MOBIUS_PRE,
 9 |     DSLASH5_MOBIUS,
10 |     M5_INV_DWF,
11 |     M5_INV_MOBIUS,
12 |     M5_INV_MOBIUS_M5_PRE,     // M5inv + M5pre
13 |     M5_PRE_MOBIUS_M5_INV,     // M5pre + M5inv
14 |     M5_INV_MOBIUS_M5_INV_DAG, // M5pre + M5inv
15 |     DSLASH5_MOBIUS_PRE_M5_MOB,
16 |     M5_INV_ZMOBIUS,
17 |     M5_EOFA,
18 |     M5INV_EOFA
19 |   };
20 | 
21 |   /**
22 |     Applying the following five kernels in the order of 4-0-1-2-3 is equivalent to applying
23 |     the full even-odd preconditioned symmetric MdagM operator:
24 |     op = (1 - M5inv * D4 * D5pre * M5inv * D4 * D5pre)^dag
25 |         * (1 - M5inv * D4 * D5pre * M5inv * D4 * D5pre)
26 |   */
27 |   enum class MdwfFusedDslashType {
28 |     D4_D5INV_D5PRE,
29 |     D4_D5INV_D5INVDAG,
30 |     D4DAG_D5PREDAG_D5INVDAG,
31 |     D4DAG_D5PREDAG,
32 |     D5PRE,
33 |   };
34 | 
35 | } // namespace quda
36 | 


--------------------------------------------------------------------------------
/include/eigen_helper.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #ifdef OPENBLAS_LIB
 4 | #define EIGEN_USE_LAPACKE
 5 | #define EIGEN_USE_BLAS
 6 | #endif
 7 | 
 8 | #include <math.h>
 9 | 
10 | // hide annoying warning
11 | #if !defined(__clang__) && !defined(_NVHPC_CUDA)
12 | #pragma GCC diagnostic push
13 | #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
14 | #endif
15 | 
16 | #include <Eigen/Eigenvalues>
17 | #include <Eigen/Dense>
18 | #include <Eigen/LU>
19 | 
20 | #if !defined(__clang__) && !defined(_NVHPC_CUDA)
21 | #pragma GCC diagnostic pop
22 | #endif
23 | 
24 | using namespace Eigen;
25 | 


--------------------------------------------------------------------------------
/include/externals/.clang-format:
--------------------------------------------------------------------------------
1 | DisableFormat: true
2 | SortIncludes: Never
3 | 
4 | 


--------------------------------------------------------------------------------
/include/gauge_update_quda.h:
--------------------------------------------------------------------------------
 1 | #ifndef _GAUGE_UPDATE_QUDA_H_
 2 | #define _GAUGE_UPDATE_QUDA_H_
 3 | 
 4 | namespace quda {
 5 | 
 6 |   /**
 7 |      Evolve the gauge field by step size dt using the momentuim field
 8 |      @param out Updated gauge field
 9 |      @param dt Step size 
10 |      @param in Input gauge field
11 |      @param mom Momentum field
12 |      @param conj_mom Whether we conjugate the momentum in the exponential
13 |      @param exact Calculate exact exponential or use an expansion
14 |    */
15 |   void updateGaugeField(GaugeField &out, double dt, const GaugeField& in, 
16 | 			const GaugeField& mom, bool conj_mom, bool exact);
17 | 
18 | } // namespace quda
19 | 
20 | #endif // _GAUGE_UPDATE_QUDA_H_
21 | 


--------------------------------------------------------------------------------
/include/hw_quda.h:
--------------------------------------------------------------------------------
 1 | #ifndef _HW_QUDA_H
 2 | #define _HW_QUDA_H
 3 | 
 4 | #include <enum_quda.h>
 5 | #include <quda_internal.h>
 6 | 
 7 | #ifdef __cplusplus
 8 | extern "C" {
 9 | #endif
10 | 
11 |   FullHw createHwQuda(int* X, QudaPrecision precision);
12 |   void loadHwToGPU(FullHw ret, void* hw, QudaPrecision cpu_prec);  
13 |   void freeHwQuda(FullHw hw);
14 |     
15 | #ifdef __cplusplus
16 | }
17 | #endif
18 | 
19 | #endif // _HW_QUDA_H
20 | 


--------------------------------------------------------------------------------
/include/int_factor_array.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <array.h>
 4 | 
 5 | namespace quda
 6 | {
 7 | 
 8 |   /**
 9 |    * @brief compute number of factors of an integer
10 |    *
11 |    */
12 |   template <unsigned int Int> constexpr unsigned int numFactors() noexcept
13 |   {
14 |     unsigned int i = 0;
15 |     for (unsigned int j = 1u; j <= Int; j++) {
16 |       if (Int % j == 0) { i++; }
17 |     }
18 |     return i;
19 |   }
20 | 
21 |   /**
22 |    * @brief A struct containing a compile time generated array
23 |    * containing factors of an integer.
24 |    */
25 |   template <unsigned int Int, unsigned int Multiple> struct IntFactorArray {
26 | 
27 |     array<unsigned int, numFactors<Int>()> data_;
28 | 
29 |     constexpr IntFactorArray() : data_()
30 |     {
31 |       static_assert(Int > 0, "Int has to be > 0");
32 |       for (unsigned int i = 0, j = 1; j <= Int; j++) {
33 |         if (Int % j == 0) {
34 |           data_[i] = j;
35 |           i++;
36 |         }
37 |       }
38 |     }
39 | 
40 |     /**
41 |      * @brief returns the size of the array
42 |      */
43 |     constexpr unsigned int size() const noexcept { return numFactors<Int>(); }
44 | 
45 |     /**
46 |      * @brief read only constant index operator[]
47 |      * @param i the index to look up
48 |      */
49 |     constexpr unsigned int operator[](int i) const noexcept { return Multiple * data_[i]; }
50 | 
51 |     constexpr unsigned int get_index(unsigned int value) const noexcept
52 |     {
53 |       unsigned int i = 0;
54 |       for (; i < numFactors<Int>(); i++) {
55 |         if (Multiple * data_[i] == static_cast<unsigned int>(value)) { return i; }
56 |       }
57 |       return i;
58 |     }
59 | 
60 |   }; // end struct
61 | 
62 | } // namespace quda
63 | 


--------------------------------------------------------------------------------
/include/int_list.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | namespace quda
 4 | {
 5 | 
 6 |   /**
 7 |     @brief This is a dummy struct that wraps around a list of integers
 8 |    */
 9 |   template <int... Ints> struct IntList {
10 |   };
11 | 
12 | } // namespace quda
13 | 


--------------------------------------------------------------------------------
/include/json_helper.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "externals/json.hpp"
 4 | using json = nlohmann::json;
 5 | 
 6 | void to_json(json &j, const dim3 &p) { j = json {{"x", p.x}, {"y", p.y}, {"z", p.z}}; }
 7 | 
 8 | void from_json(const json &j, dim3 &p)
 9 | {
10 |   j.at("x").get_to(p.x);
11 |   j.at("y").get_to(p.y);
12 |   j.at("z").get_to(p.z);
13 | }
14 | 
15 | void to_json(json &j, const int4 &p) { j = json {{"x", p.x}, {"y", p.y}, {"z", p.z}, {"w", p.w}}; }
16 | 
17 | void from_json(const json &j, int4 &p)
18 | {
19 |   j.at("x").get_to(p.x);
20 |   j.at("y").get_to(p.y);
21 |   j.at("z").get_to(p.z);
22 |   j.at("w").get_to(p.w);
23 | }
24 | 


--------------------------------------------------------------------------------
/include/kernels/copy_color_spinor_mg.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <color_spinor_field_order.h>
 4 | #include <kernel.h>
 5 | 
 6 | namespace quda {
 7 | 
 8 |   using namespace colorspinor;
 9 | 
10 |   template <int nSpin_, int nColor_, typename OutOrder, typename InOrder>
11 |   struct CopyArg : kernel_param<> {
12 |     static constexpr int nSpin = nSpin_;
13 |     static constexpr int nColor = nColor_;
14 |     OutOrder out;
15 |     const InOrder in;
16 | 
17 |     template <typename T1, typename T2>
18 |     CopyArg(ColorSpinorField &out, const ColorSpinorField &in, T1 *Out, T2 *In) :
19 |       kernel_param(dim3(in.VolumeCB(), nSpin, nColor)), out(out, 1, Out), in(in, 1, In)
20 |     {}
21 |   };
22 | 
23 |   template <typename Arg> struct CopySpinor_ {
24 |     const Arg &arg;
25 |     constexpr CopySpinor_(const Arg &arg) : arg(arg) {}
26 |     static constexpr const char *filename() { return KERNEL_FILE; }
27 | 
28 |     __device__ __host__ inline void operator()(int x_cb, int s, int c)
29 |     {
30 |       arg.out(0, x_cb, s, c) = arg.in(0, x_cb, s, c);
31 |     }
32 |   };
33 | 
34 | }
35 | 


--------------------------------------------------------------------------------
/include/kernels/device_vector_axpby.cuh:
--------------------------------------------------------------------------------
 1 | #include <tunable_nd.h>
 2 | 
 3 | /**
 4 |   @file The following contains the argument and kernel for applying axpby to device vectors.
 5 | */
 6 | 
 7 | namespace quda
 8 | {
 9 | 
10 |   template <class T> struct AxpbyArg : kernel_param<> {
11 |     T *out;
12 |     T a;
13 |     const T *x;
14 |     T b;
15 |     const T *y;
16 | 
17 |     AxpbyArg(T *out, T a, const T *x, T b, const T *y, int size) : kernel_param(size), out(out), a(a), x(x), b(b), y(y)
18 |     {
19 |     }
20 |   };
21 | 
22 |   template <class Arg> struct Axpby {
23 |     const Arg &arg;
24 |     constexpr Axpby(const Arg &arg) : arg(arg) { }
25 |     static constexpr const char *filename() { return KERNEL_FILE; }
26 | 
27 |     __device__ __host__ inline void operator()(int thread_idx)
28 |     {
29 |       arg.out[thread_idx] += arg.a * arg.x[thread_idx] + arg.b * arg.y[thread_idx];
30 |     }
31 |   };
32 | 
33 | } // namespace quda
34 | 


--------------------------------------------------------------------------------
/include/kernels/random_init.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <random_helper.h>
 4 | #include <lattice_field.h>
 5 | #include <index_helper.cuh>
 6 | #include <comm_quda.h>
 7 | #include <kernel.h>
 8 | 
 9 | namespace quda {
10 | 
11 |   struct rngArg : kernel_param<> {
12 |     int commCoord[QUDA_MAX_DIM];
13 |     int X[QUDA_MAX_DIM];
14 |     uint64_t X_global[QUDA_MAX_DIM];
15 |     RNGState *state;
16 |     unsigned long long seed;
17 |     rngArg(RNGState *state, unsigned long long seed, const LatticeField &meta) :
18 |       kernel_param(dim3(meta.LocalVolumeCB(), meta.SiteSubset(), 1)),
19 |       state(state),
20 |       seed(seed)
21 |     {
22 |       for (int i=0; i<4; i++) {
23 |         commCoord[i] = comm_coord(i);
24 |         X[i] = meta.LocalX()[i];
25 |         X_global[i] = X[i] * comm_dim(i);
26 |       }
27 |     }
28 |   };
29 | 
30 |   /**
31 |      @brief functor to initialize the RNG states
32 |      @param state RNG state array
33 |      @param seed initial seed for RNG
34 |      @param size size of the RNG state array
35 |      @param arg Metadata needed for computing multi-gpu offsets
36 |   */
37 |   template <typename Arg>
38 |   struct init_random {
39 |     const Arg &arg;
40 |     __device__ constexpr init_random(const Arg &arg) : arg(arg) {}
41 |     static constexpr const char *filename() { return KERNEL_FILE; }
42 | 
43 |     __device__ inline void operator()(int id, int parity)
44 |     {
45 |       // Each thread gets same seed, a different sequence number, no offset
46 |       int x[4];
47 |       getCoords(x, id, arg.X, parity);
48 |       for (int i = 0; i < 4; i++) x[i] += arg.commCoord[i] * arg.X[i];
49 |       auto idd = (((x[3] * arg.X_global[2] + x[2]) * arg.X_global[1]) + x[1]) * arg.X_global[0] + x[0];
50 |       random_init(arg.seed, idd, 0, arg.state[parity * arg.threads.x + id]);
51 |     }
52 |   };
53 | 
54 | }
55 | 


--------------------------------------------------------------------------------
/include/kernels/reduce_init.cuh:
--------------------------------------------------------------------------------
 1 | #include <reduce_helper.h>
 2 | #include <kernel.h>
 3 | 
 4 | namespace quda {
 5 | 
 6 |   namespace reducer {
 7 | 
 8 |     template <typename T_> struct init_arg : kernel_param<> {
 9 |       using T = T_;
10 |       T *count;
11 |       init_arg(T *count, int n_reduce) :
12 |         kernel_param(dim3(n_reduce, 1, 1)),
13 |         count(count) { }
14 |     };
15 | 
16 |     template <typename Arg> struct init_count {
17 |       const Arg &arg;
18 |       static constexpr const char *filename() { return KERNEL_FILE; }
19 |       constexpr init_count(const Arg &arg) : arg(arg) {}
20 |       __device__ void operator()(int i) { new (arg.count + i) typename Arg::T {0}; }
21 |     };
22 | 
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/include/kernels/spin_duplicate.cuh:
--------------------------------------------------------------------------------
 1 | #include <color_spinor_field_order.h>
 2 | #include <kernel.h>
 3 | 
 4 | namespace quda {
 5 | 
 6 |   using namespace colorspinor;
 7 | 
 8 |   template <typename store_t, int nSpin_, int nColor_>
 9 |   struct SpinorDuplicateArg : kernel_param<> {
10 |     using real = typename mapper<store_t>::type;
11 |     static constexpr int nSpin = nSpin_;
12 |     static constexpr int nColor = nColor_;
13 |     using V = typename colorspinor_mapper<store_t, nSpin, nColor, false, false, true>::type;
14 |     V v[nSpin];
15 |     V src;
16 | 
17 |     /**
18 |        @brief Constructor for the duplication arg
19 |        @param v The spin duplicated set
20 |        @param src The source vector we are duplicating
21 |      */
22 |     SpinorDuplicateArg(cvector_ref<ColorSpinorField> &v, const ColorSpinorField &src) :
23 |       kernel_param(dim3(src.VolumeCB(), src.SiteSubset(), 1)),
24 |       src(src)
25 |     {
26 |       for (auto i = 0u; i < v.size(); i++) this->v[i] = V(v[i]);
27 |     }
28 |   };
29 | 
30 |   /**
31 |      Functor for spin duplicating the src vector
32 |    */
33 |   template <typename Arg> struct DuplicateSpinor {
34 |     const Arg &arg;
35 |     constexpr DuplicateSpinor(const Arg &arg) : arg(arg) {}
36 |     static constexpr const char* filename() { return KERNEL_FILE; }
37 | 
38 |     __device__ __host__ void operator()(int x_cb, int parity)
39 |     {
40 |       using vector = ColorSpinor<typename Arg::real, Arg::nColor, Arg::nSpin>;
41 |       vector src = arg.src(x_cb, parity);
42 | 
43 |       for (int i = 0; i < Arg::nSpin; i++) {
44 |         vector v;
45 | 
46 |         for (int s = 0; s < Arg::nSpin; s++) {
47 |           for (int c = 0; c < Arg::nColor; c++) {
48 |             v(s, c) = src(i, c);
49 |           }
50 |         }
51 | 
52 |         arg.v[i](x_cb, parity) = v;
53 |       }
54 |     }
55 | 
56 |   };
57 | 
58 | }
59 | 


--------------------------------------------------------------------------------
/include/kernels/spinor_reweight.cuh:
--------------------------------------------------------------------------------
 1 | #include <math_helper.cuh>
 2 | #include <color_spinor_field_order.h>
 3 | #include <index_helper.cuh>
 4 | #include <kernel.h>
 5 | 
 6 | namespace quda
 7 | {
 8 | 
 9 |   using namespace colorspinor;
10 | 
11 |   template <typename store_t, int nSpin_, int nColor_> struct SpinorDistanceReweightArg : kernel_param<> {
12 |     using real = typename mapper<store_t>::type;
13 |     static constexpr int nSpin = nSpin_;
14 |     static constexpr int nColor = nColor_;
15 |     using V = typename colorspinor_mapper<store_t, nSpin, nColor>::type;
16 | 
17 |     int X[4];
18 |     V v;
19 |     real alpha0;
20 |     int t0;
21 |     SpinorDistanceReweightArg(ColorSpinorField &v, real alpha0, int t0) :
22 |       kernel_param(dim3(v.VolumeCB(), v.SiteSubset(), 1)), v(v), alpha0(alpha0), t0(t0)
23 |     {
24 |       for (int dir = 0; dir < 4; dir++) X[dir] = v.X()[dir];
25 |       X[0] *= (v.SiteSubset() == 1) ? 2 : 1; // need full lattice dims
26 |     }
27 |   };
28 | 
29 |   template <typename Arg> __device__ __host__ inline auto distanceWeight(const Arg &arg, int t, int nt)
30 |   {
31 |     using real = typename Arg::real;
32 |     if (arg.alpha0 > 0) {
33 |       return cosh(arg.alpha0 * real((t - arg.t0 + nt) % nt - nt / 2));
34 |     } else {
35 |       return 1 / cosh(arg.alpha0 * real((t - arg.t0 + nt) % nt - nt / 2));
36 |     }
37 |   }
38 | 
39 |   template <typename Arg> struct DistanceReweightSpinor {
40 |     const Arg &arg;
41 |     constexpr DistanceReweightSpinor(const Arg &arg) : arg(arg) { }
42 |     static constexpr const char *filename() { return KERNEL_FILE; }
43 | 
44 |     __device__ __host__ void operator()(int x_cb, int parity)
45 |     {
46 |       using Vector = ColorSpinor<typename Arg::real, Arg::nColor, Arg::nSpin>;
47 |       int x[4];
48 |       getCoords(x, x_cb, arg.X, parity);
49 |       Vector tmp = arg.v(x_cb, parity);
50 |       tmp *= distanceWeight(arg, arg.comms_coord[3] * arg.X[3] + x[3], arg.comms_dim[3] * arg.X[3]);
51 |       arg.v(x_cb, parity) = tmp;
52 |     }
53 |   };
54 | 
55 | } // namespace quda
56 | 


--------------------------------------------------------------------------------
/include/kernels/transform_reduce.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <limits>
 4 | #include <algorithm>
 5 | #include <reduction_kernel.h>
 6 | 
 7 | namespace quda {
 8 | 
 9 |   template <typename reducer_, typename T, typename count_t, typename transformer, typename mapper>
10 |   struct TransformReduceArg : public ReduceArg<typename reducer_::reduce_t> {
11 |     using reducer = reducer_;
12 |     using reduce_t = typename reducer::reduce_t;
13 |     static constexpr int n_batch_max = 8;
14 |     const T *v[n_batch_max];
15 |     count_t n_items;
16 |     int n_batch;
17 |     transformer h;
18 |     mapper m;
19 | 
20 |     TransformReduceArg(const std::vector<T *> &v, count_t n_items, transformer h, mapper m) :
21 |       ReduceArg<reduce_t>(dim3(n_items, 1, v.size()), v.size()),
22 |       n_items(n_items),
23 |       n_batch(v.size()),
24 |       h(h),
25 |       m(m)
26 |     {
27 |       if (n_batch > n_batch_max) errorQuda("Requested batch %d greater than max supported %d", n_batch, n_batch_max);
28 |       if (n_items > std::numeric_limits<count_t>::max())
29 |         errorQuda("Requested size %lu greater than max supported %lu",
30 |                   (uint64_t)n_items, (uint64_t)std::numeric_limits<count_t>::max());
31 |       std::copy(v.begin(), v.end(), this->v);
32 |     }
33 |   };
34 | 
35 |   template <typename Arg> struct transform_reducer : Arg::reducer {
36 |     using reduce_t = typename Arg::reduce_t;
37 |     using Arg::reducer::operator();
38 |     static constexpr int reduce_block_dim = 1;
39 |     using count_t = decltype(Arg::n_items);
40 | 
41 |     const Arg &arg;
42 |     static constexpr const char *filename() { return KERNEL_FILE; }
43 |     constexpr transform_reducer(const Arg &arg) : arg(arg) {}
44 | 
45 |     __device__ __host__ inline reduce_t operator()(reduce_t &value, count_t i, int, int j)
46 |     {
47 |       auto k = arg.m(i);
48 |       auto v = arg.v[j];
49 |       auto t = arg.h(v[k]);
50 |       return operator()(t, value);
51 |     }
52 |   };
53 | 
54 | }
55 | 


--------------------------------------------------------------------------------
/include/ks_force_quda.h:
--------------------------------------------------------------------------------
 1 | #ifndef __KS_FORCE_QUDA_H__
 2 | #define __KS_FORCE_QUDA_H__
 3 | 
 4 | #include <gauge_field.h>
 5 | 
 6 | 
 7 | namespace quda {
 8 | 
 9 | void completeKSForce(GaugeField &mom, const GaugeField &oprod, const GaugeField &gauge, QudaFieldLocation location, long long *flops = NULL);
10 | 
11 | } // namespace quda
12 | 
13 | #endif
14 | 


--------------------------------------------------------------------------------
/include/ks_qsmear.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <quda_internal.h>
 4 | #include <gauge_field.h>
 5 | 
 6 | namespace quda {
 7 | 
 8 |   /**
 9 |      @brief Compute the 2-link field for the smearing operation
10 |      @param[out] newTwoLink The computed 2-link output
11 |      @param[in] link Thin-link gauge field
12 |   */
13 |   void computeTwoLink(GaugeField &newTwoLink, const GaugeField &link);
14 | 
15 | 
16 | }  // namespace quda
17 | 


--------------------------------------------------------------------------------
/include/layout_hyper.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | // for QIO_HAS_EXTENDED_LAYOUT, QIO_Index
 4 | #include <qio.h>
 5 | 
 6 | #ifdef __cplusplus
 7 | extern "C" {
 8 | #endif
 9 | /* These routines get a quda_* prefix to avoid
10 |    potential linker conflicts, with MILC */
11 | int quda_setup_layout(int len[], int nd, int numnodes, int single_parity);
12 | extern int quda_this_node;
13 | 
14 | #ifdef QIO_HAS_EXTENDED_LAYOUT
15 | int quda_node_number_ext(const int x[], void *arg);
16 | QIO_Index quda_node_index_ext(const int x[], void *arg);
17 | void quda_get_coords_ext(int x[], int node, QIO_Index index, void *arg);
18 | QIO_Index quda_num_sites_ext(int node, void *arg);
19 | #else
20 | int quda_node_number(const int x[]);
21 | int quda_node_index(const int x[]);
22 | void quda_get_coords(int x[], int node, int index);
23 | int quda_num_sites(int node);
24 | #endif
25 | 
26 | #ifdef __cplusplus
27 | }
28 | #endif
29 | 


--------------------------------------------------------------------------------
/include/llfat_quda.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "quda.h"
 4 | #include "quda_internal.h"
 5 | 
 6 | namespace quda {
 7 | 
 8 |   /**
 9 |      @brief Compute the fat links for an improved staggered (Kogut-Susskind) fermions.
10 |      @param fat[out] The computed fat link
11 |      @param u[in] The input gauge field
12 |      @param coeff[in] Array of path coefficients
13 |   */
14 |   void fatKSLink(GaugeField &fat, const GaugeField &u, const double *coeff);
15 | 
16 |   /**
17 |      @brief Compute the long links for an improved staggered (Kogut-Susskind) fermions.
18 |      @param lng[out] The computed long link (only computed if lng!=0)
19 |      @param u[in] The input gauge field
20 |      @param coeff[in] Array of path coefficients
21 |   */
22 |   void longKSLink(GaugeField &lng, const GaugeField &u, const double *coeff);
23 | 
24 | } // namespace quda
25 | 


--------------------------------------------------------------------------------
/include/madwf_param.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | namespace quda
 4 | {
 5 |   /**
 6 |     @brief Parameter structure for holding the various MADWF parameters.
 7 |   */
 8 |   struct MadwfParam {
 9 | 
10 |     /** The diagonal constant to suppress the low modes when performing 5D transfer */
11 |     double madwf_diagonal_suppressor;
12 | 
13 |     /** The target MADWF Ls to be used in the accelerator */
14 |     int madwf_ls;
15 | 
16 |     /** The minimum number of iterations after which to generate the null vectors for MADWF */
17 |     int madwf_null_miniter;
18 | 
19 |     /** The maximum tolerance after which to generate the null vectors for MADWF */
20 |     double madwf_null_tol;
21 | 
22 |     /** The maximum number of iterations for the training iterations */
23 |     int madwf_train_maxiter;
24 | 
25 |     /** Whether to load the MADWF parameters from the file system */
26 |     bool madwf_param_load;
27 | 
28 |     /** Whether to save the MADWF parameters to the file system */
29 |     bool madwf_param_save;
30 | 
31 |     /** Path to load from the file system */
32 |     std::string madwf_param_infile;
33 | 
34 |     /** Path to save to the file system */
35 |     std::string madwf_param_outfile;
36 |   };
37 | 
38 | } // namespace quda
39 | 


--------------------------------------------------------------------------------
/include/matrix_field.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | /**
 4 |  * @field matrix_accessor.h
 5 |  * @brief Simple accessor used for matrix fields, e.g., each lattice
 6 |  * site consists of an n x n matrix
 7 |  */
 8 | 
 9 | #include <aos.h>
10 | #include <quda_matrix.h>
11 | 
12 | namespace quda
13 | {
14 | 
15 |   template <typename T, int n> struct matrix_field {
16 |     T *field;
17 |     int volume_cb;
18 | 
19 |     matrix_field(T *field, int volume_cb) : field(field), volume_cb(volume_cb) {}
20 | 
21 |     __device__ __host__ inline void load(Matrix<T, n> &A, int x_cb, int parity) const
22 |     {
23 |       int idx = parity * volume_cb + x_cb;
24 |       block_load(A, reinterpret_cast<const Matrix<T, n> *>(field) + idx);
25 |     }
26 | 
27 |     __device__ __host__ inline void save(const Matrix<T, n> &A, int x_cb, int parity) const
28 |     {
29 |       int idx = parity * volume_cb + x_cb;
30 |       block_store(reinterpret_cast<Matrix<T, n> *>(field) + idx, A);
31 |     }
32 |   };
33 | 
34 | } // namespace quda
35 | 


--------------------------------------------------------------------------------
/include/momentum.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <gauge_field.h>
 3 | 
 4 | namespace quda {
 5 | 
 6 |   /**
 7 |      @brief Compute and return global the momentum action 1/2 mom^2
 8 |      @param mom Momentum field
 9 |      @return Momentum action contribution
10 |    */
11 |   double computeMomAction(const GaugeField &mom);
12 | 
13 |   /**
14 |      Update the momentum field from the force field
15 | 
16 |      mom = mom - coeff * [force]_TA
17 | 
18 |      where [A]_TA means the traceless anti-hermitian projection of A
19 | 
20 |      @param mom Momentum field
21 |      @param coeff Integration stepsize
22 |      @param force Force field
23 |      @param func The function calling this (fname will be printed if force monitoring is enabled)
24 |    */
25 |   void updateMomentum(GaugeField &mom, double coeff, GaugeField &force, const char *fname);
26 | 
27 |   /**
28 |      Left multiply the force field by the gauge field
29 | 
30 |      force = U * force
31 | 
32 |      @param force Force field
33 |      @param U Gauge field
34 |    */
35 |   void applyU(GaugeField &force, GaugeField &U);
36 | 
37 |   /**
38 |      @brief Whether we are monitoring the force or not
39 |      @return Boolean whether we are monitoring the force
40 |   */
41 |   bool forceMonitor();
42 | 
43 |   /**
44 |      @brief Flush any outstanding force monitoring information
45 |   */
46 |   void flushForceMonitor();
47 | 
48 | } // namespace quda
49 | 


--------------------------------------------------------------------------------
/include/monitor.h:
--------------------------------------------------------------------------------
 1 | #include "device.h"
 2 | 
 3 | namespace quda
 4 | {
 5 | 
 6 |   namespace monitor
 7 |   {
 8 | 
 9 |     /**
10 |        @brief Initialize device monitoring if supported.  On CUDA this
11 |        uses NVML-based monitoring.
12 |     */
13 |     void init();
14 | 
15 |     /**
16 |        @brief Tear down any state associated with device monitoring
17 |     */
18 |     void destroy();
19 | 
20 |     /**
21 |        @brief Serlialize the monitor state history to disk.  If
22 |        QUDA_RESOURCE_PATH is not defined then no action is taken
23 |     */
24 |     void serialize();
25 | 
26 |     /**
27 |        @brief Get the current size of the monitor state.  Used for
28 |        bookending a period for later analysis.
29 |     */
30 |     size_t size();
31 | 
32 |     struct state_t {
33 |       double energy = 0.0;
34 |       double power = 0.0;
35 |       double temp = 0.0;
36 |       double clock = 0.0;
37 |     };
38 | 
39 |     /**
40 |        @brief Get the mean state observables between start and end, where
41 |        start and end are two intervals of history in the state.
42 |     */
43 |     state_t mean(size_t start, size_t end);
44 | 
45 |   } // namespace monitor
46 | 
47 | } // namespace quda
48 | 


--------------------------------------------------------------------------------
/include/mpi_comm_handle.h:
--------------------------------------------------------------------------------
 1 | #ifndef _COMM_HANDLE_H
 2 | #define _COMM_HANDLE_H
 3 | 
 4 | #if defined(QMP_COMMS) || defined(MPI_COMMS)
 5 | #include <mpi.h>
 6 | namespace quda {
 7 |   MPI_Comm get_mpi_handle();
 8 | }
 9 | #endif
10 | 
11 | #ifdef QMP_COMMS
12 | #include <qmp.h>
13 | 
14 | #ifdef __cplusplus
15 | extern "C" {
16 | #endif
17 | 
18 | QMP_status_t QMP_get_mpi_comm(QMP_comm_t comm, void **mpicomm);
19 | 
20 | #ifdef __cplusplus
21 | }
22 | #endif
23 | 
24 | #endif
25 | 
26 | #endif /* _COMM_HANDLE_H */
27 | 


--------------------------------------------------------------------------------
/include/multigrid_helper.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | namespace quda {
 4 | 
 5 |   /**
 6 |      Helper struct for dealing with spin coarsening.  This helper
 7 |      should work with all types of fermions.
 8 |    */
 9 |   template <int fineSpin, int coarseSpin>
10 |   struct spin_mapper {
11 |     // fineSpin == 1, coarseSpin == 2 identifies staggered fine -> coarse w/ spin.
12 |     static constexpr int spin_block_size = (fineSpin == 1 && coarseSpin == 2) ? 0 : fineSpin / coarseSpin;
13 | 
14 |     static constexpr int get_spin_block_factor() { return (spin_block_size == 0) ? 1 : spin_block_size; }
15 | 
16 |     /**
17 |        Return the coarse spin coordinate from the fine spin coordinate
18 |        @param s Fine spin coordinate
19 |        @param parity fine parity, for staggered
20 |        @return Coarse spin coordinate
21 |      */
22 |     constexpr int operator()(int s, int parity) const
23 |     {
24 |       return (spin_block_size == 0) ? parity : s / spin_block_size;
25 |     }
26 |   };
27 | 
28 | }
29 | 


--------------------------------------------------------------------------------
/include/numa_affinity.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | 
 4 | /**
 5 |  * sets the cpu affinity of the calling process to the affinity mask reported by nvidia-smi topo
 6 |  * Note that older driver versions might pin all mpi ranks to the same single conre instead of a range
 7 |  * @param  deviceid gpu to determine affinity for
 8 |  * @return          0 if numa affinity was set
 9 |  */
10 | int setNumaAffinityNVML(int deviceid);
11 | 


--------------------------------------------------------------------------------
/include/object.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |    @file object.h
 3 | 
 4 |    @section DESCRIPTION
 5 | 
 6 |    Abstract parent class for all classes in QUDA.  This parent class
 7 |    defines the new/delete methods to use QUDA's memory allocators.
 8 |    This gives us memory leak checking on these object instances.
 9 | */
10 | 
11 | #pragma once
12 | 
13 | #include <malloc_quda.h>
14 | 
15 | namespace quda {
16 | 
17 |   struct Object {
18 | 
19 |     Object() { }
20 |     virtual ~Object() { }
21 | 
22 |     void *operator new(std::size_t size) { return safe_malloc(size); }
23 | 
24 |     void operator delete(void *p) { host_free(p); }
25 | 
26 |     void *operator new[](std::size_t size) { return safe_malloc(size); }
27 | 
28 |     void operator delete[](void *p) { host_free(p); }
29 |   };
30 | 
31 | } // namespace quda
32 | 


--------------------------------------------------------------------------------
/include/power_of_two_array.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <array.h>
 4 | 
 5 | namespace quda
 6 | {
 7 | 
 8 |   /**
 9 |    * @brief compute number of elements of an array containing powers
10 |    * of 2 starting at a minumum up to and including a maximum
11 |    *
12 |    */
13 |   template <unsigned int Min, unsigned int Max> constexpr unsigned int numElements() noexcept
14 |   {
15 |     unsigned int i = 0;
16 |     for (auto j = Min; j <= Max; j *= 2) i++;
17 |     return i;
18 |   }
19 | 
20 |   /**
21 |    * @brief A struct containing a compile time generated array
22 |    * containing powers of 2 starting at Min up to and includeing Max
23 |    * with thanks to StackOverflow:
24 |    *   https://stackoverflow.com/questions/19019252/create-n-element-constexpr-array-in-c11
25 |    */
26 |   template <unsigned int Min, unsigned int Max> struct PowerOfTwoArray {
27 | 
28 |     array<unsigned int, numElements<Min, Max>()> data_;
29 | 
30 |     constexpr PowerOfTwoArray() : data_()
31 |     {
32 |       static_assert(Min <= Max, "Min has to be <= Max");
33 |       for (unsigned int i = 0, j = Min; j <= Max; j *= 2, i++) data_[i] = j;
34 |     }
35 | 
36 |     /**
37 |      * @brief returns the size of the array
38 |      */
39 |     constexpr unsigned int size() const noexcept { return numElements<Min, Max>(); }
40 | 
41 |     /**
42 |      * @brief read only constant index operator[]
43 |      * @param i the index to look up
44 |      */
45 |     constexpr unsigned int operator[](int i) const noexcept { return data_[i]; }
46 | 
47 |   }; // end struct
48 | 
49 | } // namespace quda
50 | 


--------------------------------------------------------------------------------
/include/qio_field.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #ifdef HAVE_QIO
 4 | void read_gauge_field(const char *filename, void *gauge[], QudaPrecision prec, const int *X,
 5 | 		      int argc, char *argv[]);
 6 | void write_gauge_field(const char *filename, void *gauge[], QudaPrecision prec, const int *X, int argc, char *argv[]);
 7 | void read_spinor_field(const char *filename, void *V[], QudaPrecision precision, const int *X, QudaSiteSubset subset,
 8 |                        QudaParity parity, int nColor, int nSpin, int Nvec, int argc, char *argv[]);
 9 | void write_spinor_field(const char *filename, const void *V[], QudaPrecision precision, const int *X,
10 |                         QudaSiteSubset subset, QudaParity parity, int nColor, int nSpin, int Nvec, int argc,
11 |                         char *argv[], bool partfile = false);
12 | #else
13 | inline void read_gauge_field(const char *, void *[], QudaPrecision, const int *, int, char *[])
14 | {
15 |   printf("QIO support has not been enabled\n");
16 |   exit(-1);
17 | }
18 | inline void write_gauge_field(const char *, void *[], QudaPrecision, const int *, int, char *[])
19 | {
20 |   printf("QIO support has not been enabled\n");
21 |   exit(-1);
22 | }
23 | inline void read_spinor_field(const char *, void *[], QudaPrecision, const int *, QudaSiteSubset, QudaParity, int, int,
24 |                               int, int, char *[])
25 | {
26 |   printf("QIO support has not been enabled\n");
27 |   exit(-1);
28 | }
29 | inline void write_spinor_field(const char *, const void *[], QudaPrecision, const int *, QudaSiteSubset, QudaParity,
30 |                                int, int, int, int, char *[], bool)
31 | {
32 |   printf("QIO support has not been enabled\n");
33 |   exit(-1);
34 | }
35 | 
36 | #endif
37 | 


--------------------------------------------------------------------------------
/include/quda_arch.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <quda_define.h>
 3 | 
 4 | #if defined(QUDA_TARGET_CUDA)
 5 | #include <cuda.h>
 6 | #include <cuda_runtime.h>
 7 | 
 8 | #if (__COMPUTE_CAPABILITY__ >= 700) && defined(QUDA_ENABLE_MMA)
 9 | #define QUDA_MMA_AVAILABLE
10 | #endif
11 | 
12 | #elif defined(QUDA_TARGET_HIP)
13 | #include <hip/hip_runtime.h>
14 | 
15 | #elif defined(QUDA_TARGET_SYCL)
16 | #include <targets/sycl/quda_sycl.h>
17 | #endif
18 | 
19 | #ifdef QUDA_OPENMP
20 | #include <omp.h>
21 | #endif
22 | 


--------------------------------------------------------------------------------
/include/quda_constants.h:
--------------------------------------------------------------------------------
 1 | #define QUDA_VERSION_MAJOR     1
 2 | #define QUDA_VERSION_MINOR     1
 3 | #define QUDA_VERSION_SUBMINOR  0
 4 | 
 5 | /**
 6 |  * @def   QUDA_VERSION
 7 |  * @brief This macro is deprecated.  Use QUDA_VERSION_MAJOR, etc., instead.
 8 |  */
 9 | #define QUDA_VERSION ((QUDA_VERSION_MAJOR<<16) | (QUDA_VERSION_MINOR<<8) | QUDA_VERSION_SUBMINOR)
10 | 
11 | 
12 | /**
13 |  * @def   QUDA_MAX_DIM
14 |  * @brief Maximum number of dimensions supported by QUDA.  In practice, no
15 |  *        routines make use of more than 5.
16 |  */
17 | #define QUDA_MAX_DIM 6
18 | 
19 | /**
20 |  * @def   QUDA_MAX_GEOMETRY
21 |  * @brief Maximum geometry supported by a field.  This essentially is
22 |  * the maximum number of dimensions supported per lattice site.
23 |  */
24 | #define QUDA_MAX_GEOMETRY 8
25 | 
26 | /**
27 |  * @def QUDA_MAX_MULTI_SHIFT
28 |  * @brief Maximum number of shifts supported by the multi-shift solver.
29 |  *        This number may be changed if need be.
30 |  */
31 | #define QUDA_MAX_MULTI_SHIFT 32
32 | 
33 | /**
34 |  * @def QUDA_MAX_BLOCK_SRC
35 |  * @brief Maximum number of sources that can be supported by the multi-src solver
36 |  */
37 | #define QUDA_MAX_MULTI_SRC 128
38 | 
39 | /**
40 |  * @def   QUDA_MAX_DWF_LS
41 |  * @brief Maximum length of the Ls dimension for domain-wall fermions
42 |  */
43 | #define QUDA_MAX_DWF_LS 32
44 | 
45 | /**
46 |  * @def QUDA_MAX_MG_LEVEL
47 |  * @brief Maximum number of multi-grid levels.  This number may be
48 |  * increased if needed.
49 |  */
50 | #define QUDA_MAX_MG_LEVEL 5
51 | 


--------------------------------------------------------------------------------
/include/random_quda.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <memory>
 4 | #include <quda_define.h>
 5 | #include <lattice_field.h>
 6 | 
 7 | namespace quda {
 8 | 
 9 |   // The nature of the state is defined in the target-specific implementation
10 |   struct RNGState;
11 | 
12 |   /**
13 |      @brief Class declaration to initialize and hold RNG states
14 |   */
15 |   class RNG
16 |   {
17 | 
18 |     bool is_initialized = false;     /*! @brief whether or not the RNG is initialized */
19 |     size_t size;                     /*! @brief number of rand states */
20 |     std::shared_ptr<RNGState> state; /*! array with current rand rng state */
21 |     RNGState *backup_state;          /*! array for backup of current rand rng state */
22 |     unsigned long long seed;         /*! initial rng seed */
23 | 
24 |   public:
25 |     /*! @brief Default constructor */
26 |     RNG() = default;
27 | 
28 |     /**
29 |        @brief Allocate and initialize RNG states.  Constructor that
30 |        takes its metadata from pre-existing field
31 |        @param[in] meta The field whose data we use
32 |        @param[in] seed Seed to initialize the RNG
33 |     */
34 |     RNG(const LatticeField &meta, unsigned long long seedin);
35 | 
36 |     unsigned long long Seed() { return seed; };
37 | 
38 |     /*! @brief Check if the RNG is initialized */
39 |     bool isInitialized() { return is_initialized; };
40 | 
41 |     /*! @brief Restore rng array states initialization */
42 |     void restore();
43 | 
44 |     /*! @brief Backup rng array states initialization */
45 |     void backup();
46 | 
47 |     /*! @brief Get pointer to RNGState */
48 |     RNGState *State();
49 |   };
50 | }
51 | 


--------------------------------------------------------------------------------
/include/shmem_helper.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | /**
 4 |    @file shmem_helper.cuh
 5 | 
 6 |    @section Description
 7 |    Include this file as opposed to nvshmem headers directly to ensure
 8 |    correct compilation with NVSHMEM
 9 |  */
10 | 
11 | #if defined(NVSHMEM_COMMS)
12 | #include <mpi.h>
13 | #include <nvshmem.h>
14 | #include <nvshmemx.h>
15 | #if defined(__CUDACC__) ||  defined(_NVHPC_CUDA) || (defined(__clang__) && defined(__CUDA__))
16 | // only include if using a CUDA compiler
17 | #include <cuda/atomic>
18 | #endif
19 | #endif
20 | 


--------------------------------------------------------------------------------
/include/spin_taste.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <color_spinor_field.h>
 3 | 
 4 | namespace quda
 5 | {
 6 | 
 7 |   /**
 8 |      @brief Compute the outer-product field between the staggered quark
 9 |      field's one and (for HISQ and ASQTAD) three hop sites.  E.g.,
10 | 
11 |      out[0][d](x) = (in(x+1_d) x conj(in(x)))
12 |      out[1][d](x) = (in(x+3_d) x conj(in(x)))
13 | 
14 |      where 1_d and 3_d represent a relative shift of magnitude 1 and 3 in dimension d, respectively
15 | 
16 |      Note out[1] is only computed if nFace=3
17 | 
18 |      @param[out] out Array of nFace outer-product matrix fields
19 |      @param[in] in Input quark field
20 |      @param[in] coeff Coefficient
21 |      @param[in] nFace Number of faces (1 or 3)
22 |   */
23 |   void applySpinTaste(ColorSpinorField &out, const ColorSpinorField &in, QudaSpinTasteGamma gamma);
24 | 
25 | } // namespace quda
26 | 


--------------------------------------------------------------------------------
/include/staggered_oprod.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <gauge_field.h>
 3 | #include <color_spinor_field.h>
 4 | 
 5 | namespace quda {
 6 | 
 7 |   /**
 8 |      @brief Compute the outer-product field between the staggered quark
 9 |      field's one and (for HISQ and ASQTAD) three hop sites.  E.g.,
10 | 
11 |      out[0][d](x) = (in(x+1_d) x conj(in(x)))
12 |      out[1][d](x) = (in(x+3_d) x conj(in(x)))
13 | 
14 |      where 1_d and 3_d represent a relative shift of magnitude 1 and 3 in dimension d, respectively
15 | 
16 |      Note out[1] is only computed if nFace=3
17 | 
18 |      @param[out] out Array of nFace outer-product matrix fields
19 |      @param[in] in Input quark field
20 |      @param[in] coeff Coefficient
21 |      @param[in] nFace Number of faces (1 or 3)
22 |   */
23 |   void computeStaggeredOprod(GaugeField *out[], ColorSpinorField& in, const double coeff[], int nFace);
24 | 
25 | } // namespace quda
26 | 


--------------------------------------------------------------------------------
/include/targets/cuda/constant_kernel_arg.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <util_quda.h>
 4 | #include <quda_api.h>
 5 | #include <target_device.h>
 6 | 
 7 | /**
 8 |    @file constant_kernel_arg.h
 9 | 
10 |    This file should be included in the kernel files for which we wish
11 |    to utilize __constant__ memory for the kernel parameter struct.
12 |    This needs to be included before the definition of the kernel,
13 |    e.g., kernel.h in order for the compiler to do the kernel
14 |    instantiation correctly.
15 |  */
16 | 
17 | #ifndef QUDA_LARGE_KERNEL_ARG
18 | 
19 | // set a preprocessor flag that we have included constant_kernel_arg.h
20 | #define QUDA_USE_CONSTANT_MEMORY
21 | 
22 | namespace quda
23 | {
24 | 
25 |   namespace device
26 |   {
27 | 
28 |     /**
29 |        @brief The __constant__ buffer used for kernel parameters
30 |     */
31 | #if defined(__CUDACC_RDC__) && !defined(QUDA_CONSTANT_DEFINE)
32 |     // rdc is enabled when NVSHMEM is enabled, so we need to make the
33 |     // buffer as extern and define it in one place only
34 |     extern __constant__ char buffer[max_constant_size()];
35 | #else
36 |     __constant__ char buffer[max_constant_size()];
37 | #endif
38 | 
39 |     /**
40 |        @brief Helper function that returns kernel argument from
41 |        __constant__ memory.
42 |      */
43 |     template <typename Arg> constexpr std::enable_if_t<!use_kernel_arg<Arg>(), Arg &> get_arg()
44 |     {
45 |       return reinterpret_cast<Arg &>(buffer);
46 |     }
47 | 
48 |     /**
49 |        @brief Helper function that returns a pointer to the
50 |        __constant__ memory buffer.
51 |      */
52 |     template <typename Arg> constexpr std::enable_if_t<!use_kernel_arg<Arg>(), void *> get_constant_buffer()
53 |     {
54 |       return qudaGetSymbolAddress(buffer);
55 |     }
56 | 
57 |   } // namespace device
58 | 
59 | } // namespace quda
60 | 
61 | #endif
62 | 


--------------------------------------------------------------------------------
/include/targets/cuda/externals/generics/ldg.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <generics/detail/alias.h>
 3 | 
 4 | namespace detail {
 5 | 
 6 | template<typename T,
 7 |          typename U=typename working_type<T>::type,
 8 |          int r = aliased_size<T, U>::value>
 9 | struct load_storage {
10 |     typedef array<U, r> result_type;
11 |     static const int idx = aliased_size<T, U>::value - r;
12 |     __device__ __forceinline__
13 |     static result_type impl(const T* ptr) {
14 |         return result_type(__ldg(((const U*)ptr) + idx),
15 |                            load_storage<T, U, r-1>::impl(ptr));
16 |     }
17 | };
18 | 
19 | template<typename T, typename U>
20 | struct load_storage<T, U, 1> {
21 |     typedef array<U, 1> result_type;
22 |     static const int idx = aliased_size<T, U>::value - 1;
23 |     __device__ __forceinline__
24 |     static result_type impl(const T* ptr) {
25 |         return result_type(__ldg(((const U*)ptr) + idx));
26 |     }
27 | };
28 | 
29 | }
30 | 
31 | 
32 | #if __CUDA_ARCH__ >= 350
33 | // Device has ldg
34 | template<typename T>
35 | __device__ __forceinline__ T __ldg(const T* ptr) {
36 |     typedef typename detail::working_array<T>::type aliased;
37 |     aliased storage = detail::load_storage<T>::impl(ptr);
38 |     return detail::fuse<T>(storage);
39 | }
40 | 
41 | #else
42 | //Device does not, fall back.
43 | template<typename T>
44 | __device__ __forceinline__ T __ldg(const T* ptr) {
45 |     return *ptr;
46 | }
47 | 
48 | #endif
49 | 


--------------------------------------------------------------------------------
/include/targets/cuda/externals/trove/warp.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2013, NVIDIA Corporation
 3 | All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are met:
 7 |     * Redistributions of source code must retain the above copyright
 8 |       notice, this list of conditions and the following disclaimer.
 9 |     * Redistributions in binary form must reproduce the above copyright
10 |       notice, this list of conditions and the following disclaimer in the
11 |       documentation and/or other materials provided with the distribution.
12 |     * Neither the name of the <organization> nor the
13 |       names of its contributors may be used to endorse or promote products
14 |       derived from this software without specific prior written permission.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
20 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | */
27 | 
28 | #pragma once
29 | 
30 | namespace trove {
31 | 
32 | #define WARP_CONVERGED 0xffffffff
33 | 
34 | __device__
35 | inline bool warp_converged() {
36 | #if (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000)
37 |     return (__activemask() == WARP_CONVERGED);
38 | #else
39 |     return (__ballot(true) == WARP_CONVERGED);
40 | #endif
41 | }
42 | 
43 | #undef WARP_CONVERGED
44 | 
45 | #define WARP_SIZE 32
46 | #define WARP_MASK 0x1f
47 | #define LOG_WARP_SIZE 5
48 | 
49 | }
50 | 


--------------------------------------------------------------------------------
/include/targets/cuda/fast_intdiv.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | // declaration of class we wish to specialize
 4 | template <bool> struct mul_hi;
 5 | 
 6 | template <> struct mul_hi<true> {
 7 |   __device__ __forceinline__ int operator()(const int n, const int m)
 8 |   {
 9 |     int q;
10 |     asm("mul.hi.s32 %0, %1, %2;" : "=r"(q) : "r"(m), "r"(n));
11 |     return q;
12 |   }
13 | };
14 | 
15 | #include "../generic/fast_intdiv.h"
16 | 


--------------------------------------------------------------------------------
/include/targets/cuda/jitify_options.hpp.in:
--------------------------------------------------------------------------------
1 | #define JITIFY_OPTIONS -I${CMAKE_BINARY_DIR}/lib        \
2 |   -I${CMAKE_BINARY_DIR}/include                         \
3 |   -I${CMAKE_BINARY_DIR}/include/externals               \
4 |   -I${CMAKE_BINARY_DIR}/include/targets/cuda            \
5 |   -I${CMAKE_BINARY_DIR}/include/targets/cuda/externals  \
6 |   -I${CUDAToolkit_INCLUDE_DIRS}
7 | 


--------------------------------------------------------------------------------
/include/targets/cuda/math_helper.h:
--------------------------------------------------------------------------------
 1 | #if defined(__CUDACC__)
 2 | 
 3 | #include <math_helper.cuh>
 4 | 
 5 | #else
 6 | 
 7 | #include "../generic/math_helper.h"
 8 | 
 9 | #endif
10 | 


--------------------------------------------------------------------------------
/include/targets/cuda/pipeline.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | /** @file The wrapper here abstract the cuda::pipeline, but _only_ when
 4 |  *     we believe it gives the better performance.
 5 |  */
 6 | 
 7 | #if (__COMPUTE_CAPABILITY__ >= 800) && (CUDA_VERSION >= 11080)
 8 | #define QUDA_USE_CUDA_PIPELINE
 9 | #include <cuda/pipeline>
10 | #endif
11 | 
12 | namespace quda
13 | {
14 | 
15 | #ifdef QUDA_USE_CUDA_PIPELINE
16 |   struct pipeline_t {
17 |     cuda::pipeline<cuda::thread_scope_thread> pipe;
18 | 
19 |     __device__ inline void producer_acquire() { pipe.producer_acquire(); }
20 | 
21 |     __device__ inline void producer_commit() { pipe.producer_commit(); }
22 | 
23 |     __device__ inline void consumer_wait() { pipe.consumer_wait(); }
24 | 
25 |     __device__ inline void consumer_release() { pipe.consumer_release(); }
26 |   };
27 | 
28 |   __device__ inline pipeline_t make_pipeline()
29 |   {
30 |     pipeline_t p = {cuda::make_pipeline()};
31 |     return p;
32 |   }
33 | #else
34 |   struct pipeline_t {
35 |     __device__ inline void producer_acquire() { }
36 | 
37 |     __device__ inline void producer_commit() { }
38 | 
39 |     __device__ inline void consumer_wait() { }
40 | 
41 |     __device__ inline void consumer_release() { }
42 |   };
43 | 
44 |   __device__ inline pipeline_t make_pipeline()
45 |   {
46 |     pipeline_t p;
47 |     return p;
48 |   }
49 | #endif
50 | 
51 |   template <class T> __device__ inline void memcpy_async(T *destination, T *source, size_t size, pipeline_t &pipe)
52 |   {
53 | #ifdef QUDA_USE_CUDA_PIPELINE
54 |     cuda::memcpy_async(destination, source, size, pipe.pipe);
55 | #else
56 |     *destination = *source;
57 | #endif
58 |   }
59 | 
60 | } // namespace quda
61 | 


--------------------------------------------------------------------------------
/include/targets/cuda/quda_cuda_api.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cuda.h>
 4 | #include <cuda_runtime.h>
 5 | 
 6 | /**
 7 |    @file quda_cuda_api.h
 8 |    @brief Header file that declares some functions that will be called from within the CUDA target
 9 | */
10 | 
11 | namespace quda
12 | {
13 | 
14 |   namespace target
15 |   {
16 | 
17 |     namespace cuda
18 |     {
19 | 
20 |       /**
21 |          @brief Return CUDA stream from QUDA stream.  This is only for
22 |          use inside target/cuda.
23 |          @param stream QUDA stream we which to convert to CUDA stream
24 |          @return CUDA stream
25 |       */
26 |       cudaStream_t get_stream(const qudaStream_t &stream);
27 | 
28 |       void set_runtime_error(cudaError_t error, const char *api_func, const char *func, const char *file,
29 |                              const char *line, bool allow_error = false);
30 | 
31 |       // defined in quda_api.cpp
32 |       void set_driver_error(CUresult error, const char *api_func, const char *func, const char *file, const char *line,
33 |                             bool allow_error = false);
34 | 
35 |     } // namespace cuda
36 |   }   // namespace target
37 | 
38 | } // namespace quda
39 | 


--------------------------------------------------------------------------------
/include/targets/cuda/quda_fp16.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cuda_fp16.h>
 4 | 
 5 | namespace quda
 6 | {
 7 | 
 8 |   __device__ inline half2 habs2(half2 input) {
 9 | #if !(defined(__clang__) && defined(__CUDA__))
10 |     return __habs2(input);
11 | #else
12 |     static constexpr uint32_t maximum_mask = 0x7fff7fffu; // 0111 1111 1111 1111 0111 1111 1111 1111
13 | 
14 |     uint32_t input_masked = *reinterpret_cast<const uint32_t *>(&input) & maximum_mask;
15 |     return *reinterpret_cast<half2 *>(&input_masked);
16 | #endif
17 |   }
18 | 
19 | } // namespace quda
20 | 


--------------------------------------------------------------------------------
/include/targets/cuda/shared_memory_cache_helper.h:
--------------------------------------------------------------------------------
1 | #include "../generic/shared_memory_cache_helper.h"
2 | 


--------------------------------------------------------------------------------
/include/targets/cuda/thread_local_cache.h:
--------------------------------------------------------------------------------
1 | #include "../generic/thread_local_cache.h"
2 | 


--------------------------------------------------------------------------------
/include/targets/cuda/warp_collective.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <target_device.h>
 4 | 
 5 | namespace quda
 6 | {
 7 | 
 8 |   template <bool is_device> struct warp_combine_impl {
 9 |     template <typename T> T operator()(T &x, int) { return x; }
10 |   };
11 | 
12 |   template <> struct warp_combine_impl<true> {
13 |     template <typename T> __device__ inline T operator()(T &x, int warp_split)
14 |     {
15 |       constexpr int warp_size = device::warp_size();
16 |       if (warp_split > 1) {
17 | #pragma unroll
18 |         for (int i = 0; i < x.size(); i++) {
19 |           // reduce down to the first group of column-split threads
20 | #pragma unroll
21 |           for (int offset = warp_size / 2; offset >= warp_size / warp_split; offset /= 2) {
22 |             // TODO - add support for non-converged warps
23 |             x[i].real(x[i].real() + __shfl_down_sync(device::warp_converged_mask(), x[i].real(), offset));
24 |             x[i].imag(x[i].imag() + __shfl_down_sync(device::warp_converged_mask(), x[i].imag(), offset));
25 |           }
26 |         }
27 |       }
28 |       return x;
29 |     }
30 |   };
31 | 
32 |   template <int warp_split, typename T> __device__ __host__ inline T warp_combine(T &x)
33 |   {
34 |     return target::dispatch<warp_combine_impl>(x, warp_split);
35 |   }
36 | 
37 | } // namespace quda
38 | 


--------------------------------------------------------------------------------
/include/targets/generic/FFT_Plans.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <quda_internal.h>
 4 | 
 5 | // Dummy implementation that does nothing
 6 | 
 7 | #define FFT_FORWARD 0
 8 | #define FFT_INVERSE 1
 9 | 
10 | namespace quda
11 | {
12 | 
13 |   typedef struct {
14 |     bool isDouble;
15 |   } FFTPlanHandle;
16 | 
17 |   inline static constexpr bool HaveFFT() { return false; }
18 | 
19 |   inline void ApplyFFT(FFTPlanHandle &, float2 *, float2 *, int) { errorQuda("FFTs are disabled"); }
20 | 
21 |   inline void ApplyFFT(FFTPlanHandle &, double2 *, double2 *, int) { errorQuda("FFTs are disabled"); }
22 | 
23 |   inline void SetPlanFFTMany(FFTPlanHandle &, int4, int, QudaPrecision) { errorQuda("FFTs are disabled"); }
24 | 
25 |   inline void SetPlanFFT2DMany(FFTPlanHandle &, int4, int, QudaPrecision) { errorQuda("FFTs are disabled"); }
26 | 
27 |   inline void FFTDestroyPlan(FFTPlanHandle &) { errorQuda("FFTs are disabled"); }
28 | 
29 | } // namespace quda
30 | 


--------------------------------------------------------------------------------
/include/targets/generic/aos.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | namespace quda
 4 | {
 5 | 
 6 |   /**
 7 |      @brief Load n-length block of memory of type T and return in local array
 8 |      @tparam T Array element type
 9 |      @tparam n Number of elements in the structure
10 |      @param[out] out Output array
11 |      @param[in] in Input memory pointer we are block loading from
12 |    */
13 |   template <typename T, int n> __host__ __device__ void block_load(T out[n], const T *in)
14 |   {
15 | #pragma unroll
16 |     for (int i = 0; i < n; i++) out[i] = in[i];
17 |   }
18 | 
19 |   /**
20 |      @brief Store n-length array of type T in block of memory
21 |      @tparam T Array element type
22 |      @tparam n Number of elements in the array
23 |      @param[out] out Output memory pointer we are block storing to
24 |      @param[in] in Input array
25 |    */
26 |   template <typename T, int n> __host__ __device__ void block_store(T *out, const T in[n])
27 |   {
28 | #pragma unroll
29 |     for (int i = 0; i < n; i++) out[i] = in[i];
30 |   }
31 | 
32 |   /**
33 |      @brief Load type T from contiguous memory
34 |      @tparam T Element type
35 |      @param[out] out Output value
36 |      @param[in] in Input memory pointer we are loading from
37 |   */
38 |   template <typename T> __host__ __device__ void block_load(T &out, const T *in) { out = *in; }
39 | 
40 |   /**
41 |      @brief Store type T in contiguous memory
42 |      @tparam Element type
43 |      @param[out] out Output memory pointer we are storing to
44 |      @param[in] in Input value
45 |   */
46 |   template <typename T> __host__ __device__ void block_store(T *out, const T &in) { *out = in; }
47 | 
48 | } // namespace quda
49 | 


--------------------------------------------------------------------------------
/include/targets/generic/block_reduction_kernel_host.h:
--------------------------------------------------------------------------------
 1 | namespace quda
 2 | {
 3 | 
 4 |   template <template <typename> class Functor, typename Arg> void BlockKernel2D_host(const Arg &arg)
 5 |   {
 6 |     Functor<Arg> t(arg);
 7 | #pragma omp parallel for
 8 |     for (unsigned int y = 0; y < arg.grid_dim.y; y++) {
 9 |       for (unsigned int x = 0; x < arg.grid_dim.x; x++) {
10 |         t(dim3(x, y, 0), dim3(0, 0, 0));
11 |       }
12 |     }
13 |   }
14 | 
15 | } // namespace quda
16 | 


--------------------------------------------------------------------------------
/include/targets/generic/kernel_host.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | namespace quda
 4 | {
 5 | 
 6 |   template <template <typename> class Functor, typename Arg> void Kernel1D_host(const Arg &arg)
 7 |   {
 8 |     Functor<Arg> f(const_cast<Arg &>(arg));
 9 | #pragma omp parallel for
10 |     for (int i = 0; i < static_cast<int>(arg.threads.x); i++) { f(i); }
11 |   }
12 | 
13 |   template <template <typename> class Functor, typename Arg> void Kernel2D_host(const Arg &arg)
14 |   {
15 |     Functor<Arg> f(const_cast<Arg &>(arg));
16 | #pragma omp parallel for
17 |     for (int i = 0; i < static_cast<int>(arg.threads.x); i++) {
18 |       for (int j = 0; j < static_cast<int>(arg.threads.y); j++) { f(i, j); }
19 |     }
20 |   }
21 | 
22 |   template <template <typename> class Functor, typename Arg> void Kernel3D_host(const Arg &arg)
23 |   {
24 |     Functor<Arg> f(const_cast<Arg &>(arg));
25 | #pragma omp parallel for
26 |     for (int i = 0; i < static_cast<int>(arg.threads.x); i++) {
27 |       for (int j = 0; j < static_cast<int>(arg.threads.y); j++) {
28 |         for (int k = 0; k < static_cast<int>(arg.threads.z); k++) { f(i, j, k); }
29 |       }
30 |     }
31 |   }
32 | 
33 | } // namespace quda
34 | 


--------------------------------------------------------------------------------
/include/targets/generic/kernel_ops_target.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <kernel_ops.h>
 3 | 
 4 | /**
 5 |    @file kernel_ops_target.h
 6 | 
 7 |    @section This file contains the target-specific parts of the
 8 |    KernelOps support.  This is the generic implementation which is
 9 |    used by default for targets that don't provide a target-specific
10 |    version of this file.  This is a dummy implementation for targets
11 |    that don't need to pass any data (e.g. a shared memory pointer) to
12 |    tagged kernels.
13 |  */
14 | 
15 | namespace quda
16 | {
17 | 
18 |   // KernelOps
19 |   template <typename... T> struct KernelOps : KernelOpsBase<T...> {
20 |   };
21 | 
22 | } // namespace quda
23 | 


--------------------------------------------------------------------------------
/include/targets/generic/load_store.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <target_device.h>
 4 | 
 5 | namespace quda
 6 | {
 7 | 
 8 |   /**
 9 |      @brief Non-specialized load operation
10 |   */
11 |   template <bool is_device> struct vector_load_impl {
12 |     template <typename T> __device__ __host__ inline void operator()(T &value, const void *ptr, int idx)
13 |     {
14 |       value = reinterpret_cast<const T *>(ptr)[idx];
15 |     }
16 |   };
17 | 
18 |   template <typename VectorType> __device__ __host__ inline VectorType vector_load(const void *ptr, int idx)
19 |   {
20 |     VectorType value;
21 |     target::dispatch<vector_load_impl>(value, ptr, idx);
22 |     return value;
23 |   }
24 | 
25 |   /**
26 |      @brief Non-specialized store operation
27 |   */
28 |   template <bool is_device> struct vector_store_impl {
29 |     template <typename T> __device__ __host__ inline void operator()(void *ptr, int idx, const T &value)
30 |     {
31 |       reinterpret_cast<T *>(ptr)[idx] = value;
32 |     }
33 |   };
34 | 
35 |   template <typename VectorType>
36 |   __device__ __host__ inline void vector_store(void *ptr, int idx, const VectorType &value)
37 |   {
38 |     target::dispatch<vector_store_impl>(ptr, idx, value);
39 |   }
40 | 
41 | } // namespace quda
42 | 


--------------------------------------------------------------------------------
/include/targets/generic/math_helper.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cmath>
 4 | 
 5 | namespace quda
 6 | {
 7 | 
 8 |   inline float abs(const float a) { return fabs(a); }
 9 |   inline double abs(const double a) { return fabs(a); }
10 |   template <typename T> inline T sqrt(const T a) { return ::sqrt(a); }
11 |   template <typename T> inline T exp(const T a) { return ::exp(a); }
12 |   template <typename T> inline T log(const T a) { return ::log(a); }
13 |   template <typename T> inline T sin(const T a) { return ::sin(a); }
14 |   template <typename T> inline T cos(const T a) { return ::cos(a); }
15 |   template <typename T> inline T sinh(const T a) { return ::sinh(a); }
16 |   template <typename T> inline T cosh(const T a) { return ::cosh(a); }
17 |   template <typename T> inline T max(const T a, const T b) { return a > b ? a : b; }
18 |   template <typename T> inline T min(const T a, const T b) { return a < b ? a : b; }
19 |   template <typename T> inline void sincos(const T a, T *s, T *c) { ::sincos(a, s, c); }
20 |   template <typename T> inline void sincospi(const T a, T *s, T *c) { ::sincos(a * static_cast<T>(M_PI), s, c); }
21 |   template <typename T> inline T sinpi(const T a) { return ::sin(a * static_cast<float>(M_PI)); }
22 |   template <typename T> inline T cospi(const T a) { return ::cos(a * static_cast<float>(M_PI)); }
23 |   template <typename T> inline T rsqrt(const T a) { return static_cast<T>(1.0) / ::sqrt(a); }
24 |   template <typename T> inline T pow(const T a, const T b) { return ::pow(a, b); }
25 |   template <typename T> inline T pow(const T a, const int b) { return ::pow(a, b); }
26 |   template <typename T> inline T fpow(const T a, const int b) { return ::pow(a, b); }
27 |   template <typename T> inline T fmod(const T a, const T b) { return ::fmod(a, b); }
28 |   inline float fdividef(const float a, const float b) { return a / b; }
29 | 
30 | } // namespace quda
31 | 


--------------------------------------------------------------------------------
/include/targets/generic/reduction_kernel_host.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <vector>
 4 | 
 5 | namespace quda
 6 | {
 7 | 
 8 |   template <template <typename> class Functor, typename Arg> auto Reduction2D_host(const Arg &arg)
 9 |   {
10 |     using reduce_t = typename Functor<Arg>::reduce_t;
11 |     Functor<Arg> t(arg);
12 | 
13 |     reduce_t value = t.init();
14 | #pragma omp parallel for collapse(2) reduction(Functor <Arg>::apply : value)
15 |     for (int j = 0; j < static_cast<int>(arg.threads.y); j++) {
16 |       for (int i = 0; i < static_cast<int>(arg.threads.x); i++) { value = t(value, i, j); }
17 |     }
18 | 
19 |     return value;
20 |   }
21 | 
22 |   template <template <typename> class Functor, typename Arg> auto MultiReduction_host(const Arg &arg)
23 |   {
24 | #pragma omp declare reduction(multi_reduce                                                                             \
25 |                               : typename Functor <Arg>::reduce_t                                                       \
26 |                               : omp_out = Functor <Arg>::apply(omp_out, omp_in))                                       \
27 |   initializer(omp_priv = Functor <Arg>::init())
28 | 
29 |     using reduce_t = typename Functor<Arg>::reduce_t;
30 |     Functor<Arg> t(arg);
31 | 
32 |     std::vector<reduce_t> value(arg.threads.z, t.init());
33 |     for (int k = 0; k < static_cast<int>(arg.threads.z); k++) {
34 |       auto val = t.init();
35 | 
36 | #pragma omp parallel for collapse(2) reduction(multi_reduce : val)
37 |       for (int j = 0; j < static_cast<int>(arg.threads.y); j++) {
38 |         for (int i = 0; i < static_cast<int>(arg.threads.x); i++) { val = t(val, i, j, k); }
39 |       }
40 | 
41 |       value[k] = val;
42 |     }
43 | 
44 |     return value;
45 |   }
46 | 
47 | } // namespace quda
48 | 


--------------------------------------------------------------------------------
/include/targets/generic/thread_array.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <kernel_ops.h>
 4 | #include <shared_memory_helper.h>
 5 | #include <array.h>
 6 | 
 7 | namespace quda
 8 | {
 9 | 
10 |   /**
11 |      @brief Class that provides indexable per-thread storage for n
12 |      elements of type T.  This version uses shared memory for storage.
13 |      The offset into the shared memory region is determined from the
14 |      type O.
15 |    */
16 |   template <typename T, int n, typename O = void> class thread_array : SharedMemory<array<T, n>, SizePerThread<1>, O>
17 |   {
18 |     using Smem = SharedMemory<array<T, n>, SizePerThread<1>, O>;
19 |     using Smem::sharedMem;
20 |     array<T, n> &array_;
21 | 
22 |   public:
23 |     using Smem::shared_mem_size;
24 | 
25 |     template <typename... U>
26 |     __device__ __host__ constexpr thread_array(const KernelOps<U...> &ops) :
27 |       Smem(ops), array_(sharedMem()[target::thread_idx_linear<3>()])
28 |     {
29 |       checkKernelOps<thread_array<T, n, O>>(ops);
30 |       array_ = array<T, n> {}; // call default constructor
31 |     }
32 | 
33 |     constexpr thread_array(const thread_array<T, n, O> &) = delete;
34 | 
35 |     __device__ __host__ T &operator[](int i) { return array_[i]; }
36 |     __device__ __host__ const T &operator[](int i) const { return array_[i]; }
37 |   };
38 | 
39 | } // namespace quda
40 | 


--------------------------------------------------------------------------------
/include/targets/hip/constant_kernel_arg.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <target_device.h>
 4 | 
 5 | /**
 6 |    @file constant_kernel_arg.h
 7 | 
 8 |    This file should be included in the kernel files for which we wish
 9 |    to utilize __constant__ memory for the kernel parameter struct.
10 |    This needs to be included before the definition of the kernel,
11 |    e.g., kernel.h in order for the compiler to do the kernel
12 |    instantiation correctly.
13 |  */
14 | 
15 | // set a preprocessor flag that we have included constant_kernel_arg.h
16 | #define QUDA_USE_CONSTANT_MEMORY
17 | 
18 | namespace quda
19 | {
20 | 
21 |   namespace device
22 |   {
23 | 
24 |     /**
25 |        @brief The __constant__ buffer userd for kernel parameters
26 |     */
27 |     __constant__ char buffer[max_constant_size()];
28 | 
29 |     /**
30 |        @brief Helper function that returns kernel argument from
31 |        __constant__ memory.
32 |      */
33 |     template <typename Arg> constexpr std::enable_if_t<!use_kernel_arg<Arg>(), Arg &> get_arg()
34 |     {
35 |       return reinterpret_cast<Arg &>(buffer);
36 |     }
37 | 
38 |     /**
39 |        @brief Helper function that returns a pointer to the
40 |        __constant__ memory buffer.
41 |      */
42 |     template <typename Arg> constexpr std::enable_if_t<!use_kernel_arg<Arg>(), void *> get_constant_buffer()
43 |     {
44 |       return qudaGetSymbolAddress(buffer);
45 |     }
46 | 
47 |   } // namespace device
48 | 
49 | } // namespace quda
50 | 


--------------------------------------------------------------------------------
/include/targets/hip/math_helper.h:
--------------------------------------------------------------------------------
 1 | #if defined(__HIP__)
 2 | 
 3 | #include <math_helper.cuh>
 4 | 
 5 | #else
 6 | 
 7 | #include "../generic/math_helper.h"
 8 | 
 9 | #endif
10 | 


--------------------------------------------------------------------------------
/include/targets/hip/quda_hip_api.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <hip/hip_runtime.h>
 3 | 
 4 | namespace quda
 5 | {
 6 | 
 7 |   /**
 8 |      @file quda_hip_api.h
 9 |      @brief Header file that declares some functions that will be called from within the CUDA target
10 |   */
11 |   namespace target
12 |   {
13 |     namespace hip
14 |     {
15 | 
16 |       /**
17 |            @brief Return HIP stream from QUDA stream.  This is only for
18 |            use inside target/cuda.
19 |            @param stream QUDA stream we which to convert to CUDA stream
20 |            @return CUDA stream
21 |         */
22 |       hipStream_t get_stream(const qudaStream_t &stream);
23 | 
24 |       void set_runtime_error(hipError_t error, const char *api_func, const char *func, const char *file,
25 |                              const char *line, bool allow_error = false);
26 | 
27 |       // defined in quda_api.cpp
28 |       void set_driver_error(hipError_t error, const char *api_func, const char *func, const char *file,
29 |                             const char *line, bool allow_error = false);
30 |     } // namespace hip
31 |   }   // namespace target
32 | } // namespace quda
33 | 


--------------------------------------------------------------------------------
/include/targets/hip/reduce_helper.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #include <quda_internal.h>
4 | // if not using heterogeneous atomics use the generic variant
5 | #include "../generic/reduce_helper.h"
6 | 


--------------------------------------------------------------------------------
/include/targets/hip/shared_memory_cache_helper.h:
--------------------------------------------------------------------------------
1 | #include "../generic/shared_memory_cache_helper.h"
2 | 


--------------------------------------------------------------------------------
/include/targets/hip/warp_collective.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <target_device.h>
 4 | 
 5 | namespace quda
 6 | {
 7 | 
 8 |   template <bool is_device> struct warp_combine_impl {
 9 |     template <typename T> T operator()(T &x, int) { return x; }
10 |   };
11 | 
12 |   template <> struct warp_combine_impl<true> {
13 | 
14 |     template <typename T> __device__ inline T operator()(T &x, int warp_split)
15 |     {
16 |       constexpr int warp_size = device::warp_size();
17 |       if (warp_split > 1) {
18 | #pragma unroll
19 |         for (int i = 0; i < x.size(); i++) {
20 |           // reduce down to the first group of column-split threads
21 | #pragma unroll
22 |           for (int offset = warp_size / 2; offset >= warp_size / warp_split; offset /= 2) {
23 |             // TODO - add support for non-converged warps
24 |             x[i].real(x[i].real() + __shfl_down(x[i].real(), offset));
25 |             x[i].imag(x[i].imag() + __shfl_down(x[i].imag(), offset));
26 |           }
27 |         }
28 |       }
29 | 
30 |       return x;
31 |     }
32 |   };
33 | 
34 |   template <int warp_split, typename T> __device__ __host__ inline T warp_combine(T &x)
35 |   {
36 |     return target::dispatch<warp_combine_impl>(x, warp_split);
37 |   }
38 | } // namespace quda
39 | 


--------------------------------------------------------------------------------
/include/tune_key.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cstring>
 4 | #include <ostream>
 5 | 
 6 | namespace quda {
 7 | 
 8 |   struct TuneKey {
 9 | 
10 |     static const int volume_n = 32;
11 |     static const int name_n = 512;
12 |     static const int aux_n = 256;
13 |     char volume[volume_n];
14 |     char name[name_n];
15 |     char aux[aux_n];
16 | 
17 |     TuneKey() { }
18 |     TuneKey(const char v[], const char n[], const char a[]="type=default") {
19 |       strcpy(volume, v);
20 |       strcpy(name, n);
21 |       strcpy(aux, a);
22 |     }
23 | 
24 |     TuneKey(const TuneKey &) = default;
25 |     TuneKey(TuneKey &&) = default;
26 |     TuneKey &operator=(const TuneKey &) = default;
27 |     TuneKey &operator=(TuneKey &&) = default;
28 | 
29 |     bool operator<(const TuneKey &other) const {
30 |       int vc = std::strcmp(volume, other.volume);
31 |       if (vc < 0) {
32 | 	return true;
33 |       } else if (vc == 0) {
34 | 	int nc = std::strcmp(name, other.name);
35 | 	if (nc < 0) {
36 | 	  return true;
37 | 	} else if (nc == 0) {
38 | 	  return (std::strcmp(aux, other.aux) < 0 ? true : false);
39 | 	}
40 |       }
41 |       return false;
42 |     }
43 | 
44 |     friend std::ostream &operator<<(std::ostream &output, const TuneKey &key)
45 |     {
46 |       output << "volume = " << key.volume << ", ";
47 |       output << "name = " << key.name << ", ";
48 |       output << "aux = " << key.aux;
49 |       return output;
50 |     }
51 |   };
52 | 
53 |   /** Return the key of the last kernel that has been tuned / called.*/
54 |   TuneKey getLastTuneKey();
55 | 
56 | } // namespace quda
57 | 


--------------------------------------------------------------------------------
/include/unitarization_links.h:
--------------------------------------------------------------------------------
 1 | #ifndef _UNITARIZATION_LINKS_QUDA_H
 2 | #define _UNITARIZATION_LINKS_QUDA_H
 3 | 
 4 | #include <gauge_field.h>
 5 | 
 6 | 
 7 | // ***************************************************
 8 | //  Declarations for unitarization functions used 
 9 | //  in the construction of the hisq-fattened links
10 | // 
11 | //  There are many algorithms for unitarizing 
12 | //  fat7-smeared link variables. 
13 | //  In practice, we use the method employed by 
14 | //  MILC and QOPQDP, namely a combination of 
15 | //  "analytic", or Cayley-Hamilton, unitarization, 
16 | //  and SVD.  
17 | //  Analytic unitarization is first attempted. 
18 | //  The eigenvalues of the matrix Q = V^{dagger}V 
19 | //  (V being the fat7 link) are computed, 
20 | //  if the determinant of Q is less than a user-defined 
21 | //  value (svd_abs_error), or the relative error on the 
22 | //  determinant, estimated by comparing the product 
23 | //  of the eigenvalues of Q to the determinant obtained 
24 | //  from the standard formula, is greater than a 
25 | //  user-specified tolerance (svd_rel_error), then 
26 | //  SVD is used to perform the unitarization.
27 | // ***************************************************  
28 | 
29 | 
30 | namespace quda {
31 | 
32 |   void setUnitarizeLinksConstants(double unitarize_eps, double max_error, 
33 | 				  bool allow_svd, bool svd_only,
34 | 				  double svd_rel_error, double svd_abs_error);
35 | 
36 |   void unitarizeLinksCPU(GaugeField &outfield, const GaugeField &infield);
37 | 
38 |   void unitarizeLinks(GaugeField &outfield, const GaugeField &infield, int *fails);
39 |   void unitarizeLinks(GaugeField &outfield, int *fails);
40 | 
41 |   bool isUnitary(const GaugeField &field, double max_error);
42 | 
43 | } // namespace quda
44 | 
45 | 
46 | #endif // _UNITARIZATION_LINKS_H
47 | 


--------------------------------------------------------------------------------
/include/vector_io.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <string>
 4 | #include <color_spinor_field.h>
 5 | #include <reference_wrapper_helper.h>
 6 | 
 7 | namespace quda
 8 | {
 9 | 
10 |   /**
11 |      @brief VectorIO is a simple wrapper class for loading and saving
12 |      sets of vector fields using QIO.
13 |    */
14 |   class VectorIO
15 |   {
16 |     const std::string filename;
17 |     bool parity_inflate;
18 |     bool partfile;
19 | 
20 |   public:
21 |     /**
22 |        Constructor for VectorIO class
23 |        @param[in] filename The filename associated with this IO object
24 |        @param[in] parity_inflate Whether to inflate single_parity
25 |        field to dual parity fields for I/O
26 |        @param[in] partfile Whether or not to save in partfiles (ignored on load)
27 |     */
28 |     VectorIO(const std::string &filename, bool parity_inflate = false, bool partfile = false);
29 | 
30 |     /**
31 |        @brief Load vectors from filename
32 |        @param[in] vecs The set of vectors to load
33 |     */
34 |     void load(cvector_ref<ColorSpinorField> &vecs);
35 | 
36 |     /**
37 |        @brief Save vectors to filename
38 |        @param[in] vecs The set of vectors to save
39 |        @param[in] prec Optional change of precision when saving
40 |        @param[in] size Optional cap to number of vectors saved
41 |     */
42 |     void save(cvector_ref<const ColorSpinorField> &vecs, QudaPrecision prec = QUDA_INVALID_PRECISION, uint32_t size = 0);
43 |   };
44 | 
45 | } // namespace quda
46 | 


--------------------------------------------------------------------------------
/include/worker.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | namespace quda {
 4 | 
 5 |   class Worker {
 6 | 
 7 |   public:
 8 |     Worker() { }
 9 |     virtual ~Worker() { }
10 |     virtual void apply(const qudaStream_t stream = device::get_default_stream()) = 0;
11 |   };
12 | 
13 | } // namespace quda
14 | 


--------------------------------------------------------------------------------
/jenkins/bqcd.config.cmake:
--------------------------------------------------------------------------------
 1 | # preload file for jenkins test
 2 | 
 3 | # MILC - turns on staggered dirac and all HISQ and gauge features for MILC RHMC
 4 | 
 5 | set(QUDA_DIRAC_WILSON ON CACHE BOOL "build Wilson Dirac operators")
 6 | set(QUDA_DIRAC_CLOVER ON CACHE BOOL "build clover Dirac operators")
 7 | set(QUDA_DIRAC_DOMAIN_WALL OFF CACHE BOOL "build domain wall Dirac operators")
 8 | set(QUDA_DIRAC_STAGGERED OFF CACHE BOOL "build staggered Dirac operators")
 9 | set(QUDA_DIRAC_TWISTED_MASS OFF CACHE BOOL "build twisted mass Dirac operators")
10 | set(QUDA_DIRAC_TWISTED_CLOVER OFF CACHE BOOL "build twisted clover Dirac operators")
11 | set(QUDA_DYNAMIC_CLOVER OFF CACHE BOOL "Dynamically invert the clover term for twisted-clover")
12 | set(QUDA_QIO OFF CACHE BOOL "build QIO code for binary I/O")
13 | 
14 | # advanced 
15 | set(QUDA_MULTIGRID OFF CACHE BOOL "build multigrid solvers")
16 | 
17 | # build with MILC interface
18 | set(QUDA_INTERFACE_QDP ON CACHE BOOL "build qdp interface")
19 | set(QUDA_INTERFACE_MILC OFF CACHE BOOL "build milc interface")
20 | set(QUDA_INTERFACE_CPS OFF CACHE BOOL "build cps interface")
21 | set(QUDA_INTERFACE_QDPJIT OFF CACHE BOOL "build qdpjit interface")
22 | set(QUDA_INTERFACE_BQCD ON CACHE BOOL "build bqcd interface")
23 | set(QUDA_INTERFACE_TIFR OFF CACHE BOOL "build tifr interface")
24 | 


--------------------------------------------------------------------------------
/jenkins/milc.config.cmake:
--------------------------------------------------------------------------------
 1 | # preload file for jenkins test
 2 | 
 3 | # MILC - turns on staggered dirac and all HISQ and gauge features for MILC RHMC
 4 | 
 5 | set(QUDA_DIRAC_WILSON OFF CACHE BOOL "build Wilson Dirac operators")
 6 | set(QUDA_DIRAC_CLOVER OFF CACHE BOOL "build clover Dirac operators")
 7 | set(QUDA_DIRAC_DOMAIN_WALL OFF CACHE BOOL "build domain wall Dirac operators")
 8 | set(QUDA_DIRAC_STAGGERED ON CACHE BOOL "build staggered Dirac operators")
 9 | set(QUDA_DIRAC_TWISTED_MASS OFF CACHE BOOL "build twisted mass Dirac operators")
10 | set(QUDA_DIRAC_TWISTED_CLOVER OFF CACHE BOOL "build twisted clover Dirac operators")
11 | set(QUDA_DYNAMIC_CLOVER OFF CACHE BOOL "Dynamically invert the clover term for twisted-clover")
12 | set(QUDA_QIO OFF CACHE BOOL "build QIO code for binary I/O")
13 | 
14 | set(QUDA_MULTIGRID OFF CACHE BOOL "build multigrid solvers")
15 | 


--------------------------------------------------------------------------------
/jenkins/twistedmass.config.cmake:
--------------------------------------------------------------------------------
 1 | # preload file for jenkins test
 2 | 
 3 | # MILC - turns on staggered dirac and all HISQ and gauge features for MILC RHMC
 4 | 
 5 | set(QUDA_DIRAC_WILSON ON CACHE BOOL "build Wilson Dirac operators")
 6 | set(QUDA_DIRAC_CLOVER ON CACHE BOOL "build clover Dirac operators")
 7 | set(QUDA_DIRAC_DOMAIN_WALL OFF CACHE BOOL "build domain wall Dirac operators")
 8 | set(QUDA_DIRAC_STAGGERED OFF CACHE BOOL "build staggered Dirac operators")
 9 | set(QUDA_DIRAC_TWISTED_MASS OFF CACHE BOOL "build twisted mass Dirac operators")
10 | set(QUDA_DIRAC_TWISTED_CLOVER OFF CACHE BOOL "build twisted clover Dirac operators")
11 | set(QUDA_DYNAMIC_CLOVER OFF CACHE BOOL "Dynamically invert the clover term for twisted-clover")
12 | set(QUDA_QIO OFF CACHE BOOL "build QIO code for binary I/O")
13 | 
14 | # advanced 
15 | set(QUDA_MULTIGRID OFF CACHE BOOL "build multigrid solvers")
16 | 
17 | # build with MILC interface
18 | set(QUDA_INTERFACE_QDP ON CACHE BOOL "build qdp interface")
19 | set(QUDA_INTERFACE_MILC OFF CACHE BOOL "build milc interface")
20 | set(QUDA_INTERFACE_CPS OFF CACHE BOOL "build cps interface")
21 | set(QUDA_INTERFACE_QDPJIT OFF CACHE BOOL "build qdpjit interface")
22 | set(QUDA_INTERFACE_BQCD ON CACHE BOOL "build bqcd interface")
23 | set(QUDA_INTERFACE_TIFR OFF CACHE BOOL "build tifr interface")
24 | 


--------------------------------------------------------------------------------
/lib/.directory:
--------------------------------------------------------------------------------
1 | [Dolphin]
2 | Timestamp=2015,6,9,13,36,18
3 | ViewMode=1
4 | 


--------------------------------------------------------------------------------
/lib/clover_quda.cu:
--------------------------------------------------------------------------------
 1 | #include <tune_quda.h>
 2 | #include <clover_field.h>
 3 | #include <gauge_field.h>
 4 | #include <instantiate.h>
 5 | #include <tunable_nd.h>
 6 | #include <kernels/clover_compute.cuh>
 7 | 
 8 | namespace quda {
 9 | 
10 |   template <typename store_t>
11 |   class ComputeClover : TunableKernel2D {
12 |     CloverArg<store_t> arg;
13 |     const GaugeField &meta;
14 |     bool tuneSharedBytes() const { return false; } // Don't tune the shared memory.
15 |     unsigned int minThreads() const { return arg.threads.x; }
16 | 
17 |   public:
18 |     ComputeClover(CloverField &clover, const GaugeField& f, double coeff) :
19 |       TunableKernel2D(clover, 2),
20 |       arg(clover, f, coeff),
21 |       meta(f)
22 |     {
23 |       checkNative(clover, f);
24 |       apply(device::get_default_stream());
25 |     }
26 | 
27 |     void apply(const qudaStream_t &stream)
28 |     {
29 |       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
30 |       launch<CloverCompute>(tp, stream, arg);
31 |     }
32 | 
33 |     long long flops() const { return 2*arg.threads.x*480ll; }
34 |     long long bytes() const { return 2*arg.threads.x*(6*arg.f.Bytes() + arg.clover.Bytes()); }
35 |   };
36 | 
37 |   void computeClover(CloverField &clover, const GaugeField& f, double coeff)
38 |   {
39 |     if constexpr (is_enabled_clover()) {
40 |       getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
41 |       if (clover.Precision() < QUDA_SINGLE_PRECISION) errorQuda("Cannot use fixed-point precision here");
42 |       clover.Diagonal(0.5); // 0.5 comes from scaling used on native fields
43 |       instantiate<ComputeClover>(clover, f, coeff);
44 |       getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
45 |     } else {
46 |       errorQuda("Clover has not been built");
47 |     }
48 |   }
49 | 
50 | } // namespace quda
51 | 
52 | 


--------------------------------------------------------------------------------
/lib/coarse_op_preconditioned.in.cpp:
--------------------------------------------------------------------------------
 1 | #include "multigrid.h"
 2 | #include <int_list.hpp>
 3 | 
 4 | namespace quda
 5 | {
 6 | 
 7 |   template <int Nc, int... N>
 8 |   void calculateYhat(GaugeField &Yhat, GaugeField &Xinv, const GaugeField &Y, const GaugeField &X, bool use_mma,
 9 |                      IntList<Nc, N...>)
10 |   {
11 |     if (Y.Ncolor() / 2 == Nc) {
12 |       calculateYhat<Nc>(Yhat, Xinv, Y, X, use_mma);
13 |     } else {
14 |       if constexpr (sizeof...(N) > 0) {
15 |         calculateYhat(Yhat, Xinv, Y, X, use_mma, IntList<N...>());
16 |       } else {
17 |         errorQuda("Nc = %d has not been instantiated", Y.Ncolor() / 2);
18 |       }
19 |     }
20 |   }
21 |   void calculateYhat(GaugeField &Yhat, GaugeField &Xinv, const GaugeField &Y, const GaugeField &X, bool use_mma)
22 |   {
23 |     if constexpr (is_enabled_multigrid()) {
24 |       // clang-format off
25 |       calculateYhat(Yhat, Xinv, Y, X, use_mma, IntList<@QUDA_MULTIGRID_NVEC_LIST@>());
26 |       // clang-format on
27 |     } else {
28 |       errorQuda("Multigrid has not been built");
29 |     }
30 |   }
31 | 
32 | } // namespace quda
33 | 


--------------------------------------------------------------------------------
/lib/copy_color_spinor_dd.cu:
--------------------------------------------------------------------------------
 1 | #include <copy_color_spinor.cuh>
 2 | 
 3 | namespace quda {
 4 |   
 5 |   void copyGenericColorSpinorDD(const copy_pack_t &pack)
 6 |   {
 7 |     CopyGenericColorSpinor<3, double, double>(pack);
 8 |   }  
 9 | 
10 | } // namespace quda
11 | 


--------------------------------------------------------------------------------
/lib/copy_color_spinor_dh.cu:
--------------------------------------------------------------------------------
 1 | #include <copy_color_spinor.cuh>
 2 | 
 3 | namespace quda {
 4 |   
 5 |   void copyGenericColorSpinorDH(const copy_pack_t &pack)
 6 |   {
 7 | #if QUDA_PRECISION & 2
 8 |     CopyGenericColorSpinor<3, double, short>(pack);
 9 | #else
10 |     errorQuda("QUDA_PRECISION=%d does not enable precision combination %d %d", QUDA_PRECISION, std::get<0>(pack).Precision(), std::get<1>(pack).Precision());
11 | #endif
12 |   }
13 | 
14 | } // namespace quda
15 | 


--------------------------------------------------------------------------------
/lib/copy_color_spinor_dq.cu:
--------------------------------------------------------------------------------
 1 | #include <copy_color_spinor.cuh>
 2 | 
 3 | namespace quda {
 4 |   
 5 |   void copyGenericColorSpinorDQ(const copy_pack_t &pack)
 6 |   {
 7 | #if QUDA_PRECISION & 1
 8 |     CopyGenericColorSpinor<3, double, int8_t>(pack);
 9 | #else
10 |     errorQuda("QUDA_PRECISION=%d does not enable precision combination %d %d", QUDA_PRECISION, std::get<0>(pack).Precision(), std::get<1>(pack).Precision());
11 | #endif
12 |   }  
13 | 
14 | } // namespace quda
15 | 


--------------------------------------------------------------------------------
/lib/copy_color_spinor_ds.cu:
--------------------------------------------------------------------------------
 1 | #include <copy_color_spinor.cuh>
 2 | 
 3 | namespace quda {
 4 |   
 5 |   void copyGenericColorSpinorDS(const copy_pack_t &pack)
 6 |   {
 7 | #if QUDA_PRECISION & 4
 8 |     CopyGenericColorSpinor<3, double, float>(pack);
 9 | #else
10 |     errorQuda("QUDA_PRECISION=%d does not enable precision combination %d %d", QUDA_PRECISION, std::get<0>(pack).Precision(), std::get<1>(pack).Precision());
11 | #endif
12 |   }  
13 | 
14 | } // namespace quda
15 | 


--------------------------------------------------------------------------------
/lib/copy_color_spinor_hd.cu:
--------------------------------------------------------------------------------
 1 | #include <copy_color_spinor.cuh>
 2 | 
 3 | namespace quda {
 4 |   
 5 |   void copyGenericColorSpinorHD(const copy_pack_t &pack)
 6 |   {
 7 | #if QUDA_PRECISION & 2
 8 |     CopyGenericColorSpinor<3, short, double>(pack);
 9 | #else
10 |     errorQuda("QUDA_PRECISION=%d does not enable precision combination %d %d", QUDA_PRECISION, std::get<0>(pack).Precision(), std::get<1>(pack).Precision());
11 | #endif
12 |   }  
13 | 
14 | } // namespace quda
15 | 


--------------------------------------------------------------------------------
/lib/copy_color_spinor_hh.cu:
--------------------------------------------------------------------------------
 1 | #include <copy_color_spinor.cuh>
 2 | 
 3 | namespace quda {
 4 |   
 5 |   void copyGenericColorSpinorHH(const copy_pack_t &pack)
 6 |   {
 7 | #if QUDA_PRECISION & 2
 8 |     CopyGenericColorSpinor<3, short, short>(pack);
 9 | #else
10 |     errorQuda("QUDA_PRECISION=%d does not enable precision combination %d %d", QUDA_PRECISION, std::get<0>(pack).Precision(), std::get<1>(pack).Precision());
11 | #endif
12 |   }  
13 | 
14 | } // namespace quda
15 | 


--------------------------------------------------------------------------------
/lib/copy_color_spinor_hq.cu:
--------------------------------------------------------------------------------
 1 | #include <copy_color_spinor.cuh>
 2 | 
 3 | namespace quda {
 4 |   
 5 |   void copyGenericColorSpinorHQ(const copy_pack_t &pack)
 6 |   {
 7 | #if (QUDA_PRECISION & 2) && (QUDA_PRECISION & 1)
 8 |     CopyGenericColorSpinor<3, short, int8_t>(pack);
 9 | #else
10 |     errorQuda("QUDA_PRECISION=%d does not enable precision combination %d %d", QUDA_PRECISION, std::get<0>(pack).Precision(), std::get<1>(pack).Precision());
11 | #endif
12 |   }  
13 | 
14 | } // namespace quda
15 | 


--------------------------------------------------------------------------------
/lib/copy_color_spinor_hs.cu:
--------------------------------------------------------------------------------
 1 | #include <copy_color_spinor.cuh>
 2 | 
 3 | namespace quda {
 4 |   
 5 |   void copyGenericColorSpinorHS(const copy_pack_t &pack)
 6 |   {
 7 | #if (QUDA_PRECISION & 4) && (QUDA_PRECISION & 2)
 8 |     CopyGenericColorSpinor<3, short, float>(pack);
 9 | #else
10 |     errorQuda("QUDA_PRECISION=%d does not enable precision combination %d %d", QUDA_PRECISION, std::get<0>(pack).Precision(), std::get<1>(pack).Precision());
11 | #endif
12 |   }  
13 | 
14 | } // namespace quda
15 | 


--------------------------------------------------------------------------------
/lib/copy_color_spinor_mg_dd.cu:
--------------------------------------------------------------------------------
 1 | #include <copy_color_spinor_mg.hpp>
 2 | 
 3 | namespace quda {
 4 |   
 5 |   void copyGenericColorSpinorMGDD(const copy_pack_t &pack)
 6 |   {
 7 |     if constexpr (is_enabled_multigrid()) {
 8 |       instantiateColor<double, double>(std::get<0>(pack), pack);
 9 |     } else {
10 |       errorQuda("Multigrid has not been enabled (precision = %d %d)", std::get<0>(pack).Precision(), std::get<1>(pack).Precision());
11 |     }
12 |   }
13 | 
14 | } // namespace quda
15 | 


--------------------------------------------------------------------------------
/lib/copy_color_spinor_mg_ds.cu:
--------------------------------------------------------------------------------
 1 | #include <copy_color_spinor_mg.hpp>
 2 | 
 3 | namespace quda {
 4 |   
 5 |   void copyGenericColorSpinorMGDS(const copy_pack_t &pack)
 6 |   {
 7 |     if constexpr (is_enabled_multigrid()) {
 8 |       instantiateColor<double, float>(std::get<0>(pack), pack);
 9 |     } else {
10 |       errorQuda("Multigrid has not been enabled (precision = %d %d)", std::get<0>(pack).Precision(), std::get<1>(pack).Precision());
11 |     }
12 |   }
13 | 
14 | } // namespace quda
15 | 


--------------------------------------------------------------------------------
/lib/copy_color_spinor_mg_hh.cu:
--------------------------------------------------------------------------------
 1 | #include <copy_color_spinor_mg.hpp>
 2 | 
 3 | namespace quda {
 4 |   
 5 |   void copyGenericColorSpinorMGHH(const copy_pack_t &pack)
 6 |   {
 7 |     if constexpr (is_enabled_multigrid()) {
 8 |       instantiateColor<short, short>(std::get<0>(pack), pack);
 9 |     } else {
10 |       errorQuda("Multigrid has not been enabled (precision = %d %d)", std::get<0>(pack).Precision(), std::get<1>(pack).Precision());
11 |     }
12 |   }
13 | 
14 | } // namespace quda
15 | 


--------------------------------------------------------------------------------
/lib/copy_color_spinor_mg_hq.cu:
--------------------------------------------------------------------------------
 1 | #include <copy_color_spinor_mg.hpp>
 2 | 
 3 | namespace quda {
 4 |   
 5 |   void copyGenericColorSpinorMGHQ(const copy_pack_t &pack)
 6 |   {
 7 |     if constexpr (is_enabled_multigrid()) {
 8 |       instantiateColor<short, int8_t>(std::get<0>(pack), pack);
 9 |     } else {
10 |       errorQuda("Multigrid has not been enabled (precision = %d %d)", std::get<0>(pack).Precision(), std::get<1>(pack).Precision());
11 |     }
12 |   }
13 | 
14 | } // namespace quda
15 | 


--------------------------------------------------------------------------------
/lib/copy_color_spinor_mg_hs.cu:
--------------------------------------------------------------------------------
 1 | #include <copy_color_spinor_mg.hpp>
 2 | 
 3 | namespace quda {
 4 |   
 5 |   void copyGenericColorSpinorMGHS(const copy_pack_t &pack)
 6 |   {
 7 |     if constexpr (is_enabled_multigrid()) {
 8 |       instantiateColor<short, float>(std::get<0>(pack), pack);
 9 |     } else {
10 |       errorQuda("Multigrid has not been enabled (precision = %d %d)", std::get<0>(pack).Precision(), std::get<1>(pack).Precision());
11 |     }
12 |   }
13 | 
14 | } // namespace quda
15 | 


--------------------------------------------------------------------------------
/lib/copy_color_spinor_mg_qh.cu:
--------------------------------------------------------------------------------
 1 | #include <copy_color_spinor_mg.hpp>
 2 | 
 3 | namespace quda {
 4 |   
 5 |   void copyGenericColorSpinorMGQH(const copy_pack_t &pack)
 6 |   {
 7 |     if constexpr (is_enabled_multigrid()) {
 8 |       instantiateColor<int8_t, short>(std::get<0>(pack), pack);
 9 |     } else {
10 |       errorQuda("Multigrid has not been enabled (precision = %d %d)", std::get<0>(pack).Precision(), std::get<1>(pack).Precision());
11 |     }
12 |   }
13 | 
14 | } // namespace quda
15 | 


--------------------------------------------------------------------------------
/lib/copy_color_spinor_mg_qq.cu:
--------------------------------------------------------------------------------
 1 | #include <copy_color_spinor_mg.hpp>
 2 | 
 3 | namespace quda {
 4 |   
 5 |   void copyGenericColorSpinorMGQQ(const copy_pack_t &pack)
 6 |   {
 7 |     if constexpr (is_enabled_multigrid()) {
 8 |       instantiateColor<int8_t, int8_t>(std::get<0>(pack), pack);
 9 |     } else {
10 |       errorQuda("Multigrid has not been enabled (precision = %d %d)", std::get<0>(pack).Precision(), std::get<1>(pack).Precision());
11 |     }
12 |   }
13 | 
14 | } // namespace quda
15 | 


--------------------------------------------------------------------------------
/lib/copy_color_spinor_mg_qs.cu:
--------------------------------------------------------------------------------
 1 | #include <copy_color_spinor_mg.hpp>
 2 | 
 3 | namespace quda {
 4 |   
 5 |   void copyGenericColorSpinorMGQS(const copy_pack_t &pack)
 6 |   {
 7 |     if constexpr (is_enabled_multigrid()) {
 8 |       instantiateColor<int8_t, float>(std::get<0>(pack), pack);
 9 |     } else {
10 |       errorQuda("Multigrid has not been enabled (precision = %d %d)", std::get<0>(pack).Precision(), std::get<1>(pack).Precision());
11 |     }
12 |   }
13 | 
14 | } // namespace quda
15 | 


--------------------------------------------------------------------------------
/lib/copy_color_spinor_mg_sd.cu:
--------------------------------------------------------------------------------
 1 | #include <copy_color_spinor_mg.hpp>
 2 | 
 3 | namespace quda {
 4 |   
 5 |   void copyGenericColorSpinorMGSD(const copy_pack_t &pack)
 6 |   {
 7 |     if constexpr (is_enabled_multigrid()) {
 8 |       instantiateColor<float, double>(std::get<0>(pack), pack);
 9 |     } else {
10 |       errorQuda("Multigrid has not been enabled (precision = %d %d)", std::get<0>(pack).Precision(), std::get<1>(pack).Precision());
11 |     }
12 |   }
13 | 
14 | } // namespace quda
15 | 


--------------------------------------------------------------------------------
/lib/copy_color_spinor_mg_sh.cu:
--------------------------------------------------------------------------------
 1 | #include <copy_color_spinor_mg.hpp>
 2 | 
 3 | namespace quda {
 4 |   
 5 |   void copyGenericColorSpinorMGSH(const copy_pack_t &pack)
 6 |   {
 7 |     if constexpr (is_enabled_multigrid()) {
 8 |       instantiateColor<float, short>(std::get<0>(pack), pack);
 9 |     } else {
10 |       errorQuda("Multigrid has not been enabled (precision = %d %d)", std::get<0>(pack).Precision(), std::get<1>(pack).Precision());
11 |     }
12 |   }
13 | 
14 | } // namespace quda
15 | 


--------------------------------------------------------------------------------
/lib/copy_color_spinor_mg_sq.cu:
--------------------------------------------------------------------------------
 1 | #include <copy_color_spinor_mg.hpp>
 2 | 
 3 | namespace quda {
 4 |   
 5 |   void copyGenericColorSpinorMGSQ(const copy_pack_t &pack)
 6 |   {
 7 |     if constexpr (is_enabled_multigrid()) {
 8 |       instantiateColor<float, int8_t>(std::get<0>(pack), pack);
 9 |     } else {
10 |       errorQuda("Multigrid has not been enabled (precision = %d %d)", std::get<0>(pack).Precision(), std::get<1>(pack).Precision());
11 |     }
12 |   }
13 | 
14 | } // namespace quda
15 | 


--------------------------------------------------------------------------------
/lib/copy_color_spinor_mg_ss.cu:
--------------------------------------------------------------------------------
 1 | #include <copy_color_spinor_mg.hpp>
 2 | 
 3 | namespace quda {
 4 |   
 5 |   void copyGenericColorSpinorMGSS(const copy_pack_t &pack)
 6 |   {
 7 |     if constexpr (is_enabled_multigrid()) {
 8 |       instantiateColor<float, float>(std::get<0>(pack), pack);
 9 |     } else {
10 |       errorQuda("Multigrid has not been enabled (precision = %d %d)", std::get<0>(pack).Precision(), std::get<1>(pack).Precision());
11 |     }
12 |   }
13 | 
14 | } // namespace quda
15 | 


--------------------------------------------------------------------------------
/lib/copy_color_spinor_qd.cu:
--------------------------------------------------------------------------------
 1 | #include <copy_color_spinor.cuh>
 2 | 
 3 | namespace quda {
 4 |   
 5 |   void copyGenericColorSpinorQD(const copy_pack_t &pack)
 6 |   {
 7 | #if QUDA_PRECISION & 1
 8 |     CopyGenericColorSpinor<3, int8_t, double>(pack);
 9 | #else
10 |     errorQuda("QUDA_PRECISION=%d does not enable precision combination %d %d", QUDA_PRECISION, std::get<0>(pack).Precision(), std::get<1>(pack).Precision());
11 | #endif
12 |   }
13 | 
14 | } // namespace quda
15 | 


--------------------------------------------------------------------------------
/lib/copy_color_spinor_qh.cu:
--------------------------------------------------------------------------------
 1 | #include <copy_color_spinor.cuh>
 2 | 
 3 | namespace quda {
 4 |   
 5 |   void copyGenericColorSpinorQH(const copy_pack_t &pack)
 6 |   {
 7 | #if (QUDA_PRECISION & 2) && (QUDA_PRECISION & 1)
 8 |     CopyGenericColorSpinor<3, int8_t, short>(pack);
 9 | #else
10 |     errorQuda("QUDA_PRECISION=%d does not enable precision combination %d %d", QUDA_PRECISION, std::get<0>(pack).Precision(), std::get<1>(pack).Precision());
11 | #endif
12 |   }  
13 | 
14 | } // namespace quda
15 | 


--------------------------------------------------------------------------------
/lib/copy_color_spinor_qq.cu:
--------------------------------------------------------------------------------
 1 | #include <copy_color_spinor.cuh>
 2 | 
 3 | namespace quda {
 4 |   
 5 |   void copyGenericColorSpinorQQ(const copy_pack_t &pack)
 6 |   {
 7 | #if QUDA_PRECISION & 1
 8 |     CopyGenericColorSpinor<3, int8_t, int8_t>(pack);
 9 | #else
10 |     errorQuda("QUDA_PRECISION=%d does not enable precision combination %d %d", QUDA_PRECISION, std::get<0>(pack).Precision(), std::get<1>(pack).Precision());
11 | #endif
12 |   }  
13 | 
14 | } // namespace quda
15 | 


--------------------------------------------------------------------------------
/lib/copy_color_spinor_qs.cu:
--------------------------------------------------------------------------------
 1 | #include <copy_color_spinor.cuh>
 2 | 
 3 | namespace quda {
 4 |   
 5 |   void copyGenericColorSpinorQS(const copy_pack_t &pack)
 6 |   {
 7 | #if (QUDA_PRECISION & 4) && (QUDA_PRECISION & 1)
 8 |     CopyGenericColorSpinor<3, int8_t, float>(pack);
 9 | #else
10 |     errorQuda("QUDA_PRECISION=%d does not enable precision combination %d %d", QUDA_PRECISION, std::get<0>(pack).Precision(), std::get<1>(pack).Precision());
11 | #endif
12 |   }
13 | 
14 | } // namespace quda
15 | 


--------------------------------------------------------------------------------
/lib/copy_color_spinor_sd.cu:
--------------------------------------------------------------------------------
 1 | #include <copy_color_spinor.cuh>
 2 | 
 3 | namespace quda {
 4 |   
 5 |   void copyGenericColorSpinorSD(const copy_pack_t &pack)
 6 |   {
 7 |     CopyGenericColorSpinor<3, float, double>(pack);
 8 |   }  
 9 | 
10 | } // namespace quda
11 | 


--------------------------------------------------------------------------------
/lib/copy_color_spinor_sh.cu:
--------------------------------------------------------------------------------
 1 | #include <copy_color_spinor.cuh>
 2 | 
 3 | namespace quda {
 4 |   
 5 |   void copyGenericColorSpinorSH(const copy_pack_t &pack)
 6 |   {
 7 | #if (QUDA_PRECISION & 4) && (QUDA_PRECISION & 2)
 8 |     CopyGenericColorSpinor<3, float, short>(pack);
 9 | #else
10 |     errorQuda("QUDA_PRECISION=%d does not enable precision combination %d %d", QUDA_PRECISION, std::get<0>(pack).Precision(), std::get<1>(pack).Precision());
11 | #endif
12 |   }
13 | 
14 | } // namespace quda
15 | 


--------------------------------------------------------------------------------
/lib/copy_color_spinor_sq.cu:
--------------------------------------------------------------------------------
 1 | #include <copy_color_spinor.cuh>
 2 | 
 3 | namespace quda {
 4 |   
 5 |   void copyGenericColorSpinorSQ(const copy_pack_t &pack)
 6 |   {
 7 | #if (QUDA_PRECISION & 4) && (QUDA_PRECISION & 1)
 8 |     CopyGenericColorSpinor<3, float, int8_t>(pack);
 9 | #else
10 |     errorQuda("QUDA_PRECISION=%d does not enable precision combination %d %d", QUDA_PRECISION, std::get<0>(pack).Precision(), std::get<1>(pack).Precision());
11 | #endif
12 |   }
13 | 
14 | } // namespace quda
15 | 


--------------------------------------------------------------------------------
/lib/copy_color_spinor_ss.cu:
--------------------------------------------------------------------------------
 1 | #include <copy_color_spinor.cuh>
 2 | 
 3 | namespace quda {
 4 |   
 5 |   void copyGenericColorSpinorSS(const copy_pack_t &pack)
 6 |   {
 7 | #if QUDA_PRECISION & 4
 8 |     CopyGenericColorSpinor<3, float, float>(pack);
 9 | #else
10 |     errorQuda("QUDA_PRECISION=%d does not enable precision combination %d %d", QUDA_PRECISION, std::get<0>(pack).Precision(), std::get<1>(pack).Precision());
11 | #endif
12 |   }  
13 | 
14 | } // namespace quda
15 | 


--------------------------------------------------------------------------------
/lib/copy_field_offset.hpp:
--------------------------------------------------------------------------------
 1 | #include <kernels/copy_field_offset.cuh>
 2 | #include <tunable_nd.h>
 3 | 
 4 | namespace quda
 5 | {
 6 | 
 7 |   template <class Arg> class CopyFieldOffset : public TunableKernel3D
 8 |   {
 9 |     using Field = typename Arg::Field;
10 | 
11 |   protected:
12 |     Arg &arg;
13 |     const Field &meta;
14 | 
15 |     long long bytes() const
16 |     {
17 |       return 2ll * (arg.mode == QudaOffsetCopyMode::COLLECT ? arg.in.Bytes() : arg.out.Bytes());
18 |     }
19 | 
20 |     unsigned int minThreads() const { return arg.volume_4d_cb; }
21 |     int blockStep() const { return 4; }
22 |     int blockMin() const { return 4; }
23 | 
24 |   public:
25 |     CopyFieldOffset(Arg &arg, const Field &meta) : TunableKernel3D(meta, arg.Ls, arg.nParity), arg(arg), meta(meta)
26 |     {
27 |       char tmp[TuneKey::aux_n];
28 |       sprintf(tmp, ",(%d,%d,%d,%d)->(%d,%d,%d,%d),Ls=%d,nParity=%d,%s,offset=%d%d%d%d", static_cast<int>(arg.dim_in[0]),
29 |               static_cast<int>(arg.dim_in[1]), static_cast<int>(arg.dim_in[2]), static_cast<int>(arg.dim_in[3]),
30 |               static_cast<int>(arg.dim_out[0]), static_cast<int>(arg.dim_out[1]), static_cast<int>(arg.dim_out[2]),
31 |               static_cast<int>(arg.dim_out[3]), arg.Ls, arg.nParity,
32 |               arg.mode == QudaOffsetCopyMode::COLLECT ? "COLLECT" : "DISPERSE", arg.offset[0], arg.offset[1],
33 |               arg.offset[2], arg.offset[3]);
34 |       strcat(aux, tmp);
35 |       apply(device::get_default_stream());
36 |     }
37 | 
38 |     void apply(const qudaStream_t &stream)
39 |     {
40 |       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
41 |       launch<copy_field_offset, true>(tp, stream, arg);
42 |     }
43 |   };
44 | 
45 | } // namespace quda
46 | 


--------------------------------------------------------------------------------
/lib/copy_gauge_double.cu:
--------------------------------------------------------------------------------
 1 | #include "copy_gauge_inc.cu"
 2 | namespace quda {
 3 |  
 4 |   // this is the function that is actually called, from here on down we instantiate all required templates
 5 |   void copyGenericGaugeDoubleIn(GaugeField &out, const GaugeField &in, QudaFieldLocation location, void *Out, void *In,
 6 |                                 void **ghostOut, void **ghostIn, int type)
 7 |   {
 8 |     copyGenericGauge<double>(out, in, location, Out, In, ghostOut, ghostIn, type);
 9 |   }
10 | 
11 | } // namespace quda
12 | 


--------------------------------------------------------------------------------
/lib/copy_gauge_half.cu:
--------------------------------------------------------------------------------
 1 | #include "copy_gauge_inc.cu"
 2 | namespace quda {
 3 | 
 4 |   // this is the function that is actually called, from here on down we instantiate all required templates
 5 | #if QUDA_PRECISION & 2
 6 |   void copyGenericGaugeHalfIn(GaugeField &out, const GaugeField &in, QudaFieldLocation location, void *Out, void *In,
 7 |                               void **ghostOut, void **ghostIn, int type)
 8 |   {
 9 |     copyGenericGauge<short>(out, in, location, Out, In, ghostOut, ghostIn, type);
10 |   }
11 | #else
12 |   void copyGenericGaugeHalfIn(GaugeField &, const GaugeField &, QudaFieldLocation, void *, void *, void **, void **, int)
13 |   {
14 |     errorQuda("QUDA_PRECISION=%d does not enable half precision", QUDA_PRECISION);
15 |   }
16 | #endif
17 | 
18 | } // namespace quda
19 | 


--------------------------------------------------------------------------------
/lib/copy_gauge_quarter.cu:
--------------------------------------------------------------------------------
 1 | #include "copy_gauge_inc.cu"
 2 | namespace quda
 3 | {
 4 | 
 5 | #if QUDA_PRECISION & 1
 6 |   // this is the function that is actually called, from here on down we instantiate all required templates
 7 |   void copyGenericGaugeQuarterIn(GaugeField &out, const GaugeField &in, QudaFieldLocation location, void *Out,
 8 |                                  void *In, void **ghostOut, void **ghostIn, int type)
 9 |   {
10 |     copyGenericGauge<int8_t>(out, in, location, Out, In, ghostOut, ghostIn, type);
11 |   }
12 | #else
13 |   void copyGenericGaugeQuarterIn(GaugeField &, const GaugeField &, QudaFieldLocation, void *, void *, void **, void **, int)
14 |   {
15 |     errorQuda("QUDA_PRECISION=%d does not enable quarter precision", QUDA_PRECISION);
16 |   }
17 | #endif
18 | 
19 | } // namespace quda
20 | 


--------------------------------------------------------------------------------
/lib/copy_gauge_single.cu:
--------------------------------------------------------------------------------
 1 | #include "copy_gauge_inc.cu"
 2 | namespace quda {
 3 | 
 4 | #if QUDA_PRECISION & 4
 5 |   // this is the function that is actually called, from here on down we instantiate all required templates
 6 |   void copyGenericGaugeSingleIn(GaugeField &out, const GaugeField &in, QudaFieldLocation location, void *Out, void *In,
 7 |                                 void **ghostOut, void **ghostIn, int type)
 8 |   {
 9 |     copyGenericGauge<float>(out, in, location, Out, In, ghostOut, ghostIn, type);
10 |   }
11 | #else
12 |   void copyGenericGaugeSingleIn(GaugeField &, const GaugeField &, QudaFieldLocation, void *, void *, void **, void **, int)
13 |   {
14 |     errorQuda("QUDA_PRECISION=%d does not enable single precision", QUDA_PRECISION);
15 |   }
16 | #endif
17 | 
18 | } // namespace quda
19 | 


--------------------------------------------------------------------------------
/lib/device_vector.cu:
--------------------------------------------------------------------------------
 1 | #include <device_vector.h>
 2 | #include <kernels/device_vector_axpby.cuh>
 3 | 
 4 | namespace quda
 5 | {
 6 | 
 7 |   template <class T> struct axpby_wrapper : TunableKernel1D {
 8 | 
 9 |     AxpbyArg<T> arg;
10 | 
11 |     device_vector<T> _backup_vector;
12 |     T *_backup_ptr = nullptr;
13 | 
14 |     axpby_wrapper(T *out, T a, const T *x, T b, const T *y, int size) :
15 |       TunableKernel1D(size, QUDA_CUDA_FIELD_LOCATION), arg(out, a, x, b, y, size)
16 |     {
17 |       strcat(aux, ",axpby");
18 |       apply(device::get_default_stream());
19 |     }
20 | 
21 |     void apply(const qudaStream_t &stream)
22 |     {
23 |       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
24 |       launch<Axpby>(tp, stream, arg);
25 |     }
26 | 
27 |     void preTune()
28 |     {
29 |       _backup_vector.resize(arg.threads.x);
30 |       _backup_vector.from_device(arg.out);
31 |       _backup_ptr = arg.out;
32 |       arg.out = _backup_vector.data();
33 |     }
34 | 
35 |     void postTune() { arg.out = _backup_ptr; }
36 | 
37 |     unsigned int minThreads() const { return arg.threads.x; }
38 |     long long flops() const { return 4 * arg.threads.x; }
39 |     long long bytes() const { return 3 * sizeof(T) * arg.threads.x; }
40 |   };
41 | 
42 |   void axpby(device_vector<float> &out, float a, const device_vector<float> &x, float b, const device_vector<float> &y)
43 |   {
44 |     axpby_wrapper<float> w(out.data(), a, x.data(), b, y.data(), out.size());
45 |   }
46 | 
47 | } // namespace quda
48 | 


--------------------------------------------------------------------------------
/lib/dslash_constant_arg.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |    @file dslash_constant_buffer.cu
 3 | 
 4 |    @brief When NVSHMEM is enabled, the dslash kernels use RDC which
 5 |    means we have to be careful with the __constant__ buffer
 6 |    declarations in dslash kernels that include constant_kernel_arg.h,
 7 |    since they will collide.  To avoid this, all files that include
 8 |    constant_kernel_arg will mark the buffer as extern, with the
 9 |    definition being this file.
10 | */
11 | 
12 | #define QUDA_CONSTANT_DEFINE
13 | #include <constant_kernel_arg.h>
14 | 


--------------------------------------------------------------------------------
/lib/dslash_domain_wall_4d.cpp:
--------------------------------------------------------------------------------
 1 | #include <instantiate_dslash.h>
 2 | /**
 3 |    This is the gauged domain-wall 4-d preconditioned operator.
 4 | 
 5 |    Note, for now, this just applies a batched 4-d dslash across the fifth
 6 |    dimension.
 7 | */
 8 | 
 9 | namespace quda
10 | {
11 | 
12 |   template <typename Float, int nColor, typename DDArg, QudaReconstructType recon> struct DomainWall4DApply {
13 |     DomainWall4DApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
14 |                       cvector_ref<const ColorSpinorField> &x, const GaugeField &U, double a, double m_5,
15 |                       const Complex *b_5, const Complex *c_5, int parity, bool dagger, const int *comm_override,
16 |                       TimeProfile &profile);
17 |   };
18 | 
19 |   // Apply the 4-d preconditioned domain-wall Dslash operator
20 |   // out(x) = M*in = in(x) + a*\sum_mu U_{-\mu}(x)in(x+mu) + U^\dagger_mu(x-mu)in(x-mu)
21 |   void ApplyDomainWall4D(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
22 |                          const GaugeField &U, double a, double m_5, const Complex *b_5, const Complex *c_5,
23 |                          cvector_ref<const ColorSpinorField> &x, int parity, bool dagger, const int *comm_override,
24 |                          TimeProfile &profile)
25 |   {
26 |     if constexpr (is_enabled<QUDA_DOMAIN_WALL_4D_DSLASH>() || is_enabled<QUDA_TWISTED_CLOVER_DSLASH>()) {
27 |       instantiate<DomainWall4DApply>(out, in, x, U, a, m_5, b_5, c_5, parity, dagger, comm_override, profile);
28 |     } else {
29 |       errorQuda("Domain-wall dslash has not been built");
30 |     }
31 |   }
32 | 
33 | } // namespace quda
34 | 


--------------------------------------------------------------------------------
/lib/dslash_domain_wall_4d.in.cu:
--------------------------------------------------------------------------------
 1 | #include <dslash_domain_wall_4d.hpp>
 2 | 
 3 | namespace quda
 4 | {
 5 | 
 6 |   constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
 7 |   constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
 8 |   constexpr int reconI = @QUDA_DSLASH_RECONI@;
 9 | 
10 |   using DDArg = @QUDA_DSLASH_DDARG@;
11 |   using Float = precision_type_mapper<precision>::type;
12 | 
13 |   template struct DomainWall4DApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>;
14 | 
15 | } // namespace quda
16 | 


--------------------------------------------------------------------------------
/lib/dslash_domain_wall_4d_fused_m5.in.cu:
--------------------------------------------------------------------------------
 1 | #include <dslash_domain_wall_4d_fused_m5.hpp>
 2 | 
 3 | namespace quda
 4 | {
 5 | 
 6 |   constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
 7 |   constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
 8 |   constexpr int reconI = @QUDA_DSLASH_RECONI@;
 9 | 
10 |   constexpr Dslash5Type dslash5_type = Dslash5Type::@QUDA_DSLASH5_TYPE@;
11 | 
12 |   using DDArg = @QUDA_DSLASH_DDARG@;
13 |   using Float = precision_type_mapper<precision>::type;
14 | 
15 |   template struct DomainWall4DApplyFusedM5<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>;
16 | 
17 |   template DomainWall4DApplyFusedM5<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>::DomainWall4DApplyFusedM5(
18 |     cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, cvector_ref<const ColorSpinorField> &x,
19 |     const GaugeField &U, cvector_ref<ColorSpinorField> &y, const Complex *b_5, const Complex *c_5, double a, double m_5,
20 |     int parity, bool dagger, const int *comm_override, double m_f, Dslash5TypeList<dslash5_type>, TimeProfile &profile);
21 | 
22 | } // namespace quda
23 | 


--------------------------------------------------------------------------------
/lib/dslash_domain_wall_4d_m5inv.cpp:
--------------------------------------------------------------------------------
 1 | #include <instantiate_dslash.h>
 2 | 
 3 | /**
 4 |    This is the gauged domain-wall 4-d preconditioned operator, fused with immediately followed fifth dimension operators.
 5 | */
 6 | 
 7 | namespace quda
 8 | {
 9 | 
10 |   template <typename Float, int nColor, typename DDArg, QudaReconstructType recon> struct DomainWall4DApplyFusedM5 {
11 |     template <Dslash5Type dslash5_type_impl, Dslash5Type... N>
12 |     DomainWall4DApplyFusedM5(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
13 |                              cvector_ref<const ColorSpinorField> &x, const GaugeField &U,
14 |                              cvector_ref<ColorSpinorField> &y, const Complex *b_5, const Complex *c_5, double a,
15 |                              double m_5, int parity, bool dagger, const int *comm_override, double m_f,
16 |                              Dslash5TypeList<dslash5_type_impl, N...>, TimeProfile &profile);
17 |   };
18 | 
19 |   // Apply the 4-d preconditioned domain-wall Dslash operator
20 |   //   i.e. out(x) = M*in = in(x) + a*\sum_mu U_{-\mu}(x)in(x+mu) + U^\dagger_mu(x-mu)in(x-mu)
21 |   // ... and then m5inv
22 |   void ApplyDomainWall4DM5inv(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
23 |                               const GaugeField &U, double a, double m_5, const Complex *b_5, const Complex *c_5,
24 |                               cvector_ref<const ColorSpinorField> &x, cvector_ref<ColorSpinorField> &y, int parity,
25 |                               bool dagger, const int *comm_override, double m_f, TimeProfile &profile)
26 |   {
27 |     if constexpr (is_enabled<QUDA_DOMAIN_WALL_4D_DSLASH>()) {
28 |       auto dummy_list = Dslash5TypeList<Dslash5Type::M5_INV_MOBIUS>();
29 |       instantiate<DomainWall4DApplyFusedM5>(out, in, x, y, U, b_5, c_5, a, m_5, parity, dagger, comm_override, m_f,
30 |                                             dummy_list, profile);
31 |     } else {
32 |       errorQuda("Domain-wall operator has not been built");
33 |     }
34 |   }
35 | 
36 | } // namespace quda
37 | 


--------------------------------------------------------------------------------
/lib/dslash_domain_wall_4d_m5inv.hpp:
--------------------------------------------------------------------------------
 1 | #include <dslash_domain_wall_4d_fused_m5.hpp>
 2 | 
 3 | /**
 4 |    This is the gauged domain-wall 4-d preconditioned operator, fused with immediately followed fifth dimension operators.
 5 | */
 6 | 
 7 | namespace quda
 8 | {
 9 | 
10 |   // Apply the 4-d preconditioned domain-wall Dslash operator
11 |   //   i.e. out(x) = M*in = in(x) + a*\sum_mu U_{-\mu}(x)in(x+mu) + U^\dagger_mu(x-mu)in(x-mu)
12 |   // ... and then m5inv
13 |   void ApplyDomainWall4DM5inv(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
14 |                               const GaugeField &U, double a, double m_5, const Complex *b_5, const Complex *c_5,
15 |                               cvector_ref<const ColorSpinorField> &x, cvector_ref<ColorSpinorField> &y, int parity,
16 |                               bool dagger, const int *comm_override, double m_f, TimeProfile &profile)
17 |   {
18 |     if constexpr (is_enabled<QUDA_DOMAIN_WALL_4D_DSLASH>()) {
19 |       auto dummy_list = Dslash5TypeList<Dslash5Type::M5_INV_MOBIUS>();
20 |       instantiate<DomainWall4DApplyFusedM5>(out, in, x, y, U, b_5, c_5, a, m_5, parity, dagger, comm_override, m_f,
21 |                                             dummy_list, profile);
22 |     } else {
23 |       errorQuda("Domain-wall operator has not been built");
24 |     }
25 |   }
26 | 
27 | } // namespace quda
28 | 


--------------------------------------------------------------------------------
/lib/dslash_domain_wall_4d_m5inv.in.cu:
--------------------------------------------------------------------------------
 1 | #include <dslash_domain_wall_4d_fused_m5.hpp>
 2 | 
 3 | /**
 4 |    This is the gauged domain-wall 4-d preconditioned operator, fused with immediately followed fifth dimension operators.
 5 | */
 6 | 
 7 | namespace quda
 8 | {
 9 | 
10 |   // Apply the 4-d preconditioned domain-wall Dslash operator
11 |   //   i.e. out(x) = M*in = in(x) + a*\sum_mu U_{-\mu}(x)in(x+mu) + U^\dagger_mu(x-mu)in(x-mu)
12 |   // ... and then m5inv
13 |   void ApplyDomainWall4DM5inv(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
14 |                               const GaugeField &U, double a, double m_5, const Complex *b_5, const Complex *c_5,
15 |                               cvector_ref<const ColorSpinorField> &x, cvector_ref<ColorSpinorField> &y, int parity,
16 |                               bool dagger, const int *comm_override, double m_f, TimeProfile &profile)
17 |   {
18 |     if constexpr (is_enabled<QUDA_DOMAIN_WALL_4D_DSLASH>()) {
19 |       auto dummy_list = Dslash5TypeList<Dslash5Type::M5_INV_MOBIUS>();
20 |       instantiate<DomainWall4DApplyFusedM5>(out, in, x, y, U, b_5, c_5, a, m_5, parity, dagger, comm_override, m_f,
21 |                                             dummy_list, profile);
22 |     } else {
23 |       errorQuda("Domain-wall operator has not been built");
24 |     }
25 |   }
26 | 
27 | } // namespace quda
28 | 


--------------------------------------------------------------------------------
/lib/dslash_domain_wall_4d_m5mob.cpp:
--------------------------------------------------------------------------------
 1 | #include <instantiate_dslash.h>
 2 | 
 3 | /**
 4 |    This is the gauged domain-wall 4-d preconditioned operator, fused with immediately followed fifth dimension operators.
 5 | */
 6 | 
 7 | namespace quda
 8 | {
 9 | 
10 |   template <typename Float, int nColor, typename DDArg, QudaReconstructType recon> struct DomainWall4DApplyFusedM5 {
11 |     template <Dslash5Type dslash5_type_impl, Dslash5Type... N>
12 |     DomainWall4DApplyFusedM5(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
13 |                              cvector_ref<const ColorSpinorField> &x, const GaugeField &U,
14 |                              cvector_ref<ColorSpinorField> &y, const Complex *b_5, const Complex *c_5, double a,
15 |                              double m_5, int parity, bool dagger, const int *comm_override, double m_f,
16 |                              Dslash5TypeList<dslash5_type_impl, N...>, TimeProfile &profile);
17 |   };
18 | 
19 |   // Apply the 4-d preconditioned domain-wall Dslash operator
20 |   //   i.e. out(x) = M*in = in(x) + a*\sum_mu U_{-\mu}(x)in(x+mu) + U^\dagger_mu(x-mu)in(x-mu)
21 |   // ... and then m5
22 |   void ApplyDomainWall4DM5mob(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
23 |                               const GaugeField &U, double a, double m_5, const Complex *b_5, const Complex *c_5,
24 |                               cvector_ref<const ColorSpinorField> &x, cvector_ref<ColorSpinorField> &y, int parity,
25 |                               bool dagger, const int *comm_override, double m_f, TimeProfile &profile)
26 |   {
27 |     if constexpr (is_enabled<QUDA_DOMAIN_WALL_4D_DSLASH>()) {
28 |       auto dummy_list = Dslash5TypeList<Dslash5Type::DSLASH5_MOBIUS>();
29 |       instantiate<DomainWall4DApplyFusedM5>(out, in, x, y, U, b_5, c_5, a, m_5, parity, dagger, comm_override, m_f,
30 |                                             dummy_list, profile);
31 |     } else {
32 |       errorQuda("Domain-wall operator has not been built");
33 |     }
34 |   }
35 | 
36 | } // namespace quda
37 | 


--------------------------------------------------------------------------------
/lib/dslash_domain_wall_4d_m5pre.cpp:
--------------------------------------------------------------------------------
 1 | #include <instantiate_dslash.h>
 2 | 
 3 | /**
 4 |    This is the gauged domain-wall 4-d preconditioned operator, fused with immediately followed fifth dimension operators.
 5 | */
 6 | 
 7 | namespace quda
 8 | {
 9 | 
10 |   template <typename Float, int nColor, typename DDArg, QudaReconstructType recon> struct DomainWall4DApplyFusedM5 {
11 |     template <Dslash5Type dslash5_type_impl, Dslash5Type... N>
12 |     DomainWall4DApplyFusedM5(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
13 |                              cvector_ref<const ColorSpinorField> &x, const GaugeField &U,
14 |                              cvector_ref<ColorSpinorField> &y, const Complex *b_5, const Complex *c_5, double a,
15 |                              double m_5, int parity, bool dagger, const int *comm_override, double m_f,
16 |                              Dslash5TypeList<dslash5_type_impl, N...>, TimeProfile &profile);
17 |   };
18 | 
19 |   // Apply the 4-d preconditioned domain-wall Dslash operator
20 |   //   i.e. out(x) = M*in = in(x) + a*\sum_mu U_{-\mu}(x)in(x+mu) + U^\dagger_mu(x-mu)in(x-mu)
21 |   // ... and then m5pre
22 |   void ApplyDomainWall4DM5pre(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
23 |                               const GaugeField &U, double a, double m_5, const Complex *b_5, const Complex *c_5,
24 |                               cvector_ref<const ColorSpinorField> &x, cvector_ref<ColorSpinorField> &y, int parity,
25 |                               bool dagger, const int *comm_override, double m_f, TimeProfile &profile)
26 |   {
27 |     if constexpr (is_enabled<QUDA_DOMAIN_WALL_4D_DSLASH>()) {
28 |       auto dummy_list = Dslash5TypeList<Dslash5Type::DSLASH5_MOBIUS_PRE>();
29 |       instantiate<DomainWall4DApplyFusedM5>(out, in, x, y, U, b_5, c_5, a, m_5, parity, dagger, comm_override, m_f,
30 |                                             dummy_list, profile);
31 |     } else {
32 |       errorQuda("Domain-wall operator has not been built");
33 |     }
34 |   }
35 | 
36 | } // namespace quda
37 | 


--------------------------------------------------------------------------------
/lib/dslash_domain_wall_5d.cpp:
--------------------------------------------------------------------------------
 1 | #include <instantiate_dslash.h>
 2 | 
 3 | /**
 4 |    This is the gauged domain-wall 5-d preconditioned operator.
 5 | */
 6 | 
 7 | namespace quda
 8 | {
 9 | 
10 |   template <typename Float, int nColor, typename DDArg, QudaReconstructType recon> struct DomainWall5DApply {
11 |     DomainWall5DApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
12 |                       cvector_ref<const ColorSpinorField> &x, const GaugeField &U, double a, double m_f, int parity,
13 |                       bool dagger, const int *comm_override, TimeProfile &profile);
14 |   };
15 | 
16 |   // Apply the 5-d preconditioned domain-wall Dslash operator
17 |   // out(x) = M*in = in(x) + a*\sum_mu U_{-\mu}(x)in(x+mu) + U^\dagger_mu(x-mu)in(x-mu)
18 |   void ApplyDomainWall5D(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
19 |                          const GaugeField &U, double a, double m_f, cvector_ref<const ColorSpinorField> &x, int parity,
20 |                          bool dagger, const int *comm_override, TimeProfile &profile)
21 |   {
22 |     if constexpr (is_enabled<QUDA_DOMAIN_WALL_DSLASH>()) {
23 |       instantiate<DomainWall5DApply>(out, in, x, U, a, m_f, parity, dagger, comm_override, profile);
24 |     } else {
25 |       errorQuda("Domain-wall operator has not been built");
26 |     }
27 |   }
28 | 
29 | } // namespace quda
30 | 


--------------------------------------------------------------------------------
/lib/dslash_domain_wall_5d.in.cu:
--------------------------------------------------------------------------------
 1 | #include <dslash_domain_wall_5d.hpp>
 2 | 
 3 | namespace quda
 4 | {
 5 | 
 6 |   constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
 7 |   constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
 8 |   constexpr int reconI = @QUDA_DSLASH_RECONI@;
 9 | 
10 |   using DDArg = @QUDA_DSLASH_DDARG@;
11 |   using Float = precision_type_mapper<precision>::type;
12 | 
13 |   template struct DomainWall5DApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>;
14 | 
15 | } // namespace quda
16 | 


--------------------------------------------------------------------------------
/lib/dslash_improved_staggered.cpp:
--------------------------------------------------------------------------------
 1 | #include <instantiate_dslash.h>
 2 | 
 3 | /**
 4 |    This is a staggered Dirac operator
 5 | */
 6 | 
 7 | namespace quda
 8 | {
 9 | 
10 |   template <typename Float, int nColor, typename DDArg, QudaReconstructType recon_l> struct ImprovedStaggeredApply {
11 |     ImprovedStaggeredApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
12 |                            cvector_ref<const ColorSpinorField> &x, const GaugeField &L, const GaugeField &U, double a,
13 |                            int parity, bool dagger, const int *comm_override, TimeProfile &profile);
14 |   };
15 | 
16 |   void ApplyImprovedStaggered(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
17 |                               const GaugeField &U, const GaugeField &L, double a, cvector_ref<const ColorSpinorField> &x,
18 |                               int parity, bool dagger, const int *comm_override, TimeProfile &profile)
19 |   {
20 |     if constexpr (is_enabled<QUDA_ASQTAD_DSLASH>()) {
21 |       for (int i = 0; i < 4; i++) {
22 |         if (comm_dim_partitioned(i) && (U.X()[i] < 6)) {
23 |           errorQuda("partitioned dimension with local size less than 6 is not supported in improved staggered dslash");
24 |         }
25 |       }
26 |       // L must be first gauge field argument since we template on long reconstruct
27 |       instantiate<ImprovedStaggeredApply, ReconstructStaggered>(out, in, x, L, U, a, parity, dagger, comm_override,
28 |                                                                 profile);
29 |     } else {
30 |       errorQuda("Improved staggered operator has not been built");
31 |     }
32 |   }
33 | 
34 | } // namespace quda
35 | 


--------------------------------------------------------------------------------
/lib/dslash_improved_staggered.in.cu:
--------------------------------------------------------------------------------
 1 | #include <dslash_improved_staggered.hpp>
 2 | 
 3 | namespace quda
 4 | {
 5 |   constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
 6 |   constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
 7 |   constexpr int reconI = @QUDA_DSLASH_RECONI@;
 8 | 
 9 |   using DDArg = @QUDA_DSLASH_DDARG@;
10 |   using Float = precision_type_mapper<precision>::type;
11 | 
12 |   template struct ImprovedStaggeredApply<Float, nColor, DDArg, ReconstructStaggered::recon[reconI]>;
13 | 
14 | } // namespace quda
15 | 


--------------------------------------------------------------------------------
/lib/dslash_mdw_fused.in.cu:
--------------------------------------------------------------------------------
 1 | #include <dslash_mdw_fused.hpp>
 2 | #include <dslash_mdw_fused_impl.hpp>
 3 | 
 4 | namespace quda
 5 | {
 6 |   namespace mobius_tensor_core
 7 |   {
 8 |     // clang-format off
 9 |     constexpr int Ls = @QUDA_MDW_FUSED_LS@;
10 |     // clang-format on
11 | #if defined(GPU_DOMAIN_WALL_DIRAC) && defined(QUDA_MMA_AVAILABLE)
12 |     template <>
13 |     void apply_fused_dslash_impl<Ls>(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U,
14 |                                      ColorSpinorField &y, const ColorSpinorField &x, double m_f, double m_5,
15 |                                      const Complex *b_5, const Complex *c_5, bool dagger, int parity, int shift[4],
16 |                                      int halo_shift[4], MdwfFusedDslashType type)
17 |     {
18 |       checkLocation(out, in); // check all locations match
19 |       instantiatePreconditioner<FusedDslashLs<Ls>::type>(out, in, x, U, y, m_f, m_5, b_5, c_5, dagger, parity, shift,
20 |                                                          halo_shift, type);
21 |     }
22 | #else
23 |     template <>
24 |     void apply_fused_dslash_impl<Ls>(ColorSpinorField &, const ColorSpinorField &, const GaugeField &,
25 |                                      ColorSpinorField &, const ColorSpinorField &, double, double, const Complex *,
26 |                                      const Complex *, bool, int, int[4], int[4], MdwfFusedDslashType)
27 |     {
28 |       errorQuda("Domain wall dslash with tensor cores has not been built");
29 |     }
30 | #endif
31 |   } // namespace mobius_tensor_core
32 | } // namespace quda
33 | 


--------------------------------------------------------------------------------
/lib/dslash_ndeg_twisted_clover.cpp:
--------------------------------------------------------------------------------
 1 | #include <instantiate_dslash.h>
 2 | 
 3 | /**
 4 |    This is the gauged non-degenerate twisted-clover operator acting on a
 5 |    quark doublet.
 6 | */
 7 | 
 8 | namespace quda
 9 | {
10 | 
11 |   template <typename Float, int nColor, typename DDArg, QudaReconstructType recon> struct NdegTwistedCloverApply {
12 |     NdegTwistedCloverApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
13 |                            cvector_ref<const ColorSpinorField> &x, const GaugeField &U, const CloverField &A, double a,
14 |                            double b, double c, int parity, bool dagger, const int *comm_override, TimeProfile &profile);
15 |   };
16 | 
17 |   void ApplyNdegTwistedClover(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
18 |                               const GaugeField &U, const CloverField &A, double a, double b, double c,
19 |                               cvector_ref<const ColorSpinorField> &x, int parity, bool dagger, const int *comm_override,
20 |                               TimeProfile &profile)
21 |   {
22 |     if constexpr (is_enabled<QUDA_TWISTED_CLOVER_DSLASH>()) {
23 |       instantiate<NdegTwistedCloverApply>(out, in, x, U, A, a, b, c, parity, dagger, comm_override, profile);
24 |     } else {
25 |       errorQuda("Non-degenerate twisted-clover operator has not been built");
26 |     }
27 |   }
28 | 
29 | } // namespace quda
30 | 


--------------------------------------------------------------------------------
/lib/dslash_ndeg_twisted_clover.in.cu:
--------------------------------------------------------------------------------
 1 | #include <dslash_ndeg_twisted_clover.hpp>
 2 | 
 3 | namespace quda
 4 | {
 5 | 
 6 |   constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
 7 |   constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
 8 |   constexpr int reconI = @QUDA_DSLASH_RECONI@;
 9 | 
10 |   using DDArg = @QUDA_DSLASH_DDARG@;
11 |   using Float = precision_type_mapper<precision>::type;
12 | 
13 |   template struct NdegTwistedCloverApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>;
14 | 
15 | } // namespace quda
16 | 


--------------------------------------------------------------------------------
/lib/dslash_ndeg_twisted_clover_preconditioned.cpp:
--------------------------------------------------------------------------------
 1 | #include <instantiate_dslash.h>
 2 | 
 3 | /**
 4 |    This is the gauged preconditioned twisted-clover operator
 5 |    acting on a non-degenerate quark doublet.
 6 | */
 7 | 
 8 | namespace quda
 9 | {
10 | 
11 |   template <typename Float, int nColor, typename DDArg, QudaReconstructType recon>
12 |   struct NdegTwistedCloverPreconditionedApply {
13 |     NdegTwistedCloverPreconditionedApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
14 |                                          cvector_ref<const ColorSpinorField> &x, const GaugeField &U,
15 |                                          const CloverField &A, double a, double b, double c, bool xpay, int parity,
16 |                                          bool dagger, const int *comm_override, TimeProfile &profile);
17 |   };
18 | 
19 |   void ApplyNdegTwistedCloverPreconditioned(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
20 |                                             const GaugeField &U, const CloverField &A, double a, double b, double c,
21 |                                             bool xpay, cvector_ref<const ColorSpinorField> &x, int parity, bool dagger,
22 |                                             const int *comm_override, TimeProfile &profile)
23 |   {
24 |     if constexpr (is_enabled<QUDA_TWISTED_CLOVER_DSLASH>()) {
25 |       instantiate<NdegTwistedCloverPreconditionedApply>(out, in, x, U, A, a, b, c, xpay, parity, dagger, comm_override,
26 |                                                         profile);
27 |     } else {
28 |       errorQuda("Non-degenerate preconditioned twisted-clover operator has not been built");
29 |     }
30 |   }
31 | 
32 | } // namespace quda
33 | 


--------------------------------------------------------------------------------
/lib/dslash_ndeg_twisted_clover_preconditioned.in.cu:
--------------------------------------------------------------------------------
 1 | #include <dslash_ndeg_twisted_clover_preconditioned.hpp>
 2 | 
 3 | namespace quda
 4 | {
 5 | 
 6 |   constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
 7 |   constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
 8 |   constexpr int reconI = @QUDA_DSLASH_RECONI@;
 9 | 
10 |   using DDArg = @QUDA_DSLASH_DDARG@;
11 |   using Float = precision_type_mapper<precision>::type;
12 | 
13 |   template struct NdegTwistedCloverPreconditionedApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>;
14 | 
15 | } // namespace quda
16 | 


--------------------------------------------------------------------------------
/lib/dslash_ndeg_twisted_mass.cpp:
--------------------------------------------------------------------------------
 1 | #include <instantiate_dslash.h>
 2 | 
 3 | /**
 4 |    This is the gauged twisted-mass operator acting on a non-generate
 5 |    quark doublet.
 6 | */
 7 | 
 8 | namespace quda
 9 | {
10 | 
11 |   template <typename Float, int nColor, typename DDArg, QudaReconstructType recon> struct NdegTwistedMassApply {
12 |     NdegTwistedMassApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
13 |                          cvector_ref<const ColorSpinorField> &x, const GaugeField &U, double a, double b, double c,
14 |                          int parity, bool dagger, const int *comm_override, TimeProfile &profile);
15 |   };
16 | 
17 |   void ApplyNdegTwistedMass(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
18 |                             const GaugeField &U, double a, double b, double c, cvector_ref<const ColorSpinorField> &x,
19 |                             int parity, bool dagger, const int *comm_override, TimeProfile &profile)
20 |   {
21 |     if constexpr (is_enabled<QUDA_TWISTED_MASS_DSLASH>()) {
22 |       instantiate<NdegTwistedMassApply>(out, in, x, U, a, b, c, parity, dagger, comm_override, profile);
23 |     } else {
24 |       errorQuda("Non-degenerate twisted-mass operator has not been built");
25 |     }
26 |   }
27 | 
28 | } // namespace quda
29 | 


--------------------------------------------------------------------------------
/lib/dslash_ndeg_twisted_mass.in.cu:
--------------------------------------------------------------------------------
 1 | #include <dslash_ndeg_twisted_mass.hpp>
 2 | 
 3 | namespace quda
 4 | {
 5 | 
 6 |   constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
 7 |   constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
 8 |   constexpr int reconI = @QUDA_DSLASH_RECONI@;
 9 | 
10 |   using DDArg = @QUDA_DSLASH_DDARG@;
11 |   using Float = precision_type_mapper<precision>::type;
12 | 
13 |   template struct NdegTwistedMassApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>;
14 | 
15 | } // namespace quda
16 | 


--------------------------------------------------------------------------------
/lib/dslash_ndeg_twisted_mass_preconditioned.cpp:
--------------------------------------------------------------------------------
 1 | #include <instantiate_dslash.h>
 2 | 
 3 | /**
 4 |    This is the preconditioned twisted-mass operator acting on a non-generate
 5 |    quark doublet.
 6 | */
 7 | 
 8 | namespace quda
 9 | {
10 | 
11 |   template <typename Float, int nColor, typename DDArg, QudaReconstructType recon>
12 |   struct NdegTwistedMassPreconditionedApply {
13 |     NdegTwistedMassPreconditionedApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
14 |                                        cvector_ref<const ColorSpinorField> &x, const GaugeField &U, double a, double b,
15 |                                        double c, bool xpay, int parity, bool dagger, bool asymmetric,
16 |                                        const int *comm_override, TimeProfile &profile);
17 |   };
18 | 
19 |   // Apply the non-degenerate twisted-mass Dslash operator
20 |   // out(x) = M*in = a*(1 + i*b*gamma_5*tau_3 + c*tau_1)*D + x
21 |   // Uses the kappa normalization for the Wilson operator, with a = -kappa.
22 |   void ApplyNdegTwistedMassPreconditioned(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
23 |                                           const GaugeField &U, double a, double b, double c, bool xpay,
24 |                                           cvector_ref<const ColorSpinorField> &x, int parity, bool dagger,
25 |                                           bool asymmetric, const int *comm_override, TimeProfile &profile)
26 |   {
27 |     if constexpr (is_enabled<QUDA_TWISTED_MASS_DSLASH>()) {
28 |       instantiate<NdegTwistedMassPreconditionedApply>(out, in, x, U, a, b, c, xpay, parity, dagger, asymmetric,
29 |                                                       comm_override, profile);
30 |     } else {
31 |       errorQuda("Non-degenerate preconditioned twisted-mass dslash has not been built");
32 |     }
33 |   }
34 | 
35 | } // namespace quda
36 | 


--------------------------------------------------------------------------------
/lib/dslash_ndeg_twisted_mass_preconditioned.in.cu:
--------------------------------------------------------------------------------
 1 | #include <dslash_ndeg_twisted_mass_preconditioned.hpp>
 2 | 
 3 | namespace quda
 4 | {
 5 | 
 6 |   constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
 7 |   constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
 8 |   constexpr int reconI = @QUDA_DSLASH_RECONI@;
 9 | 
10 |   using DDArg = @QUDA_DSLASH_DDARG@;
11 |   using Float = precision_type_mapper<precision>::type;
12 | 
13 |   template struct NdegTwistedMassPreconditionedApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>;
14 | 
15 | } // namespace quda
16 | 


--------------------------------------------------------------------------------
/lib/dslash_staggered.cpp:
--------------------------------------------------------------------------------
 1 | #include <instantiate_dslash.h>
 2 | 
 3 | /**
 4 |    This is a staggered Dirac operator
 5 | */
 6 | 
 7 | namespace quda
 8 | {
 9 | 
10 |   template <typename Float, int nColor, typename DDArg, QudaReconstructType recon_u> struct StaggeredApply {
11 |     StaggeredApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
12 |                    cvector_ref<const ColorSpinorField> &x, const GaugeField &U, double a, int parity, bool dagger,
13 |                    const int *comm_override, TimeProfile &profile);
14 |   };
15 | 
16 |   void ApplyStaggered(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const GaugeField &U,
17 |                       double a, cvector_ref<const ColorSpinorField> &x, int parity, bool dagger,
18 |                       const int *comm_override, TimeProfile &profile)
19 |   {
20 |     if constexpr (is_enabled<QUDA_STAGGERED_DSLASH>()) {
21 |       instantiate<StaggeredApply, ReconstructStaggered>(out, in, x, U, a, parity, dagger, comm_override, profile);
22 |     } else {
23 |       errorQuda("Staggered operator has not been built");
24 |     }
25 |   }
26 | 
27 | } // namespace quda
28 | 


--------------------------------------------------------------------------------
/lib/dslash_staggered.in.cu:
--------------------------------------------------------------------------------
 1 | #include <dslash_staggered.hpp>
 2 | 
 3 | namespace quda
 4 | {
 5 |   constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
 6 |   constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
 7 |   constexpr int reconI = @QUDA_DSLASH_RECONI@;
 8 | 
 9 |   using DDArg = @QUDA_DSLASH_DDARG@;
10 |   using Float = precision_type_mapper<precision>::type;
11 | 
12 |   template struct StaggeredApply<Float, nColor, DDArg, ReconstructStaggered::recon[reconI]>;
13 | 
14 | } // namespace quda
15 | 


--------------------------------------------------------------------------------
/lib/dslash_twisted_clover.cpp:
--------------------------------------------------------------------------------
 1 | #include <instantiate_dslash.h>
 2 | 
 3 | /**
 4 |    This is the basic gauged twisted-clover operator
 5 | */
 6 | 
 7 | namespace quda
 8 | {
 9 | 
10 |   template <typename Float, int nColor, typename DDArg, QudaReconstructType recon> struct TwistedCloverApply {
11 |     TwistedCloverApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
12 |                        cvector_ref<const ColorSpinorField> &x, const GaugeField &U, const CloverField &C, double a,
13 |                        double b, int parity, bool dagger, const int *comm_override, TimeProfile &profile);
14 |   };
15 | 
16 |   // Apply the twisted-mass Dslash operator
17 |   // out(x) = M*in = (A + i*b*gamma_5)*in(x) + a*\sum_mu U_{-\mu}(x)in(x+mu) + U^\dagger_mu(x-mu)in(x-mu)
18 |   // Uses the kappa normalization for the Wilson operator, with a = -kappa.
19 |   void ApplyTwistedClover(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
20 |                           const GaugeField &U, const CloverField &C, double a, double b,
21 |                           cvector_ref<const ColorSpinorField> &x, int parity, bool dagger, const int *comm_override,
22 |                           TimeProfile &profile)
23 |   {
24 |     if constexpr (is_enabled<QUDA_TWISTED_CLOVER_DSLASH>()) {
25 |       instantiate<TwistedCloverApply>(out, in, x, U, C, a, b, parity, dagger, comm_override, profile);
26 |     } else {
27 |       errorQuda("Twisted-clover operator has not been built");
28 |     }
29 |   }
30 | 
31 | } // namespace quda
32 | 


--------------------------------------------------------------------------------
/lib/dslash_twisted_clover.in.cu:
--------------------------------------------------------------------------------
 1 | #include <dslash_twisted_clover.hpp>
 2 | 
 3 | namespace quda
 4 | {
 5 | 
 6 |   constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
 7 |   constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
 8 |   constexpr int reconI = @QUDA_DSLASH_RECONI@;
 9 | 
10 |   using DDArg = @QUDA_DSLASH_DDARG@;
11 |   using Float = precision_type_mapper<precision>::type;
12 | 
13 |   template struct TwistedCloverApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>;
14 | 
15 | } // namespace quda
16 | 


--------------------------------------------------------------------------------
/lib/dslash_twisted_clover_preconditioned.cpp:
--------------------------------------------------------------------------------
 1 | #include <instantiate_dslash.h>
 2 | 
 3 | /**
 4 |    This is the preconditioned gauged twisted-mass operator
 5 | */
 6 | 
 7 | namespace quda
 8 | {
 9 | 
10 |   template <typename Float, int nColor, typename DDArg, QudaReconstructType recon>
11 |   struct TwistedCloverPreconditionedApply {
12 |     TwistedCloverPreconditionedApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
13 |                                      cvector_ref<const ColorSpinorField> &x, const GaugeField &U, const CloverField &C,
14 |                                      double a, double b, bool xpay, int parity, bool dagger, const int *comm_override,
15 |                                      TimeProfile &profile);
16 |   };
17 | 
18 |   /*
19 |     Apply the preconditioned twisted-mass Dslash operator
20 | 
21 |     out = x + a*A^{-1} D * in = x + a*(C + i*b*gamma_5)^{-1}*\sum_mu U_{-\mu}(x)in(x+mu) + U^\dagger_mu(x-mu)in(x-mu)
22 |   */
23 |   void ApplyTwistedCloverPreconditioned(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
24 |                                         const GaugeField &U, const CloverField &C, double a, double b, bool xpay,
25 |                                         cvector_ref<const ColorSpinorField> &x, int parity, bool dagger,
26 |                                         const int *comm_override, TimeProfile &profile)
27 |   {
28 |     if constexpr (is_enabled<QUDA_TWISTED_CLOVER_DSLASH>()) {
29 |       instantiate<TwistedCloverPreconditionedApply>(out, in, x, U, C, a, b, xpay, parity, dagger, comm_override, profile);
30 |     } else {
31 |       errorQuda("Twisted-clover operator has not been built");
32 |     }
33 |   }
34 | 
35 | } // namespace quda
36 | 


--------------------------------------------------------------------------------
/lib/dslash_twisted_clover_preconditioned.in.cu:
--------------------------------------------------------------------------------
 1 | #include <dslash_twisted_clover_preconditioned.hpp>
 2 | 
 3 | namespace quda
 4 | {
 5 | 
 6 |   constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
 7 |   constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
 8 |   constexpr int reconI = @QUDA_DSLASH_RECONI@;
 9 | 
10 |   using DDArg = @QUDA_DSLASH_DDARG@;
11 |   using Float = precision_type_mapper<precision>::type;
12 | 
13 |   template struct TwistedCloverPreconditionedApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>;
14 | 
15 | } // namespace quda
16 | 


--------------------------------------------------------------------------------
/lib/dslash_twisted_mass.cpp:
--------------------------------------------------------------------------------
 1 | #include <instantiate_dslash.h>
 2 | 
 3 | namespace quda
 4 | {
 5 | 
 6 |   template <typename Float, int nColor, typename DDArg, QudaReconstructType recon> struct TwistedMassApply {
 7 |     TwistedMassApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
 8 |                      cvector_ref<const ColorSpinorField> &x, const GaugeField &U, double a, double b, int parity,
 9 |                      bool dagger, const int *comm_override, TimeProfile &profile);
10 |   };
11 | 
12 |   // Apply the twisted-mass Dslash operator
13 |   // out(x) = M*in = (1 + i*b*gamma_5)*in(x) + a*\sum_mu U_{-\mu}(x)in(x+mu) + U^\dagger_mu(x-mu)in(x-mu)
14 |   // Uses the kappa normalization for the Wilson operator, with a = -kappa.
15 |   void ApplyTwistedMass(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
16 |                         const GaugeField &U, double a, double b, cvector_ref<const ColorSpinorField> &x, int parity,
17 |                         bool dagger, const int *comm_override, TimeProfile &profile)
18 |   {
19 |     if constexpr (is_enabled<QUDA_TWISTED_MASS_DSLASH>()) {
20 |       instantiate<TwistedMassApply>(out, in, x, U, a, b, parity, dagger, comm_override, profile);
21 |     } else {
22 |       errorQuda("Twisted-mass operator has not been built");
23 |     }
24 |   }
25 | 
26 | } // namespace quda
27 | 


--------------------------------------------------------------------------------
/lib/dslash_twisted_mass.in.cu:
--------------------------------------------------------------------------------
 1 | #include <dslash_twisted_mass.hpp>
 2 | 
 3 | namespace quda
 4 | {
 5 |   constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
 6 |   constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
 7 |   constexpr int reconI = @QUDA_DSLASH_RECONI@;
 8 | 
 9 |   using DDArg = @QUDA_DSLASH_DDARG@;
10 |   using Float = precision_type_mapper<precision>::type;
11 | 
12 |   template struct TwistedMassApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>;
13 | 
14 | } // namespace quda
15 | 


--------------------------------------------------------------------------------
/lib/dslash_twisted_mass_preconditioned.cpp:
--------------------------------------------------------------------------------
 1 | #include <instantiate_dslash.h>
 2 | 
 3 | /**
 4 |    This is the preconditioned gauged twisted-mass operator
 5 | */
 6 | 
 7 | namespace quda
 8 | {
 9 | 
10 |   template <typename Float, int nColor, typename DDArg, QudaReconstructType recon>
11 |   struct TwistedMassPreconditionedApply {
12 |     TwistedMassPreconditionedApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
13 |                                    cvector_ref<const ColorSpinorField> &x, const GaugeField &U, double a, double b,
14 |                                    bool xpay, int parity, bool dagger, bool asymmetric, const int *comm_override,
15 |                                    TimeProfile &profile);
16 |   };
17 | 
18 |   /*
19 |     Apply the preconditioned twisted-mass Dslash operator
20 | 
21 |     out = x + A^{-1} D * in = x + a*(1 + i*b*gamma_5)*\sum_mu U_{-\mu}(x)in(x+mu) + U^\dagger_mu(x-mu)in(x-mu)
22 |   */
23 |   void ApplyTwistedMassPreconditioned(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
24 |                                       const GaugeField &U, double a, double b, bool xpay,
25 |                                       cvector_ref<const ColorSpinorField> &x, int parity, bool dagger, bool asymmetric,
26 |                                       const int *comm_override, TimeProfile &profile)
27 |   {
28 |     if constexpr (is_enabled<QUDA_TWISTED_MASS_DSLASH>()) {
29 |       instantiate<TwistedMassPreconditionedApply>(out, in, x, U, a, b, xpay, parity, dagger, asymmetric, comm_override,
30 |                                                   profile);
31 |     } else {
32 |       errorQuda("Twisted-mass operator has not been built");
33 |     }
34 |   }
35 | 
36 | } // namespace quda
37 | 


--------------------------------------------------------------------------------
/lib/dslash_twisted_mass_preconditioned.in.cu:
--------------------------------------------------------------------------------
 1 | #include <dslash_twisted_mass_preconditioned.hpp>
 2 | 
 3 | namespace quda
 4 | {
 5 | 
 6 |   constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
 7 |   constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
 8 |   constexpr int reconI = @QUDA_DSLASH_RECONI@;
 9 | 
10 |   typedef @QUDA_DSLASH_DDARG@ DDArg;
11 |   typedef precision_type_mapper<precision>::type Float;
12 | 
13 |   template struct TwistedMassPreconditionedApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>;
14 | 
15 | } // namespace quda
16 | 


--------------------------------------------------------------------------------
/lib/dslash_wilson.cpp:
--------------------------------------------------------------------------------
 1 | #include <instantiate_dslash.h>
 2 | 
 3 | namespace quda
 4 | {
 5 | 
 6 |   template <typename Float, int nColor, typename DDArg, QudaReconstructType recon> struct WilsonApply {
 7 |     template <bool distance_pc>
 8 |     WilsonApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
 9 |                 cvector_ref<const ColorSpinorField> &x, const GaugeField &U, double a, double alpha0, int t0, int parity,
10 |                 bool dagger, const int *comm_override, std::integral_constant<bool, distance_pc>, TimeProfile &profile);
11 |   };
12 | 
13 |   // Apply the Wilson operator
14 |   // out(x) = M*in = - a*\sum_mu U_{-\mu}(x)in(x+mu) + U^\dagger_mu(x-mu)in(x-mu)
15 |   // Uses the a normalization for the Wilson operator.
16 |   void ApplyWilson(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const GaugeField &U,
17 |                    double a, cvector_ref<const ColorSpinorField> &x, int parity, bool dagger, const int *comm_override,
18 |                    TimeProfile &profile)
19 |   {
20 |     if (in.Ndim() == 5) errorQuda("Unexpected nDim = 5");
21 |     if constexpr (is_enabled<QUDA_WILSON_DSLASH>()) {
22 |       auto dummy = DistanceType<false>();
23 |       instantiate<WilsonApply>(out, in, x, U, a, 0, -1, parity, dagger, comm_override, dummy, profile);
24 |     } else {
25 |       errorQuda("Wilson operator has not been built");
26 |     }
27 |   }
28 | 
29 | } // namespace quda
30 | 


--------------------------------------------------------------------------------
/lib/dslash_wilson.hpp:
--------------------------------------------------------------------------------
 1 | #include <gauge_field.h>
 2 | #include <color_spinor_field.h>
 3 | #include <dslash.h>
 4 | #include <worker.h>
 5 | 
 6 | #include <dslash_policy.hpp>
 7 | #include <kernels/dslash_wilson.cuh>
 8 | 
 9 | /**
10 |    This is the basic gauged Wilson operator
11 |    TODO
12 |    - gauge fix support
13 | */
14 | 
15 | namespace quda
16 | {
17 | 
18 |   template <typename Arg> class Wilson : public Dslash<wilson, Arg>
19 |   {
20 |     using Dslash = Dslash<wilson, Arg>;
21 | 
22 |   public:
23 |     Wilson(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
24 |            const ColorSpinorField &halo) :
25 |       Dslash(arg, out, in, halo)
26 |     {
27 |     }
28 | 
29 |     void apply(const qudaStream_t &stream)
30 |     {
31 |       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
32 |       Dslash::setParam(tp);
33 |       Dslash::template instantiate<packShmem>(tp, stream);
34 |     }
35 |   };
36 | 
37 |   template <typename Float, int nColor, typename DDArg, QudaReconstructType recon> struct WilsonApply {
38 | 
39 |     template <bool distance_pc>
40 |     WilsonApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
41 |                 cvector_ref<const ColorSpinorField> &x, const GaugeField &U, double a, double alpha0, int t0,
42 |                 int parity, bool dagger, const int *comm_override, DistanceType<distance_pc>, TimeProfile &profile)
43 |     {
44 |       constexpr int nDim = 4;
45 |       auto halo = ColorSpinorField::create_comms_batch(in);
46 |       WilsonArg<Float, nColor, nDim, DDArg, recon, distance_pc> arg(out, in, halo, U, a, x, parity, dagger,
47 |                                                                     comm_override, alpha0, t0);
48 |       Wilson<decltype(arg)> wilson(arg, out, in, halo);
49 |       dslash::DslashPolicyTune<decltype(wilson)> policy(wilson, in, halo, profile);
50 |     }
51 |   };
52 | 
53 | } // namespace quda
54 | 


--------------------------------------------------------------------------------
/lib/dslash_wilson.in.cu:
--------------------------------------------------------------------------------
 1 | #include <dslash_wilson.hpp>
 2 | 
 3 | namespace quda
 4 | {
 5 | 
 6 |   constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
 7 |   constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
 8 |   constexpr int reconI = @QUDA_DSLASH_RECONI@;
 9 | 
10 |   using DDArg = @QUDA_DSLASH_DDARG@;
11 |   using Float = precision_type_mapper<precision>::type;
12 | 
13 |   template struct WilsonApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>;
14 | 
15 |   template WilsonApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>::WilsonApply(
16 |     cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, cvector_ref<const ColorSpinorField> &x,
17 |     const GaugeField &U, double a, double alpha0, int t0, int parity, bool dagger, const int *comm_override,
18 |     DistanceType<false>, TimeProfile &profile);
19 | 
20 | } // namespace quda
21 | 


--------------------------------------------------------------------------------
/lib/dslash_wilson_clover.cpp:
--------------------------------------------------------------------------------
 1 | #include <instantiate_dslash.h>
 2 | 
 3 | /**
 4 |    This is the Wilson-clover linear operator
 5 | */
 6 | 
 7 | namespace quda
 8 | {
 9 | 
10 |   template <typename Float, int nColor, typename DDArg, QudaReconstructType recon> struct WilsonCloverApply {
11 |     template <bool distance_pc>
12 |     WilsonCloverApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
13 |                       cvector_ref<const ColorSpinorField> &x, const GaugeField &U, const CloverField &A, double a,
14 |                       double alpha0, int t0, int parity, bool dagger, const int *comm_override,
15 |                       DistanceType<distance_pc>, TimeProfile &profile);
16 |   };
17 | 
18 |   // Apply the Wilson-clover operator
19 |   // out(x) = M*in = (A(x)*in(x) + a * \sum_mu U_{-\mu}(x)in(x+mu) + U^\dagger_mu(x-mu)in(x-mu))
20 |   // Uses the kappa normalization for the Wilson operator.
21 |   void ApplyWilsonClover(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
22 |                          const GaugeField &U, const CloverField &A, double a, cvector_ref<const ColorSpinorField> &x,
23 |                          int parity, bool dagger, const int *comm_override, TimeProfile &profile)
24 |   {
25 |     if constexpr (is_enabled<QUDA_CLOVER_WILSON_DSLASH>()) {
26 |       auto dummy = DistanceType<false>();
27 |       instantiate<WilsonCloverApply>(out, in, x, U, A, a, 0, -1, parity, dagger, comm_override, dummy, profile);
28 |     } else {
29 |       errorQuda("Wilson-clover operator has not been built");
30 |     }
31 |   }
32 | 
33 | } // namespace quda
34 | 


--------------------------------------------------------------------------------
/lib/dslash_wilson_clover.in.cu:
--------------------------------------------------------------------------------
 1 | #include <dslash_wilson_clover.hpp>
 2 | 
 3 | /**
 4 |    This is the Wilson-clover linear operator
 5 | */
 6 | 
 7 | namespace quda
 8 | {
 9 | 
10 |   constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
11 |   constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
12 |   constexpr int reconI = @QUDA_DSLASH_RECONI@;
13 | 
14 |   using DDArg = @QUDA_DSLASH_DDARG@;
15 |   using Float = precision_type_mapper<precision>::type;
16 | 
17 |   template struct WilsonCloverApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>;
18 | 
19 |   template WilsonCloverApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>::WilsonCloverApply(
20 |     cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, cvector_ref<const ColorSpinorField> &x,
21 |     const GaugeField &U, const CloverField &A, double a, double alpha0, int t0, int parity, bool dagger,
22 |     const int *comm_override, DistanceType<false>, TimeProfile &profile);
23 | 
24 | } // namespace quda
25 | 


--------------------------------------------------------------------------------
/lib/dslash_wilson_clover_distance.cpp:
--------------------------------------------------------------------------------
 1 | #include <instantiate_dslash.h>
 2 | 
 3 | /**
 4 |    This is the Wilson-clover linear operator
 5 | */
 6 | 
 7 | namespace quda
 8 | {
 9 | 
10 |   template <typename Float, int nColor, typename DDArg, QudaReconstructType recon> struct WilsonCloverApply {
11 |     template <bool distance_pc>
12 |     WilsonCloverApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
13 |                       cvector_ref<const ColorSpinorField> &x, const GaugeField &U, const CloverField &A, double a,
14 |                       double alpha0, int t0, int parity, bool dagger, const int *comm_override,
15 |                       DistanceType<distance_pc>, TimeProfile &profile);
16 |   };
17 | 
18 |   // Apply the distance preconditioned Wilson-clover operator
19 |   // out(x) = M*in = a * A(x)^{-1} [ \sum_i U_i(x)in(x+\hat{i}) + U^\dagger_i(x-\hat{i})in(x-\hat{i})
20 |   //                               + fwd(x_4)*U_4(x)in(x+\hat{4}) + bwd(x_4)*U^\dagger_4(x-\hat{4})in(x-\hat{4}) ]
21 |   // with fwd(t)=\alpha(t+1)/\alpha(t), bwd(t)=\alpha(t+1)/\alpha(t), \alpha(t)=\cosh(\alpha_0*((t-t_0)%L_t-L_t/2))
22 |   // Uses the kappa normalization for the Wilson operator.
23 |   void ApplyWilsonCloverDistance(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
24 |                                  const GaugeField &U, const CloverField &A, double a, double alpha0, int t0,
25 |                                  cvector_ref<const ColorSpinorField> &x, int parity, bool dagger,
26 |                                  const int *comm_override, TimeProfile &profile)
27 |   {
28 |     if constexpr (is_enabled<QUDA_CLOVER_WILSON_DSLASH>() && is_enabled_distance_precondition()) {
29 |       auto dummy = DistanceType<true>();
30 |       instantiate<WilsonCloverApply>(out, in, x, U, A, a, alpha0, t0, parity, dagger, comm_override, dummy, profile);
31 |     } else {
32 |       errorQuda("Wilson-clover operator with distance preconditioning has not been built");
33 |     }
34 |   }
35 | 
36 | } // namespace quda
37 | 


--------------------------------------------------------------------------------
/lib/dslash_wilson_clover_distance.in.cu:
--------------------------------------------------------------------------------
 1 | #include <dslash_wilson_clover.hpp>
 2 | 
 3 | /**
 4 |    This is the Wilson-clover linear operator
 5 | */
 6 | 
 7 | namespace quda
 8 | {
 9 | 
10 |   constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
11 |   constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
12 |   constexpr int reconI = @QUDA_DSLASH_RECONI@;
13 | 
14 |   using DDArg = @QUDA_DSLASH_DDARG@;
15 |   using Float = precision_type_mapper<precision>::type;
16 | 
17 |   template struct WilsonCloverApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>;
18 | 
19 |   template WilsonCloverApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>::WilsonCloverApply(
20 |     cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, cvector_ref<const ColorSpinorField> &x,
21 |     const GaugeField &U, const CloverField &A, double a, double alpha0, int t0, int parity, bool dagger,
22 |     const int *comm_override, DistanceType<true>, TimeProfile &profile);
23 | 
24 | } // namespace quda
25 | 


--------------------------------------------------------------------------------
/lib/dslash_wilson_clover_hasenbusch_twist.cpp:
--------------------------------------------------------------------------------
 1 | #include <instantiate_dslash.h>
 2 | 
 3 | namespace quda
 4 | {
 5 | 
 6 |   template <typename Float, int nColor, typename DDArg, QudaReconstructType recon>
 7 |   struct WilsonCloverHasenbuschTwistApply {
 8 |     WilsonCloverHasenbuschTwistApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
 9 |                                      cvector_ref<const ColorSpinorField> &x, const GaugeField &U, const CloverField &A,
10 |                                      double a, double b, int parity, bool dagger, const int *comm_override,
11 |                                      TimeProfile &profile);
12 |   };
13 | 
14 |   // Apply the Wilson-clover operator
15 |   // out(x) = M*in = (A(x) + a * \sum_mu U_{-\mu}(x)in(x+mu) + U^\dagger_mu(x-mu)in(x-mu))
16 |   // Uses the kappa normalization for the Wilson operator.
17 |   void ApplyWilsonCloverHasenbuschTwist(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
18 |                                         const GaugeField &U, const CloverField &A, double a, double b,
19 |                                         cvector_ref<const ColorSpinorField> &x, int parity, bool dagger,
20 |                                         const int *comm_override, TimeProfile &profile)
21 |   {
22 |     if constexpr (is_enabled<QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH>()) {
23 |       instantiate<WilsonCloverHasenbuschTwistApply>(out, in, x, U, A, a, b, parity, dagger, comm_override, profile);
24 |     } else {
25 |       errorQuda("Clover Hasensbuch Twist operator has not been built");
26 |     }
27 |   }
28 | 
29 | } // namespace quda
30 | 


--------------------------------------------------------------------------------
/lib/dslash_wilson_clover_hasenbusch_twist.in.cu:
--------------------------------------------------------------------------------
 1 | #include <dslash_wilson_clover_hasenbusch_twist.hpp>
 2 | 
 3 | namespace quda
 4 | {
 5 | 
 6 |   constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
 7 |   constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
 8 |   constexpr int reconI = @QUDA_DSLASH_RECONI@;
 9 | 
10 |   using DDArg = @QUDA_DSLASH_DDARG@;
11 |   using Float = precision_type_mapper<precision>::type;
12 | 
13 |   template struct WilsonCloverHasenbuschTwistApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>;
14 | 
15 | } // namespace quda
16 | 


--------------------------------------------------------------------------------
/lib/dslash_wilson_clover_hasenbusch_twist_preconditioned_clovinv.in.cu:
--------------------------------------------------------------------------------
 1 | #include <dslash_wilson_clover_hasenbusch_twist_preconditioned.hpp>
 2 | 
 3 | namespace quda
 4 | {
 5 | 
 6 |   constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
 7 |   constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
 8 |   constexpr int reconI = @QUDA_DSLASH_RECONI@;
 9 | 
10 |   using DDArg = @QUDA_DSLASH_DDARG@;
11 |   using Float = precision_type_mapper<precision>::type;
12 | 
13 |   template struct WilsonCloverHasenbuschTwistPCClovInvApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>;
14 | 
15 | } // namespace quda
16 | 


--------------------------------------------------------------------------------
/lib/dslash_wilson_clover_hasenbusch_twist_preconditioned_no_clovinv.in.cu:
--------------------------------------------------------------------------------
 1 | #include <dslash_wilson_clover_hasenbusch_twist_preconditioned.hpp>
 2 | 
 3 | namespace quda
 4 | {
 5 | 
 6 |   constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
 7 |   constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
 8 |   constexpr int reconI = @QUDA_DSLASH_RECONI@;
 9 | 
10 |   using DDArg = @QUDA_DSLASH_DDARG@;
11 |   using Float = precision_type_mapper<precision>::type;
12 | 
13 |   template struct WilsonCloverHasenbuschTwistPCNoClovInvApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>;
14 | 
15 | } // namespace quda
16 | 


--------------------------------------------------------------------------------
/lib/dslash_wilson_clover_preconditioned.cpp:
--------------------------------------------------------------------------------
 1 | #include <instantiate_dslash.h>
 2 | 
 3 | /**
 4 |    This is the Wilson-clover preconditioned linear operator
 5 | */
 6 | 
 7 | namespace quda
 8 | {
 9 | 
10 |   template <typename Float, int nColor, typename DDArg, QudaReconstructType recon>
11 |   struct WilsonCloverPreconditionedApply {
12 |     template <bool distance_pc>
13 |     WilsonCloverPreconditionedApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
14 |                                     cvector_ref<const ColorSpinorField> &x, const GaugeField &U, const CloverField &A,
15 |                                     double a, double alpha0, int t0, int parity, bool dagger, const int *comm_override,
16 |                                     DistanceType<distance_pc>, TimeProfile &profile);
17 |   };
18 | 
19 |   // Apply the preconditioned Wilson-clover operator
20 |   // out(x) = M*in = a * A(x)^{-1} (\sum_mu U_{-\mu}(x)in(x+mu) + U^\dagger_mu(x-mu)in(x-mu))
21 |   // Uses the kappa normalization for the Wilson operator.
22 |   void ApplyWilsonCloverPreconditioned(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
23 |                                        const GaugeField &U, const CloverField &A, double a,
24 |                                        cvector_ref<const ColorSpinorField> &x, int parity, bool dagger,
25 |                                        const int *comm_override, TimeProfile &profile)
26 |   {
27 |     if constexpr (is_enabled<QUDA_CLOVER_WILSON_DSLASH>()) {
28 |       auto dummy = DistanceType<false>();
29 |       instantiate<WilsonCloverPreconditionedApply>(out, in, x, U, A, a, 0.0, -1, parity, dagger, comm_override, dummy,
30 |                                                    profile);
31 |     } else {
32 |       errorQuda("Wilson-clover operator has not been built");
33 |     }
34 |   }
35 | 
36 | } // namespace quda
37 | 


--------------------------------------------------------------------------------
/lib/dslash_wilson_clover_preconditioned.in.cu:
--------------------------------------------------------------------------------
 1 | #include <dslash_wilson_clover_preconditioned.hpp>
 2 | 
 3 | /**
 4 |    This is the Wilson-clover linear operator
 5 | */
 6 | 
 7 | namespace quda
 8 | {
 9 | 
10 |   constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
11 |   constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
12 |   constexpr int reconI = @QUDA_DSLASH_RECONI@;
13 | 
14 |   using DDArg = @QUDA_DSLASH_DDARG@;
15 |   using Float = precision_type_mapper<precision>::type;
16 | 
17 |   template struct WilsonCloverPreconditionedApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>;
18 | 
19 |   template WilsonCloverPreconditionedApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>::WilsonCloverPreconditionedApply(
20 |     cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, cvector_ref<const ColorSpinorField> &x,
21 |     const GaugeField &U, const CloverField &A, double a, double alpha0, int t0, int parity, bool dagger,
22 |     const int *comm_override, DistanceType<false>, TimeProfile &profile);
23 | 
24 | } // namespace quda
25 | 


--------------------------------------------------------------------------------
/lib/dslash_wilson_clover_preconditioned_distance.in.cu:
--------------------------------------------------------------------------------
 1 | #include <dslash_wilson_clover_preconditioned.hpp>
 2 | 
 3 | /**
 4 |    This is the Wilson-clover linear operator
 5 | */
 6 | 
 7 | namespace quda
 8 | {
 9 | 
10 |   constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
11 |   constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
12 |   constexpr int reconI = @QUDA_DSLASH_RECONI@;
13 | 
14 |   using DDArg = @QUDA_DSLASH_DDARG@;
15 |   using Float = precision_type_mapper<precision>::type;
16 | 
17 |   template struct WilsonCloverPreconditionedApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>;
18 | 
19 |   template WilsonCloverPreconditionedApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>::WilsonCloverPreconditionedApply(
20 |     cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, cvector_ref<const ColorSpinorField> &x,
21 |     const GaugeField &U, const CloverField &A, double a, double alpha0, int t0, int parity, bool dagger,
22 |     const int *comm_override, DistanceType<true>, TimeProfile &profile);
23 | 
24 | } // namespace quda
25 | 


--------------------------------------------------------------------------------
/lib/dslash_wilson_distance.cpp:
--------------------------------------------------------------------------------
 1 | #include <instantiate_dslash.h>
 2 | 
 3 | namespace quda
 4 | {
 5 | 
 6 |   template <typename Float, int nColor, typename DDArg, QudaReconstructType recon> struct WilsonApply {
 7 |     template <bool distance_pc>
 8 |     WilsonApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
 9 |                 cvector_ref<const ColorSpinorField> &x, const GaugeField &U, double a, double alpha0, int t0, int parity,
10 |                 bool dagger, const int *comm_override, std::integral_constant<bool, distance_pc>, TimeProfile &profile);
11 |   };
12 | 
13 |   // Apply the distance preconditioned Wilson operator
14 |   // out(x) = M*in = - a*[ \sum_i U_i(x)in(x+\hat{i}) + U^\dagger_i(x-\hat{i})in(x-\hat{i})
15 |   //                     + fwd(x_4)*U_4(x)in(x+\hat{4}) + bwd(x_4)*U^\dagger_4(x-\hat{4})in(x-\hat{4}) ]
16 |   // with fwd(t)=\alpha(t+1)/\alpha(t), bwd(t)=\alpha(t+1)/\alpha(t), \alpha(t)=\cosh(\alpha_0*((t-t_0)%L_t-L_t/2))
17 |   // Uses the a normalization for the Wilson operator.
18 |   void ApplyWilsonDistance(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
19 |                            const GaugeField &U, double a, double alpha0, int t0, cvector_ref<const ColorSpinorField> &x,
20 |                            int parity, bool dagger, const int *comm_override, TimeProfile &profile)
21 |   {
22 |     if (in.Ndim() == 5) errorQuda("Unexpected nDim = 5");
23 |     if constexpr (is_enabled<QUDA_WILSON_DSLASH>() && is_enabled_distance_precondition()) {
24 |       auto dummy = DistanceType<true>();
25 |       instantiate<WilsonApply>(out, in, x, U, a, alpha0, t0, parity, dagger, comm_override, dummy, profile);
26 |     } else {
27 |       errorQuda("Wilson operator with distance preconditioning has not been built");
28 |     }
29 |   }
30 | 
31 | } // namespace quda
32 | 


--------------------------------------------------------------------------------
/lib/dslash_wilson_distance.in.cu:
--------------------------------------------------------------------------------
 1 | #include <dslash_wilson.hpp>
 2 | 
 3 | namespace quda
 4 | {
 5 | 
 6 |   constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
 7 |   constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
 8 |   constexpr int reconI = @QUDA_DSLASH_RECONI@;
 9 | 
10 |   using DDArg = @QUDA_DSLASH_DDARG@;
11 |   using Float = precision_type_mapper<precision>::type;
12 | 
13 |   template struct WilsonApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>;
14 | 
15 |   template WilsonApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>::WilsonApply(
16 |     cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, cvector_ref<const ColorSpinorField> &x,
17 |     const GaugeField &U, double a, double alpha0, int t0, int parity, bool dagger, const int *comm_override,
18 |     DistanceType<true>, TimeProfile &profile);
19 | 
20 | } // namespace quda
21 | 


--------------------------------------------------------------------------------
/lib/gauge_field_strength_tensor.cu:
--------------------------------------------------------------------------------
 1 | #include <tunable_nd.h>
 2 | #include <gauge_field.h>
 3 | #include <kernels/field_strength_tensor.cuh>
 4 | #include <instantiate.h>
 5 | 
 6 | namespace quda
 7 | {
 8 | 
 9 |   template <typename Float, int nColor, QudaReconstructType recon> class Fmunu : TunableKernel3D
10 |   {
11 |     GaugeField &f;
12 |     const GaugeField &u;
13 |     unsigned int minThreads() const { return f.VolumeCB(); }
14 |     unsigned int sharedBytesPerThread() const { return 4 * sizeof(int); } // for thread_array
15 | 
16 |   public:
17 |     Fmunu(const GaugeField &u, GaugeField &f) :
18 |       TunableKernel3D(u, 2, 6),
19 |       f(f),
20 |       u(u)
21 |     {
22 |       strcat(aux, comm_dim_partitioned_string());
23 |       apply(device::get_default_stream());
24 |     }
25 | 
26 |     void apply(const qudaStream_t &stream)
27 |     {
28 |       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
29 |       launch<ComputeFmunu>(tp, stream, FmunuArg<Float, nColor, recon>(f, u));
30 |     }
31 | 
32 |     long long flops() const { return (2430 + 36) * 6 * f.Volume(); }
33 |     long long bytes() const { return ((16 * u.Reconstruct() + f.Reconstruct()) * 6 * f.Volume() * f.Precision()); }
34 |   };
35 | 
36 |   void computeFmunu(GaugeField &f, const GaugeField &u)
37 |   {
38 |     getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
39 |     checkPrecision(f, u);
40 |     instantiate2<Fmunu,ReconstructWilson>(u, f); // u must be first here for correct template instantiation
41 |     getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
42 |   }
43 | 
44 | } // namespace quda
45 | 


--------------------------------------------------------------------------------
/lib/gauge_phase.cu:
--------------------------------------------------------------------------------
 1 | #include <comm_quda.h>
 2 | #include <tunable_nd.h>
 3 | #include <instantiate.h>
 4 | #include <kernels/gauge_phase.cuh>
 5 | 
 6 | namespace quda {
 7 | 
 8 |   template <typename Float, int nColor, QudaReconstructType recon>
 9 |   class GaugePhase_ : TunableKernel2D {
10 |     GaugeField &u; // used for meta data only
11 |     unsigned int minThreads() const { return u.VolumeCB(); }
12 | 
13 |   public:
14 |     GaugePhase_(GaugeField &u) :
15 |       TunableKernel2D(u, 2),
16 |       u(u)
17 |     {
18 |       strcat(aux, "phase=");
19 |       apply(device::get_default_stream());
20 |     }
21 | 
22 |     void apply(const qudaStream_t &stream)
23 |     {
24 |       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
25 |       if (u.StaggeredPhase() == QUDA_STAGGERED_PHASE_MILC) {
26 |         GaugePhaseArg<Float, nColor, recon, QUDA_STAGGERED_PHASE_MILC> arg(u);
27 |         launch<GaugePhase>(tp, stream, arg);
28 |       } else if (u.StaggeredPhase() == QUDA_STAGGERED_PHASE_CHROMA) {
29 |         GaugePhaseArg<Float, nColor, recon, QUDA_STAGGERED_PHASE_CHROMA> arg(u);
30 |         launch<GaugePhase>(tp, stream, arg);
31 |       } else if (u.StaggeredPhase() == QUDA_STAGGERED_PHASE_TIFR) {
32 |         GaugePhaseArg<Float, nColor, recon, QUDA_STAGGERED_PHASE_TIFR> arg(u);
33 |         launch<GaugePhase>(tp, stream, arg);
34 |       } else {
35 |         errorQuda("Undefined phase type %d", u.StaggeredPhase());
36 |       }
37 |     }
38 | 
39 |     void preTune() { u.backup(); }
40 |     void postTune() { u.restore(); }
41 | 
42 |     long long flops() const { return 0; }
43 |     long long bytes() const { return 2 * u.Bytes(); }
44 |   };
45 | 
46 |   void applyGaugePhase(GaugeField &u)
47 |   {
48 |     getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
49 |     instantiate<GaugePhase_, ReconstructNone>(u);
50 |     // ensure that ghosts are updated if needed
51 |     if (u.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD) u.exchangeGhost();
52 |     getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
53 |   }
54 | 
55 | } // namespace quda
56 | 


--------------------------------------------------------------------------------
/lib/gauge_plaq.cu:
--------------------------------------------------------------------------------
 1 | #include <gauge_field.h>
 2 | #include <instantiate.h>
 3 | #include <tunable_reduction.h>
 4 | #include <kernels/gauge_plaq.cuh>
 5 | 
 6 | namespace quda {
 7 | 
 8 |   template<typename Float, int nColor, QudaReconstructType recon>
 9 |   class GaugePlaq : public TunableReduction2D {
10 |     const GaugeField &u;
11 |     array<double, 2> &plq;
12 | 
13 |   public:
14 |     GaugePlaq(const GaugeField &u, array<double, 2> &plq) :
15 |       TunableReduction2D(u),
16 |       u(u),
17 |       plq(plq)
18 |     {
19 |       apply(device::get_default_stream());
20 |     }
21 | 
22 |     void apply(const qudaStream_t &stream)
23 |     {
24 |       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
25 |       GaugePlaqArg<Float, nColor, recon> arg(u);
26 |       launch<Plaquette>(plq, tp, stream, arg);
27 |       for (int i = 0; i < 2; i++) plq[i] /= 9.*2*arg.threads.x*comm_size();
28 |     }
29 | 
30 |     long long flops() const
31 |     {
32 |       auto Nc = u.Ncolor();
33 |       return 6ll*u.Volume()*(3 * (8 * Nc * Nc * Nc - 2 * Nc * Nc) + Nc);
34 |     }
35 |     long long bytes() const { return u.Bytes(); }
36 |   };
37 | 
38 |   double3 plaquette(const GaugeField &U)
39 |   {
40 |     getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
41 |     array<double, 2> plq{0.0, 0.0};
42 |     instantiate<GaugePlaq, ReconstructGauge>(U, plq);
43 |     double3 plaq = make_double3(0.5*(plq[0] + plq[1]), plq[0], plq[1]);
44 |     getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
45 |     return plaq;
46 |   }
47 | 
48 | } // namespace quda
49 | 


--------------------------------------------------------------------------------
/lib/gauge_plaqrect.cu:
--------------------------------------------------------------------------------
 1 | #include <gauge_field.h>
 2 | #include <instantiate.h>
 3 | #include <tunable_reduction.h>
 4 | #include <kernels/gauge_plaqrect.cuh>
 5 | 
 6 | namespace quda
 7 | {
 8 | 
 9 |   template <typename Float, int nColor, QudaReconstructType recon> class GaugePlaqRect : public TunableReduction2D
10 |   {
11 |     const GaugeField &u;
12 |     array<double, 4> &plqrct;
13 | 
14 |   public:
15 |     GaugePlaqRect(const GaugeField &u, array<double, 4> &plqrct) : TunableReduction2D(u), u(u), plqrct(plqrct)
16 |     {
17 |       apply(device::get_default_stream());
18 |     }
19 | 
20 |     void apply(const qudaStream_t &stream)
21 |     {
22 |       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
23 |       GaugePlaqRectArg<Float, nColor, recon> arg(u);
24 |       launch<PlaquetteRectangle>(plqrct, tp, stream, arg);
25 |       // Normalize plaquette and rectangle
26 |       for (int i = 0; i < 2; i++) plqrct[i] /= 9. * 2 * arg.threads.x * comm_size();
27 |       for (int i = 2; i < 4; i++) plqrct[i] /= 9. * 4 * arg.threads.x * comm_size();
28 |     }
29 | 
30 |     long long flops() const
31 |     {
32 |       auto Nc = u.Ncolor();
33 |       // 10 mat-mat multiplies to compute plaquette and 2 rectangles
34 |       // Nc * Nc * (8 * Nc - 2) flops per mat-mat multiply
35 |       // Plus 2 traces ~ 2 * Nc flops
36 |       // All of the above * 6 (number of planes) * volume
37 |       return 6ll * u.Volume() * (10 * Nc * Nc * (8 * Nc - 2) + 2 * Nc);
38 |     }
39 |     long long bytes() const { return u.Bytes(); }
40 |   };
41 | 
42 |   double4 plaquetteRectangle(const GaugeField &U)
43 |   {
44 |     getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
45 |     array<double, 4> plqrct {0.0, 0.0, 0.0, 0.0};
46 |     instantiate<GaugePlaqRect, ReconstructGauge>(U, plqrct);
47 |     double4 plaqrect = make_double4(plqrct[0], plqrct[1], plqrct[2], plqrct[3]);
48 |     getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
49 |     return plaqrect;
50 |   }
51 | 
52 | } // namespace quda
53 | 


--------------------------------------------------------------------------------
/lib/generate/nvtx.w:
--------------------------------------------------------------------------------
 1 | #include <pthread.h>
 2 | 
 3 | #if QUDA_NVTX_VERSION==3
 4 | #include <nvtx3/nvToolsExt.h>
 5 | #include <nvtx3/nvToolsExtCudaRt.h>
 6 | #else
 7 | #include <nvToolsExt.h>
 8 | #include <nvToolsExtCudaRt.h>
 9 | #endif
10 | 
11 | #include <mpi_comm_handle.h>
12 | // Setup event category name
13 | {{fn name MPI_Init}}
14 |   nvtxNameCategoryA(999, "MPI");
15 |   {{callfn}}
16 |   int rank;
17 |   PMPI_Comm_rank(MPI_COMM_HANDLE, &rank);
18 |   char name[256];
19 |   sprintf( name, "MPI Rank %d", rank );
20 |  
21 |   nvtxNameOsThread(pthread_self(), name);
22 |   nvtxNameCudaDeviceA(rank, name);
23 | {{endfn}}
24 | // Wrap select MPI functions with NVTX ranges
25 | {{fn name MPI_Send MPI_Recv MPI_Allreduce MPI_Reduce MPI_Wait MPI_Waitany
26 | MPI_Waitall MPI_Waitsome MPI_Gather MPI_Gatherv MPI_Scatter MPI_Scatterv
27 | MPI_Allgather MPI_Allgatherv MPI_Alltoall MPI_Alltoallv MPI_Alltoallw MPI_Bcast
28 | MPI_Sendrecv MPI_Barrier MPI_Start MPI_Test MPI_Send_init MPI_Recv_init }}
29 |   nvtxEventAttributes_t eventAttrib = {0};
30 |   eventAttrib.version = NVTX_VERSION;
31 |   eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
32 |   eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
33 |   eventAttrib.message.ascii  = "{{name}}";
34 |   eventAttrib.category = 999;
35 |   eventAttrib.colorType = NVTX_COLOR_ARGB; \
36 |   eventAttrib.color = 0xffffaa00; \
37 |  
38 |   nvtxRangePushEx(&eventAttrib);
39 |   {{callfn}}
40 |   nvtxRangePop();
41 | {{endfn}}
42 | 


--------------------------------------------------------------------------------
/lib/instantiate.cpp:
--------------------------------------------------------------------------------
 1 | #include <instantiate.h>
 2 | 
 3 | /**
 4 |    This file contains deinitions required when compiling with C++14.
 5 |    Without these, we can end up with undefined references at link
 6 |    time.  We can remove this file when we jump to C++17 and declare
 7 |    these are inline variables in instantiate.h.
 8 |  */
 9 | 
10 | namespace quda
11 | {
12 | 
13 |   // declared in instantiate.h
14 |   constexpr std::array<QudaReconstructType, 6> ReconstructFull::recon;
15 |   constexpr std::array<QudaReconstructType, 3> ReconstructWilson::recon;
16 |   constexpr std::array<QudaReconstructType, 3> ReconstructStaggered::recon;
17 |   constexpr std::array<QudaReconstructType, 2> ReconstructNo12::recon;
18 |   constexpr std::array<QudaReconstructType, 1> ReconstructNone::recon;
19 |   constexpr std::array<QudaReconstructType, 2> ReconstructMom::recon;
20 |   constexpr std::array<QudaReconstructType, 1> Reconstruct10::recon;
21 | 
22 | } // namespace quda
23 | 


--------------------------------------------------------------------------------
/lib/interface/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # add interface files / options
2 | target_sources(quda_cpp PRIVATE blas_interface.cpp)
3 | 
4 | if(BUILD_FORTRAN_INTERFACE)
5 |   target_sources(quda_cpp PRIVATE fortran_interface.cpp)
6 | endif()
7 | 


--------------------------------------------------------------------------------
/lib/laplace.cpp:
--------------------------------------------------------------------------------
 1 | #include <instantiate_dslash.h>
 2 | 
 3 | namespace quda
 4 | {
 5 | 
 6 |   template <typename Float, int nColor, typename DDArg, QudaReconstructType recon> struct LaplaceApply {
 7 |     LaplaceApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
 8 |                  cvector_ref<const ColorSpinorField> &x, const GaugeField &U, int dir, double a, double b, int parity,
 9 |                  const int *comm_override, TimeProfile &profile);
10 |   };
11 | 
12 |   // Apply the Laplace operator
13 |   // out(x) = M*in = - a*\sum_mu U_{-\mu}(x)in(x+mu) + U^\dagger_mu(x-mu)in(x-mu) + b*in(x)
14 |   // Omits direction 'dir' from the operator.
15 |   void ApplyLaplace(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const GaugeField &U,
16 |                     int dir, double a, double b, cvector_ref<const ColorSpinorField> &x, int parity,
17 |                     const int *comm_override, TimeProfile &profile)
18 |   {
19 |     if constexpr (is_enabled<QUDA_LAPLACE_DSLASH>()) {
20 |       instantiate<LaplaceApply>(out, in, x, U, dir, a, b, parity, comm_override, profile);
21 |     } else {
22 |       errorQuda("Laplace operator has not been enabled");
23 |     }
24 |   }
25 | } // namespace quda
26 | 


--------------------------------------------------------------------------------
/lib/laplace.in.cu:
--------------------------------------------------------------------------------
 1 | #include <laplace.hpp>
 2 | 
 3 | namespace quda
 4 | {
 5 | 
 6 |   constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
 7 |   constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
 8 |   constexpr int reconI = @QUDA_DSLASH_RECONI@;
 9 | 
10 |   using DDArg = @QUDA_DSLASH_DDARG@;
11 |   using Float = precision_type_mapper<precision>::type;
12 | 
13 |   template struct LaplaceApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>;
14 | 
15 | } // namespace quda
16 | 


--------------------------------------------------------------------------------
/lib/madwf_transfer.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <quda_internal.h>
 4 | #include <quda.h>
 5 | #include <complex_quda.h>
 6 | 
 7 | #include <vector>
 8 | 
 9 | #include <device_vector.h>
10 | #include <madwf_ml.h>
11 | 
12 | namespace quda
13 | {
14 |   namespace madwf_ml
15 |   {
16 |     /**
17 |       @brief Perform a transfer in the fifth dimension, i.e. out_s = T_st * in_t, s,t <- fifth dimension indices
18 |       @param out the output field
19 |       @param in the input field
20 |       @param tp The device_vector that contains the transfer matrix
21 |       @param dagger whether or not perform dagger for the transfer matrix
22 |      */
23 |     void transfer_5d_hh(ColorSpinorField &out, const ColorSpinorField &in,
24 |                         const device_vector<MadwfAcc::transfer_float> &tp, bool dagger);
25 | 
26 |     /**
27 |       @brief Perform a tensor product in the fifth dimension, i.e. T_st = conj(conj(x_s) * y_t), s,t <- fifth dimension indices
28 |       @param x the x field
29 |       @param y the y field
30 |       @param tp The device_vector that contains the output transfer matrix
31 |      */
32 |     void tensor_5d_hh(ColorSpinorField &x, const ColorSpinorField &y, device_vector<MadwfAcc::transfer_float> &tp);
33 | 
34 |   } // namespace madwf_ml
35 | } // namespace quda
36 | 


--------------------------------------------------------------------------------
/lib/numa_affinity.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | /* Originally from Galen Arnold, NCSA arnoldg@ncsa.illinois.edu
 3 |  * modified by Guochun Shi
 4 |  *
 5 |  */
 6 | 
 7 | #include <numa_affinity.h>
 8 | #include <quda_internal.h>
 9 | 
10 | #ifdef NUMA_NVML
11 | #include <nvml.h>
12 | #endif
13 | 
14 | 
15 | int setNumaAffinityNVML(int devid)
16 | {
17 | #ifdef NUMA_NVML
18 |   nvmlReturn_t result;
19 | 
20 |   result = nvmlInit();
21 |   if (NVML_SUCCESS != result)
22 |   {
23 |     warningQuda("Failed to determine NUMA affinity for device %d (NVML Init failed)", devid);
24 |     return -1;
25 |   }
26 |   nvmlDevice_t device;
27 |   result = nvmlDeviceGetHandleByIndex(devid, &device);
28 |   if (NVML_SUCCESS != result)
29 |   {
30 |     warningQuda("Failed to determine NUMA affinity for device %d (NVML DeviceGetHandle failed)", devid);
31 |     return -1;
32 |   }
33 |   result = nvmlDeviceSetCpuAffinity(device);
34 |   if (NVML_SUCCESS != result)
35 |   {
36 |     warningQuda("Failed to determine NUMA affinity for device %d (NVML DeviceSetCpuAffinity failed)", devid);
37 |     return -1;
38 |   }
39 |   else{
40 |     printfQuda("Set NUMA affinity for device %d (NVML DeviceSetCpuAffinity)\n", devid);
41 |   }
42 |   result = nvmlShutdown();
43 |   if (NVML_SUCCESS != result)
44 |   {
45 |     warningQuda("Failed to determine NUMA affinity for device %d (NVML Shutdown failed)", devid);
46 |     return -1;
47 |   }
48 |   return 0;
49 | #else
50 |   warningQuda("Failed to determine NUMA affinity for device %d (NVML not supported in quda build)", devid);
51 |   return -1;
52 | #endif
53 | }
54 | 


--------------------------------------------------------------------------------
/lib/staggered_two_link_quda.cu:
--------------------------------------------------------------------------------
 1 | #include <utility>
 2 | #include <quda_internal.h>
 3 | #include <gauge_field.h>
 4 | #include <ks_improved_force.h>
 5 | #include <tune_quda.h>
 6 | #include <instantiate.h>
 7 | #include <tunable_nd.h>
 8 | #include <kernels/staggered_two_link.cuh>
 9 | 
10 | namespace quda
11 | {
12 | 
13 |   using namespace staggered_quark_smearing;
14 | 
15 |   template <typename real, int nColor, QudaReconstructType recon> class ComputeTwoLink : public TunableKernel3D
16 |   {
17 |     GaugeField &twoLink;
18 |     const GaugeField &link;
19 |     unsigned int minThreads() const { return twoLink.VolumeCB(); }
20 | 
21 |   public:
22 |     ComputeTwoLink(GaugeField &twoLink, const GaugeField &link) :
23 |       TunableKernel3D(link, 2, 4), twoLink(twoLink), link(link)
24 |     {
25 |       strcat(aux, comm_dim_partitioned_string());
26 |       apply(device::get_default_stream());
27 |     }
28 | 
29 |     void apply(const qudaStream_t &stream)
30 |     {
31 |       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
32 |       launch<TwoLink>(tp, stream, TwoLinkArg<real, nColor, recon>(twoLink, link));
33 |     }
34 | 
35 |     long long flops() const { return 4 * twoLink.Volume() * nColor * nColor * (8 * nColor - 2); }
36 |     long long bytes() const { return 2 * link.Bytes() + twoLink.Bytes(); }
37 |   };
38 | 
39 |   void computeTwoLink(GaugeField &twoLink, const GaugeField &link)
40 |   {
41 |     if constexpr (is_enabled<QUDA_STAGGERED_DSLASH>()) {
42 |       getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
43 |       checkNative(twoLink, link);
44 |       checkLocation(twoLink, link);
45 |       checkPrecision(twoLink, link);
46 |       // FIXME: enable link-12/8 reconstruction
47 |       instantiate<ComputeTwoLink, ReconstructNone>(twoLink, link);
48 |       getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
49 |     } else {
50 |       errorQuda("Two-link computation requires staggered operator to be enabled");
51 |     }
52 |   }
53 | 
54 | } // namespace quda
55 | 


--------------------------------------------------------------------------------
/lib/targets/cuda/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # ######################################################################################################################
 2 | # additonal sources
 3 | target_sources(quda_cpp PRIVATE quda_api.cpp device.cpp malloc.cpp blas_lapack_cublas.cpp comm_target.cpp)
 4 | 
 5 | if(QUDA_JITIFY)
 6 |   target_sources(quda_cpp PRIVATE jitify_helper.cpp)
 7 | endif()
 8 | 
 9 | if(QUDA_BACKWARDS)
10 |   set_property(
11 |     SOURCE malloc.cpp
12 |     DIRECTORY ${CMAKE_SOURCE_DIR}/lib
13 |     APPEND
14 |     PROPERTY COMPILE_DEFINITIONS ${BACKWARD_DEFINITIONS})
15 |   set_property(
16 |     SOURCE malloc.cpp
17 |     DIRECTORY ${CMAKE_SOURCE_DIR}/lib
18 |     APPEND
19 |     PROPERTY COMPILE_DEFINITIONS QUDA_BACKWARDSCPP)
20 | endif()
21 | 


--------------------------------------------------------------------------------
/lib/targets/generic/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # add target specific files / options 
2 | target_sources(quda_cpp PRIVATE blas_lapack_eigen.cpp)
3 | 
4 | 


--------------------------------------------------------------------------------
/lib/targets/hip/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # #########################################################################################################################
2 | # Additional sources
3 | target_sources(quda_cpp PRIVATE  quda_api.cpp device.cpp malloc.cpp blas_lapack_hipblas.cpp comm_target.cpp)
4 | 
5 | 


--------------------------------------------------------------------------------
/lib/version.cpp:
--------------------------------------------------------------------------------
1 | #ifdef GITVERSION
2 | const char* gitversion = GITVERSION ;
3 | #else
4 | const char* gitversion;
5 | #endif
6 | 


--------------------------------------------------------------------------------
/tests/asan.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | extern "C" {
 4 | 
 5 | /**
 6 |    @brief Set the default ASAN options.  This ensures that QUDA just
 7 |    works when SANITIZE is enabled without requiring ASAN_OPTIONS to
 8 |    be set.  We default disable leak checking, otherwise this will
 9 |    cause ctest to fail with MPI library leaks.  This declaration
10 |    cannot be in the test library, and must be in the test executable.
11 | */
12 | const char *__asan_default_options() { return "detect_leaks=0,protect_shadow_gap=0"; }
13 | }
14 | 


--------------------------------------------------------------------------------
/tests/c_interface_test.c:
--------------------------------------------------------------------------------
1 | #include <quda.h>
2 | 
3 | int main()
4 | {
5 |   initQuda(0);
6 |   endQuda();
7 | }
8 | 


--------------------------------------------------------------------------------
/tests/contract_ft_test_gtest.hpp:
--------------------------------------------------------------------------------
 1 | #include <gtest/gtest.h>
 2 | #include <quda_arch.h>
 3 | #include <cmath>
 4 | 
 5 | using test_t = ::testing::tuple<QudaContractType, QudaPrecision>;
 6 | 
 7 | class ContractFTTest : public ::testing::TestWithParam<test_t>
 8 | {
 9 |   test_t param;
10 | 
11 | public:
12 |   ContractFTTest() : param(GetParam()) { }
13 | };
14 | 
15 | bool skip_test(test_t param)
16 | {
17 |   auto contract_type = ::testing::get<0>(param);
18 |   auto prec = ::testing::get<1>(param);
19 | 
20 |   // skip spin 4 cases
21 |   if (contract_type == QUDA_CONTRACT_TYPE_DR_FT_T or contract_type == QUDA_CONTRACT_TYPE_DR_FT_Z) return true;
22 |   if (prec < QUDA_SINGLE_PRECISION) return true; // outer precision >= sloppy precision
23 |   if (!(QUDA_PRECISION & prec)) return true;     // precision not enabled so skip it
24 | 
25 |   return false;
26 | }
27 | 
28 | int contract(test_t param);
29 | 
30 | TEST_P(ContractFTTest, verify)
31 | {
32 |   if (skip_test(GetParam())) GTEST_SKIP();
33 | 
34 |   auto faults = contract(GetParam());
35 |   EXPECT_EQ(faults, 0) << "CPU and GPU implementations do not agree";
36 | }
37 | 
38 | std::string gettestname(::testing::TestParamInfo<test_t> param)
39 | {
40 |   std::string str("contract_");
41 | 
42 |   str += get_contract_str(::testing::get<0>(param.param));
43 |   str += std::string("_") + get_prec_str(::testing::get<1>(param.param));
44 | 
45 |   return str;
46 | }
47 | 
48 | using ::testing::Combine;
49 | using ::testing::Values;
50 | 
51 | auto contract_types = Values(QUDA_CONTRACT_TYPE_STAGGERED_FT_T); // FIXME : extend if needed
52 | 
53 | auto precisions = Values(QUDA_DOUBLE_PRECISION, QUDA_SINGLE_PRECISION);
54 | 
55 | INSTANTIATE_TEST_SUITE_P(contraction_ft, ContractFTTest, Combine(contract_types, precisions), gettestname);
56 | 


--------------------------------------------------------------------------------
/tests/covdev_test_gtest.hpp:
--------------------------------------------------------------------------------
 1 | #include <gtest/gtest.h>
 2 | #include <quda_arch.h>
 3 | #include <cmath>
 4 | 
 5 | using test_t = ::testing::tuple<QudaPrecision, QudaDagType>;
 6 | 
 7 | class CovDevTest : public ::testing::TestWithParam<test_t>
 8 | {
 9 | protected:
10 |   test_t param;
11 | 
12 | public:
13 |   CovDevTest() : param(GetParam()) { }
14 | };
15 | 
16 | bool skip_test(test_t param)
17 | {
18 |   auto prec = ::testing::get<0>(param);
19 |   // auto dag              = ::testing::get<1>(param);
20 |   // should we keep for all options?
21 |   if (!(QUDA_PRECISION & prec)) return true; // precision not enabled so skip i
22 | 
23 |   return false;
24 | }
25 | 
26 | std::array<double, 2> covdev_test(test_t param);
27 | 
28 | TEST_P(CovDevTest, verify)
29 | {
30 |   if (skip_test(GetParam())) GTEST_SKIP();
31 | 
32 |   std::array<double, 2> test_results = covdev_test(param);
33 | 
34 |   double deviation = test_results[0];
35 |   double tol = test_results[1];
36 | 
37 |   ASSERT_LE(deviation, tol) << "CPU and CUDA implementations do not agree";
38 | }
39 | 
40 | std::string gettestname(::testing::TestParamInfo<test_t> param)
41 | {
42 |   std::string str("covdev_");
43 | 
44 |   str += get_prec_str(::testing::get<0>(param.param));
45 |   str += std::string("_") + get_dag_str(::testing::get<1>(param.param));
46 | 
47 |   return str;
48 | }
49 | 
50 | using ::testing::Combine;
51 | using ::testing::Values;
52 | 
53 | auto precisions = Values(QUDA_DOUBLE_PRECISION, QUDA_SINGLE_PRECISION, QUDA_HALF_PRECISION);
54 | auto dagger_opt = Values(QUDA_DAG_YES, QUDA_DAG_NO);
55 | 
56 | INSTANTIATE_TEST_SUITE_P(covdevtst, CovDevTest, Combine(precisions, dagger_opt), gettestname);
57 | 


--------------------------------------------------------------------------------
/tests/googletest/include/gtest/internal/custom/README.md:
--------------------------------------------------------------------------------
 1 | # Customization Points
 2 | 
 3 | The custom directory is an injection point for custom user configurations.
 4 | 
 5 | ## Header `gtest.h`
 6 | 
 7 | ### The following macros can be defined:
 8 | 
 9 | *   `GTEST_OS_STACK_TRACE_GETTER_` - The name of an implementation of
10 |     `OsStackTraceGetterInterface`.
11 | *   `GTEST_CUSTOM_TEMPDIR_FUNCTION_` - An override for `testing::TempDir()`. See
12 |     `testing::TempDir` for semantics and signature.
13 | 
14 | ## Header `gtest-port.h`
15 | 
16 | The following macros can be defined:
17 | 
18 | ### Flag related macros:
19 | 
20 | *   `GTEST_FLAG(flag_name)`
21 | *   `GTEST_USE_OWN_FLAGFILE_FLAG_` - Define to 0 when the system provides its
22 |     own flagfile flag parsing.
23 | *   `GTEST_DECLARE_bool_(name)`
24 | *   `GTEST_DECLARE_int32_(name)`
25 | *   `GTEST_DECLARE_string_(name)`
26 | *   `GTEST_DEFINE_bool_(name, default_val, doc)`
27 | *   `GTEST_DEFINE_int32_(name, default_val, doc)`
28 | *   `GTEST_DEFINE_string_(name, default_val, doc)`
29 | 
30 | ### Logging:
31 | 
32 | *   `GTEST_LOG_(severity)`
33 | *   `GTEST_CHECK_(condition)`
34 | *   Functions `LogToStderr()` and `FlushInfoLog()` have to be provided too.
35 | 
36 | ### Threading:
37 | 
38 | *   `GTEST_HAS_NOTIFICATION_` - Enabled if Notification is already provided.
39 | *   `GTEST_HAS_MUTEX_AND_THREAD_LOCAL_` - Enabled if `Mutex` and `ThreadLocal`
40 |     are already provided. Must also provide `GTEST_DECLARE_STATIC_MUTEX_(mutex)`
41 |     and `GTEST_DEFINE_STATIC_MUTEX_(mutex)`
42 | *   `GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)`
43 | *   `GTEST_LOCK_EXCLUDED_(locks)`
44 | 
45 | ### Underlying library support features
46 | 
47 | *   `GTEST_HAS_CXXABI_H_`
48 | 
49 | ### Exporting API symbols:
50 | 
51 | *   `GTEST_API_` - Specifier for exported symbols.
52 | 
53 | ## Header `gtest-printers.h`
54 | 
55 | *   See documentation at `gtest/gtest-printers.h` for details on how to define a
56 |     custom printer.
57 | 


--------------------------------------------------------------------------------
/tests/googletest/include/gtest/internal/custom/gtest-port.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2015, Google Inc.
 2 | // All rights reserved.
 3 | //
 4 | // Redistribution and use in source and binary forms, with or without
 5 | // modification, are permitted provided that the following conditions are
 6 | // met:
 7 | //
 8 | //     * Redistributions of source code must retain the above copyright
 9 | // notice, this list of conditions and the following disclaimer.
10 | //     * Redistributions in binary form must reproduce the above
11 | // copyright notice, this list of conditions and the following disclaimer
12 | // in the documentation and/or other materials provided with the
13 | // distribution.
14 | //     * Neither the name of Google Inc. nor the names of its
15 | // contributors may be used to endorse or promote products derived from
16 | // this software without specific prior written permission.
17 | //
18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | //
30 | // Injection point for custom user configurations. See README for details
31 | //
32 | // ** Custom implementation starts here **
33 | 
34 | #ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
35 | #define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
36 | 
37 | #endif  // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
38 | 


--------------------------------------------------------------------------------
/tests/googletest/include/gtest/internal/custom/gtest.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2015, Google Inc.
 2 | // All rights reserved.
 3 | //
 4 | // Redistribution and use in source and binary forms, with or without
 5 | // modification, are permitted provided that the following conditions are
 6 | // met:
 7 | //
 8 | //     * Redistributions of source code must retain the above copyright
 9 | // notice, this list of conditions and the following disclaimer.
10 | //     * Redistributions in binary form must reproduce the above
11 | // copyright notice, this list of conditions and the following disclaimer
12 | // in the documentation and/or other materials provided with the
13 | // distribution.
14 | //     * Neither the name of Google Inc. nor the names of its
15 | // contributors may be used to endorse or promote products derived from
16 | // this software without specific prior written permission.
17 | //
18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | //
30 | // Injection point for custom user configurations. See README for details
31 | //
32 | // ** Custom implementation starts here **
33 | 
34 | #ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
35 | #define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
36 | 
37 | #endif  // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
38 | 


--------------------------------------------------------------------------------
/tests/googletest/src/gtest_main.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2006, Google Inc.
 2 | // All rights reserved.
 3 | //
 4 | // Redistribution and use in source and binary forms, with or without
 5 | // modification, are permitted provided that the following conditions are
 6 | // met:
 7 | //
 8 | //     * Redistributions of source code must retain the above copyright
 9 | // notice, this list of conditions and the following disclaimer.
10 | //     * Redistributions in binary form must reproduce the above
11 | // copyright notice, this list of conditions and the following disclaimer
12 | // in the documentation and/or other materials provided with the
13 | // distribution.
14 | //     * Neither the name of Google Inc. nor the names of its
15 | // contributors may be used to endorse or promote products derived from
16 | // this software without specific prior written permission.
17 | //
18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 
30 | #include <cstdio>
31 | #include "gtest/gtest.h"
32 | 
33 | #ifdef ARDUINO
34 | void setup() {
35 |   testing::InitGoogleTest();
36 | }
37 | 
38 | void loop() { RUN_ALL_TESTS(); }
39 | 
40 | #else
41 | 
42 | GTEST_API_ int main(int argc, char **argv) {
43 |   printf("Running main() from %s\n", __FILE__);
44 |   testing::InitGoogleTest(&argc, argv);
45 |   return RUN_ALL_TESTS();
46 | }
47 | #endif
48 | 


--------------------------------------------------------------------------------
/tests/host_reference/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # add reference files to quda_test
 2 | target_sources(
 3 |   quda_test PRIVATE
 4 |   clover_reference.cpp
 5 |   covdev_reference.cpp
 6 |   blas_reference.cpp
 7 |   domain_wall_dslash_reference.cpp
 8 |   dslash_reference.cpp
 9 |   dslash_test_helpers.cpp
10 |   gauge_force_reference.cpp
11 |   clover_force_reference.cpp
12 |   hisq_force_reference.cpp
13 |   staggered_dslash_reference.cpp
14 |   wilson_dslash_reference.cpp)
15 | 
16 | target_include_directories(quda_test PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
17 | target_include_directories(quda_test PRIVATE ${CMAKE_SOURCE_DIR}/include)
18 | target_include_directories(quda_test PRIVATE ${CMAKE_BINARY_DIR}/include)
19 | 


--------------------------------------------------------------------------------
/tests/host_reference/README.md:
--------------------------------------------------------------------------------
 1 | # QUDA 1.0.0
 2 | 
 3 | ## host_reference
 4 | 
 5 | This directory contains host side reference code that cross checks the GPU kerels in QUDA
 6 | with host side calculated routines. Currenly supported reference routines for matrix 
 7 | operators are:
 8 | 
 9 | 	 1. Wilson dslash
10 | 	 2. Clover dslash
11 | 	 3. Twisted mass
12 | 	 4. Twisted clover
13 | 	 5. Non-degenerate twisted mass/clover	
14 | 	 6. Staggered dslash
15 |             (naive, HISQ)
16 | 	 7. Domain Wall dslash
17 | 	    (Shamir 4d, Shamir 5d, Mobius) 
18 | 	 8. Covariant derivative
19 | 	
20 | For gauge related routines, we have:
21 | 
22 |     	 1. Long/Fat link construction
23 | 	 2. Wilson gauge action force
24 | 	 3. HISQ gauge action force
25 | 
26 | And we also offer routines for dense arithmetic:
27 | 
28 |        	 1. BLAS
29 | 	 2. Spinor contration
30 | 
31 | The former will compute a wide variety of BLAS calls, and the latter will contract
32 | two spinors, returning an array populated with a 4x4 array of open spin index, colour 
33 | contracted data at each lattice point.
34 | 


--------------------------------------------------------------------------------
/tests/host_reference/blas_reference.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | double blasGEMMQudaVerify(void *arrayA, void *arrayB, void *arrayC, void *arrayCcopy, uint64_t refA_size,
 4 |                           uint64_t refB_size, uint64_t refC_size, QudaBLASParam *blas_param);
 5 | 
 6 | double blasLUInvQudaVerify(void *ref_array, void *dev_array_inv, uint64_t array_size, QudaBLASParam *blas_param);
 7 | 
 8 | void prepare_ref_array(void *array, int batches, uint64_t array_size, size_t data_size, QudaBLASDataType data_type);
 9 | 
10 | void copy_array(void *array_out, void *array_in, int batches, uint64_t array_size, size_t data_out_size,
11 |                 QudaBLASDataType data_type);
12 | 


--------------------------------------------------------------------------------
/tests/host_reference/clover_force_reference.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <array>
 4 | #include <vector>
 5 | #include "quda.h"
 6 | 
 7 | void TMCloverForce_reference(void *h_mom, void **h_x, void **h_x0, double *coeff, int nvector,
 8 |                              std::array<void *, 4> &gauge, std::vector<char> &clover, std::vector<char> &clover_inv,
 9 |                              QudaGaugeParam *gauge_param, QudaInvertParam *inv_param, int detratio);
10 | 


--------------------------------------------------------------------------------
/tests/host_reference/covdev_reference.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <quda_internal.h>
 3 | #include <color_spinor_field.h>
 4 | 
 5 | using namespace quda;
 6 | 
 7 | void setDims(int *);
 8 | 
 9 | void covdev_dslash(void *res, const GaugeField &link, void *spinorField, int oddBit, int daggerBit, int mu,
10 |                    QudaPrecision sPrecision, QudaPrecision gPrecision);
11 | void covdev_dslash_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int oddBit,
12 |                           int daggerBit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision);
13 | 
14 | void mat(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int daggerBit, int mu);
15 | 
16 | void matdagmat(void *out, const GaugeField &link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision,
17 |                QudaPrecision gPrecision, void *tmp, QudaParity parity);
18 | 
19 | void mat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int daggerBit, int mu);
20 | void matdagmat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int dagger_bit, int mu,
21 |                       QudaPrecision sPrecision, QudaPrecision gPrecision, ColorSpinorField &tmp, QudaParity parity);
22 | 


--------------------------------------------------------------------------------
/tests/host_reference/dslash_test_helpers.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <quda.h>
 4 | 
 5 | enum class dslash_test_type {
 6 |   Dslash = 0,
 7 |   MatPC,
 8 |   Mat,
 9 |   MatPCDagMatPC,
10 |   MatDagMat,
11 |   M5,
12 |   M5inv,
13 |   Dslash4pre,
14 |   MatPCDagMatPCLocal
15 | };
16 | 
17 | /**
18 |  * Apply the Dslash operator (D_{eo} or D_{oe}) for 4D EO preconditioned DWF.
19 |  * @param h_out  Result spinor field
20 |  * @param h_in   Input spinor field
21 |  * @param param  Contains all metadata regarding host and device
22 |  *               storage
23 |  * @param parity The destination parity of the field
24 |  * @param test_type Choose a type of dslash operators
25 |  */
26 | void dslashQuda_4dpc(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, dslash_test_type test_type);
27 | 
28 | /**
29 |  * Apply the Dslash operator (D_{eo} or D_{oe}) for Mobius DWF.
30 |  * @param h_out  Result spinor field
31 |  * @param h_in   Input spinor field
32 |  * @param param  Contains all metadata regarding host and device
33 |  *               storage
34 |  * @param parity The destination parity of the field
35 |  * @param test_type Choose a type of dslash operators
36 |  */
37 | void dslashQuda_mdwf(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, dslash_test_type test_type);
38 | 
39 | void dslashQuda_mobius_eofa(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity,
40 |                             dslash_test_type test_type);
41 | 


--------------------------------------------------------------------------------
/tests/host_reference/hisq_force_reference.h:
--------------------------------------------------------------------------------
 1 | #ifndef _HISQ_FORCE_REFERENCE_H
 2 | #define _HISQ_FORCE_REFERENCE_H
 3 | 
 4 | #include <quda.h>
 5 | #include <enum_quda.h>
 6 | #include <gauge_field.h>
 7 | 
 8 | /**
 9 |    @brief Compute a staggered spinor outer product for some offset, CPU version
10 |    @param[in] src Pointer to an appropriately sized host staggered spinor field
11 |    @param[out] dest Reference to a gauge field for the outer product
12 |    @param[in] precision Precision of data (single or double)
13 |    @param[in] separation Offset for outer product (1 for fat links, 3 for long links)
14 | */
15 | void computeLinkOrderedOuterProduct(void *src, quda::GaugeField &dest, QudaPrecision precision, size_t separation);
16 | 
17 | /**
18 |    @brief Compute the force contribution from the fat links, CPU version
19 |    @param[in] path_coeff Input HISQ coefficients
20 |    @param[in] oprod Input force outer product
21 |    @param[in] link Gauge field links
22 |    @param[out] newOprod Force accumulated with fat link contributions
23 | */
24 | void hisqStaplesForceCPU(const double *path_coeff, quda::GaugeField &oprod, quda::GaugeField &link,
25 |                          quda::GaugeField *newOprod);
26 | 
27 | /**
28 |    @brief Compute the force contribution from the long link, CPU version
29 |    @param[in] coeff Long-link contribution (path_coeff[1])
30 |    @param[in] oprod Input force outer product
31 |    @param[in] link Gauge field links
32 |    @param[out] newOprod Force accumulated with fat link contributions
33 | */
34 | void hisqLongLinkForceCPU(double coeff, quda::GaugeField &oprod, quda::GaugeField &link, quda::GaugeField *newOprod);
35 | 
36 | /**
37 |    @brief Accumulate the force contributions into the momentum field, CPU version
38 |    @param[in] oprod Input force outer product
39 |    @param[in] link Gauge field links
40 |    @param[out] mom Accumulated momentum
41 | */
42 | void hisqCompleteForceCPU(quda::GaugeField &oprod, quda::GaugeField &link, quda::GaugeField *mom);
43 | 
44 | #endif
45 | 


--------------------------------------------------------------------------------
/tests/plaq_test.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include <stdio.h>
 3 | #include <time.h>
 4 | #include <math.h>
 5 | #include <string.h>
 6 | 
 7 | // In a typical application, quda.h is the only QUDA header required.
 8 | #include <quda.h>
 9 | 
10 | #include "util_quda.h"
11 | #include "host_utils.h"
12 | #include "gauge_utils.h"
13 | #include "command_line_params.h"
14 | 
15 | int main(int argc, char **argv)
16 | {
17 |   auto app = make_app();
18 |   try {
19 |     app->parse(argc, argv);
20 |   } catch (const CLI::ParseError &e) {
21 |     return app->exit(e);
22 |   }
23 | 
24 |   // initialize QMP/MPI, QUDA comms grid and RNG (host_utils.cpp)
25 |   initComms(argc, argv, gridsize_from_cmdline);
26 | 
27 |   QudaGaugeParam gauge_param = newQudaGaugeParam();
28 |   if (prec_sloppy == QUDA_INVALID_PRECISION) prec_sloppy = prec;
29 |   if (link_recon_sloppy == QUDA_RECONSTRUCT_INVALID) link_recon_sloppy = link_recon;
30 | 
31 |   setWilsonGaugeParam(gauge_param);
32 |   gauge_param.t_boundary = QUDA_PERIODIC_T;
33 |   setDims(gauge_param.X);
34 | 
35 |   initQuda(device_ordinal);
36 | 
37 |   setVerbosity(verbosity);
38 | 
39 |   // call srand() with a rank-dependent seed
40 |   initRand();
41 | 
42 |   // Allocate host side memory for the gauge field.
43 |   //----------------------------------------------------------------------------
44 |   void *gauge[4];
45 |   // Allocate space on the host (always best to allocate and free in the same scope)
46 |   for (int dir = 0; dir < 4; dir++) gauge[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
47 |   constructHostGaugeField(gauge, gauge_param, argc, argv);
48 |   // Load the gauge field to the device
49 |   loadGaugeQuda((void *)gauge, &gauge_param);
50 | 
51 |   double plaq[3];
52 |   plaqQuda(plaq);
53 |   printfQuda("Computed plaquette gauge precise is %16.15e (spatial = %16.15e, temporal = %16.15e)\n", plaq[0], plaq[1],
54 |              plaq[2]);
55 | 
56 |   freeGaugeQuda();
57 | 
58 |   // release memory
59 |   for (int dir = 0; dir < 4; dir++) { host_free(gauge[dir]); }
60 | 
61 |   endQuda();
62 |   finalizeComms();
63 |   return 0;
64 | }
65 | 


--------------------------------------------------------------------------------
/tests/utils/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # add utils files to quda_test
 2 | target_sources(
 3 |   quda_test PRIVATE
 4 |   command_line_params.cpp
 5 |   face_gauge.cpp
 6 |   gauge_utils.cpp
 7 |   host_blas.cpp
 8 |   host_utils.cpp
 9 |   index_utils.cpp
10 |   llfat_utils.cpp
11 |   misc.cpp
12 |   momentum_utils.cpp
13 |   set_params.cpp
14 |   staggered_gauge_utils.cpp
15 |   staggered_host_utils.cpp)
16 | 
17 | target_include_directories(quda_test PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
18 | target_include_directories(quda_test SYSTEM PUBLIC ${CMAKE_SOURCE_DIR}/include/externals)
19 | target_include_directories(quda_test PRIVATE ${CMAKE_SOURCE_DIR}/include)
20 | target_include_directories(quda_test PRIVATE ${CMAKE_BINARY_DIR}/include)
21 | 


--------------------------------------------------------------------------------
/tests/utils/README.md:
--------------------------------------------------------------------------------
1 | # QUDA 1.0.0
2 | 
3 | ## utils
4 | 
5 | This directory contains useful command line utilities, as well as miscellaneous utils for host side routines.
6 | 


--------------------------------------------------------------------------------
/tests/utils/momentum_utils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "quda.h"
 4 | 
 5 | /**
 6 |  * @brief Create a momentum field in a MILC data layout
 7 |  *
 8 |  * @param[out] mom Momentum field
 9 |  * @param[in] precision Floating-point precision of field
10 |  * @param[in] max_val Maximum value of field
11 |  */
12 | void createMomCPU(void *mom, QudaPrecision precision, double max_val = 1.0);
13 | 
14 | /**
15 |  * @brief Compute and print a robust comparison of agreement between two
16 |  *        momentum fields
17 |  *
18 |  * @param[in] momA First momentum field
19 |  * @param[in] momB Second momentum field
20 |  * @param[in] len Length of the momentum field
21 |  * @param[in] precision Floating-point precision of field
22 |  */
23 | int strong_check_mom(const void *momA, const void *momB, int len, QudaPrecision prec);
24 | 
25 | /**
26 |  * @brief Host reference implementation of the momentum action
27 |  * contribution, including the MILC convention of subtracting 4
28 |  * from each site norm to improve stability.
29 |  *
30 |  * @param[in] mom Momentum field
31 |  * @param[in] len Length of the momentum field
32 |  * @param[in] precision Floating-point precision of field
33 |  */
34 | double momentumActionCPU(const void *mom, int len, QudaPrecision prec);
35 | 


--------------------------------------------------------------------------------
/tests/utils/rng_utils.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <random>
 4 | 
 5 | /**
 6 |  * @brief Generate a uniform random number on [lower, upper)
 7 |  *
 8 |  * @tparam real_t Floating point type
 9 |  * @param[in] i Checkboard lattice site
10 |  * @param[in] parity Parity of site
11 |  * @param[in] lower Lower bound of range, inclusive
12 |  * @param[in] upper Upper bound of range, exclusive
13 |  * @return Random number under requested uniform distribution
14 |  */
15 | template <typename real_t = double> real_t random_uniform_host(int i, int parity, real_t lower = 0, real_t upper = 1)
16 | {
17 |   // generates in [lower, upper)
18 |   std::uniform_real_distribution<real_t> dist {lower, upper};
19 |   return dist(host_rand[parity * Vh + i]);
20 | }
21 | 
22 | /**
23 |  * @brief Generate a Gaussian-distributed random number
24 |  *
25 |  * @tparam real_t Floating point type
26 |  * @param[in] i Checkboard lattice site
27 |  * @param[in] parity Parity of site
28 |  * @param[in] mean Center of the distribution
29 |  * @param[in] stddev Standard deviation of the distribution
30 |  * @return Random number under requested Gaussian distribution
31 |  */
32 | template <typename real_t = double> real_t random_gaussian_host(int i, int parity, real_t mean = 0, real_t stddev = 1)
33 | {
34 |   std::normal_distribution<real_t> dist {mean, stddev};
35 |   return dist(host_rand[parity * Vh + i]);
36 | }
37 | 


--------------------------------------------------------------------------------
/tests/utils/short.h:
--------------------------------------------------------------------------------
 1 | #ifndef _SHORT_H
 2 | #define _SHORT_H
 3 | 
 4 | #define SHORT_LENGTH 65536
 5 | #define SCALE_FLOAT ((SHORT_LENGTH - 1) * 0.5)
 6 | #define SHIFT_FLOAT (-1.f / (SHORT_LENGTH - 1))
 7 | 
 8 | template <typename Float> inline short FloatToShort(Float a)
 9 | {
10 |   // return (short)(a*MAX_SHORT);
11 |   short rtn = (short)((a + SHIFT_FLOAT) * SCALE_FLOAT);
12 |   return rtn;
13 | }
14 | 
15 | template <typename Float> inline Float shortToFloat(short a)
16 | {
17 |   Float rtn = (float)a / SCALE_FLOAT - SHIFT_FLOAT;
18 |   return rtn;
19 | }
20 | 
21 | #endif // _SHORT_H
22 | 


--------------------------------------------------------------------------------
/tests/utils/staggered_gauge_utils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <quda_internal.h>
 3 | #include <color_spinor_field.h>
 4 | 
 5 | extern int Z[4];
 6 | extern int Vh;
 7 | extern int V;
 8 | 
 9 | void setDims(int *);
10 | 
11 | void constructStaggeredHostGaugeField(void **qdp_inlink, void **qdp_longlink, void **qdp_fatlink,
12 |                                       QudaGaugeParam &gauge_param, int argc, char **argv, bool compute_on_gpu);
13 | 
14 | // Wrap everything for the GPU construction of fat/long links here
15 | void computeHISQLinksGPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_fatlink_eps, void **qdp_longlink_eps,
16 |                          void **qdp_inlink, QudaGaugeParam &gauge_param, double **act_path_coeffs, double eps_naik,
17 |                          size_t gSize, int n_naiks);
18 | 
19 | void computeFatLongGPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_inlink, QudaGaugeParam &gauge_param,
20 |                        size_t gSize, int n_naiks, double eps_naik);
21 | 
22 | void computeFatLongCPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_inlink, QudaGaugeParam &gauge_param,
23 |                        size_t gSize, int n_naiks, double eps_naik);
24 | 
25 | void computeFatLongGPUandCPU(void **qdp_fatlink_gpu, void **qdp_longlink_gpu, void **qdp_fatlink_cpu,
26 |                              void **qdp_longlink_cpu, void **qdp_inlink, QudaGaugeParam &gauge_param, size_t gSize,
27 |                              int n_naiks, double eps_naik);
28 | 
29 | // Routine that takes in a QDP-ordered field and outputs the plaquette.
30 | // Assumes the gauge fields already have phases on them (unless it's the Laplace op),
31 | // so it corrects the sign as appropriate.
32 | void computeStaggeredPlaquetteQDPOrder(void **qdp_link, double plaq[3], const QudaGaugeParam &gauge_param_in,
33 |                                        const QudaDslashType dslash_type);
34 | 


--------------------------------------------------------------------------------