├── LICENSE.TXT ├── MathDx ├── README.md ├── cuBLASDx │ ├── .gitignore │ ├── CMakeLists.txt │ ├── README.md │ ├── arch_runner.hpp │ ├── batched_gemm_fp64.cu │ ├── block_io.hpp │ ├── blockdim_gemm_fp16.cu │ ├── check_error.cu │ ├── check_error.hpp │ ├── common.hpp │ ├── common_nvrtc.hpp │ ├── device_gemm_performance.cu │ ├── flops.h │ ├── fused_gemm_performance.cu │ ├── gemm_fft.cu │ ├── gemm_fft_fp16.cu │ ├── gemm_fft_performance.cu │ ├── gemm_fusion.cu │ ├── introduction_example.cu │ ├── multiblock_gemm.cu │ ├── nvrtc_gemm.cpp │ ├── reduce.hpp │ ├── reference.hpp │ ├── reference │ │ ├── cublas_reference.hpp │ │ ├── naive_reference.cu │ │ └── naive_reference.hpp │ ├── scaled_dot_prod_attn.cu │ ├── scaled_dot_prod_attn_batched.cu │ ├── simple_gemm_aat.cu │ ├── simple_gemm_cfp16.cu │ ├── simple_gemm_custom_layout.cu │ ├── simple_gemm_fp32.cu │ ├── simple_gemm_fp32_decoupled.cu │ ├── simple_gemm_fp8.cu │ ├── simple_gemm_int8_int8_int32.cu │ ├── simple_gemm_leading_dimensions.cu │ ├── simple_gemm_mixed_precision.cu │ ├── simple_gemm_std_complex_fp32.cu │ ├── simple_gemm_transform.cu │ ├── single_gemm_performance.cu │ └── single_gemm_performance.hpp ├── cuFFTDx │ ├── 00_introduction_example │ │ └── 00_introduction_example.cu │ ├── 01_simple_fft_thread │ │ ├── 00_simple_fft_thread.cu │ │ ├── 01_simple_fft_thread_fp16.cu │ │ ├── 02_simple_fft_thread_lto.cu │ │ └── 02_simple_fft_thread_lto_cases.csv │ ├── 02_simple_fft_block │ │ ├── 00_simple_fft_block.cu │ │ ├── 01_simple_fft_block_shared.cu │ │ ├── 02_simple_fft_block_std_complex.cu │ │ ├── 03_simple_fft_block_half2.cu │ │ ├── 04_simple_fft_block_fp16.cu │ │ ├── 05_simple_fft_block_c2r.cu │ │ ├── 06_simple_fft_block_r2c.cu │ │ ├── 07_simple_fft_block_c2r_fp16.cu │ │ ├── 08_simple_fft_block_r2c_fp16.cu │ │ ├── 09_simple_fft_block_cub_io.cu │ │ ├── 10_simple_fft_block_c2r_lto.cu │ │ └── 10_simple_fft_block_c2r_lto_cases.csv │ ├── 03_block_fft_performance │ │ ├── 00_block_fft_performance.cu │ │ ├── 01_block_fft_performance_many.cu │ │ ├── 02_block_fft_lto_ptx_performance.cu │ │ ├── 02_block_fft_lto_ptx_performance_cases.csv │ │ └── block_fft_performance.hpp │ ├── 04_nvrtc_fft │ │ ├── 00_nvrtc_fft_thread.cu │ │ ├── 01_nvrtc_fft_block.cu │ │ ├── 02_nvrtc_fft_thread_lto.cu │ │ └── 03_nvrtc_fft_block_lto.cu │ ├── 05_fft_Xd │ │ ├── 00_fft_2d.cu │ │ ├── 01_fft_2d_single_kernel.cu │ │ ├── 02_fft_2d_r2c_c2r.cu │ │ ├── 03_fft_3d.cu │ │ ├── 04_fft_3d_box_single_block.cu │ │ └── 05_fft_3d_cube_single_block.cu │ ├── 06_convolution │ │ ├── 00_convolution.cu │ │ ├── 01_convolution_padded.cu │ │ ├── 02_convolution_performance.cu │ │ └── 03_convolution_r2c_c2r.cu │ ├── 07_convolution_3d │ │ ├── 00_convolution_3d.cu │ │ ├── 01_convolution_3d_c2r.cu │ │ ├── 02_convolution_3d_r2c.cu │ │ ├── 03_convolution_3d_padded.cu │ │ ├── 04_convolution_3d_padded_r2c.cu │ │ ├── index_mapper.hpp │ │ ├── io_strided_conv_smem.hpp │ │ ├── io_strided_conv_smem_padded.hpp │ │ ├── kernels.hpp │ │ └── reference.hpp │ ├── 08_mixed_precision │ │ ├── 00_mixed_precision_fft_1d.cu │ │ └── 01_mixed_precision_fft_2d.cu │ ├── 09_introduction_lto_example │ │ ├── 00_introduction_lto_cases.csv │ │ ├── 00_introduction_lto_example.cu │ │ ├── CMakeLists.txt │ │ └── Makefile │ ├── 10_cufft_device_api_example │ │ ├── 00_cufft_device_api_example.cu │ │ ├── CMakeLists.txt │ │ ├── Makefile │ │ └── cufft_device_api_lto_helper │ │ │ ├── CMakeLists.txt │ │ │ ├── cufft_device_api_lto_helper.cmake │ │ │ └── cufft_device_api_lto_helper.cpp │ ├── CMakeLists.txt │ ├── Makefile │ ├── README.md │ ├── README_LTO_EA.md │ ├── common │ │ ├── block_io.hpp │ │ ├── block_io_generic_strided.hpp │ │ ├── block_io_strided.hpp │ │ ├── common.hpp │ │ ├── common_nvjitlink.hpp │ │ ├── common_nvrtc.hpp │ │ ├── fp16_common.hpp │ │ ├── mixed_io.hpp │ │ ├── padded_io.hpp │ │ └── random.hpp │ └── lto_helper │ │ ├── CMakeLists.txt │ │ ├── common_lto.hpp │ │ ├── cufftdx_cufft_lto_helper.cpp │ │ └── lto_helper.cmake ├── cuRANDDx │ ├── CMakeLists.txt │ ├── README.md │ ├── common.hpp │ ├── mrg_two_distributions_thread_api.cu │ ├── nvrtc_helper.hpp │ ├── nvrtc_pcg_thread_api.cpp │ ├── philox_thread_api.cu │ ├── simple_pcg_thread_api.cu │ ├── sobol_thread_api.cu │ └── xorwow_init_and_generate_thread_api.cu └── cuSolverDx │ ├── CMakeLists.txt │ ├── README.md │ ├── blocked_potrf.cu │ ├── common.hpp │ ├── common │ ├── cudart.hpp │ ├── cusolver_reference_cholesky.hpp │ ├── cusolver_reference_lu.hpp │ ├── device_io.hpp │ ├── error_checking.cpp │ ├── error_checking.hpp │ ├── example_sm_runner.hpp │ ├── macros.hpp │ ├── measure.hpp │ ├── numeric.hpp │ ├── print.hpp │ └── random.hpp │ ├── gesv_batched_wo_pivot.cu │ ├── gesv_partial_pivot.cu │ ├── getrf_partial_pivot.cu │ ├── getrf_wo_pivot.cu │ ├── nvrtc_helper.hpp │ ├── nvrtc_potrs.cpp │ ├── posv_batched.cu │ ├── potrf_runtime_ld.cu │ └── simple_potrf.cu ├── NPP+ ├── README.md ├── batchedLabelMarkersAndCompression │ ├── CMakeLists.txt │ ├── README.md │ ├── batchedLabelMarkersAndCompression.h │ ├── batchedLabelMarkersAndCompressionNPPPlus.cpp │ ├── dirent.h │ └── images │ │ ├── CT_skull_512x512_8u.raw │ │ ├── CT_skull_CompressedMarkerLabelsUF_8Way_512x512_32u.raw │ │ ├── CT_skull_LabelMarkersUFBatch_8Way_512x512_32u.raw │ │ ├── CT_skull_LabelMarkersUF_8Way_512x512_32u.raw │ │ ├── Lena_CompressedMarkerLabelsUF_8Way_512x512_32u.raw │ │ ├── Lena_LabelMarkersUFBatch_8Way_512x512_32u.raw │ │ ├── Lena_LabelMarkersUF_8Way_512x512_32u.raw │ │ ├── PCB2_1024x683_8u.raw │ │ ├── PCB2_CompressedMarkerLabelsUF_8Way_1024x683_32u.raw │ │ ├── PCB2_LabelMarkersUFBatch_8Way_1024x683_32u.raw │ │ ├── PCB2_LabelMarkersUF_8Way_1024x683_32u.raw │ │ ├── PCB_1280x720_8u.raw │ │ ├── PCB_CompressedMarkerLabelsUF_8Way_1280x720_32u.raw │ │ ├── PCB_LabelMarkersUFBatch_8Way_1280x720_32u.raw │ │ ├── PCB_LabelMarkersUF_8Way_1280x720_32u.raw │ │ ├── PCB_METAL_509x335_8u.raw │ │ ├── PCB_METAL_CompressedMarkerLabelsUF_8Way_509x335_32u.raw │ │ ├── PCB_METAL_LabelMarkersUFBatch_8Way_509x335_32u.raw │ │ ├── PCB_METAL_LabelMarkersUF_8Way_509x335_32u.raw │ │ └── lena_512x512_8u.raw ├── cannyEdgeDetectorPython │ ├── README.md │ ├── Teapot.jpg │ ├── Teapot_resolutions │ │ ├── out_npp_1280x720.png │ │ ├── out_npp_1920x1080.png │ │ ├── out_npp_2560x1440.png │ │ ├── out_npp_320x180.png │ │ ├── out_npp_3840x2160.png │ │ ├── out_npp_5120x2880.png │ │ ├── out_npp_640x360.png │ │ ├── out_npp_800x600.png │ │ └── performance_results.csv │ └── cannyEdgeDetector.py ├── distanceTransform │ ├── CMakeLists.txt │ ├── DistanceTransformTrue_Dolphin1_319x319_16u.jpg │ ├── README.md │ ├── dolphin1_Input_319x319_8u.jpg │ ├── images │ │ ├── Dolphin1_313x317_8u.raw │ │ └── TestImage3_diamond_64x64_8u.raw │ └── unsignedAndSignedDistanceTransformNPPPlus.cpp ├── findContour │ ├── CMakeLists.txt │ ├── CircuitBoard_2048x1024_8u.jpg │ ├── CircuitBoard_CompressedMarkerLabelsUF_8Way_2048x1024_32u.jpg │ ├── CircuitBoard_ContoursReconstructed_8Way_2048x1024_8u.jpg │ ├── CircuitBoard_Contours_8Way_2048x1024_8u.jpg │ ├── CircuitBoard_LabelMarkersUF_8Way_2048x1024_32u.jpg │ ├── README.md │ ├── contour_info.log │ ├── findContourNPPPlus.cpp │ └── images │ │ └── CircuitBoard_2048x1024_8u.raw ├── floodFill │ ├── CMakeLists.txt │ ├── README.md │ ├── floodFillVariousRegionTypesNPPPlus.cpp │ └── images │ │ ├── CT_skull_512x512_8u_Gray.raw │ │ ├── Corn_614x461_8u_Gray.raw │ │ ├── DistanceSampler_512x512_8u.raw │ │ ├── DistanceSampler_512x512_Inverted_8u.raw │ │ ├── RainbowChart_RGB_C3_1024x445_8u.raw │ │ ├── RainbowChart_RGB_C3_Fill_8Way_1024x445_Dev_8u.raw │ │ ├── RainbowChart_RGB_C3_Fill_8Way_Gradient_1024x445_Dev_8u.raw │ │ ├── RainbowChart_RGB_C3_Fill_8Way_Gradient_Boundary_1024x445_Dev_8u.raw │ │ ├── Rocks_512x512_8u_Gray.raw │ │ ├── SeabedSampler_RGB_C3_675x1024_8u.raw │ │ ├── SeabedSampler_RGB_C3_Fill_8Way_Range_675x1024_Dev_8u.raw │ │ ├── SeabedSampler_RGB_C3_Fill_8Way_Range_Boundary_675x1024_Dev_8u.raw │ │ ├── SignedCircle_256x206_64f.raw │ │ ├── SignedCircle_256x206_Inverted_64f.raw │ │ └── SignedLith_554x554_32f.raw └── watershedSegmentation │ ├── CMakeLists.txt │ ├── README.md │ ├── images │ ├── CT_skull_512x512_8u_Gray.raw │ ├── CT_skull_CompressedSegmentLabels_8Way_512x512_32u.raw │ ├── CT_skull_SegmentBoundaries_8Way_512x512_8u.raw │ ├── CT_skull_SegmentsWithContrastingBoundaries_8Way_512x512_8u.raw │ ├── CT_skull_Segments_8Way_512x512_8u.raw │ ├── Corn_614x461_8u_Gray.raw │ ├── Corn_CompressedSegmentLabels_8Way_614x461_32u.raw │ ├── Corn_SegmentBoundaries_8Way_614x461_8u.raw │ ├── Corn_SegmentsWithContrastingBoundaries_8Way_614x461_8u.raw │ ├── Corn_Segments_8Way_614x461_8u.raw │ ├── DistanceSampler_512x512_8u.raw │ ├── DistanceSampler_512x512_Inverted_8u.raw │ ├── RainbowChart_RGB_C3_1024x445_8u.raw │ ├── Rocks_512x512_8u_Gray.raw │ ├── Rocks_CompressedSegmentLabels_8Way_512x512_32u.raw │ ├── Rocks_SegmentBoundaries_8Way_512x512_8u.raw │ ├── Rocks_SegmentsWithContrastingBoundaries_8Way_512x512_8u.raw │ ├── Rocks_Segments_8Way_512x512_8u.raw │ ├── SeabedSampler_RGB_C3_675x1024_8u.raw │ ├── SignedCircle_256x206_64f.raw │ ├── SignedCircle_256x206_Inverted_64f.raw │ └── SignedLith_554x554_32f.raw │ ├── watershedSegmentationNPPPlus.cpp │ └── watershedSegmentationNPPPlus.h ├── NPP ├── README.md ├── batchedLabelMarkersAndCompression │ ├── CMakeLists.txt │ ├── README.md │ ├── batchedLabelMarkersAndCompression.cpp │ ├── batchedLabelMarkersAndCompression.h │ ├── dirent.h │ └── images │ │ ├── CT_skull_512x512_8u.raw │ │ ├── CT_skull_CompressedMarkerLabelsUF_8Way_512x512_32u.raw │ │ ├── CT_skull_LabelMarkersUFBatch_8Way_512x512_32u.raw │ │ ├── CT_skull_LabelMarkersUF_8Way_512x512_32u.raw │ │ ├── Lena_CompressedMarkerLabelsUF_8Way_512x512_32u.raw │ │ ├── Lena_LabelMarkersUFBatch_8Way_512x512_32u.raw │ │ ├── Lena_LabelMarkersUF_8Way_512x512_32u.raw │ │ ├── PCB2_1024x683_8u.raw │ │ ├── PCB2_CompressedMarkerLabelsUF_8Way_1024x683_32u.raw │ │ ├── PCB2_LabelMarkersUFBatch_8Way_1024x683_32u.raw │ │ ├── PCB2_LabelMarkersUF_8Way_1024x683_32u.raw │ │ ├── PCB_1280x720_8u.raw │ │ ├── PCB_CompressedMarkerLabelsUF_8Way_1280x720_32u.raw │ │ ├── PCB_LabelMarkersUFBatch_8Way_1280x720_32u.raw │ │ ├── PCB_LabelMarkersUF_8Way_1280x720_32u.raw │ │ ├── PCB_METAL_509x335_8u.raw │ │ ├── PCB_METAL_CompressedMarkerLabelsUF_8Way_509x335_32u.raw │ │ ├── PCB_METAL_LabelMarkersUFBatch_8Way_509x335_32u.raw │ │ ├── PCB_METAL_LabelMarkersUF_8Way_509x335_32u.raw │ │ └── lena_512x512_8u.raw ├── distanceTransform │ ├── CMakeLists.txt │ ├── DistanceTransformTrue_Dolphin1_319x319_16u.jpg │ ├── README.md │ ├── distanceTransform.cpp │ ├── dolphin1_Input_319x319_8u.jpg │ └── images │ │ ├── Dolphin1_313x317_8u.raw │ │ └── TestImage3_diamond_64x64_8u.raw ├── findContour │ ├── CMakeLists.txt │ ├── CircuitBoard_2048x1024_8u.jpg │ ├── CircuitBoard_CompressedMarkerLabelsUF_8Way_2048x1024_32u.jpg │ ├── CircuitBoard_ContoursReconstructed_8Way_2048x1024_8u.jpg │ ├── CircuitBoard_Contours_8Way_2048x1024_8u.jpg │ ├── CircuitBoard_LabelMarkersUF_8Way_2048x1024_32u.jpg │ ├── README.md │ ├── contour_info.log │ ├── findContour.cpp │ └── images │ │ └── CircuitBoard_2048x1024_8u.raw └── watershedSegmentation │ ├── CMakeLists.txt │ ├── README.md │ ├── images │ ├── CT_skull_512x512_8u_Gray.raw │ ├── CT_skull_CompressedSegmentLabels_8Way_512x512_32u.raw │ ├── CT_skull_SegmentBoundaries_8Way_512x512_8u.raw │ ├── CT_skull_SegmentsWithContrastingBoundaries_8Way_512x512_8u.raw │ ├── CT_skull_Segments_8Way_512x512_8u.raw │ ├── Lena_512x512_8u_Gray.raw │ ├── Lena_CompressedSegmentLabels_8Way_512x512_32u.raw │ ├── Lena_SegmentBoundaries_8Way_512x512_8u.raw │ ├── Lena_SegmentsWithContrastingBoundaries_8Way_512x512_8u.raw │ ├── Lena_Segments_8Way_512x512_8u.raw │ ├── Rocks_512x512_8u_Gray.raw │ ├── Rocks_CompressedSegmentLabels_8Way_512x512_32u.raw │ ├── Rocks_SegmentBoundaries_8Way_512x512_8u.raw │ ├── Rocks_SegmentsWithContrastingBoundaries_8Way_512x512_8u.raw │ ├── Rocks_Segments_8Way_512x512_8u.raw │ ├── coins_500x383_8u_Gray.raw │ └── coins_overlay_500x569_8u_Gray.raw │ ├── watershedSegmentation.cpp │ └── watershedSegmentation.h ├── README.md ├── cuBLAS ├── Emulation │ ├── bf16x9_gemmEx │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_GemmEx_example.cu │ └── bf16x9_sgemm │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_sgemm_example.cu ├── Extensions │ ├── AxpyEx │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_AxpyEx_example.cu │ ├── Cherk3mEx │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_Cherk3mEx_example.cu │ ├── CherkEx │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_CherkEx_example.cu │ ├── Csyrk3mEx │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_Csyrk3mEx_example.cu │ ├── CsyrkEx │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_CsyrkEx_example.cu │ ├── DotEx │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ ├── cublas_DotEx_example.cu │ │ └── cublas_DotcEx_example.cu │ ├── GemmBatchedEx │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_GemmBatchedEx_example.cu │ ├── GemmEx │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_GemmEx_example.cu │ ├── GemmGroupedBatchedEx │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_GemmGroupedBatchedEx_example.cu │ ├── GemmStridedBatchedEx │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_GemmStridedBatchedEx_example.cu │ ├── Nrm2Ex │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_Nrm2Ex_example.cu │ ├── RotEx │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_RotEx_example.cu │ ├── ScalEx │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_ScalEx_example.cu │ ├── dgmm │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_dgmm_example.cu │ ├── geam │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_geam_example.cu │ ├── tpttr │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_tpttr_example.cu │ └── trttp │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_trttp_example.cu ├── Level-1 │ ├── amax │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_amax_example.cu │ ├── amin │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_amin_example.cu │ ├── asum │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_asum_example.cu │ ├── axpy │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_axpy_example.cu │ ├── copy │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_copy_example.cu │ ├── dot │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ ├── cublas_dot_example.cu │ │ └── cublas_dotc_example.cu │ ├── nrm2 │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_nrm2_example.cu │ ├── rot │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_rot_example.cu │ ├── rotg │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_rotg_example.cu │ ├── rotm │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_rotm_example.cu │ ├── rotmg │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_rotmg_example.cu │ ├── scal │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_scal_example.cu │ └── swap │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_swap_example.cu ├── Level-2 │ ├── gbmv │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_gbmv_example.cu │ ├── gemv │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_gemv_example.cu │ ├── ger │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_ger_example.cu │ ├── hbmv │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_hbmv_example.cu │ ├── hemv │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_hemv_example.cu │ ├── her │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_her_example.cu │ ├── her2 │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_her2_example.cu │ ├── hpmv │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_hpmv_example.cu │ ├── hpr │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_hpr_example.cu │ ├── hpr2 │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_hpr2_example.cu │ ├── sbmv │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_sbmv_example.cu │ ├── spmv │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_spmv_example.cu │ ├── spr │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_spr_example.cu │ ├── spr2 │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_spr2_example.cu │ ├── symv │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_symv_example.cu │ ├── syr │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_syr_example.cu │ ├── syr2 │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_syr2_example.cu │ ├── tbmv │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_tbmv_example.cu │ ├── tbsv │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_tbsv_example.cu │ ├── tpmv │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_tpmv_example.cu │ ├── tpsv │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_tpsv_example.cu │ ├── trmv │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_trmv_example.cu │ └── trsv │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_trsv_example.cu ├── Level-3 │ ├── gemm │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_gemm_example.cu │ ├── gemm3m │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_gemm3m_example.cu │ ├── gemmBatched │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_gemmBatched_example.cu │ ├── gemmGroupedBatched │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_gemmGroupedBatched_example.cu │ ├── gemmStridedBatched │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_gemmStridedBatched_example.cu │ ├── hemm │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_hemm_example.cu │ ├── her2k │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_her2k_example.cu │ ├── herk │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_herk_example.cu │ ├── herkx │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_herkx_example.cu │ ├── symm │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_symm_example.cu │ ├── syr2k │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_syr2k_example.cu │ ├── syrk │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_syrk_example.cu │ ├── syrkx │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_syrkx_example.cu │ ├── trmm │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_trmm_example.cu │ ├── trsm │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_trsm_example.cu │ └── trsmBatched │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── cublas_trsmBatched_example.cu ├── README.md ├── cmake │ └── cublas_example.cmake └── utils │ └── cublas_utils.h ├── cuBLASLt ├── CMakeLists.txt ├── Common │ ├── LtMatmulCustomFind.h │ ├── helpers.cpp │ └── helpers.h ├── LtBlk128x128Fp8Matmul │ ├── CMakeLists.txt │ ├── main.cpp │ ├── sample_cublasLt_LtBlk128x128Fp8Matmul.cu │ └── sample_cublasLt_LtBlk128x128Fp8Matmul.h ├── LtDgemmPresetAlgo │ ├── CMakeLists.txt │ ├── main.cpp │ ├── sample_cublasLt_LtDgemmPresetAlgo.cu │ └── sample_cublasLt_LtDgemmPresetAlgo.h ├── LtFp8CustomFind │ ├── CMakeLists.txt │ └── main.cpp ├── LtFp8Matmul │ ├── CMakeLists.txt │ ├── main.cpp │ ├── sample_cublasLt_LtFp8Matmul.cu │ └── sample_cublasLt_LtFp8Matmul.h ├── LtHSHgemmPointerArrayBatchSimple │ ├── CMakeLists.txt │ ├── main.cpp │ ├── sample_cublasLt_LtHSHgemmPointerArrayBatchSimple.cu │ └── sample_cublasLt_LtHSHgemmPointerArrayBatchSimple.h ├── LtHSHgemmStridedBatchSimple │ ├── CMakeLists.txt │ ├── main.cpp │ ├── sample_cublasLt_LtHSHgemmStridedBatchSimple.cu │ └── sample_cublasLt_LtHSHgemmStridedBatchSimple.h ├── LtIgemmTensor │ ├── CMakeLists.txt │ ├── main.cpp │ ├── sample_cublasLt_LtIgemmTensor.cu │ └── sample_cublasLt_LtIgemmTensor.h ├── LtMxfp8Matmul │ ├── CMakeLists.txt │ ├── main.cpp │ ├── sample_cublasLt_LtMxfp8Matmul.cu │ └── sample_cublasLt_LtMxfp8Matmul.h ├── LtNvfp4Matmul │ ├── CMakeLists.txt │ ├── main.cpp │ ├── sample_cublasLt_LtNvfp4Matmul.cu │ └── sample_cublasLt_LtNvfp4Matmul.h ├── LtPlanarComplex │ ├── CMakeLists.txt │ ├── main.cpp │ ├── sample_cublasLt_LtPlanarComplex.cu │ └── sample_cublasLt_LtPlanarComplex.h ├── LtSgemm │ ├── CMakeLists.txt │ ├── main.cpp │ ├── sample_cublasLt_LtSgemm.cu │ └── sample_cublasLt_LtSgemm.h ├── LtSgemmCustomFind │ ├── CMakeLists.txt │ └── main.cpp ├── LtSgemmSimpleAutoTuning │ ├── CMakeLists.txt │ ├── main.cpp │ ├── sample_cublasLt_LtSgemmSimpleAutoTuning.cu │ └── sample_cublasLt_LtSgemmSimpleAutoTuning.h └── README.md ├── cuBLASMp ├── CMakeLists.txt ├── README.md ├── helpers.h ├── matrix_generator.hxx ├── pgeadd.cu ├── pgemm.cu ├── pmatmul.cu ├── pmatmul_ar.cu ├── psyrk.cu ├── ptradd.cu └── ptrsm.cu ├── cuDSS ├── README.md ├── get_set │ ├── CMakeLists.txt │ └── get_set.cpp ├── memory_handler │ ├── CMakeLists.txt │ └── memory_handler.cpp ├── simple │ ├── CMakeLists.txt │ └── simple.cpp ├── simple_batch │ ├── CMakeLists.txt │ └── simple_batch.cpp ├── simple_complex │ ├── CMakeLists.txt │ └── simple_complex.cpp ├── simple_hybrid_execution_mode │ ├── CMakeLists.txt │ └── simple_hybrid_execution_mode.cpp ├── simple_hybrid_memory_mode │ ├── CMakeLists.txt │ └── simple_hybrid_memory_mode.cpp ├── simple_mgmn_mode │ ├── CMakeLists.txt │ └── simple_mgmn_mode.cpp └── simple_multithreaded_mode │ ├── CMakeLists.txt │ └── simple_multithreaded_mode.cpp ├── cuFFT ├── 1d_c2c │ ├── .gitignore │ ├── 1d_c2c_example.cpp │ ├── CMakeLists.txt │ ├── Makefile │ └── README.md ├── 1d_mgpu_c2c │ ├── .gitignore │ ├── 1d_mgpu_c2c_example.cpp │ ├── CMakeLists.txt │ ├── Makefile │ └── README.md ├── 1d_r2c_c2r │ ├── .gitignore │ ├── 1d_r2c_c2r_example.cpp │ ├── CMakeLists.txt │ ├── Makefile │ └── README.md ├── 2d_c2r_r2c │ ├── .gitignore │ ├── 2d_c2r_r2c_example.cpp │ ├── CMakeLists.txt │ ├── Makefile │ └── README.md ├── 3d_c2c │ ├── .gitignore │ ├── 3d_c2c_example.cpp │ ├── CMakeLists.txt │ ├── Makefile │ └── README.md ├── 3d_mgpu_c2c │ ├── .gitignore │ ├── 3d_mgpu_c2c_example.cpp │ ├── CMakeLists.txt │ ├── Makefile │ └── README.md ├── 3d_mgpu_r2c_c2r │ ├── .gitignore │ ├── 3d_mgpu_r2c_c2r_example.cpp │ ├── CMakeLists.txt │ ├── Makefile │ └── README.md ├── README.md ├── lto_callback_window_1d │ ├── CMakeLists.txt │ ├── Makefile │ ├── README.md │ └── src │ │ ├── common.cpp │ │ ├── common.h │ │ ├── nvrtc_helper.h │ │ ├── r2c_c2r_legacy_callback_example.cu │ │ ├── r2c_c2r_lto_callback_device.cu │ │ ├── r2c_c2r_lto_callback_example.cpp │ │ ├── r2c_c2r_lto_nvrtc_callback_example.cpp │ │ ├── r2c_c2r_reference.cu │ │ └── r2c_c2r_reference.h ├── lto_ea │ ├── CMakeLists.txt │ ├── Makefile │ ├── README.md │ └── src │ │ ├── common.cpp │ │ ├── common.h │ │ ├── nvrtc_helper.h │ │ ├── r2c_c2r_callback_example.cu │ │ ├── r2c_c2r_lto_callback_device.cu │ │ ├── r2c_c2r_lto_callback_example.cpp │ │ ├── r2c_c2r_lto_nvrtc_callback_example.cpp │ │ ├── r2c_c2r_reference.cu │ │ └── r2c_c2r_reference.h └── utils │ └── cufft_utils.h ├── cuFFTMp ├── Fortran_samples │ ├── Fortran_wrappers_nvhpc │ │ ├── cufft.mod │ │ ├── cufftxt.mod │ │ ├── libattachcommWrapper.a │ │ └── libnvhpcwrapcufftxt.a │ ├── c2c │ │ ├── Makefile │ │ ├── README.md │ │ └── cufftmp_c2c.f90 │ ├── c2c_no_descriptors │ │ ├── Makefile │ │ └── cufftmp_c2c_no_descriptors.f90 │ ├── c2c_pencils │ │ ├── Makefile │ │ ├── README.md │ │ └── cufftmp_c2c_pencils.f90 │ ├── common.mk │ ├── r2c_c2r │ │ ├── Makefile │ │ ├── README.md │ │ └── cufftmp_r2c.f90 │ ├── r2c_c2r_no_descriptors │ │ ├── Makefile │ │ └── cufftmp_r2c_c2r_no_descriptors.f90 │ ├── r2c_c2r_pencils │ │ ├── Makefile │ │ ├── README.md │ │ └── cufftmp_r2c_c2r_pencils.f90 │ ├── r2c_c2r_shared_scratch │ │ ├── Makefile │ │ ├── README.md │ │ └── cufftmp_r2c_workarea.f90 │ └── reshape │ │ ├── Makefile │ │ ├── README.md │ │ └── cufftmp_reshape.f90 ├── JAX_FFT │ ├── Dockerfile │ ├── README.md │ ├── misc │ │ ├── strong.png │ │ ├── strong_eos.png │ │ └── weak.png │ ├── pyproject.toml │ ├── setup.py │ ├── src │ │ ├── cufftmp_jax │ │ │ ├── CMakeLists.txt │ │ │ ├── NOTICE │ │ │ ├── __init__.py │ │ │ ├── cufftmp_jax.py │ │ │ └── src │ │ │ │ ├── gpu_ops.cpp │ │ │ │ ├── kernel_helpers.h │ │ │ │ ├── kernels.cu │ │ │ │ ├── kernels.h │ │ │ │ └── pybind11_kernel_helpers.h │ │ ├── fft_common │ │ │ ├── __init__.py │ │ │ └── utils.py │ │ └── xfft │ │ │ ├── __init__.py │ │ │ └── xfft.py │ └── tests │ │ ├── fft_test.py │ │ └── helpers.py ├── README.md ├── extra_bootstraps │ ├── Makefile │ └── README.md └── samples │ ├── c2c │ ├── Makefile │ ├── README.md │ └── cufftmp_c2c.cu │ ├── c2c_no_descriptors │ ├── Makefile │ ├── README.md │ └── cufftmp_c2c_no_descriptors.cu │ ├── c2c_no_descriptors_cufftMpMakePlan │ ├── README.md │ └── cufftmp_c2c_no_descriptors_cufftMpMakePlan.cu │ ├── c2c_pencils │ ├── Makefile │ ├── README.md │ └── cufftmp_c2c_pencils.cu │ ├── common.mk │ ├── common │ ├── README.md │ ├── error_checks.hpp │ ├── generate_random.hpp │ └── scaling.cuh │ ├── iterators │ └── box_iterator.hpp │ ├── r2c_c2r │ ├── Makefile │ ├── README.md │ └── cufftmp_r2c_c2r.cu │ ├── r2c_c2r_no_descriptors │ ├── Makefile │ ├── README.md │ └── cufftmp_r2c_c2r_no_descriptors.cu │ ├── r2c_c2r_pencils │ ├── Makefile │ ├── README.md │ └── cufftmp_r2c_c2r_pencils.cu │ ├── r2c_c2r_pencils_cufftMpMakePlan │ ├── README.md │ └── cufftmp_r2c_c2r_pencils_cufftMpMakePlan.cu │ ├── r2c_c2r_shared_scratch │ ├── Makefile │ ├── README.md │ └── cufftmp_r2c_c2r_shared_scratch.cu │ ├── r2c_c2r_slabs_GROMACS │ ├── Makefile │ ├── README.md │ └── cufftmp_r2c_c2r_slabs_GROMACS.cu │ └── reshape │ ├── Makefile │ ├── README.md │ └── cufftmp_reshape.cu ├── cuPQC ├── Makefile ├── README.md ├── example_ml_dsa.cu ├── example_ml_kem.cu ├── example_sha2.cu └── example_sha3.cu ├── cuRAND ├── .gitignore ├── Host │ ├── mrg32k3a │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── Makefile │ │ ├── README.md │ │ ├── curand_mrg32k3a_lognormal_example.cpp │ │ ├── curand_mrg32k3a_normal_example.cpp │ │ ├── curand_mrg32k3a_poisson_example.cpp │ │ └── curand_mrg32k3a_uniform_example.cpp │ ├── mt19937 │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── Makefile │ │ ├── README.md │ │ ├── curand_mt19937_lognormal_example.cpp │ │ ├── curand_mt19937_normal_example.cpp │ │ ├── curand_mt19937_poisson_example.cpp │ │ └── curand_mt19937_uniform_example.cpp │ ├── mtgp32 │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── Makefile │ │ ├── README.md │ │ ├── curand_mtgp32_lognormal_example.cpp │ │ ├── curand_mtgp32_normal_example.cpp │ │ ├── curand_mtgp32_poisson_example.cpp │ │ └── curand_mtgp32_uniform_example.cpp │ ├── philox │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── Makefile │ │ ├── README.md │ │ ├── curand_philox_lognormal_example.cpp │ │ ├── curand_philox_normal_example.cpp │ │ ├── curand_philox_poisson_example.cpp │ │ └── curand_philox_uniform_example.cpp │ ├── scrambled_sobol32 │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── Makefile │ │ ├── README.md │ │ ├── curand_scrambled_sobol32_lognormal_example.cpp │ │ ├── curand_scrambled_sobol32_normal_example.cpp │ │ ├── curand_scrambled_sobol32_poisson_example.cpp │ │ └── curand_scrambled_sobol32_uniform_example.cpp │ ├── scrambled_sobol64 │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── Makefile │ │ ├── README.md │ │ ├── curand_scrambled_sobol64_lognormal_example.cpp │ │ ├── curand_scrambled_sobol64_normal_example.cpp │ │ ├── curand_scrambled_sobol64_poisson_example.cpp │ │ └── curand_scrambled_sobol64_uniform_example.cpp │ ├── sobol32 │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── Makefile │ │ ├── README.md │ │ ├── curand_sobol32_lognormal_example.cpp │ │ ├── curand_sobol32_normal_example.cpp │ │ ├── curand_sobol32_poisson_example.cpp │ │ └── curand_sobol32_uniform_example.cpp │ ├── sobol64 │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── Makefile │ │ ├── README.md │ │ ├── curand_sobol64_lognormal_example.cpp │ │ ├── curand_sobol64_normal_example.cpp │ │ ├── curand_sobol64_poisson_example.cpp │ │ └── curand_sobol64_uniform_example.cpp │ └── xorwow │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── Makefile │ │ ├── README.md │ │ ├── curand_xorwow_lognormal_example.cpp │ │ ├── curand_xorwow_normal_example.cpp │ │ ├── curand_xorwow_poisson_example.cpp │ │ └── curand_xorwow_uniform_example.cpp ├── README.md ├── cmake │ └── curand_example.cmake └── utils │ └── curand_utils.h ├── cuSOLVER ├── MgGetrf │ ├── .gitignore │ ├── CMakeLists.txt │ ├── README.md │ └── cusolver_MgGetrf_example.cu ├── MgPotrf │ ├── .gitignore │ ├── CMakeLists.txt │ ├── README.md │ ├── cusolver_MgPotrf_example1.cu │ └── cusolver_MgPotrf_example2.cu ├── MgSyevd │ ├── .gitignore │ ├── CMakeLists.txt │ ├── README.md │ ├── cusolver_MgSyevd_example1.cu │ ├── cusolver_MgSyevd_example2.cu │ └── cusolver_MgSyevd_example3.cu ├── README.md ├── Xgeev │ ├── .gitignore │ ├── CMakeLists.txt │ ├── README.md │ ├── cusolver_Xgeev_example1.cu │ ├── cusolver_Xgeev_example2.cu │ └── cusolver_Xgeev_example3.cu ├── Xgeqrf │ ├── .gitignore │ ├── CMakeLists.txt │ ├── README.md │ └── cusolver_Xgeqrf_example.cu ├── Xgesvd │ ├── .gitignore │ ├── CMakeLists.txt │ ├── README.md │ └── cusolver_Xgesvd_example.cu ├── Xgesvdp │ ├── .gitignore │ ├── CMakeLists.txt │ ├── README.md │ └── cusolver_Xgesvdp_example.cu ├── Xgesvdr │ ├── .gitignore │ ├── CMakeLists.txt │ ├── README.md │ └── cusolver_Xgesvdr_example.cu ├── Xgetrf │ ├── .gitignore │ ├── CMakeLists.txt │ ├── README.md │ └── cusolver_Xgetrf_example.cu ├── Xpotrf │ ├── .gitignore │ ├── CMakeLists.txt │ ├── README.md │ └── cusolver_Xpotrf_example.cu ├── Xsyevd │ ├── .gitignore │ ├── CMakeLists.txt │ ├── README.md │ └── cusolver_Xsyevd_example.cu ├── Xsyevdx │ ├── .gitignore │ ├── CMakeLists.txt │ ├── README.md │ └── cusolver_Xsyevdx_example.cu ├── Xtrtri │ ├── .gitignore │ ├── CMakeLists.txt │ ├── README.md │ └── cusolver_Xtrtri_example.cu ├── cmake │ └── cusolver_example.cmake ├── csrqr │ ├── .gitignore │ ├── CMakeLists.txt │ ├── README.md │ ├── cusolver_csrqr_example1.cu │ └── cusolver_csrqr_example2.cu ├── gesv │ ├── .gitignore │ ├── CMakeLists.txt │ ├── README.md │ ├── cusolver_irs_expert_cuda-10.2.cu │ ├── cusolver_irs_expert_cuda-11.cu │ └── cusolver_irs_lapack.cu ├── gesvd │ ├── .gitignore │ ├── CMakeLists.txt │ ├── README.md │ └── cusolver_gesvd_example.cu ├── gesvdaStridedBatched │ ├── .gitignore │ ├── CMakeLists.txt │ ├── README.md │ └── cusolver_gesvdaStridedBatched_example.cu ├── gesvdj │ ├── .gitignore │ ├── CMakeLists.txt │ ├── README.md │ └── cusolver_gesvdj_example.cu ├── gesvdjBatched │ ├── .gitignore │ ├── CMakeLists.txt │ ├── README.md │ └── cusolver_gesvdjBatched_example.cu ├── getrf │ ├── .gitignore │ ├── CMakeLists.txt │ ├── README.md │ └── cusolver_getrf_example.cu ├── orgqr │ ├── .gitignore │ ├── CMakeLists.txt │ ├── README.md │ └── cusolver_orgqr_example.cu ├── ormqr │ ├── .gitignore │ ├── CMakeLists.txt │ ├── README.md │ └── cusolver_ormqr_example.cu ├── potrfBatched │ ├── .gitignore │ ├── CMakeLists.txt │ ├── README.md │ └── cusolver_potrfBatched_example.cu ├── syevd │ ├── .gitignore │ ├── CMakeLists.txt │ ├── README.md │ └── cusolver_syevd_example.cu ├── syevdx │ ├── .gitignore │ ├── CMakeLists.txt │ ├── README.md │ └── cusolver_syevdx_example.cu ├── syevj │ ├── .gitignore │ ├── CMakeLists.txt │ ├── README.md │ └── cusolver_syevj_example.cu ├── syevjBatched │ ├── .gitignore │ ├── CMakeLists.txt │ ├── README.md │ └── cusolver_syevjBatched_example.cu ├── sygvd │ ├── .gitignore │ ├── CMakeLists.txt │ ├── README.md │ └── cusolver_sygvd_example.cu ├── sygvdx │ ├── .gitignore │ ├── CMakeLists.txt │ ├── README.md │ └── cusolver_sygvdx_example.cu ├── sygvj │ ├── .gitignore │ ├── CMakeLists.txt │ ├── README.md │ └── cusolver_sygvj_example.cu └── utils │ ├── cusolverMg_utils.h │ └── cusolver_utils.h ├── cuSOLVERMp ├── .gitignore ├── CMakeLists.txt ├── Makefile ├── README.md ├── helpers.h ├── mp_gels.c ├── mp_geqrf.c ├── mp_getrf_getrs.c ├── mp_potrf_potrs.c ├── mp_syevd.c └── mp_sygvd.c ├── cuSOLVERSp2cuDSS ├── CMakeLists.txt ├── README.md ├── csreigvsi2cuDSS_double.cpp ├── cuSolverRf2cuDSS.hpp ├── cuSolverRf2cuDSS_double.cpp ├── cuSolverSp2cuDSS.hpp ├── cuSolverSp2cuDSS_dcomplex.cpp ├── cuSolverSp2cuDSS_double.cpp ├── cuSolverSp2cuDSS_float.cpp ├── cuSolverSp2cuDSS_scomplex.cpp ├── test_complex.mtx ├── test_real.mtx └── utils.hpp ├── cuSPARSE ├── README.md ├── axpby │ ├── CMakeLists.txt │ ├── Makefile │ ├── README.md │ ├── axpby.png │ └── axpby_example.c ├── bicgstab │ ├── BiCGStab.pdf │ ├── BiCGStab.png │ ├── CMakeLists.txt │ ├── Makefile │ ├── README.md │ └── bicgstab_example.c ├── cg │ ├── CMakeLists.txt │ ├── Makefile │ ├── README.md │ ├── cg.pdf │ ├── cg.png │ └── cg_example.c ├── compression │ ├── CMakeLists.txt │ ├── Makefile │ ├── README.md │ └── compression_example.cpp ├── coosort │ ├── CMakeLists.txt │ ├── Makefile │ ├── README.md │ └── coosort_example.c ├── dense2sparse_blockedell │ ├── CMakeLists.txt │ ├── Makefile │ ├── README.md │ ├── dense2sparse_blockedell.png │ └── dense2sparse_blockedell_example.c ├── dense2sparse_csr │ ├── CMakeLists.txt │ ├── Makefile │ ├── README.md │ ├── dense2sparse_csr.png │ └── dense2sparse_csr_example.c ├── gather │ ├── CMakeLists.txt │ ├── Makefile │ ├── README.md │ ├── gather.png │ └── gather_example.c ├── gpsvInterleavedBatch │ ├── CMakeLists.txt │ ├── Makefile │ ├── README.md │ └── gpsvInterleavedBatch_example.c ├── graph_capture │ ├── CMakeLists.txt │ ├── Makefile │ ├── README.md │ └── graph_capture_example.c ├── rot │ ├── CMakeLists.txt │ ├── Makefile │ ├── README.md │ ├── rot.png │ └── rot_example.c ├── scatter │ ├── CMakeLists.txt │ ├── Makefile │ ├── README.md │ ├── scatter.png │ └── scatter_example.c ├── sddmm_bsr │ ├── CMakeLists.txt │ ├── Makefile │ ├── README.md │ ├── sddmm_bsr.png │ └── sddmm_bsr_example.c ├── sddmm_csr │ ├── CMakeLists.txt │ ├── Makefile │ ├── README.md │ ├── sddmm_csr.png │ └── sddmm_csr_example.c ├── sddmm_csr_batched │ ├── CMakeLists.txt │ ├── Makefile │ ├── README.md │ ├── sddmm_csr.png │ └── sddmm_csr_batched_example.c ├── sparse2dense_csr │ ├── CMakeLists.txt │ ├── Makefile │ ├── README.md │ ├── sparse2dense_csr.png │ └── sparse2dense_csr_example.c ├── spgemm │ ├── CMakeLists.txt │ ├── Makefile │ ├── README.md │ ├── spgemm.png │ └── spgemm_example.c ├── spgemm_mem │ ├── CMakeLists.txt │ ├── Makefile │ ├── README.md │ ├── spgemm.png │ └── spgemm_mem_example.c ├── spgemm_reuse │ ├── CMakeLists.txt │ ├── Makefile │ ├── README.md │ ├── spgemm.png │ └── spgemm_reuse_example.c ├── spmm_blockedell │ ├── CMakeLists.txt │ ├── Makefile │ ├── README.md │ ├── spmm_blockedell.png │ └── spmm_blockedell_example.cpp ├── spmm_coo │ ├── CMakeLists.txt │ ├── Makefile │ ├── README.md │ ├── spmm_coo.png │ └── spmm_coo_example.c ├── spmm_coo_batched │ ├── CMakeLists.txt │ ├── Makefile │ ├── README.md │ ├── spmm_coo_batched.png │ └── spmm_coo_batched_example.c ├── spmm_csr │ ├── CMakeLists.txt │ ├── Makefile │ ├── README.md │ ├── spmm_csr.png │ └── spmm_csr_example.c ├── spmm_csr_batched │ ├── CMakeLists.txt │ ├── Makefile │ ├── README.md │ ├── spmm_csr_batched.png │ └── spmm_csr_batched_example.c ├── spmm_csr_op │ ├── CMakeLists.txt │ ├── Makefile │ ├── README.md │ ├── spmm_csr_op.png │ └── spmm_csr_op_example.c ├── spmv_coo │ ├── CMakeLists.txt │ ├── Makefile │ ├── README.md │ ├── spmv_coo.png │ └── spmv_coo_example.c ├── spmv_csr │ ├── CMakeLists.txt │ ├── Makefile │ ├── README.md │ ├── spmv_csr.png │ └── spmv_csr_example.c ├── spmv_sell │ ├── CMakeLists.txt │ ├── Makefile │ ├── README.md │ ├── spmv_sell.png │ └── spmv_sell_example.c ├── spsm_coo │ ├── CMakeLists.txt │ ├── Makefile │ ├── README.md │ ├── spsm_coo.png │ └── spsm_coo_example.c ├── spsm_csr │ ├── CMakeLists.txt │ ├── Makefile │ ├── README.md │ ├── spsm_csr.png │ └── spsm_csr_example.c ├── spsv_coo │ ├── CMakeLists.txt │ ├── Makefile │ ├── README.md │ ├── spsv_coo.png │ └── spsv_coo_example.c ├── spsv_csr │ ├── CMakeLists.txt │ ├── Makefile │ ├── README.md │ ├── spsv_csr.png │ └── spsv_csr_example.c ├── spsv_sell │ ├── CMakeLists.txt │ ├── Makefile │ ├── README.md │ ├── spsv_sell.png │ └── spsv_sell_example.c └── spvv │ ├── CMakeLists.txt │ ├── Makefile │ ├── README.md │ ├── spvv.png │ └── spvv_example.c ├── cuSPARSELt ├── README.md ├── matmul │ ├── CMakeLists.txt │ ├── Makefile │ ├── README.md │ └── matmul_example.cpp └── matmul_advanced │ ├── CMakeLists.txt │ ├── Makefile │ ├── README.md │ └── matmul_advanced_example.cpp ├── cuTENSOR ├── CMakeLists.txt ├── Makefile ├── README.md ├── contraction.cu ├── contraction_jit.cu ├── contraction_plan_cache.cu ├── einsum.cu ├── elementwise_binary.cu ├── elementwise_permute.cu ├── elementwise_permute_padding.cu ├── elementwise_trinary.cu ├── python │ ├── MANIFEST.in │ ├── README.md │ ├── cutensor │ │ ├── __init__.py │ │ ├── c_extensions.py │ │ ├── c_extensions_utils.py │ │ ├── common.py │ │ ├── package_info.py │ │ ├── tensorflow │ │ │ ├── __init__.py │ │ │ ├── einsum.py │ │ │ ├── einsum_kernel.cc │ │ │ ├── einsum_module.cc │ │ │ ├── einsum_ops.cc │ │ │ └── einsum_test.py │ │ └── torch │ │ │ ├── __init__.py │ │ │ ├── einsum.cc │ │ │ ├── einsum.py │ │ │ └── einsum_test.py │ ├── einsum.h │ └── setup.py └── reduction.cu ├── cuTENSORMg ├── CMakeLists.txt ├── blog_post.cu └── contraction_multi_gpu.cu ├── nvCOMP ├── README.md ├── benchmarks │ ├── CMakeLists.txt │ ├── README.md │ ├── benchmark_ans_chunked.cu │ ├── benchmark_bitcomp_chunked.cu │ ├── benchmark_cascaded_chunked.cu │ ├── benchmark_common.h │ ├── benchmark_deflate_chunked.cu │ ├── benchmark_gdeflate_chunked.cu │ ├── benchmark_hlif.cpp │ ├── benchmark_hlif.hpp │ ├── benchmark_lz4_chunked.cu │ ├── benchmark_snappy_chunked.cu │ ├── benchmark_template_chunked.cuh │ ├── benchmark_zstd_chunked.cu │ └── text_to_binary.py └── examples │ ├── BatchData.h │ ├── BatchDataCPU.h │ ├── CMakeLists.txt │ ├── README.md │ ├── deflate_cpu_compression.cu │ ├── deflate_cpu_decompression.cu │ ├── gdeflate_cpu_compression.cu │ ├── gdeflate_cpu_decompression.cu │ ├── gzip_gpu_decompression.cu │ ├── high_level_quickstart_example.cpp │ ├── low_level_quickstart_example.cpp │ ├── lz4_cpu_compression.cu │ ├── lz4_cpu_decompression.cu │ ├── nvcomp_gds.cu │ ├── python │ └── nvcomp_basic.ipynb │ ├── snappy_cpu_compression.cu │ ├── snappy_cpu_decompression.cu │ ├── util.h │ ├── zstd_cpu_compression.cu │ └── zstd_cpu_decompression.cu ├── nvJPEG ├── Image-Resize-WaterMark │ ├── CMakeLists.txt │ ├── NVLogo.jpg │ ├── NVLogo.png │ ├── README.md │ ├── imageResizeWatermark.cpp │ ├── imageResizeWatermark.h │ ├── img9.png │ ├── img9wm.png │ └── input_images │ │ ├── cat.jpg │ │ ├── cat_baseline.jpg │ │ ├── cat_grayscale.jpg │ │ ├── img1.jpg │ │ ├── img2.jpg │ │ ├── img3.jpg │ │ ├── img4.jpg │ │ ├── img5.jpg │ │ ├── img6.jpg │ │ ├── img7.jpg │ │ ├── img8.jpg │ │ └── img9.jpg ├── Image-Resize │ ├── CMakeLists.txt │ ├── README.md │ ├── imageResize.cpp │ ├── imageResize.h │ └── input_images │ │ ├── cat.jpg │ │ ├── cat_baseline.jpg │ │ ├── cat_grayscale.jpg │ │ ├── img1.jpg │ │ ├── img2.jpg │ │ ├── img3.jpg │ │ ├── img4.jpg │ │ ├── img5.jpg │ │ ├── img6.jpg │ │ ├── img7.jpg │ │ ├── img8.jpg │ │ └── img9.jpg ├── README.md ├── nvJPEG-Decoder-Backend-ROI │ ├── CMakeLists.txt │ ├── README.md │ ├── img9_roi.png │ ├── input_images │ │ ├── cat.jpg │ │ ├── cat_baseline.jpg │ │ ├── cat_grayscale.jpg │ │ ├── img1.jpg │ │ ├── img2.jpg │ │ ├── img3.jpg │ │ ├── img4.jpg │ │ ├── img5.jpg │ │ ├── img6.jpg │ │ ├── img7.jpg │ │ ├── img8.jpg │ │ └── img9.jpg │ ├── nvJPEGROIDecode.cpp │ ├── nvJPEGROIDecode.h │ └── threadpool.h ├── nvJPEG-Decoder-MultipleInstances │ ├── CMakeLists.txt │ ├── README.md │ ├── nvJPEGDecMultipleInstances.cpp │ ├── nvJPEGDecMultipleInstances.h │ └── threadpool.h ├── nvJPEG-Decoder │ ├── CMakeLists.txt │ ├── README.md │ ├── input_images │ │ ├── cat.jpg │ │ ├── cat_baseline.jpg │ │ ├── cat_grayscale.jpg │ │ ├── img1.jpg │ │ ├── img2.jpg │ │ ├── img3.jpg │ │ ├── img4.jpg │ │ ├── img5.jpg │ │ ├── img6.jpg │ │ ├── img7.jpg │ │ ├── img8.jpg │ │ └── img9.jpg │ ├── nvjpegDecoder.cpp │ └── nvjpegDecoder.h └── nvJPEG-Encoder-MultipleInstances │ ├── CMakeLists.txt │ ├── README.md │ └── nvJPEGEncMultipleInstances.cpp ├── nvJPEG2000 ├── README.md ├── nvJPEG2000-Decoder-Pipelined │ ├── CMakeLists.txt │ ├── README.md │ ├── nvjpeg2k_dec_pipelined.cpp │ └── nvjpeg2k_dec_pipelined.h ├── nvJPEG2000-Decoder-Tile-Partial │ ├── CMakeLists.txt │ ├── README.md │ ├── nvj2k_DecodeTilePartial.cpp │ └── nvj2k_DecodeTilePartial.h ├── nvJPEG2000-Decoder │ ├── CMakeLists.txt │ ├── README.md │ ├── images │ │ ├── 2k_image_lossless │ │ │ └── 2k_lossless.jp2 │ │ ├── 2k_image_lossy │ │ │ └── 2k_lossy.jp2 │ │ └── 4k_image_lossy │ │ │ └── 4k_lossy.jp2 │ ├── nvjpeg2000DecodeSample.cpp │ └── nvjpeg2000DecodeSample.h └── nvJPEG2000-Encoder │ ├── CMakeLists.txt │ ├── README.md │ ├── images │ └── TestImage640x480.bmp │ ├── nvjpeg2k_encode.cpp │ └── nvjpeg2k_encode.h └── nvTIFF ├── README.md ├── nvTIFF-Decode-Encode ├── CMakeLists.txt ├── README.md ├── getopt.h ├── images │ └── bali_notiles.tif └── nvtiff_example.cpp ├── nvTIFF-Decode-Image-ROI ├── CMakeLists.txt ├── README.md └── nvtiff_decode_image_roi.cpp └── nvTIFF-GeoTIFF-Decode ├── CMakeLists.txt ├── README.md ├── getopt.h ├── images └── bali_notiles.tif └── nvtiff_geotiff_decode.cpp /LICENSE.TXT: -------------------------------------------------------------------------------- 1 | Copyright (c) 2022 NVIDIA CORPORATION AND AFFILIATES. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without modification, are permitted 4 | provided that the following conditions are met: 5 | * Redistributions of source code must retain the above copyright notice, this list of 6 | conditions and the following disclaimer. 7 | * Redistributions in binary form must reproduce the above copyright notice, this list of 8 | conditions and the following disclaimer in the documentation and/or other materials 9 | provided with the distribution. 10 | * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used 11 | to endorse or promote products derived from this software without specific prior written 12 | permission. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 15 | IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 16 | FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 17 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 18 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 19 | OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 20 | STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 21 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 22 | -------------------------------------------------------------------------------- /MathDx/cuBLASDx/.gitignore: -------------------------------------------------------------------------------- 1 | build/ -------------------------------------------------------------------------------- /MathDx/cuBLASDx/reference/naive_reference.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CUBLASDX_EXAMPLE_NAIVE_REFERENCE_HPP 2 | #define CUBLASDX_EXAMPLE_NAIVE_REFERENCE_HPP 3 | 4 | #include 5 | #include "../common.hpp" 6 | 7 | namespace example { 8 | template 9 | void reference_gemm_naive_device(const unsigned int m, 10 | const unsigned int n, 11 | const unsigned int k, 12 | const ValueType alpha, 13 | example::device_vector& A, 14 | const unsigned int lda, 15 | cublasdx::arrangement arr_a, 16 | example::device_vector& B, 17 | const unsigned int ldb, 18 | cublasdx::arrangement arr_b, 19 | const ValueType beta, 20 | example::device_vector& C, 21 | const unsigned int ldc, 22 | cublasdx::arrangement arr_c); 23 | } // namespace example 24 | 25 | #endif // CUBLASDX_EXAMPLE_NAIVE_REFERENCE_HPP 26 | -------------------------------------------------------------------------------- /MathDx/cuFFTDx/01_simple_fft_thread/02_simple_fft_thread_lto_cases.csv: -------------------------------------------------------------------------------- 1 | exec_op,size,type,direction,precision 2 | Thread,8,fft_type::c2c,fft_direction::forward,double -------------------------------------------------------------------------------- /MathDx/cuFFTDx/02_simple_fft_block/10_simple_fft_block_c2r_lto_cases.csv: -------------------------------------------------------------------------------- 1 | exec_op,size,type,precision,elements_per_thread,real_mode 2 | Block,128,fft_type::c2r,float,8,real_mode::normal -------------------------------------------------------------------------------- /MathDx/cuFFTDx/03_block_fft_performance/00_block_fft_performance.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "block_fft_performance.hpp" 7 | 8 | template 9 | void block_fft_performance() { 10 | using namespace cufftdx; 11 | 12 | using fft_base = decltype(Block() + Type() + Direction() + 13 | Precision() + SM()); 14 | 15 | static constexpr unsigned int elements_per_thread = 8; 16 | static constexpr unsigned int fft_size = 512; 17 | static constexpr unsigned int ffts_per_block = 1; 18 | 19 | cudaStream_t stream; 20 | CUDA_CHECK_AND_EXIT(cudaStreamCreate(&stream)) 21 | benchmark_block_fft(stream, true); 22 | CUDA_CHECK_AND_EXIT(cudaStreamDestroy(stream)); 23 | } 24 | 25 | template 26 | struct block_fft_performance_functor { 27 | void operator()() { return block_fft_performance(); } 28 | }; 29 | 30 | int main(int, char**) { 31 | return example::sm_runner(); 32 | } 33 | -------------------------------------------------------------------------------- /MathDx/cuFFTDx/03_block_fft_performance/02_block_fft_lto_ptx_performance_cases.csv: -------------------------------------------------------------------------------- 1 | size,direction,type,precision,exec_op 2 | 16,fft_direction::forward,fft_type::c2c,float,Block 3 | 32,fft_direction::forward,fft_type::c2c,float,Block 4 | 64,fft_direction::forward,fft_type::c2c,float,Block 5 | 128,fft_direction::forward,fft_type::c2c,float,Block 6 | 256,fft_direction::forward,fft_type::c2c,float,Block 7 | 512,fft_direction::forward,fft_type::c2c,float,Block 8 | 1024,fft_direction::forward,fft_type::c2c,float,Block 9 | 2048,fft_direction::forward,fft_type::c2c,float,Block 10 | 4096,fft_direction::forward,fft_type::c2c,float,Block 11 | 8192,fft_direction::forward,fft_type::c2c,float,Block 12 | 16384,fft_direction::forward,fft_type::c2c,float,Block 13 | 544,fft_direction::forward,fft_type::c2c,float,Block 14 | 608,fft_direction::forward,fft_type::c2c,float,Block 15 | 675,fft_direction::forward,fft_type::c2c,float,Block 16 | 686,fft_direction::forward,fft_type::c2c,float,Block 17 | 800,fft_direction::forward,fft_type::c2c,float,Block 18 | -------------------------------------------------------------------------------- /MathDx/cuFFTDx/09_introduction_lto_example/00_introduction_lto_cases.csv: -------------------------------------------------------------------------------- 1 | size,direction,exec_op 2 | 128,fft_direction::forward,Block -------------------------------------------------------------------------------- /MathDx/cuFFTDx/10_cufft_device_api_example/cufft_device_api_lto_helper/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.18) 2 | 3 | # Define project 4 | project(cufft_device_api_lto_helper LANGUAGES CXX) 5 | 6 | set(CMAKE_CXX_STANDARD 17) 7 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 8 | 9 | # Possible variables that can be set externally 10 | # cufft_ROOT: points to the cuFFT library root directory 11 | 12 | find_package(cufft 11.5.0 EXACT REQUIRED CONFIG 13 | PATHS 14 | "${PROJECT_SOURCE_DIR}/../../../../../cufft" 15 | "/opt/cufft" 16 | ) 17 | 18 | # Define the helper executable 19 | add_executable(cufft_device_api_lto_helper cufft_device_api_lto_helper.cpp) 20 | target_link_libraries(cufft_device_api_lto_helper 21 | PUBLIC 22 | cufft::cufft_static 23 | ) 24 | -------------------------------------------------------------------------------- /MathDx/cuFFTDx/lto_helper/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.18) 2 | 3 | # Define project 4 | project(lto_helper LANGUAGES CXX) 5 | 6 | set(CMAKE_CXX_STANDARD 17) 7 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 8 | 9 | # Possible variables that can be set externally 10 | # cufft_ROOT: points to the cuFFT library root directory 11 | 12 | find_package(cufft 11.5.0 EXACT REQUIRED CONFIG 13 | PATHS 14 | "${PROJECT_SOURCE_DIR}/../../../../cufft" 15 | "/opt/cufft" 16 | ) 17 | 18 | # Define the helper executable 19 | add_executable(cufftdx_cufft_lto_helper cufftdx_cufft_lto_helper.cpp) 20 | target_link_libraries(cufftdx_cufft_lto_helper 21 | PRIVATE 22 | cufft::cufft_static 23 | ) 24 | -------------------------------------------------------------------------------- /MathDx/cuSolverDx/common.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CUSOLVERDX_EXAMPLE_COMMON_HPP_ 2 | #define CUSOLVERDX_EXAMPLE_COMMON_HPP_ 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #ifndef CUSOLVERDX_EXAMPLE_NVRTC 9 | # include 10 | # include 11 | #endif 12 | 13 | #include "common/macros.hpp" 14 | #include "common/cudart.hpp" 15 | #include "common/error_checking.hpp" 16 | #include "common/measure.hpp" 17 | #include "common/numeric.hpp" 18 | #include "common/random.hpp" 19 | #include "common/example_sm_runner.hpp" 20 | #include "common/device_io.hpp" 21 | #include "common/print.hpp" 22 | #include "common/cusolver_reference_cholesky.hpp" 23 | #include "common/cusolver_reference_lu.hpp" 24 | 25 | // the nvcc bug in CUDA 12.2-12.4, fixed in 12.5 26 | #ifdef __NVCC__ 27 | # if (__CUDACC_VER_MAJOR__ == 12 && (__CUDACC_VER_MINOR__ >= 2 && __CUDACC_VER_MINOR__ <= 5)) 28 | # define CUSOLVERDX_EXAMPLE_DETAIL_NVCC_12_2_BUG_WORKAROUND 1 29 | # endif 30 | #endif 31 | 32 | namespace example { 33 | // Used when CUSOLVERDX_EXAMPLE_DETAIL_NVCC_12_2_BUG_WORKAROUND is defined 34 | template 35 | using a_data_type_t = typename T::a_data_type; 36 | 37 | template 38 | using a_cuda_data_type_t = typename T::a_cuda_data_type; 39 | } // namespace example 40 | 41 | #endif 42 | -------------------------------------------------------------------------------- /MathDx/cuSolverDx/common/error_checking.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CUSOLVERDX_EXAMPLE_COMMON_ERROR_CHECKING_HPP 2 | #define CUSOLVERDX_EXAMPLE_COMMON_ERROR_CHECKING_HPP 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | #include 10 | 11 | #include "numeric.hpp" 12 | 13 | namespace common { 14 | 15 | template 16 | double check_error(const ResultType* data, const ReferenceType* reference, const std::size_t n, bool print = false, bool verbose = false); 17 | 18 | template 19 | bool is_error_acceptable(double tot_rel_err) { 20 | constexpr bool is_non_float_non_double_a_b_c = 21 | (!std::is_same_v && !std::is_same_v) || (!std::is_same_v> && !std::is_same_v>); 22 | 23 | if (is_non_float_non_double_a_b_c) { 24 | if (tot_rel_err > 1e-2) { 25 | std::cout << tot_rel_err << std::endl; 26 | return false; 27 | } 28 | } else { // A,B,C are either float or double 29 | if (tot_rel_err > 1e-3) { 30 | std::cout << tot_rel_err << std::endl; 31 | return false; 32 | } 33 | } 34 | return std::isfinite(tot_rel_err); 35 | } 36 | } // namespace common 37 | 38 | 39 | #endif // CUSOLVERDX_TEST_COMMON_ERROR_CHECKING_HPP 40 | -------------------------------------------------------------------------------- /NPP+/batchedLabelMarkersAndCompression/images/CT_skull_512x512_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/batchedLabelMarkersAndCompression/images/CT_skull_512x512_8u.raw -------------------------------------------------------------------------------- /NPP+/batchedLabelMarkersAndCompression/images/CT_skull_CompressedMarkerLabelsUF_8Way_512x512_32u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/batchedLabelMarkersAndCompression/images/CT_skull_CompressedMarkerLabelsUF_8Way_512x512_32u.raw -------------------------------------------------------------------------------- /NPP+/batchedLabelMarkersAndCompression/images/CT_skull_LabelMarkersUFBatch_8Way_512x512_32u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/batchedLabelMarkersAndCompression/images/CT_skull_LabelMarkersUFBatch_8Way_512x512_32u.raw -------------------------------------------------------------------------------- /NPP+/batchedLabelMarkersAndCompression/images/CT_skull_LabelMarkersUF_8Way_512x512_32u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/batchedLabelMarkersAndCompression/images/CT_skull_LabelMarkersUF_8Way_512x512_32u.raw -------------------------------------------------------------------------------- /NPP+/batchedLabelMarkersAndCompression/images/Lena_CompressedMarkerLabelsUF_8Way_512x512_32u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/batchedLabelMarkersAndCompression/images/Lena_CompressedMarkerLabelsUF_8Way_512x512_32u.raw -------------------------------------------------------------------------------- /NPP+/batchedLabelMarkersAndCompression/images/Lena_LabelMarkersUFBatch_8Way_512x512_32u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/batchedLabelMarkersAndCompression/images/Lena_LabelMarkersUFBatch_8Way_512x512_32u.raw -------------------------------------------------------------------------------- /NPP+/batchedLabelMarkersAndCompression/images/Lena_LabelMarkersUF_8Way_512x512_32u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/batchedLabelMarkersAndCompression/images/Lena_LabelMarkersUF_8Way_512x512_32u.raw -------------------------------------------------------------------------------- /NPP+/batchedLabelMarkersAndCompression/images/PCB2_1024x683_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/batchedLabelMarkersAndCompression/images/PCB2_1024x683_8u.raw -------------------------------------------------------------------------------- /NPP+/batchedLabelMarkersAndCompression/images/PCB2_CompressedMarkerLabelsUF_8Way_1024x683_32u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/batchedLabelMarkersAndCompression/images/PCB2_CompressedMarkerLabelsUF_8Way_1024x683_32u.raw -------------------------------------------------------------------------------- /NPP+/batchedLabelMarkersAndCompression/images/PCB2_LabelMarkersUFBatch_8Way_1024x683_32u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/batchedLabelMarkersAndCompression/images/PCB2_LabelMarkersUFBatch_8Way_1024x683_32u.raw -------------------------------------------------------------------------------- /NPP+/batchedLabelMarkersAndCompression/images/PCB2_LabelMarkersUF_8Way_1024x683_32u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/batchedLabelMarkersAndCompression/images/PCB2_LabelMarkersUF_8Way_1024x683_32u.raw -------------------------------------------------------------------------------- /NPP+/batchedLabelMarkersAndCompression/images/PCB_1280x720_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/batchedLabelMarkersAndCompression/images/PCB_1280x720_8u.raw -------------------------------------------------------------------------------- /NPP+/batchedLabelMarkersAndCompression/images/PCB_CompressedMarkerLabelsUF_8Way_1280x720_32u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/batchedLabelMarkersAndCompression/images/PCB_CompressedMarkerLabelsUF_8Way_1280x720_32u.raw -------------------------------------------------------------------------------- /NPP+/batchedLabelMarkersAndCompression/images/PCB_LabelMarkersUFBatch_8Way_1280x720_32u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/batchedLabelMarkersAndCompression/images/PCB_LabelMarkersUFBatch_8Way_1280x720_32u.raw -------------------------------------------------------------------------------- /NPP+/batchedLabelMarkersAndCompression/images/PCB_LabelMarkersUF_8Way_1280x720_32u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/batchedLabelMarkersAndCompression/images/PCB_LabelMarkersUF_8Way_1280x720_32u.raw -------------------------------------------------------------------------------- /NPP+/batchedLabelMarkersAndCompression/images/PCB_METAL_509x335_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/batchedLabelMarkersAndCompression/images/PCB_METAL_509x335_8u.raw -------------------------------------------------------------------------------- /NPP+/batchedLabelMarkersAndCompression/images/PCB_METAL_CompressedMarkerLabelsUF_8Way_509x335_32u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/batchedLabelMarkersAndCompression/images/PCB_METAL_CompressedMarkerLabelsUF_8Way_509x335_32u.raw -------------------------------------------------------------------------------- /NPP+/batchedLabelMarkersAndCompression/images/PCB_METAL_LabelMarkersUFBatch_8Way_509x335_32u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/batchedLabelMarkersAndCompression/images/PCB_METAL_LabelMarkersUFBatch_8Way_509x335_32u.raw -------------------------------------------------------------------------------- /NPP+/batchedLabelMarkersAndCompression/images/PCB_METAL_LabelMarkersUF_8Way_509x335_32u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/batchedLabelMarkersAndCompression/images/PCB_METAL_LabelMarkersUF_8Way_509x335_32u.raw -------------------------------------------------------------------------------- /NPP+/batchedLabelMarkersAndCompression/images/lena_512x512_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/batchedLabelMarkersAndCompression/images/lena_512x512_8u.raw -------------------------------------------------------------------------------- /NPP+/cannyEdgeDetectorPython/Teapot.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/cannyEdgeDetectorPython/Teapot.jpg -------------------------------------------------------------------------------- /NPP+/cannyEdgeDetectorPython/Teapot_resolutions/out_npp_1280x720.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/cannyEdgeDetectorPython/Teapot_resolutions/out_npp_1280x720.png -------------------------------------------------------------------------------- /NPP+/cannyEdgeDetectorPython/Teapot_resolutions/out_npp_1920x1080.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/cannyEdgeDetectorPython/Teapot_resolutions/out_npp_1920x1080.png -------------------------------------------------------------------------------- /NPP+/cannyEdgeDetectorPython/Teapot_resolutions/out_npp_2560x1440.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/cannyEdgeDetectorPython/Teapot_resolutions/out_npp_2560x1440.png -------------------------------------------------------------------------------- /NPP+/cannyEdgeDetectorPython/Teapot_resolutions/out_npp_320x180.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/cannyEdgeDetectorPython/Teapot_resolutions/out_npp_320x180.png -------------------------------------------------------------------------------- /NPP+/cannyEdgeDetectorPython/Teapot_resolutions/out_npp_3840x2160.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/cannyEdgeDetectorPython/Teapot_resolutions/out_npp_3840x2160.png -------------------------------------------------------------------------------- /NPP+/cannyEdgeDetectorPython/Teapot_resolutions/out_npp_5120x2880.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/cannyEdgeDetectorPython/Teapot_resolutions/out_npp_5120x2880.png -------------------------------------------------------------------------------- /NPP+/cannyEdgeDetectorPython/Teapot_resolutions/out_npp_640x360.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/cannyEdgeDetectorPython/Teapot_resolutions/out_npp_640x360.png -------------------------------------------------------------------------------- /NPP+/cannyEdgeDetectorPython/Teapot_resolutions/out_npp_800x600.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/cannyEdgeDetectorPython/Teapot_resolutions/out_npp_800x600.png -------------------------------------------------------------------------------- /NPP+/cannyEdgeDetectorPython/Teapot_resolutions/performance_results.csv: -------------------------------------------------------------------------------- 1 | Resolution,Megapixels,NPP Time (ms) 2 | 320x180,0.0576,0.04460153608769178 3 | 640x360,0.2304,0.04880320030450821 4 | 800x600,0.48,0.054283583376556636 5 | 1280x720,0.9216,0.06511971176043153 6 | 1920x1080,2.0736,0.1064842866435647 7 | 2560x1440,3.6864,0.15593324881792067 8 | 3840x2160,8.2944,0.30110825645923617 9 | 5120x2880,14.7456,0.5023832985758782 10 | -------------------------------------------------------------------------------- /NPP+/distanceTransform/DistanceTransformTrue_Dolphin1_319x319_16u.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/distanceTransform/DistanceTransformTrue_Dolphin1_319x319_16u.jpg -------------------------------------------------------------------------------- /NPP+/distanceTransform/dolphin1_Input_319x319_8u.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/distanceTransform/dolphin1_Input_319x319_8u.jpg -------------------------------------------------------------------------------- /NPP+/distanceTransform/images/Dolphin1_313x317_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/distanceTransform/images/Dolphin1_313x317_8u.raw -------------------------------------------------------------------------------- /NPP+/distanceTransform/images/TestImage3_diamond_64x64_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/distanceTransform/images/TestImage3_diamond_64x64_8u.raw -------------------------------------------------------------------------------- /NPP+/findContour/CircuitBoard_2048x1024_8u.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/findContour/CircuitBoard_2048x1024_8u.jpg -------------------------------------------------------------------------------- /NPP+/findContour/CircuitBoard_CompressedMarkerLabelsUF_8Way_2048x1024_32u.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/findContour/CircuitBoard_CompressedMarkerLabelsUF_8Way_2048x1024_32u.jpg -------------------------------------------------------------------------------- /NPP+/findContour/CircuitBoard_ContoursReconstructed_8Way_2048x1024_8u.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/findContour/CircuitBoard_ContoursReconstructed_8Way_2048x1024_8u.jpg -------------------------------------------------------------------------------- /NPP+/findContour/CircuitBoard_Contours_8Way_2048x1024_8u.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/findContour/CircuitBoard_Contours_8Way_2048x1024_8u.jpg -------------------------------------------------------------------------------- /NPP+/findContour/CircuitBoard_LabelMarkersUF_8Way_2048x1024_32u.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/findContour/CircuitBoard_LabelMarkersUF_8Way_2048x1024_32u.jpg -------------------------------------------------------------------------------- /NPP+/findContour/images/CircuitBoard_2048x1024_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/findContour/images/CircuitBoard_2048x1024_8u.raw -------------------------------------------------------------------------------- /NPP+/floodFill/images/CT_skull_512x512_8u_Gray.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/floodFill/images/CT_skull_512x512_8u_Gray.raw -------------------------------------------------------------------------------- /NPP+/floodFill/images/Corn_614x461_8u_Gray.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/floodFill/images/Corn_614x461_8u_Gray.raw -------------------------------------------------------------------------------- /NPP+/floodFill/images/DistanceSampler_512x512_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/floodFill/images/DistanceSampler_512x512_8u.raw -------------------------------------------------------------------------------- /NPP+/floodFill/images/DistanceSampler_512x512_Inverted_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/floodFill/images/DistanceSampler_512x512_Inverted_8u.raw -------------------------------------------------------------------------------- /NPP+/floodFill/images/RainbowChart_RGB_C3_1024x445_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/floodFill/images/RainbowChart_RGB_C3_1024x445_8u.raw -------------------------------------------------------------------------------- /NPP+/floodFill/images/RainbowChart_RGB_C3_Fill_8Way_1024x445_Dev_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/floodFill/images/RainbowChart_RGB_C3_Fill_8Way_1024x445_Dev_8u.raw -------------------------------------------------------------------------------- /NPP+/floodFill/images/RainbowChart_RGB_C3_Fill_8Way_Gradient_1024x445_Dev_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/floodFill/images/RainbowChart_RGB_C3_Fill_8Way_Gradient_1024x445_Dev_8u.raw -------------------------------------------------------------------------------- /NPP+/floodFill/images/RainbowChart_RGB_C3_Fill_8Way_Gradient_Boundary_1024x445_Dev_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/floodFill/images/RainbowChart_RGB_C3_Fill_8Way_Gradient_Boundary_1024x445_Dev_8u.raw -------------------------------------------------------------------------------- /NPP+/floodFill/images/Rocks_512x512_8u_Gray.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/floodFill/images/Rocks_512x512_8u_Gray.raw -------------------------------------------------------------------------------- /NPP+/floodFill/images/SeabedSampler_RGB_C3_675x1024_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/floodFill/images/SeabedSampler_RGB_C3_675x1024_8u.raw -------------------------------------------------------------------------------- /NPP+/floodFill/images/SeabedSampler_RGB_C3_Fill_8Way_Range_675x1024_Dev_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/floodFill/images/SeabedSampler_RGB_C3_Fill_8Way_Range_675x1024_Dev_8u.raw -------------------------------------------------------------------------------- /NPP+/floodFill/images/SeabedSampler_RGB_C3_Fill_8Way_Range_Boundary_675x1024_Dev_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/floodFill/images/SeabedSampler_RGB_C3_Fill_8Way_Range_Boundary_675x1024_Dev_8u.raw -------------------------------------------------------------------------------- /NPP+/floodFill/images/SignedCircle_256x206_64f.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/floodFill/images/SignedCircle_256x206_64f.raw -------------------------------------------------------------------------------- /NPP+/floodFill/images/SignedCircle_256x206_Inverted_64f.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/floodFill/images/SignedCircle_256x206_Inverted_64f.raw -------------------------------------------------------------------------------- /NPP+/floodFill/images/SignedLith_554x554_32f.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/floodFill/images/SignedLith_554x554_32f.raw -------------------------------------------------------------------------------- /NPP+/watershedSegmentation/images/CT_skull_512x512_8u_Gray.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/CT_skull_512x512_8u_Gray.raw -------------------------------------------------------------------------------- /NPP+/watershedSegmentation/images/CT_skull_CompressedSegmentLabels_8Way_512x512_32u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/CT_skull_CompressedSegmentLabels_8Way_512x512_32u.raw -------------------------------------------------------------------------------- /NPP+/watershedSegmentation/images/CT_skull_SegmentBoundaries_8Way_512x512_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/CT_skull_SegmentBoundaries_8Way_512x512_8u.raw -------------------------------------------------------------------------------- /NPP+/watershedSegmentation/images/CT_skull_SegmentsWithContrastingBoundaries_8Way_512x512_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/CT_skull_SegmentsWithContrastingBoundaries_8Way_512x512_8u.raw -------------------------------------------------------------------------------- /NPP+/watershedSegmentation/images/CT_skull_Segments_8Way_512x512_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/CT_skull_Segments_8Way_512x512_8u.raw -------------------------------------------------------------------------------- /NPP+/watershedSegmentation/images/Corn_614x461_8u_Gray.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/Corn_614x461_8u_Gray.raw -------------------------------------------------------------------------------- /NPP+/watershedSegmentation/images/Corn_CompressedSegmentLabels_8Way_614x461_32u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/Corn_CompressedSegmentLabels_8Way_614x461_32u.raw -------------------------------------------------------------------------------- /NPP+/watershedSegmentation/images/Corn_SegmentBoundaries_8Way_614x461_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/Corn_SegmentBoundaries_8Way_614x461_8u.raw -------------------------------------------------------------------------------- /NPP+/watershedSegmentation/images/Corn_SegmentsWithContrastingBoundaries_8Way_614x461_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/Corn_SegmentsWithContrastingBoundaries_8Way_614x461_8u.raw -------------------------------------------------------------------------------- /NPP+/watershedSegmentation/images/Corn_Segments_8Way_614x461_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/Corn_Segments_8Way_614x461_8u.raw -------------------------------------------------------------------------------- /NPP+/watershedSegmentation/images/DistanceSampler_512x512_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/DistanceSampler_512x512_8u.raw -------------------------------------------------------------------------------- /NPP+/watershedSegmentation/images/DistanceSampler_512x512_Inverted_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/DistanceSampler_512x512_Inverted_8u.raw -------------------------------------------------------------------------------- /NPP+/watershedSegmentation/images/RainbowChart_RGB_C3_1024x445_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/RainbowChart_RGB_C3_1024x445_8u.raw -------------------------------------------------------------------------------- /NPP+/watershedSegmentation/images/Rocks_512x512_8u_Gray.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/Rocks_512x512_8u_Gray.raw -------------------------------------------------------------------------------- /NPP+/watershedSegmentation/images/Rocks_CompressedSegmentLabels_8Way_512x512_32u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/Rocks_CompressedSegmentLabels_8Way_512x512_32u.raw -------------------------------------------------------------------------------- /NPP+/watershedSegmentation/images/Rocks_SegmentBoundaries_8Way_512x512_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/Rocks_SegmentBoundaries_8Way_512x512_8u.raw -------------------------------------------------------------------------------- /NPP+/watershedSegmentation/images/Rocks_SegmentsWithContrastingBoundaries_8Way_512x512_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/Rocks_SegmentsWithContrastingBoundaries_8Way_512x512_8u.raw -------------------------------------------------------------------------------- /NPP+/watershedSegmentation/images/Rocks_Segments_8Way_512x512_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/Rocks_Segments_8Way_512x512_8u.raw -------------------------------------------------------------------------------- /NPP+/watershedSegmentation/images/SeabedSampler_RGB_C3_675x1024_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/SeabedSampler_RGB_C3_675x1024_8u.raw -------------------------------------------------------------------------------- /NPP+/watershedSegmentation/images/SignedCircle_256x206_64f.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/SignedCircle_256x206_64f.raw -------------------------------------------------------------------------------- /NPP+/watershedSegmentation/images/SignedCircle_256x206_Inverted_64f.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/SignedCircle_256x206_Inverted_64f.raw -------------------------------------------------------------------------------- /NPP+/watershedSegmentation/images/SignedLith_554x554_32f.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/SignedLith_554x554_32f.raw -------------------------------------------------------------------------------- /NPP/batchedLabelMarkersAndCompression/images/CT_skull_512x512_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/batchedLabelMarkersAndCompression/images/CT_skull_512x512_8u.raw -------------------------------------------------------------------------------- /NPP/batchedLabelMarkersAndCompression/images/CT_skull_CompressedMarkerLabelsUF_8Way_512x512_32u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/batchedLabelMarkersAndCompression/images/CT_skull_CompressedMarkerLabelsUF_8Way_512x512_32u.raw -------------------------------------------------------------------------------- /NPP/batchedLabelMarkersAndCompression/images/CT_skull_LabelMarkersUFBatch_8Way_512x512_32u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/batchedLabelMarkersAndCompression/images/CT_skull_LabelMarkersUFBatch_8Way_512x512_32u.raw -------------------------------------------------------------------------------- /NPP/batchedLabelMarkersAndCompression/images/CT_skull_LabelMarkersUF_8Way_512x512_32u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/batchedLabelMarkersAndCompression/images/CT_skull_LabelMarkersUF_8Way_512x512_32u.raw -------------------------------------------------------------------------------- /NPP/batchedLabelMarkersAndCompression/images/Lena_CompressedMarkerLabelsUF_8Way_512x512_32u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/batchedLabelMarkersAndCompression/images/Lena_CompressedMarkerLabelsUF_8Way_512x512_32u.raw -------------------------------------------------------------------------------- /NPP/batchedLabelMarkersAndCompression/images/Lena_LabelMarkersUFBatch_8Way_512x512_32u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/batchedLabelMarkersAndCompression/images/Lena_LabelMarkersUFBatch_8Way_512x512_32u.raw -------------------------------------------------------------------------------- /NPP/batchedLabelMarkersAndCompression/images/Lena_LabelMarkersUF_8Way_512x512_32u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/batchedLabelMarkersAndCompression/images/Lena_LabelMarkersUF_8Way_512x512_32u.raw -------------------------------------------------------------------------------- /NPP/batchedLabelMarkersAndCompression/images/PCB2_1024x683_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/batchedLabelMarkersAndCompression/images/PCB2_1024x683_8u.raw -------------------------------------------------------------------------------- /NPP/batchedLabelMarkersAndCompression/images/PCB2_CompressedMarkerLabelsUF_8Way_1024x683_32u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/batchedLabelMarkersAndCompression/images/PCB2_CompressedMarkerLabelsUF_8Way_1024x683_32u.raw -------------------------------------------------------------------------------- /NPP/batchedLabelMarkersAndCompression/images/PCB2_LabelMarkersUFBatch_8Way_1024x683_32u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/batchedLabelMarkersAndCompression/images/PCB2_LabelMarkersUFBatch_8Way_1024x683_32u.raw -------------------------------------------------------------------------------- /NPP/batchedLabelMarkersAndCompression/images/PCB2_LabelMarkersUF_8Way_1024x683_32u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/batchedLabelMarkersAndCompression/images/PCB2_LabelMarkersUF_8Way_1024x683_32u.raw -------------------------------------------------------------------------------- /NPP/batchedLabelMarkersAndCompression/images/PCB_1280x720_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/batchedLabelMarkersAndCompression/images/PCB_1280x720_8u.raw -------------------------------------------------------------------------------- /NPP/batchedLabelMarkersAndCompression/images/PCB_CompressedMarkerLabelsUF_8Way_1280x720_32u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/batchedLabelMarkersAndCompression/images/PCB_CompressedMarkerLabelsUF_8Way_1280x720_32u.raw -------------------------------------------------------------------------------- /NPP/batchedLabelMarkersAndCompression/images/PCB_LabelMarkersUFBatch_8Way_1280x720_32u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/batchedLabelMarkersAndCompression/images/PCB_LabelMarkersUFBatch_8Way_1280x720_32u.raw -------------------------------------------------------------------------------- /NPP/batchedLabelMarkersAndCompression/images/PCB_LabelMarkersUF_8Way_1280x720_32u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/batchedLabelMarkersAndCompression/images/PCB_LabelMarkersUF_8Way_1280x720_32u.raw -------------------------------------------------------------------------------- /NPP/batchedLabelMarkersAndCompression/images/PCB_METAL_509x335_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/batchedLabelMarkersAndCompression/images/PCB_METAL_509x335_8u.raw -------------------------------------------------------------------------------- /NPP/batchedLabelMarkersAndCompression/images/PCB_METAL_CompressedMarkerLabelsUF_8Way_509x335_32u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/batchedLabelMarkersAndCompression/images/PCB_METAL_CompressedMarkerLabelsUF_8Way_509x335_32u.raw -------------------------------------------------------------------------------- /NPP/batchedLabelMarkersAndCompression/images/PCB_METAL_LabelMarkersUFBatch_8Way_509x335_32u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/batchedLabelMarkersAndCompression/images/PCB_METAL_LabelMarkersUFBatch_8Way_509x335_32u.raw -------------------------------------------------------------------------------- /NPP/batchedLabelMarkersAndCompression/images/PCB_METAL_LabelMarkersUF_8Way_509x335_32u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/batchedLabelMarkersAndCompression/images/PCB_METAL_LabelMarkersUF_8Way_509x335_32u.raw -------------------------------------------------------------------------------- /NPP/batchedLabelMarkersAndCompression/images/lena_512x512_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/batchedLabelMarkersAndCompression/images/lena_512x512_8u.raw -------------------------------------------------------------------------------- /NPP/distanceTransform/DistanceTransformTrue_Dolphin1_319x319_16u.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/distanceTransform/DistanceTransformTrue_Dolphin1_319x319_16u.jpg -------------------------------------------------------------------------------- /NPP/distanceTransform/dolphin1_Input_319x319_8u.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/distanceTransform/dolphin1_Input_319x319_8u.jpg -------------------------------------------------------------------------------- /NPP/distanceTransform/images/Dolphin1_313x317_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/distanceTransform/images/Dolphin1_313x317_8u.raw -------------------------------------------------------------------------------- /NPP/distanceTransform/images/TestImage3_diamond_64x64_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/distanceTransform/images/TestImage3_diamond_64x64_8u.raw -------------------------------------------------------------------------------- /NPP/findContour/CircuitBoard_2048x1024_8u.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/findContour/CircuitBoard_2048x1024_8u.jpg -------------------------------------------------------------------------------- /NPP/findContour/CircuitBoard_CompressedMarkerLabelsUF_8Way_2048x1024_32u.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/findContour/CircuitBoard_CompressedMarkerLabelsUF_8Way_2048x1024_32u.jpg -------------------------------------------------------------------------------- /NPP/findContour/CircuitBoard_ContoursReconstructed_8Way_2048x1024_8u.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/findContour/CircuitBoard_ContoursReconstructed_8Way_2048x1024_8u.jpg -------------------------------------------------------------------------------- /NPP/findContour/CircuitBoard_Contours_8Way_2048x1024_8u.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/findContour/CircuitBoard_Contours_8Way_2048x1024_8u.jpg -------------------------------------------------------------------------------- /NPP/findContour/CircuitBoard_LabelMarkersUF_8Way_2048x1024_32u.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/findContour/CircuitBoard_LabelMarkersUF_8Way_2048x1024_32u.jpg -------------------------------------------------------------------------------- /NPP/findContour/images/CircuitBoard_2048x1024_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/findContour/images/CircuitBoard_2048x1024_8u.raw -------------------------------------------------------------------------------- /NPP/watershedSegmentation/images/CT_skull_512x512_8u_Gray.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/watershedSegmentation/images/CT_skull_512x512_8u_Gray.raw -------------------------------------------------------------------------------- /NPP/watershedSegmentation/images/CT_skull_CompressedSegmentLabels_8Way_512x512_32u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/watershedSegmentation/images/CT_skull_CompressedSegmentLabels_8Way_512x512_32u.raw -------------------------------------------------------------------------------- /NPP/watershedSegmentation/images/CT_skull_SegmentBoundaries_8Way_512x512_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/watershedSegmentation/images/CT_skull_SegmentBoundaries_8Way_512x512_8u.raw -------------------------------------------------------------------------------- /NPP/watershedSegmentation/images/CT_skull_SegmentsWithContrastingBoundaries_8Way_512x512_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/watershedSegmentation/images/CT_skull_SegmentsWithContrastingBoundaries_8Way_512x512_8u.raw -------------------------------------------------------------------------------- /NPP/watershedSegmentation/images/CT_skull_Segments_8Way_512x512_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/watershedSegmentation/images/CT_skull_Segments_8Way_512x512_8u.raw -------------------------------------------------------------------------------- /NPP/watershedSegmentation/images/Lena_512x512_8u_Gray.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/watershedSegmentation/images/Lena_512x512_8u_Gray.raw -------------------------------------------------------------------------------- /NPP/watershedSegmentation/images/Lena_CompressedSegmentLabels_8Way_512x512_32u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/watershedSegmentation/images/Lena_CompressedSegmentLabels_8Way_512x512_32u.raw -------------------------------------------------------------------------------- /NPP/watershedSegmentation/images/Lena_SegmentBoundaries_8Way_512x512_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/watershedSegmentation/images/Lena_SegmentBoundaries_8Way_512x512_8u.raw -------------------------------------------------------------------------------- /NPP/watershedSegmentation/images/Lena_SegmentsWithContrastingBoundaries_8Way_512x512_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/watershedSegmentation/images/Lena_SegmentsWithContrastingBoundaries_8Way_512x512_8u.raw -------------------------------------------------------------------------------- /NPP/watershedSegmentation/images/Lena_Segments_8Way_512x512_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/watershedSegmentation/images/Lena_Segments_8Way_512x512_8u.raw -------------------------------------------------------------------------------- /NPP/watershedSegmentation/images/Rocks_512x512_8u_Gray.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/watershedSegmentation/images/Rocks_512x512_8u_Gray.raw -------------------------------------------------------------------------------- /NPP/watershedSegmentation/images/Rocks_CompressedSegmentLabels_8Way_512x512_32u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/watershedSegmentation/images/Rocks_CompressedSegmentLabels_8Way_512x512_32u.raw -------------------------------------------------------------------------------- /NPP/watershedSegmentation/images/Rocks_SegmentBoundaries_8Way_512x512_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/watershedSegmentation/images/Rocks_SegmentBoundaries_8Way_512x512_8u.raw -------------------------------------------------------------------------------- /NPP/watershedSegmentation/images/Rocks_SegmentsWithContrastingBoundaries_8Way_512x512_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/watershedSegmentation/images/Rocks_SegmentsWithContrastingBoundaries_8Way_512x512_8u.raw -------------------------------------------------------------------------------- /NPP/watershedSegmentation/images/Rocks_Segments_8Way_512x512_8u.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/watershedSegmentation/images/Rocks_Segments_8Way_512x512_8u.raw -------------------------------------------------------------------------------- /NPP/watershedSegmentation/images/coins_500x383_8u_Gray.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/watershedSegmentation/images/coins_500x383_8u_Gray.raw -------------------------------------------------------------------------------- /NPP/watershedSegmentation/images/coins_overlay_500x569_8u_Gray.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/watershedSegmentation/images/coins_overlay_500x569_8u_Gray.raw -------------------------------------------------------------------------------- /cuBLAS/Emulation/bf16x9_gemmEx/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Emulation/bf16x9_sgemm/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Extensions/AxpyEx/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Extensions/Cherk3mEx/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Extensions/CherkEx/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Extensions/Csyrk3mEx/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Extensions/CsyrkEx/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Extensions/DotEx/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Extensions/GemmBatchedEx/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Extensions/GemmEx/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Extensions/GemmGroupedBatchedEx/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Extensions/GemmStridedBatchedEx/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Extensions/Nrm2Ex/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Extensions/RotEx/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Extensions/ScalEx/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Extensions/dgmm/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Extensions/geam/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Extensions/tpttr/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Extensions/tpttr/README.md: -------------------------------------------------------------------------------- 1 | # cuBLAS Extension APIs - `cublastpttr` 2 | 3 | ## Description 4 | 5 | This code demonstrates a usage of cuBLAS `tpttr` function to perform the conversion from the triangular packed format to the triangular format 6 | 7 | ``` 8 | A = | 1.0 | 2.0 | 9 | | 3.0 | 4.0 | 10 | ``` 11 | 12 | See documentation for further details. 13 | 14 | ## Supported SM Architectures 15 | 16 | All GPUs supported by CUDA Toolkit (https://developer.nvidia.com/cuda-gpus) 17 | 18 | ## Supported OSes 19 | 20 | Linux 21 | Windows 22 | 23 | ## Supported CPU Architecture 24 | 25 | x86_64 26 | ppc64le 27 | arm64-sbsa 28 | 29 | ## CUDA APIs involved 30 | - [cublas\tpttr() API](https://docs.nvidia.com/cuda/cublas/index.html#cublas-t-tpttr) 31 | 32 | # Building (make) 33 | 34 | # Prerequisites 35 | - A Linux/Windows system with recent NVIDIA drivers. 36 | - [CMake](https://cmake.org/download) version 3.18 minimum 37 | 38 | ## Build command on Linux 39 | ``` 40 | $ mkdir build 41 | $ cd build 42 | $ cmake .. 43 | $ make 44 | ``` 45 | Make sure that CMake finds expected CUDA Toolkit. If that is not the case you can add argument `-DCMAKE_CUDA_COMPILER=/path/to/cuda/bin/nvcc` to cmake command. 46 | 47 | ## Build command on Windows 48 | ``` 49 | $ mkdir build 50 | $ cd build 51 | $ cmake -DCMAKE_GENERATOR_PLATFORM=x64 .. 52 | $ Open cublas_examples.sln project in Visual Studio and build 53 | ``` 54 | 55 | # Usage 56 | ``` 57 | $ ./cublas_tpttr_example 58 | ``` 59 | 60 | Sample example output: 61 | 62 | ``` 63 | AP 64 | 1.00 2.00 65 | 3.00 4.00 66 | ===== 67 | A 68 | 1.00 3.00 69 | 0.00 2.00 70 | ===== 71 | ``` 72 | -------------------------------------------------------------------------------- /cuBLAS/Extensions/trttp/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Extensions/trttp/README.md: -------------------------------------------------------------------------------- 1 | # cuBLAS Extension APIs - `cublastrttp` 2 | 3 | ## Description 4 | 5 | This code demonstrates a usage of cuBLAS `trttp` function to perform the conversion from the triangular format to the triangular packed format 6 | 7 | ``` 8 | A = | 1.0 | 0.0 | 9 | | 2.0 | 3.0 | 10 | ``` 11 | 12 | See documentation for further details. 13 | 14 | ## Supported SM Architectures 15 | 16 | All GPUs supported by CUDA Toolkit (https://developer.nvidia.com/cuda-gpus) 17 | 18 | ## Supported OSes 19 | 20 | Linux 21 | Windows 22 | 23 | ## Supported CPU Architecture 24 | 25 | x86_64 26 | ppc64le 27 | arm64-sbsa 28 | 29 | ## CUDA APIs involved 30 | - [cublas\trttp() API](https://docs.nvidia.com/cuda/cublas/index.html#cublas-t-trttp) 31 | 32 | # Building (make) 33 | 34 | # Prerequisites 35 | - A Linux/Windows system with recent NVIDIA drivers. 36 | - [CMake](https://cmake.org/download) version 3.18 minimum 37 | 38 | ## Build command on Linux 39 | ``` 40 | $ mkdir build 41 | $ cd build 42 | $ cmake .. 43 | $ make 44 | ``` 45 | Make sure that CMake finds expected CUDA Toolkit. If that is not the case you can add argument `-DCMAKE_CUDA_COMPILER=/path/to/cuda/bin/nvcc` to cmake command. 46 | 47 | ## Build command on Windows 48 | ``` 49 | $ mkdir build 50 | $ cd build 51 | $ cmake -DCMAKE_GENERATOR_PLATFORM=x64 .. 52 | $ Open cublas_examples.sln project in Visual Studio and build 53 | ``` 54 | 55 | # Usage 56 | ``` 57 | $ ./cublas_trttp_example 58 | ``` 59 | 60 | Sample example output: 61 | 62 | ``` 63 | A 64 | 1.00 3.00 65 | 2.00 4.00 66 | ===== 67 | AP 68 | 1.00 4.00 69 | 3.00 0.00 70 | ===== 71 | ``` 72 | -------------------------------------------------------------------------------- /cuBLAS/Level-1/amax/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-1/amax/README.md: -------------------------------------------------------------------------------- 1 | # cuBLAS Level-1 APIs - `cublasamax` 2 | 3 | ## Description 4 | 5 | This code demonstrates a usage of cuBLAS `amax` function to find the (smallest) index of the element of the maximum magnitude 6 | 7 | ``` 8 | A = | 1.0 | 2.0 | 3.0 | 4.0 | 9 | ``` 10 | 11 | See documentation for further details. 12 | 13 | ## Supported SM Architectures 14 | 15 | All GPUs supported by CUDA Toolkit (https://developer.nvidia.com/cuda-gpus) 16 | 17 | ## Supported OSes 18 | 19 | Linux 20 | Windows 21 | 22 | ## Supported CPU Architecture 23 | 24 | x86_64 25 | ppc64le 26 | arm64-sbsa 27 | 28 | ## CUDA APIs involved 29 | - [cublasI\amax API](https://docs.nvidia.com/cuda/cublas/index.html#cublasi-t-amax) 30 | 31 | # Building (make) 32 | 33 | # Prerequisites 34 | - A Linux/Windows system with recent NVIDIA drivers. 35 | - [CMake](https://cmake.org/download) version 3.18 minimum 36 | 37 | ## Build command on Linux 38 | ``` 39 | $ mkdir build 40 | $ cd build 41 | $ cmake .. 42 | $ make 43 | ``` 44 | Make sure that CMake finds expected CUDA Toolkit. If that is not the case you can add argument `-DCMAKE_CUDA_COMPILER=/path/to/cuda/bin/nvcc` to cmake command. 45 | 46 | ## Build command on Windows 47 | ``` 48 | $ mkdir build 49 | $ cd build 50 | $ cmake -DCMAKE_GENERATOR_PLATFORM=x64 .. 51 | $ Open cublas_examples.sln project in Visual Studio and build 52 | ``` 53 | 54 | # Usage 55 | ``` 56 | $ ./cublas_amax_example 57 | ``` 58 | 59 | Sample example output: 60 | 61 | ``` 62 | A 63 | 1.00 2.00 3.00 4.00 64 | ===== 65 | result 66 | 4 67 | ===== 68 | ``` 69 | -------------------------------------------------------------------------------- /cuBLAS/Level-1/amin/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-1/amin/README.md: -------------------------------------------------------------------------------- 1 | # cuBLAS Level-1 APIs - `cublasamin` 2 | 3 | ## Description 4 | 5 | This code demonstrates a usage of cuBLAS `amin` function to find the (smallest) index of the element of the minimum magnitude 6 | 7 | ``` 8 | A = | 1.0 | 2.0 | 3.0 | 4.0 | 9 | ``` 10 | 11 | See documentation for further details. 12 | 13 | ## Supported SM Architectures 14 | 15 | All GPUs supported by CUDA Toolkit (https://developer.nvidia.com/cuda-gpus) 16 | 17 | ## Supported OSes 18 | 19 | Linux 20 | Windows 21 | 22 | ## Supported CPU Architecture 23 | 24 | x86_64 25 | ppc64le 26 | arm64-sbsa 27 | 28 | ## CUDA APIs involved 29 | - [cublasI\amin API](https://docs.nvidia.com/cuda/cublas/index.html#cublasi-t-amin) 30 | 31 | # Building (make) 32 | 33 | # Prerequisites 34 | - A Linux/Windows system with recent NVIDIA drivers. 35 | - [CMake](https://cmake.org/download) version 3.18 minimum 36 | 37 | ## Build command on Linux 38 | ``` 39 | $ mkdir build 40 | $ cd build 41 | $ cmake .. 42 | $ make 43 | ``` 44 | Make sure that CMake finds expected CUDA Toolkit. If that is not the case you can add argument `-DCMAKE_CUDA_COMPILER=/path/to/cuda/bin/nvcc` to cmake command. 45 | 46 | ## Build command on Windows 47 | ``` 48 | $ mkdir build 49 | $ cd build 50 | $ cmake -DCMAKE_GENERATOR_PLATFORM=x64 .. 51 | $ Open cublas_examples.sln project in Visual Studio and build 52 | ``` 53 | 54 | # Usage 55 | ``` 56 | $ ./cublas_amin_example 57 | ``` 58 | 59 | Sample example output: 60 | 61 | ``` 62 | A 63 | 1.00 2.00 3.00 4.00 64 | ===== 65 | result 66 | 1 67 | ===== 68 | ``` 69 | -------------------------------------------------------------------------------- /cuBLAS/Level-1/asum/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-1/asum/README.md: -------------------------------------------------------------------------------- 1 | # cuBLAS Level-1 APIs - `cublasasum` 2 | 3 | ## Description 4 | 5 | This code demonstrates a usage of cuBLAS `asum` function to compute the sum of the absolute values of the elements of vector _x_ 6 | 7 | ``` 8 | A = | 1.0 | 2.0 | 3.0 | 4.0 | 9 | ``` 10 | 11 | See documentation for further details. 12 | 13 | ## Supported SM Architectures 14 | 15 | All GPUs supported by CUDA Toolkit (https://developer.nvidia.com/cuda-gpus) 16 | 17 | ## Supported OSes 18 | 19 | Linux 20 | Windows 21 | 22 | ## Supported CPU Architecture 23 | 24 | x86_64 25 | ppc64le 26 | arm64-sbsa 27 | 28 | ## CUDA APIs involved 29 | - [cublas\asum API](https://docs.nvidia.com/cuda/cublas/index.html#cublas-t-asum) 30 | 31 | # Building (make) 32 | 33 | # Prerequisites 34 | - A Linux/Windows system with recent NVIDIA drivers. 35 | - [CMake](https://cmake.org/download) version 3.18 minimum 36 | 37 | ## Build command on Linux 38 | ``` 39 | $ mkdir build 40 | $ cd build 41 | $ cmake .. 42 | $ make 43 | ``` 44 | Make sure that CMake finds expected CUDA Toolkit. If that is not the case you can add argument `-DCMAKE_CUDA_COMPILER=/path/to/cuda/bin/nvcc` to cmake command. 45 | 46 | ## Build command on Windows 47 | ``` 48 | $ mkdir build 49 | $ cd build 50 | $ cmake -DCMAKE_GENERATOR_PLATFORM=x64 .. 51 | $ Open cublas_examples.sln project in Visual Studio and build 52 | ``` 53 | 54 | # Usage 55 | ``` 56 | $ ./cublas_asum_example 57 | ``` 58 | 59 | Sample example output: 60 | 61 | ``` 62 | A 63 | 1.00 2.00 3.00 4.00 64 | ===== 65 | result 66 | 10.00 67 | ===== 68 | ``` 69 | -------------------------------------------------------------------------------- /cuBLAS/Level-1/axpy/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-1/copy/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-1/copy/README.md: -------------------------------------------------------------------------------- 1 | # cuBLAS Level-1 APIs - `cublascopy` 2 | 3 | ## Description 4 | 5 | This code demonstrates a usage of cuBLAS `copy` function to copy the vector _x_ into the vector _y_ 6 | 7 | ``` 8 | A = | 1.0 | 2.0 | 3.0 | 4.0 | 9 | ``` 10 | 11 | See documentation for further details. 12 | 13 | ## Supported SM Architectures 14 | 15 | All GPUs supported by CUDA Toolkit (https://developer.nvidia.com/cuda-gpus) 16 | 17 | ## Supported OSes 18 | 19 | Linux 20 | Windows 21 | 22 | ## Supported CPU Architecture 23 | 24 | x86_64 25 | ppc64le 26 | arm64-sbsa 27 | 28 | ## CUDA APIs involved 29 | - [cublas\copy API](https://docs.nvidia.com/cuda/cublas/index.html#cublas-t-copy) 30 | 31 | # Building (make) 32 | 33 | # Prerequisites 34 | - A Linux/Windows system with recent NVIDIA drivers. 35 | - [CMake](https://cmake.org/download) version 3.18 minimum 36 | 37 | ## Build command on Linux 38 | ``` 39 | $ mkdir build 40 | $ cd build 41 | $ cmake .. 42 | $ make 43 | ``` 44 | Make sure that CMake finds expected CUDA Toolkit. If that is not the case you can add argument `-DCMAKE_CUDA_COMPILER=/path/to/cuda/bin/nvcc` to cmake command. 45 | 46 | ## Build command on Windows 47 | ``` 48 | $ mkdir build 49 | $ cd build 50 | $ cmake -DCMAKE_GENERATOR_PLATFORM=x64 .. 51 | $ Open cublas_examples.sln project in Visual Studio and build 52 | ``` 53 | 54 | # Usage 55 | ``` 56 | $ ./cublas_copy_example 57 | ``` 58 | 59 | Sample example output: 60 | 61 | ``` 62 | A 63 | 1.00 2.00 3.00 4.00 64 | ===== 65 | B 66 | 0.00 0.00 0.00 0.00 67 | ===== 68 | B 69 | 1.00 2.00 3.00 4.00 70 | ===== 71 | 72 | ``` 73 | -------------------------------------------------------------------------------- /cuBLAS/Level-1/dot/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-1/nrm2/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-1/nrm2/README.md: -------------------------------------------------------------------------------- 1 | # cuBLAS Level-1 APIs - `cublasnrm2` 2 | 3 | ## Description 4 | 5 | This code demonstrates a usage of cuBLAS `nrm2` function to compute the Euclidean norm of a vector 6 | 7 | ``` 8 | A = | 1.0 | 2.0 | 3.0 | 4.0 | 9 | ``` 10 | 11 | See documentation for further details. 12 | 13 | ## Supported SM Architectures 14 | 15 | All GPUs supported by CUDA Toolkit (https://developer.nvidia.com/cuda-gpus) 16 | 17 | ## Supported OSes 18 | 19 | Linux 20 | Windows 21 | 22 | ## Supported CPU Architecture 23 | 24 | x86_64 25 | ppc64le 26 | arm64-sbsa 27 | 28 | ## CUDA APIs involved 29 | - [cublas\nrm2 API](https://docs.nvidia.com/cuda/cublas/index.html#cublas-t-nrm2) 30 | 31 | # Building (make) 32 | 33 | # Prerequisites 34 | - A Linux/Windows system with recent NVIDIA drivers. 35 | - [CMake](https://cmake.org/download) version 3.18 minimum 36 | 37 | ## Build command on Linux 38 | ``` 39 | $ mkdir build 40 | $ cd build 41 | $ cmake .. 42 | $ make 43 | ``` 44 | Make sure that CMake finds expected CUDA Toolkit. If that is not the case you can add argument `-DCMAKE_CUDA_COMPILER=/path/to/cuda/bin/nvcc` to cmake command. 45 | 46 | ## Build command on Windows 47 | ``` 48 | $ mkdir build 49 | $ cd build 50 | $ cmake -DCMAKE_GENERATOR_PLATFORM=x64 .. 51 | $ Open cublas_examples.sln project in Visual Studio and build 52 | ``` 53 | 54 | # Usage 55 | ``` 56 | $ ./cublas_nrm2_example 57 | ``` 58 | 59 | Sample example output: 60 | 61 | ``` 62 | A 63 | 1.00 2.00 3.00 4.00 64 | ===== 65 | Result 66 | 5.48 67 | ===== 68 | ``` 69 | -------------------------------------------------------------------------------- /cuBLAS/Level-1/rot/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-1/rotg/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-1/rotm/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-1/rotmg/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-1/scal/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-1/swap/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-2/gbmv/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-2/gbmv/README.md: -------------------------------------------------------------------------------- 1 | # cuBLAS Level-2 APIs - `cublasgbmv` 2 | 3 | ## Description 4 | 5 | This code demonstrates a usage of cuBLAS `gbmv` function to compute a banded matrix-vector multiplication 6 | 7 | ``` 8 | A = | 1.0 | 2.0 | 9 | | 3.0 | 4.0 | 10 | 11 | x = | 5.0 | 6.0 | 12 | ``` 13 | 14 | See documentation for further details. 15 | 16 | ## Supported SM Architectures 17 | 18 | All GPUs supported by CUDA Toolkit (https://developer.nvidia.com/cuda-gpus) 19 | 20 | ## Supported OSes 21 | 22 | Linux 23 | Windows 24 | 25 | ## Supported CPU Architecture 26 | 27 | x86_64 28 | ppc64le 29 | arm64-sbsa 30 | 31 | ## CUDA APIs involved 32 | - [cublas\gbmv API](https://docs.nvidia.com/cuda/cublas/index.html#cublas-t-gbmv) 33 | 34 | # Building (make) 35 | 36 | # Prerequisites 37 | - A Linux/Windows system with recent NVIDIA drivers. 38 | - [CMake](https://cmake.org/download) version 3.18 minimum 39 | 40 | ## Build command on Linux 41 | ``` 42 | $ mkdir build 43 | $ cd build 44 | $ cmake .. 45 | $ make 46 | ``` 47 | Make sure that CMake finds expected CUDA Toolkit. If that is not the case you can add argument `-DCMAKE_CUDA_COMPILER=/path/to/cuda/bin/nvcc` to cmake command. 48 | 49 | ## Build command on Windows 50 | ``` 51 | $ mkdir build 52 | $ cd build 53 | $ cmake -DCMAKE_GENERATOR_PLATFORM=x64 .. 54 | $ Open cublas_examples.sln project in Visual Studio and build 55 | ``` 56 | 57 | # Usage 58 | ``` 59 | $ ./cublas_gbmv_example 60 | ``` 61 | 62 | Sample example output: 63 | 64 | ``` 65 | A 66 | 1.00 2.00 67 | 3.00 4.00 68 | ===== 69 | x 70 | 5.00 6.00 71 | ===== 72 | y 73 | 27.00 24.00 74 | ===== 75 | ``` 76 | -------------------------------------------------------------------------------- /cuBLAS/Level-2/gemv/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-2/gemv/README.md: -------------------------------------------------------------------------------- 1 | # cuBLAS Level-2 APIs - `cublasgemv` 2 | 3 | ## Description 4 | 5 | This code demonstrates a usage of cuBLAS `gemv` function to compute a matrix-vector multiplication 6 | 7 | ``` 8 | A = | 1.0 | 2.0 | 9 | | 3.0 | 4.0 | 10 | 11 | x = | 5.0 | 6.0 | 12 | ``` 13 | 14 | See documentation for further details. 15 | 16 | ## Supported SM Architectures 17 | 18 | All GPUs supported by CUDA Toolkit (https://developer.nvidia.com/cuda-gpus) 19 | 20 | ## Supported OSes 21 | 22 | Linux 23 | Windows 24 | 25 | ## Supported CPU Architecture 26 | 27 | x86_64 28 | ppc64le 29 | arm64-sbsa 30 | 31 | ## CUDA APIs involved 32 | - [cublas\gemv API](https://docs.nvidia.com/cuda/cublas/index.html#cublas-t-gemv) 33 | 34 | # Building (make) 35 | 36 | # Prerequisites 37 | - A Linux/Windows system with recent NVIDIA drivers. 38 | - [CMake](https://cmake.org/download) version 3.18 minimum 39 | 40 | ## Build command on Linux 41 | ``` 42 | $ mkdir build 43 | $ cd build 44 | $ cmake .. 45 | $ make 46 | ``` 47 | Make sure that CMake finds expected CUDA Toolkit. If that is not the case you can add argument `-DCMAKE_CUDA_COMPILER=/path/to/cuda/bin/nvcc` to cmake command. 48 | 49 | ## Build command on Windows 50 | ``` 51 | $ mkdir build 52 | $ cd build 53 | $ cmake -DCMAKE_GENERATOR_PLATFORM=x64 .. 54 | $ Open cublas_examples.sln project in Visual Studio and build 55 | ``` 56 | 57 | # Usage 58 | ``` 59 | $ ./cublas_gemv_example 60 | ``` 61 | 62 | Sample example output: 63 | 64 | ``` 65 | A 66 | 1.00 2.00 67 | 3.00 4.00 68 | ===== 69 | x 70 | 5.00 6.00 71 | ===== 72 | y 73 | 17.00 39.00 74 | ===== 75 | ``` 76 | -------------------------------------------------------------------------------- /cuBLAS/Level-2/ger/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-2/hbmv/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-2/hemv/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-2/her/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-2/her2/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-2/hpmv/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-2/hpr/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-2/hpr2/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-2/sbmv/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-2/spmv/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-2/spr/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-2/spr2/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-2/symv/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-2/symv/README.md: -------------------------------------------------------------------------------- 1 | # cuBLAS Level-2 APIs - `cublassymv` 2 | 3 | ## Description 4 | 5 | This code demonstrates a usage of cuBLAS `symv` function to compute a symmetric matrix-vector multiplication 6 | 7 | ``` 8 | A = | 1.0 | 3.0 | 9 | | 3.0 | 4.0 | 10 | 11 | x = | 5.0 | 6.0 | 12 | ``` 13 | 14 | See documentation for further details. 15 | 16 | ## Supported SM Architectures 17 | 18 | All GPUs supported by CUDA Toolkit (https://developer.nvidia.com/cuda-gpus) 19 | 20 | ## Supported OSes 21 | 22 | Linux 23 | Windows 24 | 25 | ## Supported CPU Architecture 26 | 27 | x86_64 28 | ppc64le 29 | arm64-sbsa 30 | 31 | ## CUDA APIs involved 32 | - [cublas\symv API](https://docs.nvidia.com/cuda/cublas/index.html#cublas-t-symv) 33 | 34 | # Building (make) 35 | 36 | # Prerequisites 37 | - A Linux/Windows system with recent NVIDIA drivers. 38 | - [CMake](https://cmake.org/download) version 3.18 minimum 39 | 40 | ## Build command on Linux 41 | ``` 42 | $ mkdir build 43 | $ cd build 44 | $ cmake .. 45 | $ make 46 | ``` 47 | Make sure that CMake finds expected CUDA Toolkit. If that is not the case you can add argument `-DCMAKE_CUDA_COMPILER=/path/to/cuda/bin/nvcc` to cmake command. 48 | 49 | ## Build command on Windows 50 | ``` 51 | $ mkdir build 52 | $ cd build 53 | $ cmake -DCMAKE_GENERATOR_PLATFORM=x64 .. 54 | $ Open cublas_examples.sln project in Visual Studio and build 55 | ``` 56 | 57 | # Usage 58 | ``` 59 | $ ./cublas_symv_example 60 | ``` 61 | 62 | Sample example output: 63 | 64 | ``` 65 | A 66 | 1.00 2.00 67 | 3.00 4.00 68 | ===== 69 | x 70 | 5.00 6.00 71 | ===== 72 | y 73 | 23.00 39.00 74 | ===== 75 | ``` 76 | -------------------------------------------------------------------------------- /cuBLAS/Level-2/syr/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-2/syr2/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-2/tbmv/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-2/tbsv/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-2/tpmv/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-2/tpsv/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-2/trmv/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-2/trsv/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-3/gemm/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-3/gemm3m/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-3/gemmBatched/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-3/gemmGroupedBatched/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-3/gemmStridedBatched/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-3/hemm/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-3/her2k/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-3/herk/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-3/herkx/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-3/symm/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-3/syr2k/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-3/syrk/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-3/syrk/README.md: -------------------------------------------------------------------------------- 1 | # cuBLAS Level-3 APIs - `cublassyrk` 2 | 3 | ## Description 4 | 5 | This code demonstrates a usage of cuBLAS `syrk` function to compute a symmetric rank-k update 6 | 7 | ``` 8 | A = | 1.0 | 3.0 | 9 | | 3.0 | 4.0 | 10 | 11 | B = | 5.0 | 7.0 | 12 | | 7.0 | 8.0 | 13 | ``` 14 | 15 | See documentation for further details. 16 | 17 | ## Supported SM Architectures 18 | 19 | All GPUs supported by CUDA Toolkit (https://developer.nvidia.com/cuda-gpus) 20 | 21 | ## Supported OSes 22 | 23 | Linux 24 | Windows 25 | 26 | ## Supported CPU Architecture 27 | 28 | x86_64 29 | ppc64le 30 | arm64-sbsa 31 | 32 | ## CUDA APIs involved 33 | - [cublas\syrk API](https://docs.nvidia.com/cuda/cublas/index.html#cublas-t-syrk) 34 | 35 | # Building (make) 36 | 37 | # Prerequisites 38 | - A Linux/Windows system with recent NVIDIA drivers. 39 | - [CMake](https://cmake.org/download) version 3.18 minimum 40 | 41 | ## Build command on Linux 42 | ``` 43 | $ mkdir build 44 | $ cd build 45 | $ cmake .. 46 | $ make 47 | ``` 48 | Make sure that CMake finds expected CUDA Toolkit. If that is not the case you can add argument `-DCMAKE_CUDA_COMPILER=/path/to/cuda/bin/nvcc` to cmake command. 49 | 50 | ## Build command on Windows 51 | ``` 52 | $ mkdir build 53 | $ cd build 54 | $ cmake -DCMAKE_GENERATOR_PLATFORM=x64 .. 55 | $ Open cublas_examples.sln project in Visual Studio and build 56 | ``` 57 | 58 | # Usage 59 | ``` 60 | $ ./cublas_syrk_example 61 | ``` 62 | 63 | Sample example output: 64 | 65 | ``` 66 | A 67 | 1.00 3.00 68 | 3.00 4.00 69 | ===== 70 | C 71 | 10.00 15.00 72 | 0.00 25.00 73 | ===== 74 | ``` 75 | -------------------------------------------------------------------------------- /cuBLAS/Level-3/syrkx/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-3/trmm/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-3/trsm/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLAS/Level-3/trsmBatched/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuBLASLt/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property 5 | # and proprietary rights in and to this software, related documentation 6 | # and any modifications thereto. Any use, reproduction, disclosure or 7 | # distribution of this software and related documentation without an express 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | # 10 | 11 | cmake_minimum_required(VERSION 3.10.0) 12 | 13 | project(cublasLtSamples) 14 | 15 | add_subdirectory(LtSgemm) 16 | add_subdirectory(LtFp8Matmul) 17 | add_subdirectory(LtBlk128x128Fp8Matmul) 18 | add_subdirectory(LtMxfp8Matmul) 19 | add_subdirectory(LtNvfp4Matmul) 20 | add_subdirectory(LtDgemmPresetAlgo) 21 | add_subdirectory(LtIgemmTensor) 22 | add_subdirectory(LtHSHgemmStridedBatchSimple) 23 | add_subdirectory(LtHSHgemmPointerArrayBatchSimple) 24 | add_subdirectory(LtSgemmCustomFind) 25 | add_subdirectory(LtFp8CustomFind) 26 | add_subdirectory(LtPlanarComplex) 27 | add_subdirectory(LtSgemmSimpleAutoTuning) -------------------------------------------------------------------------------- /cuBLASLt/LtBlk128x128Fp8Matmul/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property 5 | # and proprietary rights in and to this software, related documentation 6 | # and any modifications thereto. Any use, reproduction, disclosure or 7 | # distribution of this software and related documentation without an express 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | # 10 | 11 | cmake_minimum_required(VERSION 3.10.0) 12 | 13 | project(sample_cublasLt_LtBlk128x128Fp8Matmul LANGUAGES CXX CUDA) 14 | 15 | set(CMAKE_CXX_STANDARD 11) 16 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 17 | 18 | add_executable(${PROJECT_NAME} 19 | main.cpp 20 | ${PROJECT_NAME}.cu 21 | ) 22 | 23 | target_include_directories(${PROJECT_NAME} PRIVATE 24 | ${CMAKE_CURRENT_SOURCE_DIR}/../Common 25 | ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} 26 | ) 27 | 28 | set(CUDART_LIBRARY cudart) 29 | set(CUBLASLT_LIBRARY cublasLt) 30 | 31 | if(NOT WIN32) 32 | find_library(CUDART_LIBRARY cudart ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) 33 | find_library(CUBLASLT_LIBRARY cublasLt ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) 34 | endif() 35 | 36 | target_link_libraries(${PROJECT_NAME} 37 | ${CUDART_LIBRARY} 38 | ${CUBLASLT_LIBRARY} 39 | ) -------------------------------------------------------------------------------- /cuBLASLt/LtDgemmPresetAlgo/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property 5 | # and proprietary rights in and to this software, related documentation 6 | # and any modifications thereto. Any use, reproduction, disclosure or 7 | # distribution of this software and related documentation without an express 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | # 10 | 11 | cmake_minimum_required(VERSION 3.10.0) 12 | 13 | project(sample_cublasLt_LtDgemmPresetAlgo LANGUAGES CXX CUDA) 14 | 15 | set(CMAKE_CXX_STANDARD 11) 16 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 17 | 18 | add_executable(${PROJECT_NAME} 19 | main.cpp 20 | ${PROJECT_NAME}.cu 21 | ) 22 | 23 | target_include_directories(${PROJECT_NAME} PRIVATE 24 | ${CMAKE_CURRENT_SOURCE_DIR}/../Common 25 | ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} 26 | ) 27 | 28 | set(CUDART_LIBRARY cudart) 29 | set(CUBLASLT_LIBRARY cublasLt) 30 | 31 | if(NOT WIN32) 32 | find_library(CUDART_LIBRARY cudart ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) 33 | find_library(CUBLASLT_LIBRARY cublasLt ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) 34 | endif() 35 | 36 | target_link_libraries(${PROJECT_NAME} 37 | ${CUDART_LIBRARY} 38 | ${CUBLASLT_LIBRARY} 39 | ) -------------------------------------------------------------------------------- /cuBLASLt/LtFp8CustomFind/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property 5 | # and proprietary rights in and to this software, related documentation 6 | # and any modifications thereto. Any use, reproduction, disclosure or 7 | # distribution of this software and related documentation without an express 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | # 10 | 11 | cmake_minimum_required(VERSION 3.10.0) 12 | 13 | project(sample_cublasLt_LtFp8CustomFind LANGUAGES CXX CUDA) 14 | 15 | set(CMAKE_CXX_STANDARD 11) 16 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 17 | 18 | add_executable(${PROJECT_NAME} 19 | main.cpp 20 | ../Common/helpers.cpp 21 | ) 22 | 23 | target_include_directories(${PROJECT_NAME} PRIVATE 24 | ${CMAKE_CURRENT_SOURCE_DIR}/../Common 25 | ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} 26 | ) 27 | 28 | set(CUDART_LIBRARY cudart) 29 | set(CUBLASLT_LIBRARY cublasLt) 30 | 31 | if(NOT WIN32) 32 | find_library(CUDART_LIBRARY cudart ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) 33 | find_library(CUBLASLT_LIBRARY cublasLt ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) 34 | endif() 35 | 36 | target_link_libraries(${PROJECT_NAME} 37 | ${CUDART_LIBRARY} 38 | ${CUBLASLT_LIBRARY} 39 | ) -------------------------------------------------------------------------------- /cuBLASLt/LtFp8Matmul/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property 5 | # and proprietary rights in and to this software, related documentation 6 | # and any modifications thereto. Any use, reproduction, disclosure or 7 | # distribution of this software and related documentation without an express 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | # 10 | 11 | cmake_minimum_required(VERSION 3.10.0) 12 | 13 | project(sample_cublasLt_LtFp8Matmul LANGUAGES CXX CUDA) 14 | 15 | set(CMAKE_CXX_STANDARD 11) 16 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 17 | 18 | add_executable(${PROJECT_NAME} 19 | main.cpp 20 | ${PROJECT_NAME}.cu 21 | ) 22 | 23 | target_include_directories(${PROJECT_NAME} PRIVATE 24 | ${CMAKE_CURRENT_SOURCE_DIR}/../Common 25 | ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} 26 | ) 27 | 28 | set(CUDART_LIBRARY cudart) 29 | set(CUBLASLT_LIBRARY cublasLt) 30 | 31 | if(NOT WIN32) 32 | find_library(CUDART_LIBRARY cudart ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) 33 | find_library(CUBLASLT_LIBRARY cublasLt ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) 34 | endif() 35 | 36 | target_link_libraries(${PROJECT_NAME} 37 | ${CUDART_LIBRARY} 38 | ${CUBLASLT_LIBRARY} 39 | ) -------------------------------------------------------------------------------- /cuBLASLt/LtHSHgemmPointerArrayBatchSimple/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property 5 | # and proprietary rights in and to this software, related documentation 6 | # and any modifications thereto. Any use, reproduction, disclosure or 7 | # distribution of this software and related documentation without an express 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | # 10 | 11 | cmake_minimum_required(VERSION 3.10.0) 12 | 13 | project(sample_cublasLt_LtHSHgemmPointerArrayBatchSimple LANGUAGES CXX CUDA) 14 | 15 | set(CMAKE_CXX_STANDARD 11) 16 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 17 | 18 | add_executable(${PROJECT_NAME} 19 | main.cpp 20 | ${PROJECT_NAME}.cu 21 | ) 22 | 23 | target_include_directories(${PROJECT_NAME} PRIVATE 24 | ${CMAKE_CURRENT_SOURCE_DIR}/../Common 25 | ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} 26 | ) 27 | 28 | set(CUDART_LIBRARY cudart) 29 | set(CUBLASLT_LIBRARY cublasLt) 30 | 31 | if(NOT WIN32) 32 | find_library(CUDART_LIBRARY cudart ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) 33 | find_library(CUBLASLT_LIBRARY cublasLt ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) 34 | endif() 35 | 36 | target_link_libraries(${PROJECT_NAME} 37 | ${CUDART_LIBRARY} 38 | ${CUBLASLT_LIBRARY} 39 | ) -------------------------------------------------------------------------------- /cuBLASLt/LtHSHgemmStridedBatchSimple/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property 5 | # and proprietary rights in and to this software, related documentation 6 | # and any modifications thereto. Any use, reproduction, disclosure or 7 | # distribution of this software and related documentation without an express 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | # 10 | 11 | cmake_minimum_required(VERSION 3.10.0) 12 | 13 | project(sample_cublasLt_LtHSHgemmStridedBatchSimple LANGUAGES CXX CUDA) 14 | 15 | set(CMAKE_CXX_STANDARD 11) 16 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 17 | 18 | add_executable(${PROJECT_NAME} 19 | main.cpp 20 | ${PROJECT_NAME}.cu 21 | ) 22 | 23 | target_include_directories(${PROJECT_NAME} PRIVATE 24 | ${CMAKE_CURRENT_SOURCE_DIR}/../Common 25 | ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} 26 | ) 27 | 28 | set(CUDART_LIBRARY cudart) 29 | set(CUBLASLT_LIBRARY cublasLt) 30 | 31 | if(NOT WIN32) 32 | find_library(CUDART_LIBRARY cudart ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) 33 | find_library(CUBLASLT_LIBRARY cublasLt ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) 34 | endif() 35 | 36 | target_link_libraries(${PROJECT_NAME} 37 | ${CUDART_LIBRARY} 38 | ${CUBLASLT_LIBRARY} 39 | ) -------------------------------------------------------------------------------- /cuBLASLt/LtIgemmTensor/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property 5 | # and proprietary rights in and to this software, related documentation 6 | # and any modifications thereto. Any use, reproduction, disclosure or 7 | # distribution of this software and related documentation without an express 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | # 10 | 11 | cmake_minimum_required(VERSION 3.10.0) 12 | 13 | project(sample_cublasLt_LtIgemmTensor LANGUAGES CXX CUDA) 14 | 15 | set(CMAKE_CXX_STANDARD 11) 16 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 17 | 18 | add_executable(${PROJECT_NAME} 19 | main.cpp 20 | ${PROJECT_NAME}.cu 21 | ) 22 | 23 | target_include_directories(${PROJECT_NAME} PRIVATE 24 | ${CMAKE_CURRENT_SOURCE_DIR}/../Common 25 | ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} 26 | ) 27 | 28 | set(CUDART_LIBRARY cudart) 29 | set(CUBLASLT_LIBRARY cublasLt) 30 | 31 | if(NOT WIN32) 32 | find_library(CUDART_LIBRARY cudart ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) 33 | find_library(CUBLASLT_LIBRARY cublasLt ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) 34 | endif() 35 | 36 | target_link_libraries(${PROJECT_NAME} 37 | ${CUDART_LIBRARY} 38 | ${CUBLASLT_LIBRARY} 39 | ) -------------------------------------------------------------------------------- /cuBLASLt/LtMxfp8Matmul/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property 5 | # and proprietary rights in and to this software, related documentation 6 | # and any modifications thereto. Any use, reproduction, disclosure or 7 | # distribution of this software and related documentation without an express 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | # 10 | 11 | cmake_minimum_required(VERSION 3.10.0) 12 | 13 | project(sample_cublasLt_LtMxfp8Matmul LANGUAGES CXX CUDA) 14 | 15 | set(CMAKE_CXX_STANDARD 11) 16 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 17 | 18 | add_executable(${PROJECT_NAME} 19 | main.cpp 20 | ${PROJECT_NAME}.cu 21 | ) 22 | 23 | target_include_directories(${PROJECT_NAME} PRIVATE 24 | ${CMAKE_CURRENT_SOURCE_DIR}/../Common 25 | ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} 26 | ) 27 | 28 | set(CUDART_LIBRARY cudart) 29 | set(CUBLASLT_LIBRARY cublasLt) 30 | 31 | if(NOT WIN32) 32 | find_library(CUDART_LIBRARY cudart ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) 33 | find_library(CUBLASLT_LIBRARY cublasLt ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) 34 | endif() 35 | 36 | target_link_libraries(${PROJECT_NAME} 37 | ${CUDART_LIBRARY} 38 | ${CUBLASLT_LIBRARY} 39 | ) -------------------------------------------------------------------------------- /cuBLASLt/LtNvfp4Matmul/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property 5 | # and proprietary rights in and to this software, related documentation 6 | # and any modifications thereto. Any use, reproduction, disclosure or 7 | # distribution of this software and related documentation without an express 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | # 10 | 11 | cmake_minimum_required(VERSION 3.10.0) 12 | 13 | project(sample_cublasLt_LtNvfp4Matmul LANGUAGES CXX CUDA) 14 | 15 | set(CMAKE_CXX_STANDARD 11) 16 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 17 | 18 | add_executable(${PROJECT_NAME} 19 | main.cpp 20 | ${PROJECT_NAME}.cu 21 | ) 22 | 23 | target_include_directories(${PROJECT_NAME} PRIVATE 24 | ${CMAKE_CURRENT_SOURCE_DIR}/../Common 25 | ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} 26 | ) 27 | 28 | set(CUDART_LIBRARY cudart) 29 | set(CUBLASLT_LIBRARY cublasLt) 30 | 31 | if(NOT WIN32) 32 | find_library(CUDART_LIBRARY cudart ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) 33 | find_library(CUBLASLT_LIBRARY cublasLt ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) 34 | endif() 35 | 36 | target_link_libraries(${PROJECT_NAME} 37 | ${CUDART_LIBRARY} 38 | ${CUBLASLT_LIBRARY} 39 | ) -------------------------------------------------------------------------------- /cuBLASLt/LtPlanarComplex/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property 5 | # and proprietary rights in and to this software, related documentation 6 | # and any modifications thereto. Any use, reproduction, disclosure or 7 | # distribution of this software and related documentation without an express 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | # 10 | 11 | cmake_minimum_required(VERSION 3.10.0) 12 | 13 | project(sample_cublasLt_LtPlanarComplex LANGUAGES CXX CUDA) 14 | 15 | set(CMAKE_CXX_STANDARD 11) 16 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 17 | 18 | add_executable(${PROJECT_NAME} 19 | main.cpp 20 | ${PROJECT_NAME}.cu 21 | ) 22 | 23 | target_include_directories(${PROJECT_NAME} PRIVATE 24 | ${CMAKE_CURRENT_SOURCE_DIR}/../Common 25 | ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} 26 | ) 27 | 28 | set(CUDART_LIBRARY cudart) 29 | set(CUBLASLT_LIBRARY cublasLt) 30 | 31 | if(NOT WIN32) 32 | find_library(CUDART_LIBRARY cudart ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) 33 | find_library(CUBLASLT_LIBRARY cublasLt ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) 34 | endif() 35 | 36 | target_link_libraries(${PROJECT_NAME} 37 | ${CUDART_LIBRARY} 38 | ${CUBLASLT_LIBRARY} 39 | ) -------------------------------------------------------------------------------- /cuBLASLt/LtSgemm/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property 5 | # and proprietary rights in and to this software, related documentation 6 | # and any modifications thereto. Any use, reproduction, disclosure or 7 | # distribution of this software and related documentation without an express 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | # 10 | 11 | cmake_minimum_required(VERSION 3.10.0) 12 | 13 | project(sample_cublasLt_LtSgemm LANGUAGES CXX CUDA) 14 | 15 | set(CMAKE_CXX_STANDARD 11) 16 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 17 | 18 | add_executable(${PROJECT_NAME} 19 | main.cpp 20 | ${PROJECT_NAME}.cu 21 | ) 22 | 23 | target_include_directories(${PROJECT_NAME} PRIVATE 24 | ${CMAKE_CURRENT_SOURCE_DIR}/../Common 25 | ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} 26 | ) 27 | 28 | set(CUDART_LIBRARY cudart) 29 | set(CUBLASLT_LIBRARY cublasLt) 30 | 31 | if(NOT WIN32) 32 | find_library(CUDART_LIBRARY cudart ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) 33 | find_library(CUBLASLT_LIBRARY cublasLt ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) 34 | endif() 35 | 36 | target_link_libraries(${PROJECT_NAME} 37 | ${CUDART_LIBRARY} 38 | ${CUBLASLT_LIBRARY} 39 | ) -------------------------------------------------------------------------------- /cuBLASLt/LtSgemmCustomFind/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property 5 | # and proprietary rights in and to this software, related documentation 6 | # and any modifications thereto. Any use, reproduction, disclosure or 7 | # distribution of this software and related documentation without an express 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | # 10 | 11 | cmake_minimum_required(VERSION 3.10.0) 12 | 13 | project(sample_cublasLt_LtSgemmCustomFind LANGUAGES CXX CUDA) 14 | 15 | set(CMAKE_CXX_STANDARD 11) 16 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 17 | 18 | add_executable(${PROJECT_NAME} 19 | main.cpp 20 | ../Common/helpers.cpp 21 | ) 22 | 23 | target_include_directories(${PROJECT_NAME} PRIVATE 24 | ${CMAKE_CURRENT_SOURCE_DIR}/../Common 25 | ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} 26 | ) 27 | 28 | set(CUDART_LIBRARY cudart) 29 | set(CUBLASLT_LIBRARY cublasLt) 30 | 31 | if(NOT WIN32) 32 | find_library(CUDART_LIBRARY cudart ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) 33 | find_library(CUBLASLT_LIBRARY cublasLt ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) 34 | endif() 35 | 36 | target_link_libraries(${PROJECT_NAME} 37 | ${CUDART_LIBRARY} 38 | ${CUBLASLT_LIBRARY} 39 | ) -------------------------------------------------------------------------------- /cuBLASLt/LtSgemmSimpleAutoTuning/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property 5 | # and proprietary rights in and to this software, related documentation 6 | # and any modifications thereto. Any use, reproduction, disclosure or 7 | # distribution of this software and related documentation without an express 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | # 10 | 11 | cmake_minimum_required(VERSION 3.10.0) 12 | 13 | project(sample_cublasLt_LtSgemmSimpleAutoTuning LANGUAGES CXX CUDA) 14 | 15 | set(CMAKE_CXX_STANDARD 11) 16 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 17 | 18 | add_executable(${PROJECT_NAME} 19 | main.cpp 20 | ${PROJECT_NAME}.cu 21 | ) 22 | 23 | target_include_directories(${PROJECT_NAME} PRIVATE 24 | ${CMAKE_CURRENT_SOURCE_DIR}/../Common 25 | ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} 26 | ) 27 | 28 | set(CUDART_LIBRARY cudart) 29 | set(CUBLASLT_LIBRARY cublasLt) 30 | 31 | if(NOT WIN32) 32 | find_library(CUDART_LIBRARY cudart ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) 33 | find_library(CUBLASLT_LIBRARY cublasLt ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) 34 | endif() 35 | 36 | target_link_libraries(${PROJECT_NAME} 37 | ${CUDART_LIBRARY} 38 | ${CUBLASLT_LIBRARY} 39 | ) -------------------------------------------------------------------------------- /cuFFT/1d_c2c/.gitignore: -------------------------------------------------------------------------------- 1 | build/ -------------------------------------------------------------------------------- /cuFFT/1d_c2c/README.md: -------------------------------------------------------------------------------- 1 | # cuFFT 1D FFT C2C example 2 | 3 | ## Description 4 | 5 | In this example a one-dimensional complex-to-complex transform is applied to the input data. Afterwards an inverse transform is performed on the computed frequency domain representation. 6 | 7 | ## Supported SM Architectures 8 | 9 | All GPUs supported by CUDA Toolkit (https://developer.nvidia.com/cuda-gpus) 10 | 11 | ## Supported OSes 12 | 13 | Linux 14 | Windows 15 | 16 | ## Supported CPU Architecture 17 | 18 | x86_64 19 | ppc64le 20 | arm64-sbsa 21 | 22 | ## CUDA APIs involved 23 | - [cufftExecC2C API](https://docs.nvidia.com/cuda/cufft/index.html#function-cufftexecc2c-cufftexecz2z) 24 | 25 | 26 | # Building (make) 27 | 28 | # Prerequisites 29 | - A Linux/Windows system with recent NVIDIA drivers. 30 | - [CMake](https://cmake.org/download) version 3.18 minimum 31 | 32 | ## Build command on Linux 33 | ``` 34 | $ mkdir build 35 | $ cd build 36 | $ cmake .. 37 | $ make 38 | ``` 39 | Make sure that CMake finds expected CUDA Toolkit. If that is not the case you can add argument `-DCMAKE_CUDA_COMPILER=/path/to/cuda/bin/nvcc` to cmake command. 40 | 41 | # Usage 1 42 | ``` 43 | $ ./bin/1d_c2c_example 44 | ``` 45 | -------------------------------------------------------------------------------- /cuFFT/1d_mgpu_c2c/.gitignore: -------------------------------------------------------------------------------- 1 | build/ -------------------------------------------------------------------------------- /cuFFT/1d_r2c_c2r/.gitignore: -------------------------------------------------------------------------------- 1 | build/ -------------------------------------------------------------------------------- /cuFFT/1d_r2c_c2r/README.md: -------------------------------------------------------------------------------- 1 | # cuFFT 1D FFT R2C/C2R example 2 | 3 | ## Description 4 | 5 | In this example a one-dimensional real-to-complex transform is applied to the input data. 6 | 7 | ## Supported SM Architectures 8 | 9 | All GPUs supported by CUDA Toolkit (https://developer.nvidia.com/cuda-gpus) 10 | 11 | ## Supported OSes 12 | 13 | Linux 14 | Windows 15 | 16 | ## Supported CPU Architecture 17 | 18 | x86_64 19 | ppc64le 20 | arm64-sbsa 21 | 22 | ## CUDA APIs involved 23 | - [cufftExecR2C API](https://docs.nvidia.com/cuda/cufft/index.html#function-cufftexecr2c-cufftexecd2z) 24 | 25 | # Building (make) 26 | 27 | # Prerequisites 28 | - A Linux/Windows system with recent NVIDIA drivers. 29 | - [CMake](https://cmake.org/download) version 3.18 minimum 30 | 31 | ## Build command on Linux 32 | ``` 33 | $ mkdir build 34 | $ cd build 35 | $ cmake .. 36 | $ make 37 | ``` 38 | Make sure that CMake finds expected CUDA Toolkit. If that is not the case you can add argument `-DCMAKE_CUDA_COMPILER=/path/to/cuda/bin/nvcc` to cmake command. 39 | 40 | # Usage 1 41 | ``` 42 | $ ./bin/1d_r2c_c2r_example 43 | ``` 44 | -------------------------------------------------------------------------------- /cuFFT/2d_c2r_r2c/.gitignore: -------------------------------------------------------------------------------- 1 | build/ -------------------------------------------------------------------------------- /cuFFT/2d_c2r_r2c/README.md: -------------------------------------------------------------------------------- 1 | # cuFFT 2D FFT C2R/R2C example 2 | 3 | ## Description 4 | 5 | In this example a two-dimensional complex-to-real transform is applied to the input data arranged according to the requirements of the default FFTW padding mode. 6 | 7 | ## Supported SM Architectures 8 | 9 | All GPUs supported by CUDA Toolkit (https://developer.nvidia.com/cuda-gpus) 10 | 11 | ## Supported OSes 12 | 13 | Linux 14 | Windows 15 | 16 | ## Supported CPU Architecture 17 | 18 | x86_64 19 | ppc64le 20 | arm64-sbsa 21 | 22 | ## CUDA APIs involved 23 | - [cufftExecC2R API](https://docs.nvidia.com/cuda/cufft/index.html#function-cufftexecr2c-cufftexecd2z) 24 | 25 | # Building (make) 26 | 27 | # Prerequisites 28 | - A Linux/Windows system with recent NVIDIA drivers. 29 | - [CMake](https://cmake.org/download) version 3.18 minimum 30 | 31 | ## Build command on Linux 32 | ``` 33 | $ mkdir build 34 | $ cd build 35 | $ cmake .. 36 | $ make 37 | ``` 38 | Make sure that CMake finds expected CUDA Toolkit. If that is not the case you can add argument `-DCMAKE_CUDA_COMPILER=/path/to/cuda/bin/nvcc` to cmake command. 39 | 40 | # Usage 1 41 | ``` 42 | $ ./bin/2d_c2r_r2c_example 43 | ``` 44 | -------------------------------------------------------------------------------- /cuFFT/3d_c2c/.gitignore: -------------------------------------------------------------------------------- 1 | build/ -------------------------------------------------------------------------------- /cuFFT/3d_c2c/README.md: -------------------------------------------------------------------------------- 1 | # cuFFT 3D FFT C2C example 2 | 3 | ## Description 4 | 5 | In this example a three-dimensional complex-to-complex transform is applied to the input data. Afterwards an inverse transform is performed on the computed frequency domain representation. 6 | 7 | ## Supported SM Architectures 8 | 9 | All GPUs supported by CUDA Toolkit (https://developer.nvidia.com/cuda-gpus) 10 | 11 | ## Supported OSes 12 | 13 | Linux 14 | Windows 15 | 16 | ## Supported CPU Architecture 17 | 18 | x86_64 19 | ppc64le 20 | arm64-sbsa 21 | 22 | ## CUDA APIs involved 23 | - [cufftExecC2C API](https://docs.nvidia.com/cuda/cufft/index.html#function-cufftexecc2c-cufftexecz2z) 24 | 25 | 26 | # Building (make) 27 | 28 | # Prerequisites 29 | - A Linux/Windows system with recent NVIDIA drivers. 30 | - [CMake](https://cmake.org/download) version 3.18 minimum 31 | 32 | ## Build command on Linux 33 | ``` 34 | $ mkdir build 35 | $ cd build 36 | $ cmake .. 37 | $ make 38 | ``` 39 | Make sure that CMake finds expected CUDA Toolkit. If that is not the case you can add argument `-DCMAKE_CUDA_COMPILER=/path/to/cuda/bin/nvcc` to cmake command. 40 | 41 | # Usage 1 42 | ``` 43 | $ ./bin/3d_c2c_example 44 | ``` 45 | -------------------------------------------------------------------------------- /cuFFT/3d_mgpu_c2c/.gitignore: -------------------------------------------------------------------------------- 1 | build/ -------------------------------------------------------------------------------- /cuFFT/3d_mgpu_r2c_c2r/.gitignore: -------------------------------------------------------------------------------- 1 | build/ -------------------------------------------------------------------------------- /cuFFTMp/Fortran_samples/Fortran_wrappers_nvhpc/libattachcommWrapper.a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuFFTMp/Fortran_samples/Fortran_wrappers_nvhpc/libattachcommWrapper.a -------------------------------------------------------------------------------- /cuFFTMp/Fortran_samples/Fortran_wrappers_nvhpc/libnvhpcwrapcufftxt.a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuFFTMp/Fortran_samples/Fortran_wrappers_nvhpc/libnvhpcwrapcufftxt.a -------------------------------------------------------------------------------- /cuFFTMp/Fortran_samples/c2c/Makefile: -------------------------------------------------------------------------------- 1 | include ../common.mk 2 | 3 | exe = cufftmp_c2c 4 | exe_acc = cufftmp_c2c_acc 5 | 6 | all : $(exe) $(exe_acc) 7 | 8 | $(exe) : $(exe).o 9 | $(f90) -o $@ $+ $(FLAGS) $(LINKER) 10 | 11 | $(exe_acc) : $(exe_acc).o 12 | $(f90) -o $@ $+ $(FLAGS) $(LINKER) -acc 13 | 14 | %.o : %.f90 15 | $(f90) -c $< -o $@ $(FLAGS) 16 | 17 | %_acc.o : %.f90 18 | $(f90) -c $< -o $@ $(FLAGS) -DACC -acc 19 | 20 | .PHONY: clean 21 | 22 | clean: 23 | @rm -rf *.mod *.o $(exe) $(exe_acc) 24 | 25 | run: $(exe) $(exe_acc) 26 | LD_LIBRARY_PATH="${NVSHMEM_LIB}:${CUFFT_LIB}:${LD_LIBRARY_PATH}" mpirun -n 2 $(exe) 27 | LD_LIBRARY_PATH="${NVSHMEM_LIB}:${CUFFT_LIB}:${LD_LIBRARY_PATH}" mpirun -n 2 $(exe_acc) 28 | -------------------------------------------------------------------------------- /cuFFTMp/Fortran_samples/c2c_no_descriptors/Makefile: -------------------------------------------------------------------------------- 1 | include ../common.mk 2 | 3 | exe = cufftmp_c2c_no_descriptors 4 | 5 | all : $(exe) 6 | 7 | $(exe) : $(exe).o 8 | $(f90) -o $@ $+ $(FLAGS) $(LINKER) 9 | 10 | %.o : %.f90 11 | $(f90) -c $< -o $@ $(FLAGS) 12 | 13 | .PHONY: clean 14 | 15 | clean: 16 | @rm -rf *.mod *.o $(exe) 17 | 18 | run: $(exe) 19 | LD_LIBRARY_PATH="${NVSHMEM_LIB}:${CUFFT_LIB}:${LD_LIBRARY_PATH}" mpirun -n 2 $(exe) 20 | -------------------------------------------------------------------------------- /cuFFTMp/Fortran_samples/c2c_pencils/Makefile: -------------------------------------------------------------------------------- 1 | include ../common.mk 2 | 3 | exe = cufftmp_c2c_pencils 4 | 5 | all : $(exe) 6 | 7 | $(exe) : $(exe).o 8 | $(f90) -o $@ $+ $(FLAGS) $(LINKER) 9 | 10 | %.o : %.f90 11 | $(f90) -c $< -o $@ $(FLAGS) 12 | 13 | .PHONY: clean 14 | 15 | clean: 16 | @rm -rf *.mod *.o $(exe) 17 | 18 | run: $(exe) 19 | LD_LIBRARY_PATH="${NVSHMEM_LIB}:${CUFFT_LIB}:${LD_LIBRARY_PATH}" mpirun -n 4 $(exe) 20 | -------------------------------------------------------------------------------- /cuFFTMp/Fortran_samples/common.mk: -------------------------------------------------------------------------------- 1 | NVSHMEM_LIB ?= ../../cufft/lib 2 | CUFFT_LIB ?= ../../cufft/lib 3 | CUFFT_INC ?= ../../cufft/include 4 | 5 | f90 := mpif90 6 | 7 | WRAPPERS_DIR = ../Fortran_wrappers_nvhpc 8 | FLAGS = -O3 -Mfree -fast -Mextend -Mpreprocess -Minform=warn 9 | FLAGS += -I./ -I${WRAPPERS_DIR}/ -I${CUFFT_INC}/ 10 | # Add flags -gpu to build for specific architecture. E.g., -gpu=cc70,cc80,cc90 11 | # See https://docs.nvidia.com/hpc-sdk/compilers/hpc-compilers-user-guide/index.html#compute-capability 12 | # Also see https://docs.nvidia.com/cuda/cufftmp/usage/requirements.html for supported architectures 13 | FLAGS += -Minfo=accel -cuda -cudalib=cufftmp 14 | LINKER := -L$(HPCSDK_ROOT)/compilers/lib -lnvhpcwrapcufft -lnvhpcwrapcufftmp -L${NVSHMEM_LIB} -lnvshmem_host -lnvshmem_device -L${CUFFT_LIB} 15 | 16 | -------------------------------------------------------------------------------- /cuFFTMp/Fortran_samples/r2c_c2r/Makefile: -------------------------------------------------------------------------------- 1 | include ../common.mk 2 | 3 | exe = cufftmp_r2c 4 | exe_acc = cufftmp_r2c_acc 5 | 6 | all : $(exe) $(exe_acc) 7 | 8 | $(exe) : $(exe).o 9 | $(f90) -o $@ $+ $(FLAGS) $(LINKER) 10 | 11 | $(exe_acc) : $(exe_acc).o 12 | $(f90) -o $@ $+ $(FLAGS) $(LINKER) -acc 13 | 14 | %.o : %.f90 15 | $(f90) -c $< -o $@ $(FLAGS) 16 | 17 | %_acc.o : %.f90 18 | $(f90) -c $< -o $@ $(FLAGS) -DACC -acc 19 | 20 | .PHONY: clean 21 | 22 | clean: 23 | @rm -rf *.mod *.o $(exe) $(exe_acc) 24 | 25 | run: $(exe) $(exe_acc) 26 | LD_LIBRARY_PATH="${NVSHMEM_LIB}:${CUFFT_LIB}:${LD_LIBRARY_PATH}" mpirun -n 2 $(exe) 27 | LD_LIBRARY_PATH="${NVSHMEM_LIB}:${CUFFT_LIB}:${LD_LIBRARY_PATH}" mpirun -n 2 $(exe_acc) 28 | -------------------------------------------------------------------------------- /cuFFTMp/Fortran_samples/r2c_c2r_no_descriptors/Makefile: -------------------------------------------------------------------------------- 1 | include ../common.mk 2 | 3 | exe = cufftmp_r2c_c2r_no_descriptors 4 | 5 | all : $(exe) 6 | 7 | $(exe) : $(exe).o 8 | $(f90) -o $@ $+ $(FLAGS) $(LINKER) 9 | 10 | %.o : %.f90 11 | $(f90) -c $< -o $@ $(FLAGS) 12 | 13 | .PHONY: clean 14 | 15 | clean: 16 | @rm -rf *.mod *.o $(exe) 17 | 18 | run: $(exe) 19 | LD_LIBRARY_PATH="${NVSHMEM_LIB}:${CUFFT_LIB}:${LD_LIBRARY_PATH}" mpirun -n 2 $(exe) 20 | -------------------------------------------------------------------------------- /cuFFTMp/Fortran_samples/r2c_c2r_pencils/Makefile: -------------------------------------------------------------------------------- 1 | include ../common.mk 2 | 3 | exe = cufftmp_r2c_c2r_pencils 4 | 5 | all : $(exe) 6 | 7 | $(exe) : $(exe).o 8 | $(f90) -o $@ $+ $(FLAGS) $(LINKER) 9 | 10 | %.o : %.f90 11 | $(f90) -c $< -o $@ $(FLAGS) 12 | 13 | .PHONY: clean 14 | 15 | clean: 16 | @rm -rf *.mod *.o $(exe) 17 | 18 | run: $(exe) 19 | LD_LIBRARY_PATH="${NVSHMEM_LIB}:${CUFFT_LIB}:${LD_LIBRARY_PATH}" mpirun -n 4 $(exe) 20 | -------------------------------------------------------------------------------- /cuFFTMp/Fortran_samples/r2c_c2r_shared_scratch/Makefile: -------------------------------------------------------------------------------- 1 | include ../common.mk 2 | 3 | exe = cufftmp_r2c_workarea 4 | 5 | all : $(exe) 6 | 7 | $(exe) : $(exe).o 8 | $(f90) -o $@ $+ $(FLAGS) $(LINKER) 9 | 10 | %.o : %.f90 11 | $(f90) -c $< -o $@ $(FLAGS) -DSHARED_WORKAREA 12 | 13 | .PHONY: clean 14 | 15 | clean: 16 | @rm -rf *.mod *.o $(exe) 17 | 18 | run: $(exe) 19 | LD_LIBRARY_PATH="${NVSHMEM_LIB}:${CUFFT_LIB}:${LD_LIBRARY_PATH}" mpirun -n 2 $(exe) 20 | 21 | -------------------------------------------------------------------------------- /cuFFTMp/Fortran_samples/r2c_c2r_shared_scratch/README.md: -------------------------------------------------------------------------------- 1 | # Fortran R2C_C2R Sample with workarea sharing 2 | 3 | This sample shows how to compute a distributed R2C-C2R transform using shared scratch workarea between the two plans. 4 | It is otherwise identical to the other, simpler R2C-C2R sample. 5 | 6 | Requirement: 7 | - HPC SDK 21.9 and up 8 | - `mpif90` and `mpicc` should be in your `$PATH` 9 | 10 | To build and run: 11 | ``` 12 | export CUFFT_LIB=/path/to/cufftMp/lib/ 13 | export CUFFT_INC=/path/to/cufftMp/include/ 14 | 15 | cd r2c_c2r_shared_scratch 16 | make run 17 | [...] 18 | Hello from rank 0 gpu id 0 size 2 19 | Hello from rank 1 gpu id 1 size 2 20 | local_rshape : 258 256 128 21 | local_permuted_cshape : 129 128 256 22 | shape of u is 258 256 128 23 | shape of u_permuted is 129 128 256 24 | [...] 25 | after C2R 0 max_norm is 1.00000000 max_diff is 0.00000107 26 | Relative Linf on rank 0 is 0.00000107 27 | after C2R 1 max_norm is 1.00000000 max_diff is 0.00000107 28 | Relative Linf on rank 1 is 0.00000107 29 | >>>> PASSED on rank 0 30 | >>>> PASSED on rank 1 31 | ``` 32 | -------------------------------------------------------------------------------- /cuFFTMp/Fortran_samples/reshape/Makefile: -------------------------------------------------------------------------------- 1 | include ../common.mk 2 | 3 | exe = cufftmp_reshape 4 | 5 | all : $(exe) 6 | 7 | $(exe) : $(exe).o 8 | $(f90) -o $@ $+ $(FLAGS) $(LINKER) 9 | 10 | %.o : %.f90 11 | $(f90) -c $< -o $@ $(FLAGS) 12 | 13 | .PHONY: clean 14 | 15 | clean: 16 | @rm -rf *.mod *.o $(exe) 17 | 18 | run: $(exe) 19 | LD_LIBRARY_PATH="${NVSHMEM_LIB}:${CUFFT_LIB}:${LD_LIBRARY_PATH}" mpirun -n 2 $(exe) 20 | -------------------------------------------------------------------------------- /cuFFTMp/Fortran_samples/reshape/README.md: -------------------------------------------------------------------------------- 1 | # Fortran Standalone Reshape Sample 2 | 3 | This sample shows how to use the reshape API to re-distribute data accross GPUs. 4 | 5 | Requirement: 6 | - HPC SDK 21.9 and up 7 | - `mpif90` and `mpicc` should be in your `$PATH` 8 | 9 | To build and run: 10 | ``` 11 | export CUFFT_LIB=/path/to/cufftMp/lib/ 12 | export CUFFT_INC=/path/to/cufftMp/include/ 13 | cd reshape 14 | make run 15 | 16 | Hello from rank 0 gpu id 0 size 2 17 | Input data on rank 0: 0 1 4 5 8 9 12 13 18 | Expected output data on rank 0: 0 1 2 3 4 5 6 7 19 | Hello from rank 1 gpu id 1 size 2 20 | Input data on rank 1: 2 3 6 7 10 11 14 15 21 | Expected output data on rank 1: 8 9 10 11 12 13 14 15 22 | Output data on rank 0: 0 1 2 3 4 5 6 7 23 | Output data on rank 1: 8 9 10 11 12 13 14 15 24 | >>>> PASSED on rank 1 25 | >>>> PASSED on rank 0 26 | -------------------------------------------------------------------------------- /cuFFTMp/JAX_FFT/Dockerfile: -------------------------------------------------------------------------------- 1 | # We use JAX-toolbox from https://github.com/NVIDIA/JAX-Toolbox 2 | FROM ghcr.io/nvidia/jax:jax-2024-10-24 3 | RUN apt-get update && apt-get install openmpi-bin -y 4 | 5 | COPY . /fft_jax 6 | RUN rm -rf /fft_jax/build 7 | RUN pip install -e /fft_jax 8 | 9 | ENV LD_LIBRARY_PATH=/fft_jax/nvshmem/lib:/fft_jax/cufftmp/lib:$LD_LIBRARY_PATH 10 | 11 | ENV NVSHMEM_DISABLE_NCCL=1 12 | ENV NVSHMEM_DISABLE_GDRCOPY=1 13 | ENV NVSHMEM_BOOTSTRAP=MPI 14 | 15 | # Infiniband service level is beneficial for performance for large FFTs on many GPUs. 16 | # see *Note* in https://docs.nvidia.com/hpc-sdk/cufftmp/usage/performances.html#performance-considerations 17 | # The IB service level for both NVSHMEM (for cuFFTMp) and NCCL (for JAX FFT) are declared here. 18 | ENV NVSHMEM_IB_SL=1 19 | ENV NCCL_IB_SL=1 20 | 21 | ENV OMPI_ALLOW_RUN_AS_ROOT=1 22 | ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 -------------------------------------------------------------------------------- /cuFFTMp/JAX_FFT/misc/strong.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuFFTMp/JAX_FFT/misc/strong.png -------------------------------------------------------------------------------- /cuFFTMp/JAX_FFT/misc/strong_eos.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuFFTMp/JAX_FFT/misc/strong_eos.png -------------------------------------------------------------------------------- /cuFFTMp/JAX_FFT/misc/weak.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuFFTMp/JAX_FFT/misc/weak.png -------------------------------------------------------------------------------- /cuFFTMp/JAX_FFT/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=42", "wheel", "setuptools_scm[toml]>=3.4", "pybind11>=2.6", "cmake"] 3 | build-backend = "setuptools.build_meta" 4 | -------------------------------------------------------------------------------- /cuFFTMp/JAX_FFT/src/cufftmp_jax/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.12) 2 | project(cufftmp_jax LANGUAGES CXX CUDA) 3 | 4 | find_package(Python COMPONENTS Interpreter Development REQUIRED) 5 | find_package(pybind11 CONFIG REQUIRED) 6 | 7 | include_directories(${CMAKE_CURRENT_LIST_DIR}/src) 8 | 9 | message(STATUS "Using ${NVSHMEM_HOME} for NVSHMEM_HOME and ${CUFFTMP_HOME} for CUFFTMP_HOME") 10 | include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} ${CUFFTMP_HOME}/include ${NVSHMEM_HOME}/include) 11 | link_directories(${CUFFTMP_HOME}/lib ${NVSHMEM_HOME}/lib) 12 | 13 | pybind11_add_module(gpu_ops 14 | ${CMAKE_CURRENT_LIST_DIR}/src/kernels.cu 15 | ${CMAKE_CURRENT_LIST_DIR}/src/gpu_ops.cpp 16 | ) 17 | 18 | target_link_libraries(gpu_ops 19 | PRIVATE 20 | cufftMp 21 | nvshmem_host 22 | nvshmem_device 23 | ) 24 | 25 | set_target_properties(gpu_ops 26 | PROPERTIES 27 | CUDA_STANDARD 17 28 | CUDA_RESOLVE_DEVICE_SYMBOLS ON 29 | POSITION_INDEPENDENT_CODE ON 30 | CUDA_SEPARABLE_COMPILATION ON 31 | ) 32 | 33 | install(TARGETS gpu_ops DESTINATION cufftmp_jax) 34 | -------------------------------------------------------------------------------- /cuFFTMp/JAX_FFT/src/cufftmp_jax/NOTICE: -------------------------------------------------------------------------------- 1 | The code in this directory was adapted from https://github.com/dfm/extending-jax 2 | by Dan Foreman-Mackey and published under the MIT license as stated below. 3 | 4 | -------------------------------------------------------------------------------- 5 | 6 | MIT License 7 | 8 | Copyright (c) 2021 Dan Foreman-Mackey 9 | 10 | Permission is hereby granted, free of charge, to any person obtaining a copy 11 | of this software and associated documentation files (the "Software"), to deal 12 | in the Software without restriction, including without limitation the rights 13 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 | copies of the Software, and to permit persons to whom the Software is 15 | furnished to do so, subject to the following conditions: 16 | 17 | The above copyright notice and this permission notice shall be included in all 18 | copies or substantial portions of the Software. 19 | 20 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 23 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 26 | SOFTWARE. -------------------------------------------------------------------------------- /cuFFTMp/JAX_FFT/src/cufftmp_jax/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from .cufftmp_jax import cufftmp 4 | -------------------------------------------------------------------------------- /cuFFTMp/JAX_FFT/src/cufftmp_jax/src/gpu_ops.cpp: -------------------------------------------------------------------------------- 1 | #include "kernels.h" 2 | #include "pybind11_kernel_helpers.h" 3 | 4 | using namespace cufftmp_jax; 5 | 6 | /** 7 | * Boilerplate used to 8 | * (1) Expose the gpu_cufftmp function to Python (to launch our custom op) 9 | * (2) Expose the cufftmpDescriptor (to pass parameters from Python to C++) 10 | */ 11 | 12 | namespace { 13 | 14 | pybind11::dict Registrations() { 15 | pybind11::dict dict; 16 | dict["gpu_cufftmp"] = EncapsulateFunction(gpu_cufftmp); 17 | return dict; 18 | } 19 | 20 | PYBIND11_MODULE(gpu_ops, m) { 21 | m.def("registrations", &Registrations); 22 | m.def("build_cufftmp_descriptor", 23 | [](std::int64_t x, std::int64_t y, std::int64_t z, int dist, int dir) { 24 | return PackDescriptor(cufftmpDescriptor{x, y, z, dist, dir}); 25 | } 26 | ); 27 | } 28 | 29 | } // namespace 30 | -------------------------------------------------------------------------------- /cuFFTMp/JAX_FFT/src/cufftmp_jax/src/kernel_helpers.h: -------------------------------------------------------------------------------- 1 | #ifndef _CUFFTMP_JAX_KERNEL_HELPERS_H_ 2 | #define _CUFFTMP_JAX_KERNEL_HELPERS_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | /** 10 | * Boilerplate to copy descriptors from Python to C++ 11 | */ 12 | 13 | namespace cufftmp_jax { 14 | 15 | // See https://en.cppreference.com/w/cpp/numeric/bit_cast 16 | template 17 | typename std::enable_if::value && 19 | std::is_trivially_copyable::value, 20 | To>::type 21 | bit_cast(const From& src) noexcept { 22 | static_assert( 23 | std::is_trivially_constructible::value, 24 | "This implementation additionally requires destination type to be trivially constructible" 25 | ); 26 | 27 | To dst; 28 | memcpy(&dst, &src, sizeof(To)); 29 | return dst; 30 | } 31 | 32 | template 33 | std::string PackDescriptorAsString(const T& descriptor) { 34 | return std::string(bit_cast(&descriptor), sizeof(T)); 35 | } 36 | 37 | template 38 | const T* UnpackDescriptor(const char* opaque, std::size_t opaque_len) { 39 | if (opaque_len != sizeof(T)) { 40 | throw std::runtime_error("Invalid opaque object size with opaque_len = " + std::to_string(opaque_len) + ", and sizeof(T) = " + std::to_string(sizeof(T))); 41 | } 42 | return bit_cast(opaque); 43 | } 44 | 45 | } // namespace cufftmp_jax 46 | 47 | #endif 48 | -------------------------------------------------------------------------------- /cuFFTMp/JAX_FFT/src/cufftmp_jax/src/kernels.h: -------------------------------------------------------------------------------- 1 | #ifndef _CUFFTMP_JAX_KERNELS_H_ 2 | #define _CUFFTMP_JAX_KERNELS_H_ 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | namespace cufftmp_jax { 10 | 11 | /** 12 | * Description of an FFT 13 | * - global_x, global_y, global_z are the global size of the tensor to transform 14 | * - distribution is 0 for a CUFFT_XT_FORMAT_INPLACE (== Slabs_X) and 15 | * 1 for a CUFFT_XT_FORMAT_INPLACE_SHUFFLED (== Slabs_Y) data distribution 16 | * - direction is 0 for a CUFFT_FORWARD transform, 1 for CUFFT_INVERSE 17 | */ 18 | 19 | struct cufftmpDescriptor { 20 | std::int64_t global_x; 21 | std::int64_t global_y; 22 | std::int64_t global_z; 23 | int distribution; 24 | int direction; 25 | }; 26 | 27 | /** 28 | * Generic signature for a custom op with CUDA 29 | */ 30 | void gpu_cufftmp(cudaStream_t stream, void** buffers, const char* opaque, std::size_t opaque_len); 31 | 32 | } // namespace cufftmp_jax 33 | 34 | #endif -------------------------------------------------------------------------------- /cuFFTMp/JAX_FFT/src/cufftmp_jax/src/pybind11_kernel_helpers.h: -------------------------------------------------------------------------------- 1 | #ifndef _CUFFTMP_JAX_PYBIND11_KERNEL_HELPERS_H_ 2 | #define _CUFFTMP_JAX_PYBIND11_KERNEL_HELPERS_H_ 3 | 4 | #include 5 | 6 | #include "kernel_helpers.h" 7 | 8 | /** 9 | * pybind11 boilerplate 10 | */ 11 | 12 | namespace cufftmp_jax { 13 | 14 | template 15 | pybind11::bytes PackDescriptor(const T& descriptor) { 16 | return pybind11::bytes(PackDescriptorAsString(descriptor)); 17 | } 18 | 19 | template 20 | pybind11::capsule EncapsulateFunction(T* fn) { 21 | return pybind11::capsule(bit_cast(fn), "xla._CUSTOM_CALL_TARGET"); 22 | } 23 | 24 | } // namespace cufftmp_jax 25 | 26 | #endif 27 | -------------------------------------------------------------------------------- /cuFFTMp/JAX_FFT/src/fft_common/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from .utils import Dist, Dir 4 | -------------------------------------------------------------------------------- /cuFFTMp/JAX_FFT/src/xfft/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from .xfft import xfft 4 | -------------------------------------------------------------------------------- /cuFFTMp/extra_bootstraps/Makefile: -------------------------------------------------------------------------------- 1 | DEST ?= "myMPI" 2 | MPICC ?= mpicc 3 | NVSHMEM_VER ?= 2.8.0 4 | NVSHMEM_VER_RC ?= 2.8.0-3 5 | 6 | all: mpi_bootstrap 7 | 8 | ${DEST}/nvshmem_bootstrap_mpi.so: 9 | mkdir -p ${DEST} 10 | wget https://developer.download.nvidia.com/compute/redist/nvshmem/${NVSHMEM_VER}/source/nvshmem_src_${NVSHMEM_VER_RC}.txz 11 | tar Jxvf nvshmem_src_${NVSHMEM_VER_RC}.txz 12 | make -C nvshmem_src_${NVSHMEM_VER_RC} $(shell pwd)/nvshmem_src_${NVSHMEM_VER_RC}/build/lib/nvshmem_bootstrap_mpi.so.${NVSHMEM_VER} 13 | echo "Bootstrap built!" 14 | mv $(shell pwd)/nvshmem_src_${NVSHMEM_VER_RC}/build/lib/nvshmem_bootstrap_mpi* ${DEST} 15 | ldd ${DEST}/nvshmem_bootstrap_mpi.so 16 | rm -rf ./nvshmem_src_${NVSHMEM_VER_RC}.txz ./nvshmem_src_${NVSHMEM_VER_RC} 17 | 18 | mpi_bootstrap: ${DEST}/nvshmem_bootstrap_mpi.so 19 | 20 | clean: 21 | rm -rf ./nvshmem_src_${NVSHMEM_VER_RC}.txz ./nvshmem_src_${NVSHMEM_VER_RC} ./${DEST} 22 | -------------------------------------------------------------------------------- /cuFFTMp/extra_bootstraps/README.md: -------------------------------------------------------------------------------- 1 | # Building bootstraps for other versions of MPI 2 | 3 | cuFFTMp uses NVSHMEM. In order to interoperate with MPI, a bootstrap plugin is required. NVSHMEM ships with a bootstrap compatible with HPC-X. 4 | However, you can easily build you own bootstrap, compatible with another MPI implementation. To do so, 5 | ``` 6 | MPI_HOME=/path/to/mpi/home/ CUDA_HOME=/path/to/cuda/home DEST=myMPI make mpi_bootstrap 7 | ``` 8 | will download NVSHMEM, build the bootstrap library and place it in the `myMPI` folder. 9 | 10 | After this, you can run any sample by 11 | ``` 12 | MPI_HOME=/path/to/my/mpi/ NVSHMEM_LIB="../../extra_bootstraps/myMPI" make run 13 | ``` 14 | which effectively place `myMPI` in your `LD_LIBRARY_PATH`. 15 | -------------------------------------------------------------------------------- /cuFFTMp/samples/c2c/Makefile: -------------------------------------------------------------------------------- 1 | include ../common.mk 2 | 3 | exe = cufftmp_c2c 4 | 5 | all: $(exe) 6 | 7 | .PHONY: clean 8 | 9 | clean: 10 | rm -rf $(exe) 11 | 12 | $(exe): $(exe).cu 13 | ${CUDA_HOME}/bin/nvcc $< -o $@ ${CXXFLAGS} ${INCFLAGS} ${LDFLAGS} 14 | 15 | build: $(exe) 16 | 17 | run: $(exe) 18 | LD_LIBRARY_PATH="${NVSHMEM_LIB}:${CUFFT_LIB}:${LD_LIBRARY_PATH}" mpirun -oversubscribe -n 2 $(exe) 19 | -------------------------------------------------------------------------------- /cuFFTMp/samples/c2c_no_descriptors/Makefile: -------------------------------------------------------------------------------- 1 | include ../common.mk 2 | 3 | exe = cufftmp_c2c_no_descriptors 4 | 5 | all: $(exe) 6 | 7 | .PHONY: clean 8 | 9 | clean: 10 | rm -rf $(exe) 11 | 12 | $(exe): $(exe).cu 13 | ${CUDA_HOME}/bin/nvcc $< -o $@ ${CXXFLAGS} ${INCFLAGS} ${LDFLAGS} 14 | 15 | build: $(exe) 16 | 17 | run: $(exe) 18 | LD_LIBRARY_PATH="${NVSHMEM_LIB}:${CUFFT_LIB}:${LD_LIBRARY_PATH}" mpirun -oversubscribe -n 2 $(exe) 19 | -------------------------------------------------------------------------------- /cuFFTMp/samples/c2c_pencils/Makefile: -------------------------------------------------------------------------------- 1 | include ../common.mk 2 | 3 | exe = cufftmp_c2c_pencils 4 | 5 | all: $(exe) 6 | 7 | .PHONY: clean 8 | 9 | clean: 10 | rm -rf $(exe) 11 | 12 | $(exe): $(exe).cu ../iterators/box_iterator.hpp 13 | ${CUDA_HOME}/bin/nvcc $< -o $@ ${CXXFLAGS} ${INCFLAGS} ${LDFLAGS} 14 | 15 | build: $(exe) 16 | 17 | run: $(exe) 18 | LD_LIBRARY_PATH="${NVSHMEM_LIB}:${CUFFT_LIB}:${LD_LIBRARY_PATH}" mpirun -oversubscribe -n 4 $(exe) 19 | -------------------------------------------------------------------------------- /cuFFTMp/samples/c2c_pencils/README.md: -------------------------------------------------------------------------------- 1 | # C2C using a custom user distributions (pencils) 2 | ## Sample description 3 | This sample is similar to [samples/c2c](../c2c/README.md), where it performs 4 | - C2C forward transform 5 | - [Scaling/normalization](../common/README.md) 6 | - C2C backward transform. 7 | 8 | But this sample assumes pencil decomposition layout: 9 | - the input data is distributed using a pencil decomposition in X and Y, along Z; 10 | - the output data is distributed using a pencil decomposition in X and Z, along Y. 11 | 12 | This is achieved using a custom user-defined distribution and `cufftXtSetDistribution`. 13 | 14 | ## Build and run 15 | This example requires 4 GPUs. 16 | 17 | See [Requirements](../../README.md) and [Quick start for C++ samples](../../README.md) for hardware/software requirements and build instructions. 18 | 19 | Example code snippet: 20 | ``` 21 | $ MPI_HOME=/usr/lib/x86_64-linux-gnu/openmpi/ make run 22 | Hello from rank 3/4 using GPU 3 23 | Hello from rank 1/4 using GPU 1 24 | Hello from rank 0/4 using GPU 0 25 | Hello from rank 2/4 using GPU 2 26 | input data, global 3D index [2,0,0], local index 0, rank 2 is (-0.12801,-0.629836) 27 | input data, global 3D index [2,0,1], local index 1, rank 2 is (-0.948148,0.863082) 28 | [...] 29 | output, global 3D index [0,0,2], local index 0, rank 1 is (-8.45704,12.8481) 30 | output, global 3D index [0,0,3], local index 1, rank 1 is (3.18903,28.6322) 31 | [...] 32 | ``` -------------------------------------------------------------------------------- /cuFFTMp/samples/common.mk: -------------------------------------------------------------------------------- 1 | MPI_HOME ?= /opt/nvidia/hpc_sdk/Linux_x86_64/${HPCSDK_VERSION}/comm_libs/mpi 2 | NVSHMEM_LIB ?= /opt/nvidia/hpc_sdk/Linux_x86_64/${HPCSDK_VERSION}/comm_libs/nvshmem/lib 3 | NVSHMEM_INC ?= /opt/nvidia/hpc_sdk/Linux_x86_64/${HPCSDK_VERSION}/comm_libs/nvshmem/include 4 | CUDA_HOME ?= $(shell dirname $$(command -v nvcc))/.. 5 | CUFFT_LIB ?= ../../cufft/lib/ 6 | CUFFT_INC ?= ../../cufft/include/ 7 | ARCH ?= $(shell uname -m) 8 | ifeq ($(ARCH), ppc64le) 9 | MPI ?= mpi_ibm 10 | else 11 | MPI ?= mpi 12 | endif 13 | # Also see https://docs.nvidia.com/cuda/cufftmp/usage/requirements.html for supported architectures 14 | CXXFLAGS = -std=c++17 --generate-code arch=compute_70,code=sm_70 --generate-code arch=compute_75,code=sm_75 --generate-code arch=compute_80,code=sm_80 --generate-code arch=compute_86,code=sm_86 --generate-code arch=compute_89,code=sm_89 --generate-code arch=compute_90,code=sm_90 --generate-code arch=compute_100,code=sm_100 15 | INCFLAGS = -I${CUFFT_INC} -I${NVSHMEM_INC} -I${MPI_HOME}/include 16 | LDFLAGS = -lcuda -L${CUFFT_LIB} -L${NVSHMEM_LIB} -lcufftMp -lnvshmem_device -lnvshmem_host -L${MPI_HOME}/lib -l${MPI} 17 | -------------------------------------------------------------------------------- /cuFFTMp/samples/common/README.md: -------------------------------------------------------------------------------- 1 | # Auxiliary functions for samples 2 | 3 | This folder contains a few auxiliary functions for various samples. 4 | 5 | ## error_checks.hpp 6 | - error_check: Compute the global L2 norm between reference and test values by using `BoxIterator`. 7 | - assess_error: Assess the error based on some tolerance (default: `tolerance = 1e-6`). This also produces a print statement on the MPI rank 0. 8 | 9 | ## generate_random.hpp 10 | - Two generate_random functions that generate real or complex values in a `std::vector` 11 | 12 | ## scaling.cuh 13 | - scaling_kernel: Normalize entries in the box with a constant scaling factor using `BoxIterator`. By default, entries corresponding to the first 10 threads are printed for illustration. This kernel serves as an example of intermediate operations that can be done between two Fourier transforms. -------------------------------------------------------------------------------- /cuFFTMp/samples/common/generate_random.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | void generate_random(std::vector>& data, int seed) { 6 | std::mt19937 gen(seed); 7 | std::uniform_real_distribution dist(-1, 1); 8 | for(auto& v: data) { 9 | float r = dist(gen); 10 | float i = dist(gen); 11 | v = {r, i}; 12 | } 13 | } 14 | 15 | void generate_random(std::vector& data, int seed) { 16 | std::mt19937 gen(seed); 17 | std::uniform_real_distribution dist(-1, 1); 18 | for(auto& v: data) { 19 | v = dist(gen); 20 | } 21 | } -------------------------------------------------------------------------------- /cuFFTMp/samples/common/scaling.cuh: -------------------------------------------------------------------------------- 1 | #include "../iterators/box_iterator.hpp" 2 | 3 | __global__ 4 | void scaling_kernel(BoxIterator begin, BoxIterator end, int rank, int size, size_t nx, size_t ny, size_t nz, bool printing = true) { 5 | const int tid = threadIdx.x + blockIdx.x * blockDim.x; 6 | begin += tid; 7 | if(begin < end) { 8 | // begin.x(), begin.y() and begin.z() are the global 3D coordinate of the data pointed by the iterator 9 | // begin->x and begin->y are the real and imaginary part of the corresponding cufftComplex element 10 | if(tid < 10 && printing) { 11 | printf("GPU data (after first transform): global 3D index [%d %d %d], local index %d, rank %d is (%f,%f)\n", 12 | (int)begin.x(), (int)begin.y(), (int)begin.z(), (int)begin.i(), rank, begin->x, begin->y); 13 | } 14 | *begin = {begin->x / (float)(nx * ny * nz), begin->y / (float)(nx * ny * nz)}; 15 | } 16 | }; -------------------------------------------------------------------------------- /cuFFTMp/samples/r2c_c2r/Makefile: -------------------------------------------------------------------------------- 1 | include ../common.mk 2 | 3 | exe = cufftmp_r2c_c2r 4 | 5 | all: $(exe) 6 | 7 | .PHONY: clean 8 | 9 | clean: 10 | rm -rf $(exe) 11 | 12 | $(exe): $(exe).cu 13 | ${CUDA_HOME}/bin/nvcc $< -o $@ ${CXXFLAGS} ${INCFLAGS} ${LDFLAGS} 14 | 15 | build: $(exe) 16 | 17 | run: $(exe) 18 | LD_LIBRARY_PATH="${NVSHMEM_LIB}:${CUFFT_LIB}:${LD_LIBRARY_PATH}" mpirun -oversubscribe -n 2 $(exe) 19 | -------------------------------------------------------------------------------- /cuFFTMp/samples/r2c_c2r_no_descriptors/Makefile: -------------------------------------------------------------------------------- 1 | include ../common.mk 2 | 3 | exe = cufftmp_r2c_c2r_no_descriptors 4 | 5 | all: $(exe) 6 | 7 | .PHONY: clean 8 | 9 | clean: 10 | rm -rf $(exe) 11 | 12 | $(exe): $(exe).cu 13 | ${CUDA_HOME}/bin/nvcc $< -o $@ ${CXXFLAGS} ${INCFLAGS} ${LDFLAGS} 14 | 15 | build: $(exe) 16 | 17 | run: $(exe) 18 | LD_LIBRARY_PATH="${NVSHMEM_LIB}:${CUFFT_LIB}:${LD_LIBRARY_PATH}" mpirun -oversubscribe -n 2 $(exe) 19 | -------------------------------------------------------------------------------- /cuFFTMp/samples/r2c_c2r_pencils/Makefile: -------------------------------------------------------------------------------- 1 | include ../common.mk 2 | 3 | exe = cufftmp_r2c_c2r_pencils 4 | 5 | all: $(exe) 6 | 7 | .PHONY: clean 8 | 9 | clean: 10 | rm -rf $(exe) 11 | 12 | $(exe): $(exe).cu 13 | ${CUDA_HOME}/bin/nvcc $< -o $@ ${CXXFLAGS} ${INCFLAGS} ${LDFLAGS} 14 | 15 | build: $(exe) 16 | 17 | run: $(exe) 18 | LD_LIBRARY_PATH="${NVSHMEM_LIB}:${CUFFT_LIB}:${LD_LIBRARY_PATH}" mpirun -oversubscribe -n 4 $(exe) 19 | -------------------------------------------------------------------------------- /cuFFTMp/samples/r2c_c2r_pencils_cufftMpMakePlan/README.md: -------------------------------------------------------------------------------- 1 | # R2C_C2R Sample using a custom user distributions (pencils) 2 | ## Sample description 3 | This sample is simiar to [samples/r2c_c2r_pencils](../r2c_c2r_pencils/README.md), where it performs 4 | - R2C forward transform 5 | - [Scaling/normalization](../common/README.md) 6 | - C2R backward transform. 7 | 8 | But this sample uses the new API `cufftMpMakePlanDecomposition` where the box coordinates for data decomposition across ranks/PEs as well as the communicator are passed directly to the planning function. 9 | 10 | ## Build and run 11 | This example requires 4 GPUs. 12 | 13 | See [Requirements](../../README.md) and [Quick start for C++ samples](../../README.md) for hardware/software requirements and build instructions. 14 | 15 | Example code snippet: 16 | ``` 17 | $ MPI_HOME=/usr/lib/x86_64-linux-gnu/openmpi/ make run 18 | Hello from rank 1/4 using GPU 1 19 | Hello from rank 3/4 using GPU 3 20 | Hello from rank 2/4 using GPU 2 21 | Hello from rank 0/4 using GPU 0 22 | Input data, global 3D index [0,2,0], local index 0, rank 1 is -0.165956 23 | [...] 24 | GPU data (after first transform): global 3D index [0 4 3], local index 9, rank 1 is (0.412567,-9.293055) 25 | PASSED with L2 error 1.156259e-07 < 1.000000e-06 26 | [...] 27 | ``` 28 | -------------------------------------------------------------------------------- /cuFFTMp/samples/r2c_c2r_shared_scratch/Makefile: -------------------------------------------------------------------------------- 1 | include ../common.mk 2 | 3 | exe = cufftmp_r2c_c2r_shared_scratch 4 | 5 | all: $(exe) 6 | 7 | .PHONY: clean 8 | 9 | clean: 10 | rm -rf $(exe) 11 | 12 | $(exe): $(exe).cu 13 | ${CUDA_HOME}/bin/nvcc $< -o $@ ${CXXFLAGS} ${INCFLAGS} ${LDFLAGS} 14 | 15 | build: $(exe) 16 | 17 | run: $(exe) 18 | LD_LIBRARY_PATH="${NVSHMEM_LIB}:${CUFFT_LIB}:${LD_LIBRARY_PATH}" mpirun -oversubscribe -n 2 $(exe) 19 | -------------------------------------------------------------------------------- /cuFFTMp/samples/r2c_c2r_slabs_GROMACS/Makefile: -------------------------------------------------------------------------------- 1 | include ../common.mk 2 | 3 | exe = cufftmp_r2c_c2r_slabs_GROMACS 4 | 5 | all: $(exe) 6 | 7 | .PHONY: clean 8 | 9 | clean: 10 | rm -rf $(exe) 11 | 12 | $(exe): $(exe).cu 13 | ${CUDA_HOME}/bin/nvcc $< -o $@ ${CXXFLAGS} ${INCFLAGS} ${LDFLAGS} 14 | 15 | build: $(exe) 16 | 17 | run: $(exe) 18 | LD_LIBRARY_PATH="${NVSHMEM_LIB}:${CUFFT_LIB}:${LD_LIBRARY_PATH}" mpirun -oversubscribe -n 4 $(exe) 19 | -------------------------------------------------------------------------------- /cuFFTMp/samples/reshape/Makefile: -------------------------------------------------------------------------------- 1 | include ../common.mk 2 | 3 | exe = cufftmp_reshape 4 | 5 | all: $(exe) 6 | 7 | .PHONY: clean 8 | 9 | clean: 10 | rm -rf $(exe) 11 | 12 | $(exe): $(exe).cu 13 | ${CUDA_HOME}/bin/nvcc $< -o $@ ${CXXFLAGS} ${INCFLAGS} ${LDFLAGS} 14 | 15 | build: $(exe) 16 | 17 | run: $(exe) 18 | LD_LIBRARY_PATH="${NVSHMEM_LIB}:${CUFFT_LIB}:${LD_LIBRARY_PATH}" mpirun -oversubscribe -n 2 $(exe) 19 | -------------------------------------------------------------------------------- /cuPQC/Makefile: -------------------------------------------------------------------------------- 1 | NVCC=nvcc 2 | NVCC_FLAGS=-dlto -arch=native -std=c++17 -O3 3 | 4 | 5 | CUPQC_DIR?=/usr/lib/nvidia/cupqc-pkg-0.3.0/ 6 | CUPQC_INCLUDE_DIR=$(CUPQC_DIR)/include/cupqc 7 | COMMONDX_INCLUDE_DIR=$(CUPQC_DIR)/include/ 8 | #/commondx 9 | 10 | CUDA_BIN_DIR=$(shell dirname `which $(NVCC)`) 11 | CUDA_INCLUDE_DIR=$(CUDA_BIN_DIR)/../include 12 | 13 | SRCS = $(wildcard *.cu) 14 | TARGETS=$(patsubst %.cu,%,$(SRCS)) 15 | 16 | $(TARGETS): %: %.cu 17 | $(NVCC) $(NVCC_FLAGS) -L$(CUPQC_DIR)/lib/ -lcupqc -lcuhash-o $@ $< -I$(COMMONDX_INCLUDE_DIR) -I$(CUPQC_INCLUDE_DIR) 18 | .PHONY: all clean 19 | 20 | all: $(TARGETS) 21 | 22 | clean: 23 | rm -f $(TARGETS) 24 | 25 | .DEFAULT_GOAL := all 26 | -------------------------------------------------------------------------------- /cuPQC/README.md: -------------------------------------------------------------------------------- 1 | # cuPQC Library - API Examples 2 | 3 | All examples are shipped within [cuPQC Software Development Kit](https://developer.nvidia.com/cupqc-downloads). 4 | 5 | ## Description 6 | 7 | This folder demonstrates how to use the libraries stored in the cuPQC SDK: cuPQC and cuHash. 8 | 9 | * [cuPQC download page](https://developer.nvidia.com/cupqc-downloads) 10 | * [cuPQC API documentation](https://docs.nvidia.com/cuda/cupqc/index.html) 11 | 12 | ## Requirements 13 | 14 | * [cuPQC SDK](https://developer.nvidia.com/cupqc-downloads) 15 | * [See cuPQC SDK requirements](https://docs.nvidia.com/cuda/cupqc/requirements.html) 16 | * Linux system with installed NVIDIA drivers 17 | * NVIDIA GPU of Volta (SM70) or newer architecture 18 | 19 | ## Build 20 | Download and expand the cuPQC SDK then use the MakeFile located in this directory. Make sure that you set the `CUPQC_DIR` to the location of your expanded cuPQC SDK folder. 21 | 22 | ``` 23 | export CUPQC_DIR= 24 | make 25 | // Run 26 | ./example_ml_kem 27 | ./example_ml_dsa 28 | ./example_sha2 29 | ./example_sha3 30 | ``` 31 | 32 | ## Examples 33 | There is a ML-KEM and a ML-DSA example in this directory, these demonstrate the usage for the cuPQC library, and requires `libcupqc.a`. 34 | There are also SHA2 and SHA3 examples that demonstrate the usage of the cuHash library, these require `libcuhash.a`. 35 | For the detailed descriptions of the examples please visit [Examples](https://docs.nvidia.com/cuda/cupqc/examples.html) section of the cuCPQ documentation. 36 | 37 | -------------------------------------------------------------------------------- /cuPQC/example_sha3.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | using namespace cupqc; 7 | 8 | using SHA3_256_WARP = decltype(SHA3_256() + Warp()); 9 | 10 | __global__ void hash_sha3_kernel(uint8_t* digest, const uint8_t* msg, size_t inbuf_len) 11 | { 12 | SHA3_256_WARP hash {}; 13 | hash.reset(); 14 | hash.update(msg, inbuf_len); 15 | hash.finalize(); 16 | hash.digest(digest, SHA3_256_WARP::digest_size); 17 | } 18 | 19 | void hash_sha3(std::vector& digest, std::vector& msg) 20 | { 21 | uint8_t* d_msg; 22 | uint8_t* d_digest; 23 | cudaMalloc(reinterpret_cast(&d_msg), msg.size()); 24 | cudaMalloc(reinterpret_cast(&d_digest), digest.size()); 25 | 26 | cudaMemcpy(d_msg, msg.data(), msg.size(), cudaMemcpyHostToDevice); 27 | 28 | hash_sha3_kernel<<<1, 32>>>(d_digest, d_msg, msg.size()); 29 | 30 | cudaMemcpy(digest.data(), d_digest, digest.size(), cudaMemcpyDeviceToHost); 31 | 32 | cudaFree(d_msg); 33 | cudaFree(d_digest); 34 | } 35 | 36 | int main([[maybe_unused]] int argc, [[maybe_unused]] char* argv[]) { 37 | const char * msg_str = "The quick brown fox jumps over the lazy dog"; 38 | std::vector msg(reinterpret_cast(msg_str), reinterpret_cast(msg_str) + strlen(msg_str)); 39 | std::vector digest(SHA3_256::digest_size, 0); 40 | hash_sha3(digest, msg); 41 | printf("SHA3-256: "); 42 | for (uint8_t num : digest) { 43 | printf("%02x", num); 44 | } 45 | printf("\n"); 46 | } 47 | -------------------------------------------------------------------------------- /cuRAND/.gitignore: -------------------------------------------------------------------------------- 1 | format/ -------------------------------------------------------------------------------- /cuRAND/Host/mrg32k3a/.gitignore: -------------------------------------------------------------------------------- 1 | /build -------------------------------------------------------------------------------- /cuRAND/Host/mrg32k3a/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_TOOLKIT := $(shell dirname $$(command -v nvcc))/.. 2 | INC := -I$(CUDA_TOOLKIT)/include -I../../utils 3 | LIBS := -L$(CUDA_TOOLKIT)/lib64 -lcudart -lcurand 4 | FLAGS := -O3 -std=c++11 5 | 6 | ROUTINES := curand_mrg32k3a_uniform_example \ 7 | curand_mrg32k3a_normal_example \ 8 | curand_mrg32k3a_lognormal_example\ 9 | curand_mrg32k3a_poisson_example 10 | 11 | all: $(ROUTINES) 12 | 13 | %: %.cpp 14 | nvcc $(FLAGS) $(INC) $^ -o $@ $(LIBS) 15 | 16 | clean: 17 | rm -f $(ROUTINES) 18 | 19 | .PHONY: clean all test 20 | -------------------------------------------------------------------------------- /cuRAND/Host/mt19937/.gitignore: -------------------------------------------------------------------------------- 1 | /build -------------------------------------------------------------------------------- /cuRAND/Host/mt19937/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_TOOLKIT := $(shell dirname $$(command -v nvcc))/.. 2 | INC := -I$(CUDA_TOOLKIT)/include -I../../utils 3 | LIBS := -L$(CUDA_TOOLKIT)/lib64 -lcudart -lcurand 4 | FLAGS := -O3 -std=c++11 5 | 6 | ROUTINES := curand_mt19937_uniform_example \ 7 | curand_mt19937_normal_example \ 8 | curand_mt19937_lognormal_example\ 9 | curand_mt19937_poisson_example 10 | 11 | all: $(ROUTINES) 12 | 13 | %: %.cpp 14 | nvcc $(FLAGS) $(INC) $^ -o $@ $(LIBS) 15 | 16 | clean: 17 | rm -f $(ROUTINES) 18 | 19 | .PHONY: clean all test 20 | -------------------------------------------------------------------------------- /cuRAND/Host/mtgp32/.gitignore: -------------------------------------------------------------------------------- 1 | /build -------------------------------------------------------------------------------- /cuRAND/Host/mtgp32/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_TOOLKIT := $(shell dirname $$(command -v nvcc))/.. 2 | INC := -I$(CUDA_TOOLKIT)/include -I../../utils 3 | LIBS := -L$(CUDA_TOOLKIT)/lib64 -lcudart -lcurand 4 | FLAGS := -O3 -std=c++11 5 | 6 | ROUTINES := curand_mtgp32_uniform_example \ 7 | curand_mtgp32_normal_example \ 8 | curand_mtgp32_lognormal_example\ 9 | curand_mtgp32_poisson_example 10 | 11 | all: $(ROUTINES) 12 | 13 | %: %.cpp 14 | nvcc $(FLAGS) $(INC) $^ -o $@ $(LIBS) 15 | 16 | clean: 17 | rm -f $(ROUTINES) 18 | 19 | .PHONY: clean all test 20 | -------------------------------------------------------------------------------- /cuRAND/Host/philox/.gitignore: -------------------------------------------------------------------------------- 1 | /build -------------------------------------------------------------------------------- /cuRAND/Host/philox/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_TOOLKIT := $(shell dirname $$(command -v nvcc))/.. 2 | INC := -I$(CUDA_TOOLKIT)/include -I../../utils 3 | LIBS := -L$(CUDA_TOOLKIT)/lib64 -lcudart -lcurand 4 | FLAGS := -O3 -std=c++11 5 | 6 | ROUTINES := curand_philox_uniform_example \ 7 | curand_philox_normal_example \ 8 | curand_philox_lognormal_example\ 9 | curand_philox_poisson_example 10 | 11 | all: $(ROUTINES) 12 | 13 | %: %.cpp 14 | nvcc $(FLAGS) $(INC) $^ -o $@ $(LIBS) 15 | 16 | clean: 17 | rm -f $(ROUTINES) 18 | 19 | .PHONY: clean all test 20 | -------------------------------------------------------------------------------- /cuRAND/Host/scrambled_sobol32/.gitignore: -------------------------------------------------------------------------------- 1 | /build -------------------------------------------------------------------------------- /cuRAND/Host/scrambled_sobol32/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_TOOLKIT := $(shell dirname $$(command -v nvcc))/.. 2 | INC := -I$(CUDA_TOOLKIT)/include -I../../utils 3 | LIBS := -L$(CUDA_TOOLKIT)/lib64 -lcudart -lcurand 4 | FLAGS := -O3 -std=c++11 5 | 6 | ROUTINES := curand_scrambled_sobol32_uniform_example \ 7 | curand_scrambled_sobol32_normal_example \ 8 | curand_scrambled_sobol32_lognormal_example\ 9 | curand_scrambled_sobol32_poisson_example 10 | 11 | all: $(ROUTINES) 12 | 13 | %: %.cpp 14 | nvcc $(FLAGS) $(INC) $^ -o $@ $(LIBS) 15 | 16 | clean: 17 | rm -f $(ROUTINES) 18 | 19 | .PHONY: clean all test 20 | -------------------------------------------------------------------------------- /cuRAND/Host/scrambled_sobol64/.gitignore: -------------------------------------------------------------------------------- 1 | /build -------------------------------------------------------------------------------- /cuRAND/Host/scrambled_sobol64/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_TOOLKIT := $(shell dirname $$(command -v nvcc))/.. 2 | INC := -I$(CUDA_TOOLKIT)/include -I../../utils 3 | LIBS := -L$(CUDA_TOOLKIT)/lib64 -lcudart -lcurand 4 | FLAGS := -O3 -std=c++11 5 | 6 | ROUTINES := curand_scrambled_sobol64_uniform_example \ 7 | curand_scrambled_sobol64_normal_example \ 8 | curand_scrambled_sobol64_lognormal_example\ 9 | curand_scrambled_sobol64_poisson_example 10 | 11 | all: $(ROUTINES) 12 | 13 | %: %.cpp 14 | nvcc $(FLAGS) $(INC) $^ -o $@ $(LIBS) 15 | 16 | clean: 17 | rm -f $(ROUTINES) 18 | 19 | .PHONY: clean all test 20 | -------------------------------------------------------------------------------- /cuRAND/Host/sobol32/.gitignore: -------------------------------------------------------------------------------- 1 | /build -------------------------------------------------------------------------------- /cuRAND/Host/sobol32/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_TOOLKIT := $(shell dirname $$(command -v nvcc))/.. 2 | INC := -I$(CUDA_TOOLKIT)/include -I../../utils 3 | LIBS := -L$(CUDA_TOOLKIT)/lib64 -lcudart -lcurand 4 | FLAGS := -O3 -std=c++11 5 | 6 | ROUTINES := curand_sobol32_uniform_example \ 7 | curand_sobol32_normal_example \ 8 | curand_sobol32_lognormal_example\ 9 | curand_sobol32_poisson_example 10 | 11 | all: $(ROUTINES) 12 | 13 | %: %.cpp 14 | nvcc $(FLAGS) $(INC) $^ -o $@ $(LIBS) 15 | 16 | clean: 17 | rm -f $(ROUTINES) 18 | 19 | .PHONY: clean all test 20 | -------------------------------------------------------------------------------- /cuRAND/Host/sobol64/.gitignore: -------------------------------------------------------------------------------- 1 | /build -------------------------------------------------------------------------------- /cuRAND/Host/sobol64/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_TOOLKIT := $(shell dirname $$(command -v nvcc))/.. 2 | INC := -I$(CUDA_TOOLKIT)/include -I../../utils 3 | LIBS := -L$(CUDA_TOOLKIT)/lib64 -lcudart -lcurand 4 | FLAGS := -O3 -std=c++11 5 | 6 | ROUTINES := curand_sobol64_uniform_example \ 7 | curand_sobol64_normal_example \ 8 | curand_sobol64_lognormal_example \ 9 | curand_sobol64_poisson_example 10 | 11 | all: $(ROUTINES) 12 | 13 | %: %.cpp 14 | nvcc $(FLAGS) $(INC) $^ -o $@ $(LIBS) 15 | 16 | clean: 17 | rm -f $(ROUTINES) 18 | 19 | .PHONY: clean all test 20 | -------------------------------------------------------------------------------- /cuRAND/Host/xorwow/.gitignore: -------------------------------------------------------------------------------- 1 | /build -------------------------------------------------------------------------------- /cuRAND/Host/xorwow/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_TOOLKIT := $(shell dirname $$(command -v nvcc))/.. 2 | INC := -I$(CUDA_TOOLKIT)/include -I../../utils 3 | LIBS := -L$(CUDA_TOOLKIT)/lib64 -lcudart -lcurand 4 | FLAGS := -O3 -std=c++11 5 | 6 | ROUTINES := curand_xorwow_uniform_example \ 7 | curand_xorwow_normal_example \ 8 | curand_xorwow_lognormal_example\ 9 | curand_xorwow_poisson_example 10 | 11 | all: $(ROUTINES) 12 | 13 | %: %.cpp 14 | nvcc $(FLAGS) $(INC) $^ -o $@ $(LIBS) 15 | 16 | clean: 17 | rm -f $(ROUTINES) 18 | 19 | .PHONY: clean all test 20 | -------------------------------------------------------------------------------- /cuSOLVER/MgGetrf/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuSOLVER/MgPotrf/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuSOLVER/MgSyevd/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuSOLVER/Xgeev/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuSOLVER/Xgeqrf/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuSOLVER/Xgesvd/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuSOLVER/Xgesvdp/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuSOLVER/Xgesvdr/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuSOLVER/Xgetrf/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuSOLVER/Xpotrf/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuSOLVER/Xsyevd/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuSOLVER/Xsyevdx/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuSOLVER/Xtrtri/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuSOLVER/csrqr/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuSOLVER/gesv/.gitignore: -------------------------------------------------------------------------------- 1 | build -------------------------------------------------------------------------------- /cuSOLVER/gesvd/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuSOLVER/gesvdaStridedBatched/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuSOLVER/gesvdj/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuSOLVER/gesvdjBatched/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuSOLVER/getrf/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuSOLVER/orgqr/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuSOLVER/ormqr/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuSOLVER/potrfBatched/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuSOLVER/syevd/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuSOLVER/syevdx/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuSOLVER/syevj/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuSOLVER/syevjBatched/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuSOLVER/sygvd/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuSOLVER/sygvdx/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuSOLVER/sygvj/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /cuSOLVERMp/.gitignore: -------------------------------------------------------------------------------- 1 | mp_getrf_getrs 2 | mp_potrf_potrs 3 | -------------------------------------------------------------------------------- /cuSOLVERSp2cuDSS/test_complex.mtx: -------------------------------------------------------------------------------- 1 | %%MatrixMarket matrix coordinate complex general 2 | %------------------------------------------------------------------------------- 3 | 12 12 46 4 | 1 1 10 0 5 | 2 1 1 0 6 | 3 1 1 0 7 | 1 2 1 0 8 | 2 2 10 0 9 | 4 2 1 0 10 | 5 2 2 0 11 | 1 3 1 0 12 | 3 3 10 0 13 | 5 3 1 0 14 | 6 3 1 0 15 | 2 4 1 0 16 | 4 4 10 0 17 | 7 4 1 0 18 | 8 4 1 0 19 | 2 5 2 0 20 | 3 5 1 0 21 | 5 5 10 0 22 | 8 5 2 0 23 | 9 5 2 0 24 | 3 6 1 0 25 | 6 6 10 0 26 | 9 6 1 0 27 | 4 7 1 0 28 | 7 7 10 0 29 | 10 7 2 0 30 | 4 8 1 0 31 | 5 8 2 0 32 | 8 8 10 0 33 | 10 8 2 0 34 | 11 8 1 0 35 | 5 9 2 0 36 | 6 9 1 0 37 | 9 9 10 0 38 | 11 9 1 0 39 | 7 10 2 0 40 | 8 10 2 0 41 | 10 10 10 0 42 | 12 10 2 0 43 | 8 11 1 0 44 | 9 11 1 0 45 | 11 11 10 0 46 | 12 11 1 0 47 | 10 12 2 0 48 | 11 12 1 0 49 | 12 12 10 0 50 | -------------------------------------------------------------------------------- /cuSOLVERSp2cuDSS/test_real.mtx: -------------------------------------------------------------------------------- 1 | %%MatrixMarket matrix coordinate real general 2 | %------------------------------------------------------------------------------- 3 | 12 12 46 4 | 1 1 10 5 | 2 1 1 6 | 3 1 1 7 | 1 2 1 8 | 2 2 10 9 | 4 2 1 10 | 5 2 2 11 | 1 3 1 12 | 3 3 10 13 | 5 3 1 14 | 6 3 1 15 | 2 4 1 16 | 4 4 10 17 | 7 4 1 18 | 8 4 1 19 | 2 5 2 20 | 3 5 1 21 | 5 5 10 22 | 8 5 2 23 | 9 5 2 24 | 3 6 1 25 | 6 6 10 26 | 9 6 1 27 | 4 7 1 28 | 7 7 10 29 | 10 7 2 30 | 4 8 1 31 | 5 8 2 32 | 8 8 10 33 | 10 8 2 34 | 11 8 1 35 | 5 9 2 36 | 6 9 1 37 | 9 9 10 38 | 11 9 1 39 | 7 10 2 40 | 8 10 2 41 | 10 10 10 42 | 12 10 2 43 | 8 11 1 44 | 9 11 1 45 | 11 11 10 46 | 12 11 1 47 | 10 12 2 48 | 11 12 1 49 | 12 12 10 50 | -------------------------------------------------------------------------------- /cuSPARSE/axpby/README.md: -------------------------------------------------------------------------------- 1 | # cuSPARSE Generic APIs - `cusparseSpVV` 2 | 3 | ## Description 4 | 5 | This sample demonstrates the usage of `cusparseAxpby` for performing *sparse vector - dense vector scaling and sum*. 6 | 7 | [cusparseAxpby Documentation](https://docs.nvidia.com/cuda/cusparse/index.html#cusparse-generic-function-axpby) 8 | 9 |
10 | 11 | `Y = alpha * X + beta * Y` 12 | 13 | ![](axpby.png) 14 |
15 | 16 | ## Building 17 | 18 | * Command line 19 | ```bash 20 | nvcc -I/include axpby_example.c -o axpby_example -lcusparse 21 | ``` 22 | 23 | * Linux 24 | ```bash 25 | make 26 | ``` 27 | 28 | * Windows/Linux 29 | ```bash 30 | mkdir build 31 | cd build 32 | cmake .. 33 | make 34 | ``` 35 | On Windows, instead of running the last build step, open the Visual Studio Solution that was created and build. 36 | 37 | ## Support 38 | 39 | * **Supported SM Architectures:** SM 5.0, SM 5.2, SM 5.3, SM 6.0, SM 6.1, SM 6.2, SM 7.0, SM 7.2, SM 7.5, SM 8.0, SM 8.6, SM 8.9, SM 9.0 40 | * **Supported OSes:** Linux, Windows, QNX, Android 41 | * **Supported CPU Architectures**: x86_64, arm64 42 | * **Supported Compilers**: gcc, clang, Intel icc, Microsoft msvc, Nvidia HPC SDK nvc 43 | * **Language**: `C99` 44 | 45 | ## Prerequisites 46 | 47 | * [CUDA 11.0 toolkit](https://developer.nvidia.com/cuda-downloads) (or above) and compatible driver (see [CUDA Driver Release Notes](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-major-component-versions)). 48 | * [CMake 3.9](https://cmake.org/download/) or above on Windows 49 | -------------------------------------------------------------------------------- /cuSPARSE/axpby/axpby.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/axpby/axpby.png -------------------------------------------------------------------------------- /cuSPARSE/bicgstab/BiCGStab.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/bicgstab/BiCGStab.pdf -------------------------------------------------------------------------------- /cuSPARSE/bicgstab/BiCGStab.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/bicgstab/BiCGStab.png -------------------------------------------------------------------------------- /cuSPARSE/cg/cg.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/cg/cg.pdf -------------------------------------------------------------------------------- /cuSPARSE/cg/cg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/cg/cg.png -------------------------------------------------------------------------------- /cuSPARSE/coosort/README.md: -------------------------------------------------------------------------------- 1 | # cuSPARSE APIs - `cusparseXcoosort` 2 | 3 | ## Description 4 | 5 | This sample demonstrates the usage of `cusparseXcoosortByRow` to perform sorting of COO format. 6 | 7 | [cusparseXcoosort Documentation](https://docs.nvidia.com/cuda/cusparse/index.html#coosort) 8 | 9 | ## Building 10 | 11 | * Command line 12 | ```bash 13 | nvcc -I/include coosort_example.c -o coosort_example -lcusparse 14 | ``` 15 | 16 | * Linux 17 | ```bash 18 | make 19 | ``` 20 | 21 | * Windows/Linux 22 | ```bash 23 | mkdir build 24 | cd build 25 | cmake .. 26 | make 27 | ``` 28 | On Windows, instead of running the last build step, open the Visual Studio Solution that was created and build. 29 | 30 | ## Support 31 | 32 | * **Supported SM Architectures:** SM 5.0, SM 5.2, SM 5.3, SM 6.0, SM 6.1, SM 6.2, SM 7.0, SM 7.2, SM 7.5, SM 8.0, SM 8.6, SM 8.9, SM 9.0 33 | * **Supported OSes:** Linux, Windows, QNX, Android 34 | * **Supported CPU Architectures**: x86_64, arm64 35 | * **Supported Compilers**: gcc, clang, Intel icc, Microsoft msvc, Nvidia HPC SDK nvc 36 | * **Language**: `C99` 37 | 38 | ## Prerequisites 39 | 40 | * [CUDA 11.0 toolkit](https://developer.nvidia.com/cuda-downloads) (or above) and compatible driver (see [CUDA Driver Release Notes](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-major-component-versions)). 41 | * [CMake 3.9](https://cmake.org/download/) or above on Windows 42 | -------------------------------------------------------------------------------- /cuSPARSE/dense2sparse_blockedell/dense2sparse_blockedell.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/dense2sparse_blockedell/dense2sparse_blockedell.png -------------------------------------------------------------------------------- /cuSPARSE/dense2sparse_csr/dense2sparse_csr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/dense2sparse_csr/dense2sparse_csr.png -------------------------------------------------------------------------------- /cuSPARSE/gather/README.md: -------------------------------------------------------------------------------- 1 | # cuSPARSE Generic APIs - `cusparseGather` 2 | 3 | ## Description 4 | 5 | This sample demonstrates the usage of `cusparseGather` for performing *sparse vector - dense vector element gathering*. 6 | 7 | [cusparseGather Documentation](https://docs.nvidia.com/cuda/cusparse/index.html#cusparse-generic-function-gather) 8 | 9 |
10 | 11 | ![](gather.png) 12 |
13 | 14 | ## Building 15 | 16 | * Command line 17 | ```bash 18 | nvcc -I/include gather_example.c -o gather_example -lcusparse 19 | ``` 20 | 21 | * Linux 22 | ```bash 23 | make 24 | ``` 25 | 26 | * Windows/Linux 27 | ```bash 28 | mkdir build 29 | cd build 30 | cmake .. 31 | make 32 | ``` 33 | On Windows, instead of running the last build step, open the Visual Studio Solution that was created and build. 34 | 35 | ## Support 36 | 37 | * **Supported SM Architectures:** SM 5.0, SM 5.2, SM 5.3, SM 6.0, SM 6.1, SM 6.2, SM 7.0, SM 7.2, SM 7.5, SM 8.0, SM 8.6, SM 8.9, SM 9.0 38 | * **Supported OSes:** Linux, Windows, QNX, Android 39 | * **Supported CPU Architectures**: x86_64, arm64 40 | * **Supported Compilers**: gcc, clang, Intel icc, Microsoft msvc, Nvidia HPC SDK nvc 41 | * **Language**: `C99` 42 | 43 | ## Prerequisites 44 | 45 | * [CUDA 11.0 toolkit](https://developer.nvidia.com/cuda-downloads) (or above) and compatible driver (see [CUDA Driver Release Notes](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-major-component-versions)). 46 | * [CMake 3.9](https://cmake.org/download/) or above on Windows 47 | -------------------------------------------------------------------------------- /cuSPARSE/gather/gather.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/gather/gather.png -------------------------------------------------------------------------------- /cuSPARSE/graph_capture/README.md: -------------------------------------------------------------------------------- 1 | # cuSPARSE Generic APIs - `CUDA Graph Capture` 2 | 3 | ## Description 4 | 5 | The sample demonstrates how to optimize *sparse vector - dense vector dot product* (`cusparseSpVV`) by exploiting *CUDA Graph Capture functionality* 6 | 7 | [cuSPARSE Optimization Notes](https://docs.nvidia.com/cuda/cusparse/index.html#optimization-notes) 8 | 9 | ## Building 10 | 11 | * Command line 12 | ```bash 13 | nvcc -I/include graph_capture_example.c -o graph_capture_example -lcusparse 14 | ``` 15 | 16 | * Linux 17 | ```bash 18 | make 19 | ``` 20 | 21 | * Windows/Linux 22 | ```bash 23 | mkdir build 24 | cd build 25 | cmake .. 26 | make 27 | ``` 28 | On Windows, instead of running the last build step, open the Visual Studio Solution that was created and build. 29 | 30 | ## Support 31 | 32 | * **Supported SM Architectures:** SM 5.0, SM 5.2, SM 5.3, SM 6.0, SM 6.1, SM 6.2, SM 7.0, SM 7.2, SM 7.5, SM 8.0, SM 8.6, SM 8.9, SM 9.0 33 | * **Supported OSes:** Linux, Windows, QNX, Android 34 | * **Supported CPU Architectures**: x86_64, arm64 35 | * **Supported Compilers**: gcc, clang, Intel icc, Microsoft msvc, Nvidia HPC SDK nvc 36 | * **Language**: `C99` 37 | 38 | ## Prerequisites 39 | 40 | * [CUDA 11.0 toolkit](https://developer.nvidia.com/cuda-downloads) (or above) and compatible driver (see [CUDA Driver Release Notes](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-major-component-versions)). 41 | * [CMake 3.9](https://cmake.org/download/) or above on Windows 42 | -------------------------------------------------------------------------------- /cuSPARSE/rot/README.md: -------------------------------------------------------------------------------- 1 | # cuSPARSE Generic APIs - `cusparseRot` 2 | 3 | ## Description 4 | 5 | This sample demonstrates the usage of `cusparseRot` for performing *Givens rotation*. 6 | 7 | [cusparseRot Documentation](https://docs.nvidia.com/cuda/cusparse/index.html#cusparse-generic-function-rot) 8 | 9 |
10 | 11 | ![](rot.png) 12 |
13 | 14 | ## Building 15 | 16 | * Command line 17 | ```bash 18 | nvcc -I/include rot_example.c -o rot_example -lcusparse 19 | ``` 20 | 21 | * Linux 22 | ```bash 23 | make 24 | ``` 25 | 26 | * Windows/Linux 27 | ```bash 28 | mkdir build 29 | cd build 30 | cmake .. 31 | make 32 | ``` 33 | On Windows, instead of running the last build step, open the Visual Studio Solution that was created and build. 34 | 35 | ## Support 36 | 37 | * **Supported SM Architectures:** SM 5.0, SM 5.2, SM 5.3, SM 6.0, SM 6.1, SM 6.2, SM 7.0, SM 7.2, SM 7.5, SM 8.0, SM 8.6, SM 8.9, SM 9.0 38 | * **Supported OSes:** Linux, Windows, QNX, Android 39 | * **Supported CPU Architectures**: x86_64, arm64 40 | * **Supported Compilers**: gcc, clang, Intel icc, Microsoft msvc, Nvidia HPC SDK nvc 41 | * **Language**: `C99` 42 | 43 | ## Prerequisites 44 | 45 | * [CUDA 11.0 toolkit](https://developer.nvidia.com/cuda-downloads) (or above) and compatible driver (see [CUDA Driver Release Notes](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-major-component-versions)). 46 | * [CMake 3.9](https://cmake.org/download/) or above on Windows 47 | -------------------------------------------------------------------------------- /cuSPARSE/rot/rot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/rot/rot.png -------------------------------------------------------------------------------- /cuSPARSE/scatter/README.md: -------------------------------------------------------------------------------- 1 | # cuSPARSE Generic APIs - `cusparseScatter` 2 | 3 | ## Description 4 | 5 | This sample demonstrates the usage of `cusparseScatter` for performing *sparse vector - dense vector element scattering*. 6 | 7 | [cusparseScatter Documentation](https://docs.nvidia.com/cuda/cusparse/index.html#cusparse-generic-function-spvv) 8 | 9 |
10 | 11 | ![](scatter.png) 12 |
13 | 14 | ## Building 15 | 16 | * Command line 17 | ```bash 18 | nvcc -I/include scatter_example.c -o scatter_example -lcusparse 19 | ``` 20 | 21 | * Linux 22 | ```bash 23 | make 24 | ``` 25 | 26 | * Windows/Linux 27 | ```bash 28 | mkdir build 29 | cd build 30 | cmake .. 31 | make 32 | ``` 33 | On Windows, instead of running the last build step, open the Visual Studio Solution that was created and build. 34 | 35 | ## Support 36 | 37 | * **Supported SM Architectures:** SM 5.0, SM 5.2, SM 5.3, SM 6.0, SM 6.1, SM 6.2, SM 7.0, SM 7.2, SM 7.5, SM 8.0, SM 8.6, SM 8.9, SM 9.0 38 | * **Supported OSes:** Linux, Windows, QNX, Android 39 | * **Supported CPU Architectures**: x86_64, arm64 40 | * **Supported Compilers**: gcc, clang, Intel icc, Microsoft msvc, Nvidia HPC SDK nvc 41 | * **Language**: `C99` 42 | 43 | ## Prerequisites 44 | 45 | * [CUDA 11.0 toolkit](https://developer.nvidia.com/cuda-downloads) (or above) and compatible driver (see [CUDA Driver Release Notes](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-major-component-versions)). 46 | * [CMake 3.9](https://cmake.org/download/) or above on Windows 47 | -------------------------------------------------------------------------------- /cuSPARSE/scatter/scatter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/scatter/scatter.png -------------------------------------------------------------------------------- /cuSPARSE/sddmm_bsr/README.md: -------------------------------------------------------------------------------- 1 | # cuSPARSE Generic APIs - `cusparseSDDMM CSR` 2 | 3 | ## Description 4 | 5 | This sample demonstrates the usage of `cusparseSDDMM` for performing *dense matrix - dense matrix multiplication into sparse matrix*, where the sparse matrix is represented in CSR (Compressed Sparse Row) storage format. 6 | 7 | [cusparseSDDMM Documentation](https://docs.nvidia.com/cuda/cusparse/index.html#cusparse-generic-function-sddmm) 8 | 9 |
10 | 11 | `C = (alpha * A * B) ° spy(C) + beta * C` 12 | 13 | ![](sddmm_csr.png) 14 |
15 | 16 | ## Building 17 | 18 | * Linux 19 | ```bash 20 | make 21 | ``` 22 | 23 | * Windows/Linux 24 | ```bash 25 | mkdir build 26 | cd build 27 | cmake .. 28 | make 29 | ``` 30 | On Windows, instead of running the last build step, open the Visual Studio Solution that was created and build. 31 | 32 | ## Support 33 | 34 | * **Supported SM Architectures:** SM 5.0, SM 5.2, SM 5.3, SM 6.0, SM 6.1, SM 6.2, SM 7.0, SM 7.2, SM 7.5, SM 8.0, SM 8.6 35 | * **Supported OSes:** Linux, Windows, QNX, Android 36 | * **Supported CPU Architectures**: x86_64, arm64 37 | * **Supported Compilers**: gcc, clang, Intel icc, Microsoft msvc, Nvidia HPC SDK nvc 38 | * **Language**: `C99` 39 | 40 | ## Prerequisites 41 | 42 | * [CUDA 12.1 toolkit](https://developer.nvidia.com/cuda-downloads) (or above) and compatible driver (see [CUDA Driver Release Notes](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-major-component-versions)). 43 | * [CMake 3.9](https://cmake.org/download/) or above on Windows 44 | -------------------------------------------------------------------------------- /cuSPARSE/sddmm_bsr/sddmm_bsr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/sddmm_bsr/sddmm_bsr.png -------------------------------------------------------------------------------- /cuSPARSE/sddmm_csr/sddmm_csr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/sddmm_csr/sddmm_csr.png -------------------------------------------------------------------------------- /cuSPARSE/sddmm_csr_batched/sddmm_csr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/sddmm_csr_batched/sddmm_csr.png -------------------------------------------------------------------------------- /cuSPARSE/sparse2dense_csr/sparse2dense_csr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/sparse2dense_csr/sparse2dense_csr.png -------------------------------------------------------------------------------- /cuSPARSE/spgemm/spgemm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/spgemm/spgemm.png -------------------------------------------------------------------------------- /cuSPARSE/spgemm_mem/spgemm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/spgemm_mem/spgemm.png -------------------------------------------------------------------------------- /cuSPARSE/spgemm_reuse/spgemm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/spgemm_reuse/spgemm.png -------------------------------------------------------------------------------- /cuSPARSE/spmm_blockedell/spmm_blockedell.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/spmm_blockedell/spmm_blockedell.png -------------------------------------------------------------------------------- /cuSPARSE/spmm_coo/spmm_coo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/spmm_coo/spmm_coo.png -------------------------------------------------------------------------------- /cuSPARSE/spmm_coo_batched/spmm_coo_batched.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/spmm_coo_batched/spmm_coo_batched.png -------------------------------------------------------------------------------- /cuSPARSE/spmm_csr/spmm_csr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/spmm_csr/spmm_csr.png -------------------------------------------------------------------------------- /cuSPARSE/spmm_csr_batched/spmm_csr_batched.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/spmm_csr_batched/spmm_csr_batched.png -------------------------------------------------------------------------------- /cuSPARSE/spmm_csr_op/spmm_csr_op.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/spmm_csr_op/spmm_csr_op.png -------------------------------------------------------------------------------- /cuSPARSE/spmv_coo/spmv_coo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/spmv_coo/spmv_coo.png -------------------------------------------------------------------------------- /cuSPARSE/spmv_csr/spmv_csr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/spmv_csr/spmv_csr.png -------------------------------------------------------------------------------- /cuSPARSE/spmv_sell/spmv_sell.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/spmv_sell/spmv_sell.png -------------------------------------------------------------------------------- /cuSPARSE/spsm_coo/spsm_coo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/spsm_coo/spsm_coo.png -------------------------------------------------------------------------------- /cuSPARSE/spsm_csr/spsm_csr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/spsm_csr/spsm_csr.png -------------------------------------------------------------------------------- /cuSPARSE/spsv_coo/spsv_coo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/spsv_coo/spsv_coo.png -------------------------------------------------------------------------------- /cuSPARSE/spsv_csr/spsv_csr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/spsv_csr/spsv_csr.png -------------------------------------------------------------------------------- /cuSPARSE/spsv_sell/spsv_sell.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/spsv_sell/spsv_sell.png -------------------------------------------------------------------------------- /cuSPARSE/spvv/README.md: -------------------------------------------------------------------------------- 1 | # cuSPARSE Generic APIs - `cusparseSpVV` 2 | 3 | ## Description 4 | 5 | This sample demonstrates the usage of `cusparseSpVV` for performing *sparse vector - dense vector dot product*. 6 | 7 | [cusparseSpVV Documentation](https://docs.nvidia.com/cuda/cusparse/index.html#cusparse-generic-function-spvv) 8 | 9 |
10 | 11 | `result = X * Y` or `result = X^H * Y` 12 | 13 | ![](spvv.png) 14 |
15 | 16 | ## Building 17 | 18 | * Command line 19 | ```bash 20 | nvcc -I/include spvv_example.c -o spvv_example -lcusparse 21 | ``` 22 | 23 | * Linux 24 | ```bash 25 | make 26 | ``` 27 | 28 | * Windows/Linux 29 | ```bash 30 | mkdir build 31 | cd build 32 | cmake .. 33 | make 34 | ``` 35 | On Windows, instead of running the last build step, open the Visual Studio Solution that was created and build. 36 | 37 | ## Support 38 | 39 | * **Supported SM Architectures:** SM 5.0, SM 5.2, SM 5.3, SM 6.0, SM 6.1, SM 6.2, SM 7.0, SM 7.2, SM 7.5, SM 8.0, SM 8.6, SM 8.9, SM 9.0 40 | * **Supported OSes:** Linux, Windows, QNX, Android 41 | * **Supported CPU Architectures**: x86_64, arm64 42 | * **Supported Compilers**: gcc, clang, Intel icc, Microsoft msvc, Nvidia HPC SDK nvc 43 | * **Language**: `C99` 44 | 45 | ## Prerequisites 46 | 47 | * [CUDA 11.0 toolkit](https://developer.nvidia.com/cuda-downloads) (or above) and compatible driver (see [CUDA Driver Release Notes](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-major-component-versions)). 48 | * [CMake 3.9](https://cmake.org/download/) or above on Windows 49 | -------------------------------------------------------------------------------- /cuSPARSE/spvv/spvv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/spvv/spvv.png -------------------------------------------------------------------------------- /cuSPARSELt/README.md: -------------------------------------------------------------------------------- 1 | # cuSPARSELt Library 2 | 3 | ## Description 4 | 5 | This folder demonstrates cuSPARSELt Generic APIs usage. 6 | 7 | [cuSPARSELt Documentation](https://docs.nvidia.com/cuda/cusparselt/index.html) 8 | 9 | ## cuSPARSELt Samples 10 | 11 | * [Structured Matrix-Matrix Multiplication - Basic Concepts](matmul/) 12 | 13 | The sample demonstrates how to exploit *Sparse Tensor Cores* for performing Structured Matrix-Matrix Multiplication 14 | 15 | * [Batched GEMM, Activation Function, and Bias](matmul_advanced/) 16 | 17 | The sample extends the previous code to demonstrate how to perform batched GEMM computation, Split-K, and how to set up the activation function and bias 18 | -------------------------------------------------------------------------------- /cuTENSOR/Makefile: -------------------------------------------------------------------------------- 1 | CXX_FLAGS=-std=c++11 -I${CUTENSOR_ROOT}/include -L${CUTENSOR_ROOT}/lib -lcutensor -lcudart 2 | 3 | all: 4 | nvcc einsum.cu -o einsum ${CXX_FLAGS} 5 | nvcc contraction.cu -o contraction ${CXX_FLAGS} 6 | nvcc contraction_jit.cu -o contraction_jit ${CXX_FLAGS} 7 | nvcc elementwise_binary.cu -o elementwise_binary ${CXX_FLAGS} 8 | nvcc elementwise_permute.cu -o elementwise_permute ${CXX_FLAGS} 9 | nvcc elementwise_trinary.cu -o elementwise_trinary ${CXX_FLAGS} 10 | nvcc reduction.cu -o reduction ${CXX_FLAGS} 11 | 12 | run: 13 | ./einsum 14 | ./contraction 15 | ./contraction_jit 16 | ./elementwise_binary 17 | ./elementwise_permute 18 | ./elementwise_trinary 19 | ./reduction 20 | 21 | clean: 22 | rm -f contraction contraction_jit elementwise_binary elementwise_permute elementwise_trinary reduction 23 | -------------------------------------------------------------------------------- /cuTENSOR/README.md: -------------------------------------------------------------------------------- 1 | # cuTENSOR - Samples# 2 | 3 | * [Documentation](https://docs.nvidia.com/cuda/cutensor/index.html) 4 | 5 | # Install 6 | 7 | ## Linux 8 | 9 | You can use make or cmake to compile the cuTENSOR samples. 10 | 11 | With make 12 | 13 | ``` 14 | export CUTENSOR_ROOT= 15 | make -j8 16 | ``` 17 | 18 | With cmake 19 | 20 | ``` 21 | mkdir build && cd build 22 | cmake .. -DCUTENSOR_ROOT= 23 | make -j8 24 | ``` 25 | 26 | ## Windows 27 | 28 | We recommend using cmake with Ninja generator to compile: 29 | 30 | ``` 31 | mkdir build && cd build 32 | cmake .. -DCUTENSOR_ROOT= -G Ninja 33 | ninja 34 | ``` 35 | 36 | To run the examples, make sure the library files are located in a directory included in your %PATH% 37 | -------------------------------------------------------------------------------- /cuTENSOR/python/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.h 2 | -------------------------------------------------------------------------------- /nvJPEG/Image-Resize-WaterMark/NVLogo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize-WaterMark/NVLogo.jpg -------------------------------------------------------------------------------- /nvJPEG/Image-Resize-WaterMark/NVLogo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize-WaterMark/NVLogo.png -------------------------------------------------------------------------------- /nvJPEG/Image-Resize-WaterMark/img9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize-WaterMark/img9.png -------------------------------------------------------------------------------- /nvJPEG/Image-Resize-WaterMark/img9wm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize-WaterMark/img9wm.png -------------------------------------------------------------------------------- /nvJPEG/Image-Resize-WaterMark/input_images/cat.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize-WaterMark/input_images/cat.jpg -------------------------------------------------------------------------------- /nvJPEG/Image-Resize-WaterMark/input_images/cat_baseline.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize-WaterMark/input_images/cat_baseline.jpg -------------------------------------------------------------------------------- /nvJPEG/Image-Resize-WaterMark/input_images/cat_grayscale.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize-WaterMark/input_images/cat_grayscale.jpg -------------------------------------------------------------------------------- /nvJPEG/Image-Resize-WaterMark/input_images/img1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize-WaterMark/input_images/img1.jpg -------------------------------------------------------------------------------- /nvJPEG/Image-Resize-WaterMark/input_images/img2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize-WaterMark/input_images/img2.jpg -------------------------------------------------------------------------------- /nvJPEG/Image-Resize-WaterMark/input_images/img3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize-WaterMark/input_images/img3.jpg -------------------------------------------------------------------------------- /nvJPEG/Image-Resize-WaterMark/input_images/img4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize-WaterMark/input_images/img4.jpg -------------------------------------------------------------------------------- /nvJPEG/Image-Resize-WaterMark/input_images/img5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize-WaterMark/input_images/img5.jpg -------------------------------------------------------------------------------- /nvJPEG/Image-Resize-WaterMark/input_images/img6.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize-WaterMark/input_images/img6.jpg -------------------------------------------------------------------------------- /nvJPEG/Image-Resize-WaterMark/input_images/img7.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize-WaterMark/input_images/img7.jpg -------------------------------------------------------------------------------- /nvJPEG/Image-Resize-WaterMark/input_images/img8.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize-WaterMark/input_images/img8.jpg -------------------------------------------------------------------------------- /nvJPEG/Image-Resize-WaterMark/input_images/img9.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize-WaterMark/input_images/img9.jpg -------------------------------------------------------------------------------- /nvJPEG/Image-Resize/input_images/cat.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize/input_images/cat.jpg -------------------------------------------------------------------------------- /nvJPEG/Image-Resize/input_images/cat_baseline.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize/input_images/cat_baseline.jpg -------------------------------------------------------------------------------- /nvJPEG/Image-Resize/input_images/cat_grayscale.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize/input_images/cat_grayscale.jpg -------------------------------------------------------------------------------- /nvJPEG/Image-Resize/input_images/img1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize/input_images/img1.jpg -------------------------------------------------------------------------------- /nvJPEG/Image-Resize/input_images/img2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize/input_images/img2.jpg -------------------------------------------------------------------------------- /nvJPEG/Image-Resize/input_images/img3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize/input_images/img3.jpg -------------------------------------------------------------------------------- /nvJPEG/Image-Resize/input_images/img4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize/input_images/img4.jpg -------------------------------------------------------------------------------- /nvJPEG/Image-Resize/input_images/img5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize/input_images/img5.jpg -------------------------------------------------------------------------------- /nvJPEG/Image-Resize/input_images/img6.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize/input_images/img6.jpg -------------------------------------------------------------------------------- /nvJPEG/Image-Resize/input_images/img7.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize/input_images/img7.jpg -------------------------------------------------------------------------------- /nvJPEG/Image-Resize/input_images/img8.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize/input_images/img8.jpg -------------------------------------------------------------------------------- /nvJPEG/Image-Resize/input_images/img9.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize/input_images/img9.jpg -------------------------------------------------------------------------------- /nvJPEG/README.md: -------------------------------------------------------------------------------- 1 | # nvJPEG Library API examples 2 | 3 | ## Description 4 | 5 | This folder demonstrates nvJPEG library API usage. 6 | 7 | ## Key Concepts 8 | 9 | Image Encoding and Decoding from NVJPEG Library 10 | 11 | ## Examples 12 | 13 | [JPEG Image Decoder](nvJPEG-Decoder/) 14 | 15 | [JPEG Image Decoder MultipleInstances](nvJPEG-Decoder-MultipleInstances/) 16 | 17 | [JPEG Image Decoder Backend and ROI](nvJPEG-Decoder-Backend-ROI/) 18 | 19 | [Image Resize](Image-Resize/) 20 | 21 | [Image Resize Watermarking](Image-Resize-WaterMark/) 22 | 23 | ## Supported SM Architectures 24 | 25 | [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) 26 | 27 | ## Supported OSes 28 | 29 | Linux Windows 30 | 31 | ## Supported CPU Architecture 32 | 33 | x86_64 34 | 35 | ## CUDA APIs involved 36 | [NVJPEG](https://docs.nvidia.com/cuda/nvjpeg/index.html) 37 | 38 | 39 | # Prerequisites 40 | - A Linux system with recent NVIDIA drivers. 41 | - Install the [CUDA 11.0 toolkit and above](https://developer.nvidia.com/cuda-downloads). 42 | 43 | -------------------------------------------------------------------------------- /nvJPEG/nvJPEG-Decoder-Backend-ROI/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property 5 | # and proprietary rights in and to this software, related documentation 6 | # and any modifications thereto. Any use, reproduction, disclosure or 7 | # distribution of this software and related documentation without an express 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | # 10 | 11 | cmake_minimum_required(VERSION 3.13 FATAL_ERROR) 12 | 13 | project(nvJPEGROIDecode LANGUAGES CXX CUDA) 14 | 15 | # ---[ Project specIFication. 16 | SET(PROJECT_NAME nvJPEGROIDecode) 17 | PROJECT(${PROJECT_NAME} LANGUAGES CUDA CXX) 18 | 19 | if(NOT DEFINED CMAKE_CUDA_STANDARD) 20 | set(CMAKE_CUDA_STANDARD 11) 21 | set(CMAKE_CUDA_STANDARD_REQUIRED ON) 22 | endif() 23 | 24 | set(CMAKE_CXX_STANDARD 11) 25 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 26 | set(CMAKE_CXX_EXTENSIONS OFF) 27 | 28 | 29 | include_directories( 30 | SYSTEM ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} 31 | ) 32 | 33 | 34 | SET(EXAMPLES_DESCRIPTOR_SOURCES "nvJPEGROIDecode.cpp") 35 | 36 | add_executable(nvJPEGROIDecode ${EXAMPLES_DESCRIPTOR_SOURCES}) 37 | 38 | find_library(NVJPEG_LIB 39 | NAMES nvjpeg 40 | PATHS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) 41 | 42 | find_library(CUDART_LIB 43 | NAMES cudart 44 | PATHS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) 45 | 46 | target_link_libraries(nvJPEGROIDecode PUBLIC ${NVJPEG_LIB} ${CUDART_LIB} pthread) 47 | -------------------------------------------------------------------------------- /nvJPEG/nvJPEG-Decoder-Backend-ROI/img9_roi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder-Backend-ROI/img9_roi.png -------------------------------------------------------------------------------- /nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/cat.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/cat.jpg -------------------------------------------------------------------------------- /nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/cat_baseline.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/cat_baseline.jpg -------------------------------------------------------------------------------- /nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/cat_grayscale.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/cat_grayscale.jpg -------------------------------------------------------------------------------- /nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/img1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/img1.jpg -------------------------------------------------------------------------------- /nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/img2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/img2.jpg -------------------------------------------------------------------------------- /nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/img3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/img3.jpg -------------------------------------------------------------------------------- /nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/img4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/img4.jpg -------------------------------------------------------------------------------- /nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/img5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/img5.jpg -------------------------------------------------------------------------------- /nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/img6.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/img6.jpg -------------------------------------------------------------------------------- /nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/img7.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/img7.jpg -------------------------------------------------------------------------------- /nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/img8.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/img8.jpg -------------------------------------------------------------------------------- /nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/img9.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/img9.jpg -------------------------------------------------------------------------------- /nvJPEG/nvJPEG-Decoder-MultipleInstances/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property 5 | # and proprietary rights in and to this software, related documentation 6 | # and any modifications thereto. Any use, reproduction, disclosure or 7 | # distribution of this software and related documentation without an express 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | # 10 | 11 | cmake_minimum_required(VERSION 3.13 FATAL_ERROR) 12 | 13 | project(nvJPEGDecMultipleInstances LANGUAGES CXX CUDA) 14 | 15 | # ---[ Project specIFication. 16 | SET(PROJECT_NAME nvJPEGDecMultipleInstances) 17 | PROJECT(${PROJECT_NAME} LANGUAGES CUDA CXX) 18 | 19 | if(NOT DEFINED CMAKE_CUDA_STANDARD) 20 | set(CMAKE_CUDA_STANDARD 11) 21 | set(CMAKE_CUDA_STANDARD_REQUIRED ON) 22 | endif() 23 | 24 | set(CMAKE_CXX_STANDARD 11) 25 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 26 | set(CMAKE_CXX_EXTENSIONS OFF) 27 | 28 | 29 | include_directories( 30 | SYSTEM ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} 31 | ) 32 | 33 | 34 | SET(EXAMPLES_DESCRIPTOR_SOURCES "nvJPEGDecMultipleInstances.cpp") 35 | 36 | add_executable(nvJPEGDecMultipleInstances ${EXAMPLES_DESCRIPTOR_SOURCES}) 37 | 38 | find_library(NVJPEG_LIB 39 | NAMES nvjpeg 40 | PATHS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) 41 | 42 | find_library(CUDART_LIB 43 | NAMES cudart 44 | PATHS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) 45 | 46 | target_link_libraries(nvJPEGDecMultipleInstances PUBLIC ${NVJPEG_LIB} ${CUDART_LIB} pthread) 47 | 48 | -------------------------------------------------------------------------------- /nvJPEG/nvJPEG-Decoder/input_images/cat.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder/input_images/cat.jpg -------------------------------------------------------------------------------- /nvJPEG/nvJPEG-Decoder/input_images/cat_baseline.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder/input_images/cat_baseline.jpg -------------------------------------------------------------------------------- /nvJPEG/nvJPEG-Decoder/input_images/cat_grayscale.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder/input_images/cat_grayscale.jpg -------------------------------------------------------------------------------- /nvJPEG/nvJPEG-Decoder/input_images/img1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder/input_images/img1.jpg -------------------------------------------------------------------------------- /nvJPEG/nvJPEG-Decoder/input_images/img2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder/input_images/img2.jpg -------------------------------------------------------------------------------- /nvJPEG/nvJPEG-Decoder/input_images/img3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder/input_images/img3.jpg -------------------------------------------------------------------------------- /nvJPEG/nvJPEG-Decoder/input_images/img4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder/input_images/img4.jpg -------------------------------------------------------------------------------- /nvJPEG/nvJPEG-Decoder/input_images/img5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder/input_images/img5.jpg -------------------------------------------------------------------------------- /nvJPEG/nvJPEG-Decoder/input_images/img6.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder/input_images/img6.jpg -------------------------------------------------------------------------------- /nvJPEG/nvJPEG-Decoder/input_images/img7.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder/input_images/img7.jpg -------------------------------------------------------------------------------- /nvJPEG/nvJPEG-Decoder/input_images/img8.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder/input_images/img8.jpg -------------------------------------------------------------------------------- /nvJPEG/nvJPEG-Decoder/input_images/img9.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder/input_images/img9.jpg -------------------------------------------------------------------------------- /nvJPEG/nvJPEG-Encoder-MultipleInstances/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property 5 | # and proprietary rights in and to this software, related documentation 6 | # and any modifications thereto. Any use, reproduction, disclosure or 7 | # distribution of this software and related documentation without an express 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | # 10 | 11 | cmake_minimum_required(VERSION 3.17 FATAL_ERROR) 12 | 13 | 14 | option(CROSS_COMPILE_AARCH64 "Cross compile for ARM64" OFF) 15 | 16 | if(CROSS_COMPILE_AARCH64) 17 | set(CMAKE_SYSTEM_NAME Linux) 18 | set(CMAKE_SYSTEM_PROCESSOR aarch64) 19 | set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc) 20 | set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++) 21 | set(CMAKE_CUDA_HOST_COMPILER aarch64-linux-gnu-g++) 22 | endif() 23 | 24 | project(nvJPEGEncMultipleInstances LANGUAGES CXX CUDA) 25 | 26 | find_package(Threads REQUIRED) 27 | find_package(CUDAToolkit 12.9 REQUIRED) 28 | 29 | add_executable(nvJPEGEncMultipleInstances "nvJPEGEncMultipleInstances.cpp") 30 | 31 | target_compile_features(nvJPEGEncMultipleInstances PRIVATE 32 | cxx_std_17 33 | cuda_std_17) 34 | 35 | target_link_libraries(nvJPEGEncMultipleInstances PUBLIC 36 | CUDA::nvjpeg 37 | CUDA::cudart_static 38 | Threads::Threads) -------------------------------------------------------------------------------- /nvJPEG2000/README.md: -------------------------------------------------------------------------------- 1 | # nvJPEG2000 Library API examples 2 | 3 | ## Description 4 | 5 | This folder demonstrates nvJPEG2000 library API usage. 6 | 7 | ## Key Concepts 8 | 9 | Image Decoding from NVJPEG2000 Library 10 | 11 | ## Examples 12 | 13 | [JPEG2000 Image Decoder](nvJPEG2000-Decoder/) 14 | [JPEG2000 Image Decoder Pipelined](nvJPEG2000-Decoder-Pipelined/) 15 | [JPEG2000 Image Decoder Tile Partial](nvjpeg2000-Decoder-Tile-Partial/) 16 | 17 | ## Supported SM Architectures 18 | 19 | [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) [SM 8.0 ](https://developer.nvidia.com/cuda-gpus) 20 | 21 | ## Supported OSes 22 | 23 | Linux, Windows 24 | 25 | ## Supported CPU Architecture 26 | 27 | x86_64 28 | 29 | ## CUDA APIs involved 30 | [NVJPEG2000](https://docs.nvidia.com/cuda/nvjpeg2000/index.html) 31 | 32 | 33 | # Prerequisites 34 | - A Linux system with recent NVIDIA drivers. 35 | - Install the [CUDA 11.0 toolkit](https://developer.nvidia.com/cuda-downloads). 36 | 37 | -------------------------------------------------------------------------------- /nvJPEG2000/nvJPEG2000-Decoder/images/2k_image_lossless/2k_lossless.jp2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG2000/nvJPEG2000-Decoder/images/2k_image_lossless/2k_lossless.jp2 -------------------------------------------------------------------------------- /nvJPEG2000/nvJPEG2000-Decoder/images/2k_image_lossy/2k_lossy.jp2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG2000/nvJPEG2000-Decoder/images/2k_image_lossy/2k_lossy.jp2 -------------------------------------------------------------------------------- /nvJPEG2000/nvJPEG2000-Decoder/images/4k_image_lossy/4k_lossy.jp2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG2000/nvJPEG2000-Decoder/images/4k_image_lossy/4k_lossy.jp2 -------------------------------------------------------------------------------- /nvJPEG2000/nvJPEG2000-Encoder/images/TestImage640x480.bmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG2000/nvJPEG2000-Encoder/images/TestImage640x480.bmp -------------------------------------------------------------------------------- /nvTIFF/README.md: -------------------------------------------------------------------------------- 1 | # nvTIFF Library API examples 2 | 3 | ## Description 4 | 5 | This folder demonstrates nvTIFF library API usage. 6 | 7 | ## Key Concepts 8 | 9 | TIFF Image Decoding and Encoding from nvTIFF Library 10 | 11 | ## Examples 12 | 13 | [TIFF Image Decoder Encoder](nvTIFF-Decode-Encode/) 14 | 15 | [GeoTIFF Image Decoder](nvTIFF-GeoTIFF-Decode/) 16 | 17 | [TIFF Image Decoding with ROI](nvTIFF-Decode-Image-ROI/) 18 | 19 | 20 | ## Supported SM Architectures 21 | 22 | [SM 6.0 +](https://developer.nvidia.com/cuda-gpus) 23 | 24 | ## Supported OSes 25 | 26 | Linux, Windows 27 | 28 | ## Supported CPU Architecture 29 | 30 | x86_64, arm64-sbsa, aarch64-jetson 31 | 32 | ## CUDA APIs involved 33 | [nvTIFF](https://docs.nvidia.com/cuda/nvtiff/index.html) 34 | 35 | 36 | # Prerequisites 37 | - A Linux system with recent NVIDIA drivers. 38 | - Install the [CUDA toolkit](https://developer.nvidia.com/cuda-downloads). 39 | - [nvCOMP](https://developer.nvidia.com/nvcomp-download) for Deflate decompression support 40 | 41 | -------------------------------------------------------------------------------- /nvTIFF/nvTIFF-Decode-Encode/images/bali_notiles.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvTIFF/nvTIFF-Decode-Encode/images/bali_notiles.tif -------------------------------------------------------------------------------- /nvTIFF/nvTIFF-GeoTIFF-Decode/images/bali_notiles.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvTIFF/nvTIFF-GeoTIFF-Decode/images/bali_notiles.tif --------------------------------------------------------------------------------