├── LICENSE.TXT
├── MathDx
    ├── README.md
    ├── cuBLASDx
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── arch_runner.hpp
    │   ├── batched_gemm_fp64.cu
    │   ├── block_io.hpp
    │   ├── blockdim_gemm_fp16.cu
    │   ├── check_error.cu
    │   ├── check_error.hpp
    │   ├── common.hpp
    │   ├── common_nvrtc.hpp
    │   ├── device_gemm_performance.cu
    │   ├── flops.h
    │   ├── fused_gemm_performance.cu
    │   ├── gemm_fft.cu
    │   ├── gemm_fft_fp16.cu
    │   ├── gemm_fft_performance.cu
    │   ├── gemm_fusion.cu
    │   ├── introduction_example.cu
    │   ├── multiblock_gemm.cu
    │   ├── nvrtc_gemm.cpp
    │   ├── reduce.hpp
    │   ├── reference.hpp
    │   ├── reference
    │   │   ├── cublas_reference.hpp
    │   │   ├── naive_reference.cu
    │   │   └── naive_reference.hpp
    │   ├── scaled_dot_prod_attn.cu
    │   ├── scaled_dot_prod_attn_batched.cu
    │   ├── simple_gemm_aat.cu
    │   ├── simple_gemm_cfp16.cu
    │   ├── simple_gemm_custom_layout.cu
    │   ├── simple_gemm_fp32.cu
    │   ├── simple_gemm_fp32_decoupled.cu
    │   ├── simple_gemm_fp8.cu
    │   ├── simple_gemm_int8_int8_int32.cu
    │   ├── simple_gemm_leading_dimensions.cu
    │   ├── simple_gemm_mixed_precision.cu
    │   ├── simple_gemm_std_complex_fp32.cu
    │   ├── simple_gemm_transform.cu
    │   ├── single_gemm_performance.cu
    │   └── single_gemm_performance.hpp
    ├── cuFFTDx
    │   ├── 00_introduction_example
    │   │   └── 00_introduction_example.cu
    │   ├── 01_simple_fft_thread
    │   │   ├── 00_simple_fft_thread.cu
    │   │   ├── 01_simple_fft_thread_fp16.cu
    │   │   ├── 02_simple_fft_thread_lto.cu
    │   │   └── 02_simple_fft_thread_lto_cases.csv
    │   ├── 02_simple_fft_block
    │   │   ├── 00_simple_fft_block.cu
    │   │   ├── 01_simple_fft_block_shared.cu
    │   │   ├── 02_simple_fft_block_std_complex.cu
    │   │   ├── 03_simple_fft_block_half2.cu
    │   │   ├── 04_simple_fft_block_fp16.cu
    │   │   ├── 05_simple_fft_block_c2r.cu
    │   │   ├── 06_simple_fft_block_r2c.cu
    │   │   ├── 07_simple_fft_block_c2r_fp16.cu
    │   │   ├── 08_simple_fft_block_r2c_fp16.cu
    │   │   ├── 09_simple_fft_block_cub_io.cu
    │   │   ├── 10_simple_fft_block_c2r_lto.cu
    │   │   └── 10_simple_fft_block_c2r_lto_cases.csv
    │   ├── 03_block_fft_performance
    │   │   ├── 00_block_fft_performance.cu
    │   │   ├── 01_block_fft_performance_many.cu
    │   │   ├── 02_block_fft_lto_ptx_performance.cu
    │   │   ├── 02_block_fft_lto_ptx_performance_cases.csv
    │   │   └── block_fft_performance.hpp
    │   ├── 04_nvrtc_fft
    │   │   ├── 00_nvrtc_fft_thread.cu
    │   │   ├── 01_nvrtc_fft_block.cu
    │   │   ├── 02_nvrtc_fft_thread_lto.cu
    │   │   └── 03_nvrtc_fft_block_lto.cu
    │   ├── 05_fft_Xd
    │   │   ├── 00_fft_2d.cu
    │   │   ├── 01_fft_2d_single_kernel.cu
    │   │   ├── 02_fft_2d_r2c_c2r.cu
    │   │   ├── 03_fft_3d.cu
    │   │   ├── 04_fft_3d_box_single_block.cu
    │   │   └── 05_fft_3d_cube_single_block.cu
    │   ├── 06_convolution
    │   │   ├── 00_convolution.cu
    │   │   ├── 01_convolution_padded.cu
    │   │   ├── 02_convolution_performance.cu
    │   │   └── 03_convolution_r2c_c2r.cu
    │   ├── 07_convolution_3d
    │   │   ├── 00_convolution_3d.cu
    │   │   ├── 01_convolution_3d_c2r.cu
    │   │   ├── 02_convolution_3d_r2c.cu
    │   │   ├── 03_convolution_3d_padded.cu
    │   │   ├── 04_convolution_3d_padded_r2c.cu
    │   │   ├── index_mapper.hpp
    │   │   ├── io_strided_conv_smem.hpp
    │   │   ├── io_strided_conv_smem_padded.hpp
    │   │   ├── kernels.hpp
    │   │   └── reference.hpp
    │   ├── 08_mixed_precision
    │   │   ├── 00_mixed_precision_fft_1d.cu
    │   │   └── 01_mixed_precision_fft_2d.cu
    │   ├── 09_introduction_lto_example
    │   │   ├── 00_introduction_lto_cases.csv
    │   │   ├── 00_introduction_lto_example.cu
    │   │   ├── CMakeLists.txt
    │   │   └── Makefile
    │   ├── 10_cufft_device_api_example
    │   │   ├── 00_cufft_device_api_example.cu
    │   │   ├── CMakeLists.txt
    │   │   ├── Makefile
    │   │   └── cufft_device_api_lto_helper
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── cufft_device_api_lto_helper.cmake
    │   │   │   └── cufft_device_api_lto_helper.cpp
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── README.md
    │   ├── README_LTO_EA.md
    │   ├── common
    │   │   ├── block_io.hpp
    │   │   ├── block_io_generic_strided.hpp
    │   │   ├── block_io_strided.hpp
    │   │   ├── common.hpp
    │   │   ├── common_nvjitlink.hpp
    │   │   ├── common_nvrtc.hpp
    │   │   ├── fp16_common.hpp
    │   │   ├── mixed_io.hpp
    │   │   ├── padded_io.hpp
    │   │   └── random.hpp
    │   └── lto_helper
    │   │   ├── CMakeLists.txt
    │   │   ├── common_lto.hpp
    │   │   ├── cufftdx_cufft_lto_helper.cpp
    │   │   └── lto_helper.cmake
    ├── cuRANDDx
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── common.hpp
    │   ├── mrg_two_distributions_thread_api.cu
    │   ├── nvrtc_helper.hpp
    │   ├── nvrtc_pcg_thread_api.cpp
    │   ├── philox_thread_api.cu
    │   ├── simple_pcg_thread_api.cu
    │   ├── sobol_thread_api.cu
    │   └── xorwow_init_and_generate_thread_api.cu
    └── cuSolverDx
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── blocked_potrf.cu
    │   ├── common.hpp
    │   ├── common
    │       ├── cudart.hpp
    │       ├── cusolver_reference_cholesky.hpp
    │       ├── cusolver_reference_lu.hpp
    │       ├── device_io.hpp
    │       ├── error_checking.cpp
    │       ├── error_checking.hpp
    │       ├── example_sm_runner.hpp
    │       ├── macros.hpp
    │       ├── measure.hpp
    │       ├── numeric.hpp
    │       ├── print.hpp
    │       └── random.hpp
    │   ├── gesv_batched_wo_pivot.cu
    │   ├── gesv_partial_pivot.cu
    │   ├── getrf_partial_pivot.cu
    │   ├── getrf_wo_pivot.cu
    │   ├── nvrtc_helper.hpp
    │   ├── nvrtc_potrs.cpp
    │   ├── posv_batched.cu
    │   ├── potrf_runtime_ld.cu
    │   └── simple_potrf.cu
├── NPP+
    ├── README.md
    ├── batchedLabelMarkersAndCompression
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── batchedLabelMarkersAndCompression.h
    │   ├── batchedLabelMarkersAndCompressionNPPPlus.cpp
    │   ├── dirent.h
    │   └── images
    │   │   ├── CT_skull_512x512_8u.raw
    │   │   ├── CT_skull_CompressedMarkerLabelsUF_8Way_512x512_32u.raw
    │   │   ├── CT_skull_LabelMarkersUFBatch_8Way_512x512_32u.raw
    │   │   ├── CT_skull_LabelMarkersUF_8Way_512x512_32u.raw
    │   │   ├── Lena_CompressedMarkerLabelsUF_8Way_512x512_32u.raw
    │   │   ├── Lena_LabelMarkersUFBatch_8Way_512x512_32u.raw
    │   │   ├── Lena_LabelMarkersUF_8Way_512x512_32u.raw
    │   │   ├── PCB2_1024x683_8u.raw
    │   │   ├── PCB2_CompressedMarkerLabelsUF_8Way_1024x683_32u.raw
    │   │   ├── PCB2_LabelMarkersUFBatch_8Way_1024x683_32u.raw
    │   │   ├── PCB2_LabelMarkersUF_8Way_1024x683_32u.raw
    │   │   ├── PCB_1280x720_8u.raw
    │   │   ├── PCB_CompressedMarkerLabelsUF_8Way_1280x720_32u.raw
    │   │   ├── PCB_LabelMarkersUFBatch_8Way_1280x720_32u.raw
    │   │   ├── PCB_LabelMarkersUF_8Way_1280x720_32u.raw
    │   │   ├── PCB_METAL_509x335_8u.raw
    │   │   ├── PCB_METAL_CompressedMarkerLabelsUF_8Way_509x335_32u.raw
    │   │   ├── PCB_METAL_LabelMarkersUFBatch_8Way_509x335_32u.raw
    │   │   ├── PCB_METAL_LabelMarkersUF_8Way_509x335_32u.raw
    │   │   └── lena_512x512_8u.raw
    ├── cannyEdgeDetectorPython
    │   ├── README.md
    │   ├── Teapot.jpg
    │   ├── Teapot_resolutions
    │   │   ├── out_npp_1280x720.png
    │   │   ├── out_npp_1920x1080.png
    │   │   ├── out_npp_2560x1440.png
    │   │   ├── out_npp_320x180.png
    │   │   ├── out_npp_3840x2160.png
    │   │   ├── out_npp_5120x2880.png
    │   │   ├── out_npp_640x360.png
    │   │   ├── out_npp_800x600.png
    │   │   └── performance_results.csv
    │   └── cannyEdgeDetector.py
    ├── distanceTransform
    │   ├── CMakeLists.txt
    │   ├── DistanceTransformTrue_Dolphin1_319x319_16u.jpg
    │   ├── README.md
    │   ├── dolphin1_Input_319x319_8u.jpg
    │   ├── images
    │   │   ├── Dolphin1_313x317_8u.raw
    │   │   └── TestImage3_diamond_64x64_8u.raw
    │   └── unsignedAndSignedDistanceTransformNPPPlus.cpp
    ├── findContour
    │   ├── CMakeLists.txt
    │   ├── CircuitBoard_2048x1024_8u.jpg
    │   ├── CircuitBoard_CompressedMarkerLabelsUF_8Way_2048x1024_32u.jpg
    │   ├── CircuitBoard_ContoursReconstructed_8Way_2048x1024_8u.jpg
    │   ├── CircuitBoard_Contours_8Way_2048x1024_8u.jpg
    │   ├── CircuitBoard_LabelMarkersUF_8Way_2048x1024_32u.jpg
    │   ├── README.md
    │   ├── contour_info.log
    │   ├── findContourNPPPlus.cpp
    │   └── images
    │   │   └── CircuitBoard_2048x1024_8u.raw
    ├── floodFill
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── floodFillVariousRegionTypesNPPPlus.cpp
    │   └── images
    │   │   ├── CT_skull_512x512_8u_Gray.raw
    │   │   ├── Corn_614x461_8u_Gray.raw
    │   │   ├── DistanceSampler_512x512_8u.raw
    │   │   ├── DistanceSampler_512x512_Inverted_8u.raw
    │   │   ├── RainbowChart_RGB_C3_1024x445_8u.raw
    │   │   ├── RainbowChart_RGB_C3_Fill_8Way_1024x445_Dev_8u.raw
    │   │   ├── RainbowChart_RGB_C3_Fill_8Way_Gradient_1024x445_Dev_8u.raw
    │   │   ├── RainbowChart_RGB_C3_Fill_8Way_Gradient_Boundary_1024x445_Dev_8u.raw
    │   │   ├── Rocks_512x512_8u_Gray.raw
    │   │   ├── SeabedSampler_RGB_C3_675x1024_8u.raw
    │   │   ├── SeabedSampler_RGB_C3_Fill_8Way_Range_675x1024_Dev_8u.raw
    │   │   ├── SeabedSampler_RGB_C3_Fill_8Way_Range_Boundary_675x1024_Dev_8u.raw
    │   │   ├── SignedCircle_256x206_64f.raw
    │   │   ├── SignedCircle_256x206_Inverted_64f.raw
    │   │   └── SignedLith_554x554_32f.raw
    └── watershedSegmentation
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── images
    │       ├── CT_skull_512x512_8u_Gray.raw
    │       ├── CT_skull_CompressedSegmentLabels_8Way_512x512_32u.raw
    │       ├── CT_skull_SegmentBoundaries_8Way_512x512_8u.raw
    │       ├── CT_skull_SegmentsWithContrastingBoundaries_8Way_512x512_8u.raw
    │       ├── CT_skull_Segments_8Way_512x512_8u.raw
    │       ├── Corn_614x461_8u_Gray.raw
    │       ├── Corn_CompressedSegmentLabels_8Way_614x461_32u.raw
    │       ├── Corn_SegmentBoundaries_8Way_614x461_8u.raw
    │       ├── Corn_SegmentsWithContrastingBoundaries_8Way_614x461_8u.raw
    │       ├── Corn_Segments_8Way_614x461_8u.raw
    │       ├── DistanceSampler_512x512_8u.raw
    │       ├── DistanceSampler_512x512_Inverted_8u.raw
    │       ├── RainbowChart_RGB_C3_1024x445_8u.raw
    │       ├── Rocks_512x512_8u_Gray.raw
    │       ├── Rocks_CompressedSegmentLabels_8Way_512x512_32u.raw
    │       ├── Rocks_SegmentBoundaries_8Way_512x512_8u.raw
    │       ├── Rocks_SegmentsWithContrastingBoundaries_8Way_512x512_8u.raw
    │       ├── Rocks_Segments_8Way_512x512_8u.raw
    │       ├── SeabedSampler_RGB_C3_675x1024_8u.raw
    │       ├── SignedCircle_256x206_64f.raw
    │       ├── SignedCircle_256x206_Inverted_64f.raw
    │       └── SignedLith_554x554_32f.raw
    │   ├── watershedSegmentationNPPPlus.cpp
    │   └── watershedSegmentationNPPPlus.h
├── NPP
    ├── README.md
    ├── batchedLabelMarkersAndCompression
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── batchedLabelMarkersAndCompression.cpp
    │   ├── batchedLabelMarkersAndCompression.h
    │   ├── dirent.h
    │   └── images
    │   │   ├── CT_skull_512x512_8u.raw
    │   │   ├── CT_skull_CompressedMarkerLabelsUF_8Way_512x512_32u.raw
    │   │   ├── CT_skull_LabelMarkersUFBatch_8Way_512x512_32u.raw
    │   │   ├── CT_skull_LabelMarkersUF_8Way_512x512_32u.raw
    │   │   ├── Lena_CompressedMarkerLabelsUF_8Way_512x512_32u.raw
    │   │   ├── Lena_LabelMarkersUFBatch_8Way_512x512_32u.raw
    │   │   ├── Lena_LabelMarkersUF_8Way_512x512_32u.raw
    │   │   ├── PCB2_1024x683_8u.raw
    │   │   ├── PCB2_CompressedMarkerLabelsUF_8Way_1024x683_32u.raw
    │   │   ├── PCB2_LabelMarkersUFBatch_8Way_1024x683_32u.raw
    │   │   ├── PCB2_LabelMarkersUF_8Way_1024x683_32u.raw
    │   │   ├── PCB_1280x720_8u.raw
    │   │   ├── PCB_CompressedMarkerLabelsUF_8Way_1280x720_32u.raw
    │   │   ├── PCB_LabelMarkersUFBatch_8Way_1280x720_32u.raw
    │   │   ├── PCB_LabelMarkersUF_8Way_1280x720_32u.raw
    │   │   ├── PCB_METAL_509x335_8u.raw
    │   │   ├── PCB_METAL_CompressedMarkerLabelsUF_8Way_509x335_32u.raw
    │   │   ├── PCB_METAL_LabelMarkersUFBatch_8Way_509x335_32u.raw
    │   │   ├── PCB_METAL_LabelMarkersUF_8Way_509x335_32u.raw
    │   │   └── lena_512x512_8u.raw
    ├── distanceTransform
    │   ├── CMakeLists.txt
    │   ├── DistanceTransformTrue_Dolphin1_319x319_16u.jpg
    │   ├── README.md
    │   ├── distanceTransform.cpp
    │   ├── dolphin1_Input_319x319_8u.jpg
    │   └── images
    │   │   ├── Dolphin1_313x317_8u.raw
    │   │   └── TestImage3_diamond_64x64_8u.raw
    ├── findContour
    │   ├── CMakeLists.txt
    │   ├── CircuitBoard_2048x1024_8u.jpg
    │   ├── CircuitBoard_CompressedMarkerLabelsUF_8Way_2048x1024_32u.jpg
    │   ├── CircuitBoard_ContoursReconstructed_8Way_2048x1024_8u.jpg
    │   ├── CircuitBoard_Contours_8Way_2048x1024_8u.jpg
    │   ├── CircuitBoard_LabelMarkersUF_8Way_2048x1024_32u.jpg
    │   ├── README.md
    │   ├── contour_info.log
    │   ├── findContour.cpp
    │   └── images
    │   │   └── CircuitBoard_2048x1024_8u.raw
    └── watershedSegmentation
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── images
    │       ├── CT_skull_512x512_8u_Gray.raw
    │       ├── CT_skull_CompressedSegmentLabels_8Way_512x512_32u.raw
    │       ├── CT_skull_SegmentBoundaries_8Way_512x512_8u.raw
    │       ├── CT_skull_SegmentsWithContrastingBoundaries_8Way_512x512_8u.raw
    │       ├── CT_skull_Segments_8Way_512x512_8u.raw
    │       ├── Lena_512x512_8u_Gray.raw
    │       ├── Lena_CompressedSegmentLabels_8Way_512x512_32u.raw
    │       ├── Lena_SegmentBoundaries_8Way_512x512_8u.raw
    │       ├── Lena_SegmentsWithContrastingBoundaries_8Way_512x512_8u.raw
    │       ├── Lena_Segments_8Way_512x512_8u.raw
    │       ├── Rocks_512x512_8u_Gray.raw
    │       ├── Rocks_CompressedSegmentLabels_8Way_512x512_32u.raw
    │       ├── Rocks_SegmentBoundaries_8Way_512x512_8u.raw
    │       ├── Rocks_SegmentsWithContrastingBoundaries_8Way_512x512_8u.raw
    │       ├── Rocks_Segments_8Way_512x512_8u.raw
    │       ├── coins_500x383_8u_Gray.raw
    │       └── coins_overlay_500x569_8u_Gray.raw
    │   ├── watershedSegmentation.cpp
    │   └── watershedSegmentation.h
├── README.md
├── cuBLAS
    ├── Emulation
    │   ├── bf16x9_gemmEx
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_GemmEx_example.cu
    │   └── bf16x9_sgemm
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_sgemm_example.cu
    ├── Extensions
    │   ├── AxpyEx
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_AxpyEx_example.cu
    │   ├── Cherk3mEx
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_Cherk3mEx_example.cu
    │   ├── CherkEx
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_CherkEx_example.cu
    │   ├── Csyrk3mEx
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_Csyrk3mEx_example.cu
    │   ├── CsyrkEx
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_CsyrkEx_example.cu
    │   ├── DotEx
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   ├── cublas_DotEx_example.cu
    │   │   └── cublas_DotcEx_example.cu
    │   ├── GemmBatchedEx
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_GemmBatchedEx_example.cu
    │   ├── GemmEx
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_GemmEx_example.cu
    │   ├── GemmGroupedBatchedEx
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_GemmGroupedBatchedEx_example.cu
    │   ├── GemmStridedBatchedEx
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_GemmStridedBatchedEx_example.cu
    │   ├── Nrm2Ex
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_Nrm2Ex_example.cu
    │   ├── RotEx
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_RotEx_example.cu
    │   ├── ScalEx
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_ScalEx_example.cu
    │   ├── dgmm
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_dgmm_example.cu
    │   ├── geam
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_geam_example.cu
    │   ├── tpttr
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_tpttr_example.cu
    │   └── trttp
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_trttp_example.cu
    ├── Level-1
    │   ├── amax
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_amax_example.cu
    │   ├── amin
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_amin_example.cu
    │   ├── asum
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_asum_example.cu
    │   ├── axpy
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_axpy_example.cu
    │   ├── copy
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_copy_example.cu
    │   ├── dot
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   ├── cublas_dot_example.cu
    │   │   └── cublas_dotc_example.cu
    │   ├── nrm2
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_nrm2_example.cu
    │   ├── rot
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_rot_example.cu
    │   ├── rotg
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_rotg_example.cu
    │   ├── rotm
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_rotm_example.cu
    │   ├── rotmg
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_rotmg_example.cu
    │   ├── scal
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_scal_example.cu
    │   └── swap
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_swap_example.cu
    ├── Level-2
    │   ├── gbmv
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_gbmv_example.cu
    │   ├── gemv
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_gemv_example.cu
    │   ├── ger
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_ger_example.cu
    │   ├── hbmv
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_hbmv_example.cu
    │   ├── hemv
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_hemv_example.cu
    │   ├── her
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_her_example.cu
    │   ├── her2
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_her2_example.cu
    │   ├── hpmv
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_hpmv_example.cu
    │   ├── hpr
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_hpr_example.cu
    │   ├── hpr2
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_hpr2_example.cu
    │   ├── sbmv
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_sbmv_example.cu
    │   ├── spmv
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_spmv_example.cu
    │   ├── spr
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_spr_example.cu
    │   ├── spr2
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_spr2_example.cu
    │   ├── symv
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_symv_example.cu
    │   ├── syr
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_syr_example.cu
    │   ├── syr2
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_syr2_example.cu
    │   ├── tbmv
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_tbmv_example.cu
    │   ├── tbsv
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_tbsv_example.cu
    │   ├── tpmv
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_tpmv_example.cu
    │   ├── tpsv
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_tpsv_example.cu
    │   ├── trmv
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_trmv_example.cu
    │   └── trsv
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_trsv_example.cu
    ├── Level-3
    │   ├── gemm
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_gemm_example.cu
    │   ├── gemm3m
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_gemm3m_example.cu
    │   ├── gemmBatched
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_gemmBatched_example.cu
    │   ├── gemmGroupedBatched
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_gemmGroupedBatched_example.cu
    │   ├── gemmStridedBatched
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_gemmStridedBatched_example.cu
    │   ├── hemm
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_hemm_example.cu
    │   ├── her2k
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_her2k_example.cu
    │   ├── herk
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_herk_example.cu
    │   ├── herkx
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_herkx_example.cu
    │   ├── symm
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_symm_example.cu
    │   ├── syr2k
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_syr2k_example.cu
    │   ├── syrk
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_syrk_example.cu
    │   ├── syrkx
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_syrkx_example.cu
    │   ├── trmm
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_trmm_example.cu
    │   ├── trsm
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_trsm_example.cu
    │   └── trsmBatched
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   └── cublas_trsmBatched_example.cu
    ├── README.md
    ├── cmake
    │   └── cublas_example.cmake
    └── utils
    │   └── cublas_utils.h
├── cuBLASLt
    ├── CMakeLists.txt
    ├── Common
    │   ├── LtMatmulCustomFind.h
    │   ├── helpers.cpp
    │   └── helpers.h
    ├── LtBlk128x128Fp8Matmul
    │   ├── CMakeLists.txt
    │   ├── main.cpp
    │   ├── sample_cublasLt_LtBlk128x128Fp8Matmul.cu
    │   └── sample_cublasLt_LtBlk128x128Fp8Matmul.h
    ├── LtDgemmPresetAlgo
    │   ├── CMakeLists.txt
    │   ├── main.cpp
    │   ├── sample_cublasLt_LtDgemmPresetAlgo.cu
    │   └── sample_cublasLt_LtDgemmPresetAlgo.h
    ├── LtFp8CustomFind
    │   ├── CMakeLists.txt
    │   └── main.cpp
    ├── LtFp8Matmul
    │   ├── CMakeLists.txt
    │   ├── main.cpp
    │   ├── sample_cublasLt_LtFp8Matmul.cu
    │   └── sample_cublasLt_LtFp8Matmul.h
    ├── LtHSHgemmPointerArrayBatchSimple
    │   ├── CMakeLists.txt
    │   ├── main.cpp
    │   ├── sample_cublasLt_LtHSHgemmPointerArrayBatchSimple.cu
    │   └── sample_cublasLt_LtHSHgemmPointerArrayBatchSimple.h
    ├── LtHSHgemmStridedBatchSimple
    │   ├── CMakeLists.txt
    │   ├── main.cpp
    │   ├── sample_cublasLt_LtHSHgemmStridedBatchSimple.cu
    │   └── sample_cublasLt_LtHSHgemmStridedBatchSimple.h
    ├── LtIgemmTensor
    │   ├── CMakeLists.txt
    │   ├── main.cpp
    │   ├── sample_cublasLt_LtIgemmTensor.cu
    │   └── sample_cublasLt_LtIgemmTensor.h
    ├── LtMxfp8Matmul
    │   ├── CMakeLists.txt
    │   ├── main.cpp
    │   ├── sample_cublasLt_LtMxfp8Matmul.cu
    │   └── sample_cublasLt_LtMxfp8Matmul.h
    ├── LtNvfp4Matmul
    │   ├── CMakeLists.txt
    │   ├── main.cpp
    │   ├── sample_cublasLt_LtNvfp4Matmul.cu
    │   └── sample_cublasLt_LtNvfp4Matmul.h
    ├── LtPlanarComplex
    │   ├── CMakeLists.txt
    │   ├── main.cpp
    │   ├── sample_cublasLt_LtPlanarComplex.cu
    │   └── sample_cublasLt_LtPlanarComplex.h
    ├── LtSgemm
    │   ├── CMakeLists.txt
    │   ├── main.cpp
    │   ├── sample_cublasLt_LtSgemm.cu
    │   └── sample_cublasLt_LtSgemm.h
    ├── LtSgemmCustomFind
    │   ├── CMakeLists.txt
    │   └── main.cpp
    ├── LtSgemmSimpleAutoTuning
    │   ├── CMakeLists.txt
    │   ├── main.cpp
    │   ├── sample_cublasLt_LtSgemmSimpleAutoTuning.cu
    │   └── sample_cublasLt_LtSgemmSimpleAutoTuning.h
    └── README.md
├── cuBLASMp
    ├── CMakeLists.txt
    ├── README.md
    ├── helpers.h
    ├── matrix_generator.hxx
    ├── pgeadd.cu
    ├── pgemm.cu
    ├── pmatmul.cu
    ├── pmatmul_ar.cu
    ├── psyrk.cu
    ├── ptradd.cu
    └── ptrsm.cu
├── cuDSS
    ├── README.md
    ├── get_set
    │   ├── CMakeLists.txt
    │   └── get_set.cpp
    ├── memory_handler
    │   ├── CMakeLists.txt
    │   └── memory_handler.cpp
    ├── simple
    │   ├── CMakeLists.txt
    │   └── simple.cpp
    ├── simple_batch
    │   ├── CMakeLists.txt
    │   └── simple_batch.cpp
    ├── simple_complex
    │   ├── CMakeLists.txt
    │   └── simple_complex.cpp
    ├── simple_hybrid_execution_mode
    │   ├── CMakeLists.txt
    │   └── simple_hybrid_execution_mode.cpp
    ├── simple_hybrid_memory_mode
    │   ├── CMakeLists.txt
    │   └── simple_hybrid_memory_mode.cpp
    ├── simple_mgmn_mode
    │   ├── CMakeLists.txt
    │   └── simple_mgmn_mode.cpp
    └── simple_multithreaded_mode
    │   ├── CMakeLists.txt
    │   └── simple_multithreaded_mode.cpp
├── cuFFT
    ├── 1d_c2c
    │   ├── .gitignore
    │   ├── 1d_c2c_example.cpp
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   └── README.md
    ├── 1d_mgpu_c2c
    │   ├── .gitignore
    │   ├── 1d_mgpu_c2c_example.cpp
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   └── README.md
    ├── 1d_r2c_c2r
    │   ├── .gitignore
    │   ├── 1d_r2c_c2r_example.cpp
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   └── README.md
    ├── 2d_c2r_r2c
    │   ├── .gitignore
    │   ├── 2d_c2r_r2c_example.cpp
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   └── README.md
    ├── 3d_c2c
    │   ├── .gitignore
    │   ├── 3d_c2c_example.cpp
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   └── README.md
    ├── 3d_mgpu_c2c
    │   ├── .gitignore
    │   ├── 3d_mgpu_c2c_example.cpp
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   └── README.md
    ├── 3d_mgpu_r2c_c2r
    │   ├── .gitignore
    │   ├── 3d_mgpu_r2c_c2r_example.cpp
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   └── README.md
    ├── README.md
    ├── lto_callback_window_1d
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── README.md
    │   └── src
    │   │   ├── common.cpp
    │   │   ├── common.h
    │   │   ├── nvrtc_helper.h
    │   │   ├── r2c_c2r_legacy_callback_example.cu
    │   │   ├── r2c_c2r_lto_callback_device.cu
    │   │   ├── r2c_c2r_lto_callback_example.cpp
    │   │   ├── r2c_c2r_lto_nvrtc_callback_example.cpp
    │   │   ├── r2c_c2r_reference.cu
    │   │   └── r2c_c2r_reference.h
    ├── lto_ea
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── README.md
    │   └── src
    │   │   ├── common.cpp
    │   │   ├── common.h
    │   │   ├── nvrtc_helper.h
    │   │   ├── r2c_c2r_callback_example.cu
    │   │   ├── r2c_c2r_lto_callback_device.cu
    │   │   ├── r2c_c2r_lto_callback_example.cpp
    │   │   ├── r2c_c2r_lto_nvrtc_callback_example.cpp
    │   │   ├── r2c_c2r_reference.cu
    │   │   └── r2c_c2r_reference.h
    └── utils
    │   └── cufft_utils.h
├── cuFFTMp
    ├── Fortran_samples
    │   ├── Fortran_wrappers_nvhpc
    │   │   ├── cufft.mod
    │   │   ├── cufftxt.mod
    │   │   ├── libattachcommWrapper.a
    │   │   └── libnvhpcwrapcufftxt.a
    │   ├── c2c
    │   │   ├── Makefile
    │   │   ├── README.md
    │   │   └── cufftmp_c2c.f90
    │   ├── c2c_no_descriptors
    │   │   ├── Makefile
    │   │   └── cufftmp_c2c_no_descriptors.f90
    │   ├── c2c_pencils
    │   │   ├── Makefile
    │   │   ├── README.md
    │   │   └── cufftmp_c2c_pencils.f90
    │   ├── common.mk
    │   ├── r2c_c2r
    │   │   ├── Makefile
    │   │   ├── README.md
    │   │   └── cufftmp_r2c.f90
    │   ├── r2c_c2r_no_descriptors
    │   │   ├── Makefile
    │   │   └── cufftmp_r2c_c2r_no_descriptors.f90
    │   ├── r2c_c2r_pencils
    │   │   ├── Makefile
    │   │   ├── README.md
    │   │   └── cufftmp_r2c_c2r_pencils.f90
    │   ├── r2c_c2r_shared_scratch
    │   │   ├── Makefile
    │   │   ├── README.md
    │   │   └── cufftmp_r2c_workarea.f90
    │   └── reshape
    │   │   ├── Makefile
    │   │   ├── README.md
    │   │   └── cufftmp_reshape.f90
    ├── JAX_FFT
    │   ├── Dockerfile
    │   ├── README.md
    │   ├── misc
    │   │   ├── strong.png
    │   │   ├── strong_eos.png
    │   │   └── weak.png
    │   ├── pyproject.toml
    │   ├── setup.py
    │   ├── src
    │   │   ├── cufftmp_jax
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── NOTICE
    │   │   │   ├── __init__.py
    │   │   │   ├── cufftmp_jax.py
    │   │   │   └── src
    │   │   │   │   ├── gpu_ops.cpp
    │   │   │   │   ├── kernel_helpers.h
    │   │   │   │   ├── kernels.cu
    │   │   │   │   ├── kernels.h
    │   │   │   │   └── pybind11_kernel_helpers.h
    │   │   ├── fft_common
    │   │   │   ├── __init__.py
    │   │   │   └── utils.py
    │   │   └── xfft
    │   │   │   ├── __init__.py
    │   │   │   └── xfft.py
    │   └── tests
    │   │   ├── fft_test.py
    │   │   └── helpers.py
    ├── README.md
    ├── extra_bootstraps
    │   ├── Makefile
    │   └── README.md
    └── samples
    │   ├── c2c
    │       ├── Makefile
    │       ├── README.md
    │       └── cufftmp_c2c.cu
    │   ├── c2c_no_descriptors
    │       ├── Makefile
    │       ├── README.md
    │       └── cufftmp_c2c_no_descriptors.cu
    │   ├── c2c_no_descriptors_cufftMpMakePlan
    │       ├── README.md
    │       └── cufftmp_c2c_no_descriptors_cufftMpMakePlan.cu
    │   ├── c2c_pencils
    │       ├── Makefile
    │       ├── README.md
    │       └── cufftmp_c2c_pencils.cu
    │   ├── common.mk
    │   ├── common
    │       ├── README.md
    │       ├── error_checks.hpp
    │       ├── generate_random.hpp
    │       └── scaling.cuh
    │   ├── iterators
    │       └── box_iterator.hpp
    │   ├── r2c_c2r
    │       ├── Makefile
    │       ├── README.md
    │       └── cufftmp_r2c_c2r.cu
    │   ├── r2c_c2r_no_descriptors
    │       ├── Makefile
    │       ├── README.md
    │       └── cufftmp_r2c_c2r_no_descriptors.cu
    │   ├── r2c_c2r_pencils
    │       ├── Makefile
    │       ├── README.md
    │       └── cufftmp_r2c_c2r_pencils.cu
    │   ├── r2c_c2r_pencils_cufftMpMakePlan
    │       ├── README.md
    │       └── cufftmp_r2c_c2r_pencils_cufftMpMakePlan.cu
    │   ├── r2c_c2r_shared_scratch
    │       ├── Makefile
    │       ├── README.md
    │       └── cufftmp_r2c_c2r_shared_scratch.cu
    │   ├── r2c_c2r_slabs_GROMACS
    │       ├── Makefile
    │       ├── README.md
    │       └── cufftmp_r2c_c2r_slabs_GROMACS.cu
    │   └── reshape
    │       ├── Makefile
    │       ├── README.md
    │       └── cufftmp_reshape.cu
├── cuPQC
    ├── Makefile
    ├── README.md
    ├── example_ml_dsa.cu
    ├── example_ml_kem.cu
    ├── example_sha2.cu
    └── example_sha3.cu
├── cuRAND
    ├── .gitignore
    ├── Host
    │   ├── mrg32k3a
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── Makefile
    │   │   ├── README.md
    │   │   ├── curand_mrg32k3a_lognormal_example.cpp
    │   │   ├── curand_mrg32k3a_normal_example.cpp
    │   │   ├── curand_mrg32k3a_poisson_example.cpp
    │   │   └── curand_mrg32k3a_uniform_example.cpp
    │   ├── mt19937
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── Makefile
    │   │   ├── README.md
    │   │   ├── curand_mt19937_lognormal_example.cpp
    │   │   ├── curand_mt19937_normal_example.cpp
    │   │   ├── curand_mt19937_poisson_example.cpp
    │   │   └── curand_mt19937_uniform_example.cpp
    │   ├── mtgp32
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── Makefile
    │   │   ├── README.md
    │   │   ├── curand_mtgp32_lognormal_example.cpp
    │   │   ├── curand_mtgp32_normal_example.cpp
    │   │   ├── curand_mtgp32_poisson_example.cpp
    │   │   └── curand_mtgp32_uniform_example.cpp
    │   ├── philox
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── Makefile
    │   │   ├── README.md
    │   │   ├── curand_philox_lognormal_example.cpp
    │   │   ├── curand_philox_normal_example.cpp
    │   │   ├── curand_philox_poisson_example.cpp
    │   │   └── curand_philox_uniform_example.cpp
    │   ├── scrambled_sobol32
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── Makefile
    │   │   ├── README.md
    │   │   ├── curand_scrambled_sobol32_lognormal_example.cpp
    │   │   ├── curand_scrambled_sobol32_normal_example.cpp
    │   │   ├── curand_scrambled_sobol32_poisson_example.cpp
    │   │   └── curand_scrambled_sobol32_uniform_example.cpp
    │   ├── scrambled_sobol64
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── Makefile
    │   │   ├── README.md
    │   │   ├── curand_scrambled_sobol64_lognormal_example.cpp
    │   │   ├── curand_scrambled_sobol64_normal_example.cpp
    │   │   ├── curand_scrambled_sobol64_poisson_example.cpp
    │   │   └── curand_scrambled_sobol64_uniform_example.cpp
    │   ├── sobol32
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── Makefile
    │   │   ├── README.md
    │   │   ├── curand_sobol32_lognormal_example.cpp
    │   │   ├── curand_sobol32_normal_example.cpp
    │   │   ├── curand_sobol32_poisson_example.cpp
    │   │   └── curand_sobol32_uniform_example.cpp
    │   ├── sobol64
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── Makefile
    │   │   ├── README.md
    │   │   ├── curand_sobol64_lognormal_example.cpp
    │   │   ├── curand_sobol64_normal_example.cpp
    │   │   ├── curand_sobol64_poisson_example.cpp
    │   │   └── curand_sobol64_uniform_example.cpp
    │   └── xorwow
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── Makefile
    │   │   ├── README.md
    │   │   ├── curand_xorwow_lognormal_example.cpp
    │   │   ├── curand_xorwow_normal_example.cpp
    │   │   ├── curand_xorwow_poisson_example.cpp
    │   │   └── curand_xorwow_uniform_example.cpp
    ├── README.md
    ├── cmake
    │   └── curand_example.cmake
    └── utils
    │   └── curand_utils.h
├── cuSOLVER
    ├── MgGetrf
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   └── cusolver_MgGetrf_example.cu
    ├── MgPotrf
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── cusolver_MgPotrf_example1.cu
    │   └── cusolver_MgPotrf_example2.cu
    ├── MgSyevd
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── cusolver_MgSyevd_example1.cu
    │   ├── cusolver_MgSyevd_example2.cu
    │   └── cusolver_MgSyevd_example3.cu
    ├── README.md
    ├── Xgeev
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── cusolver_Xgeev_example1.cu
    │   ├── cusolver_Xgeev_example2.cu
    │   └── cusolver_Xgeev_example3.cu
    ├── Xgeqrf
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   └── cusolver_Xgeqrf_example.cu
    ├── Xgesvd
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   └── cusolver_Xgesvd_example.cu
    ├── Xgesvdp
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   └── cusolver_Xgesvdp_example.cu
    ├── Xgesvdr
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   └── cusolver_Xgesvdr_example.cu
    ├── Xgetrf
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   └── cusolver_Xgetrf_example.cu
    ├── Xpotrf
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   └── cusolver_Xpotrf_example.cu
    ├── Xsyevd
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   └── cusolver_Xsyevd_example.cu
    ├── Xsyevdx
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   └── cusolver_Xsyevdx_example.cu
    ├── Xtrtri
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   └── cusolver_Xtrtri_example.cu
    ├── cmake
    │   └── cusolver_example.cmake
    ├── csrqr
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── cusolver_csrqr_example1.cu
    │   └── cusolver_csrqr_example2.cu
    ├── gesv
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── cusolver_irs_expert_cuda-10.2.cu
    │   ├── cusolver_irs_expert_cuda-11.cu
    │   └── cusolver_irs_lapack.cu
    ├── gesvd
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   └── cusolver_gesvd_example.cu
    ├── gesvdaStridedBatched
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   └── cusolver_gesvdaStridedBatched_example.cu
    ├── gesvdj
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   └── cusolver_gesvdj_example.cu
    ├── gesvdjBatched
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   └── cusolver_gesvdjBatched_example.cu
    ├── getrf
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   └── cusolver_getrf_example.cu
    ├── orgqr
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   └── cusolver_orgqr_example.cu
    ├── ormqr
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   └── cusolver_ormqr_example.cu
    ├── potrfBatched
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   └── cusolver_potrfBatched_example.cu
    ├── syevd
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   └── cusolver_syevd_example.cu
    ├── syevdx
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   └── cusolver_syevdx_example.cu
    ├── syevj
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   └── cusolver_syevj_example.cu
    ├── syevjBatched
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   └── cusolver_syevjBatched_example.cu
    ├── sygvd
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   └── cusolver_sygvd_example.cu
    ├── sygvdx
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   └── cusolver_sygvdx_example.cu
    ├── sygvj
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   └── cusolver_sygvj_example.cu
    └── utils
    │   ├── cusolverMg_utils.h
    │   └── cusolver_utils.h
├── cuSOLVERMp
    ├── .gitignore
    ├── CMakeLists.txt
    ├── Makefile
    ├── README.md
    ├── helpers.h
    ├── mp_gels.c
    ├── mp_geqrf.c
    ├── mp_getrf_getrs.c
    ├── mp_potrf_potrs.c
    ├── mp_syevd.c
    └── mp_sygvd.c
├── cuSOLVERSp2cuDSS
    ├── CMakeLists.txt
    ├── README.md
    ├── csreigvsi2cuDSS_double.cpp
    ├── cuSolverRf2cuDSS.hpp
    ├── cuSolverRf2cuDSS_double.cpp
    ├── cuSolverSp2cuDSS.hpp
    ├── cuSolverSp2cuDSS_dcomplex.cpp
    ├── cuSolverSp2cuDSS_double.cpp
    ├── cuSolverSp2cuDSS_float.cpp
    ├── cuSolverSp2cuDSS_scomplex.cpp
    ├── test_complex.mtx
    ├── test_real.mtx
    └── utils.hpp
├── cuSPARSE
    ├── README.md
    ├── axpby
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── README.md
    │   ├── axpby.png
    │   └── axpby_example.c
    ├── bicgstab
    │   ├── BiCGStab.pdf
    │   ├── BiCGStab.png
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── README.md
    │   └── bicgstab_example.c
    ├── cg
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── README.md
    │   ├── cg.pdf
    │   ├── cg.png
    │   └── cg_example.c
    ├── compression
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── README.md
    │   └── compression_example.cpp
    ├── coosort
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── README.md
    │   └── coosort_example.c
    ├── dense2sparse_blockedell
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── README.md
    │   ├── dense2sparse_blockedell.png
    │   └── dense2sparse_blockedell_example.c
    ├── dense2sparse_csr
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── README.md
    │   ├── dense2sparse_csr.png
    │   └── dense2sparse_csr_example.c
    ├── gather
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── README.md
    │   ├── gather.png
    │   └── gather_example.c
    ├── gpsvInterleavedBatch
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── README.md
    │   └── gpsvInterleavedBatch_example.c
    ├── graph_capture
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── README.md
    │   └── graph_capture_example.c
    ├── rot
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── README.md
    │   ├── rot.png
    │   └── rot_example.c
    ├── scatter
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── README.md
    │   ├── scatter.png
    │   └── scatter_example.c
    ├── sddmm_bsr
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── README.md
    │   ├── sddmm_bsr.png
    │   └── sddmm_bsr_example.c
    ├── sddmm_csr
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── README.md
    │   ├── sddmm_csr.png
    │   └── sddmm_csr_example.c
    ├── sddmm_csr_batched
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── README.md
    │   ├── sddmm_csr.png
    │   └── sddmm_csr_batched_example.c
    ├── sparse2dense_csr
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── README.md
    │   ├── sparse2dense_csr.png
    │   └── sparse2dense_csr_example.c
    ├── spgemm
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── README.md
    │   ├── spgemm.png
    │   └── spgemm_example.c
    ├── spgemm_mem
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── README.md
    │   ├── spgemm.png
    │   └── spgemm_mem_example.c
    ├── spgemm_reuse
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── README.md
    │   ├── spgemm.png
    │   └── spgemm_reuse_example.c
    ├── spmm_blockedell
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── README.md
    │   ├── spmm_blockedell.png
    │   └── spmm_blockedell_example.cpp
    ├── spmm_coo
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── README.md
    │   ├── spmm_coo.png
    │   └── spmm_coo_example.c
    ├── spmm_coo_batched
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── README.md
    │   ├── spmm_coo_batched.png
    │   └── spmm_coo_batched_example.c
    ├── spmm_csr
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── README.md
    │   ├── spmm_csr.png
    │   └── spmm_csr_example.c
    ├── spmm_csr_batched
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── README.md
    │   ├── spmm_csr_batched.png
    │   └── spmm_csr_batched_example.c
    ├── spmm_csr_op
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── README.md
    │   ├── spmm_csr_op.png
    │   └── spmm_csr_op_example.c
    ├── spmv_coo
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── README.md
    │   ├── spmv_coo.png
    │   └── spmv_coo_example.c
    ├── spmv_csr
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── README.md
    │   ├── spmv_csr.png
    │   └── spmv_csr_example.c
    ├── spmv_sell
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── README.md
    │   ├── spmv_sell.png
    │   └── spmv_sell_example.c
    ├── spsm_coo
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── README.md
    │   ├── spsm_coo.png
    │   └── spsm_coo_example.c
    ├── spsm_csr
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── README.md
    │   ├── spsm_csr.png
    │   └── spsm_csr_example.c
    ├── spsv_coo
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── README.md
    │   ├── spsv_coo.png
    │   └── spsv_coo_example.c
    ├── spsv_csr
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── README.md
    │   ├── spsv_csr.png
    │   └── spsv_csr_example.c
    ├── spsv_sell
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── README.md
    │   ├── spsv_sell.png
    │   └── spsv_sell_example.c
    └── spvv
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── README.md
    │   ├── spvv.png
    │   └── spvv_example.c
├── cuSPARSELt
    ├── README.md
    ├── matmul
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── README.md
    │   └── matmul_example.cpp
    └── matmul_advanced
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── README.md
    │   └── matmul_advanced_example.cpp
├── cuTENSOR
    ├── CMakeLists.txt
    ├── Makefile
    ├── README.md
    ├── contraction.cu
    ├── contraction_jit.cu
    ├── contraction_plan_cache.cu
    ├── einsum.cu
    ├── elementwise_binary.cu
    ├── elementwise_permute.cu
    ├── elementwise_permute_padding.cu
    ├── elementwise_trinary.cu
    ├── python
    │   ├── MANIFEST.in
    │   ├── README.md
    │   ├── cutensor
    │   │   ├── __init__.py
    │   │   ├── c_extensions.py
    │   │   ├── c_extensions_utils.py
    │   │   ├── common.py
    │   │   ├── package_info.py
    │   │   ├── tensorflow
    │   │   │   ├── __init__.py
    │   │   │   ├── einsum.py
    │   │   │   ├── einsum_kernel.cc
    │   │   │   ├── einsum_module.cc
    │   │   │   ├── einsum_ops.cc
    │   │   │   └── einsum_test.py
    │   │   └── torch
    │   │   │   ├── __init__.py
    │   │   │   ├── einsum.cc
    │   │   │   ├── einsum.py
    │   │   │   └── einsum_test.py
    │   ├── einsum.h
    │   └── setup.py
    └── reduction.cu
├── cuTENSORMg
    ├── CMakeLists.txt
    ├── blog_post.cu
    └── contraction_multi_gpu.cu
├── nvCOMP
    ├── README.md
    ├── benchmarks
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── benchmark_ans_chunked.cu
    │   ├── benchmark_bitcomp_chunked.cu
    │   ├── benchmark_cascaded_chunked.cu
    │   ├── benchmark_common.h
    │   ├── benchmark_deflate_chunked.cu
    │   ├── benchmark_gdeflate_chunked.cu
    │   ├── benchmark_hlif.cpp
    │   ├── benchmark_hlif.hpp
    │   ├── benchmark_lz4_chunked.cu
    │   ├── benchmark_snappy_chunked.cu
    │   ├── benchmark_template_chunked.cuh
    │   ├── benchmark_zstd_chunked.cu
    │   └── text_to_binary.py
    └── examples
    │   ├── BatchData.h
    │   ├── BatchDataCPU.h
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── deflate_cpu_compression.cu
    │   ├── deflate_cpu_decompression.cu
    │   ├── gdeflate_cpu_compression.cu
    │   ├── gdeflate_cpu_decompression.cu
    │   ├── gzip_gpu_decompression.cu
    │   ├── high_level_quickstart_example.cpp
    │   ├── low_level_quickstart_example.cpp
    │   ├── lz4_cpu_compression.cu
    │   ├── lz4_cpu_decompression.cu
    │   ├── nvcomp_gds.cu
    │   ├── python
    │       └── nvcomp_basic.ipynb
    │   ├── snappy_cpu_compression.cu
    │   ├── snappy_cpu_decompression.cu
    │   ├── util.h
    │   ├── zstd_cpu_compression.cu
    │   └── zstd_cpu_decompression.cu
├── nvJPEG
    ├── Image-Resize-WaterMark
    │   ├── CMakeLists.txt
    │   ├── NVLogo.jpg
    │   ├── NVLogo.png
    │   ├── README.md
    │   ├── imageResizeWatermark.cpp
    │   ├── imageResizeWatermark.h
    │   ├── img9.png
    │   ├── img9wm.png
    │   └── input_images
    │   │   ├── cat.jpg
    │   │   ├── cat_baseline.jpg
    │   │   ├── cat_grayscale.jpg
    │   │   ├── img1.jpg
    │   │   ├── img2.jpg
    │   │   ├── img3.jpg
    │   │   ├── img4.jpg
    │   │   ├── img5.jpg
    │   │   ├── img6.jpg
    │   │   ├── img7.jpg
    │   │   ├── img8.jpg
    │   │   └── img9.jpg
    ├── Image-Resize
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── imageResize.cpp
    │   ├── imageResize.h
    │   └── input_images
    │   │   ├── cat.jpg
    │   │   ├── cat_baseline.jpg
    │   │   ├── cat_grayscale.jpg
    │   │   ├── img1.jpg
    │   │   ├── img2.jpg
    │   │   ├── img3.jpg
    │   │   ├── img4.jpg
    │   │   ├── img5.jpg
    │   │   ├── img6.jpg
    │   │   ├── img7.jpg
    │   │   ├── img8.jpg
    │   │   └── img9.jpg
    ├── README.md
    ├── nvJPEG-Decoder-Backend-ROI
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── img9_roi.png
    │   ├── input_images
    │   │   ├── cat.jpg
    │   │   ├── cat_baseline.jpg
    │   │   ├── cat_grayscale.jpg
    │   │   ├── img1.jpg
    │   │   ├── img2.jpg
    │   │   ├── img3.jpg
    │   │   ├── img4.jpg
    │   │   ├── img5.jpg
    │   │   ├── img6.jpg
    │   │   ├── img7.jpg
    │   │   ├── img8.jpg
    │   │   └── img9.jpg
    │   ├── nvJPEGROIDecode.cpp
    │   ├── nvJPEGROIDecode.h
    │   └── threadpool.h
    ├── nvJPEG-Decoder-MultipleInstances
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── nvJPEGDecMultipleInstances.cpp
    │   ├── nvJPEGDecMultipleInstances.h
    │   └── threadpool.h
    ├── nvJPEG-Decoder
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── input_images
    │   │   ├── cat.jpg
    │   │   ├── cat_baseline.jpg
    │   │   ├── cat_grayscale.jpg
    │   │   ├── img1.jpg
    │   │   ├── img2.jpg
    │   │   ├── img3.jpg
    │   │   ├── img4.jpg
    │   │   ├── img5.jpg
    │   │   ├── img6.jpg
    │   │   ├── img7.jpg
    │   │   ├── img8.jpg
    │   │   └── img9.jpg
    │   ├── nvjpegDecoder.cpp
    │   └── nvjpegDecoder.h
    └── nvJPEG-Encoder-MultipleInstances
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   └── nvJPEGEncMultipleInstances.cpp
├── nvJPEG2000
    ├── README.md
    ├── nvJPEG2000-Decoder-Pipelined
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── nvjpeg2k_dec_pipelined.cpp
    │   └── nvjpeg2k_dec_pipelined.h
    ├── nvJPEG2000-Decoder-Tile-Partial
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── nvj2k_DecodeTilePartial.cpp
    │   └── nvj2k_DecodeTilePartial.h
    ├── nvJPEG2000-Decoder
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── images
    │   │   ├── 2k_image_lossless
    │   │   │   └── 2k_lossless.jp2
    │   │   ├── 2k_image_lossy
    │   │   │   └── 2k_lossy.jp2
    │   │   └── 4k_image_lossy
    │   │   │   └── 4k_lossy.jp2
    │   ├── nvjpeg2000DecodeSample.cpp
    │   └── nvjpeg2000DecodeSample.h
    └── nvJPEG2000-Encoder
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── images
    │       └── TestImage640x480.bmp
    │   ├── nvjpeg2k_encode.cpp
    │   └── nvjpeg2k_encode.h
└── nvTIFF
    ├── README.md
    ├── nvTIFF-Decode-Encode
        ├── CMakeLists.txt
        ├── README.md
        ├── getopt.h
        ├── images
        │   └── bali_notiles.tif
        └── nvtiff_example.cpp
    ├── nvTIFF-Decode-Image-ROI
        ├── CMakeLists.txt
        ├── README.md
        └── nvtiff_decode_image_roi.cpp
    └── nvTIFF-GeoTIFF-Decode
        ├── CMakeLists.txt
        ├── README.md
        ├── getopt.h
        ├── images
            └── bali_notiles.tif
        └── nvtiff_geotiff_decode.cpp


/LICENSE.TXT:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2022 NVIDIA CORPORATION AND AFFILIATES.  All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without modification, are permitted
 4 | provided that the following conditions are met:
 5 |     * Redistributions of source code must retain the above copyright notice, this list of
 6 |       conditions and the following disclaimer.
 7 |     * Redistributions in binary form must reproduce the above copyright notice, this list of
 8 |       conditions and the following disclaimer in the documentation and/or other materials
 9 |       provided with the distribution.
10 |     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
11 |       to endorse or promote products derived from this software without specific prior written
12 |       permission.
13 | 
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
15 | IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
16 | FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
17 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
18 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
19 | OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
20 | STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
21 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
22 | 


--------------------------------------------------------------------------------
/MathDx/cuBLASDx/.gitignore:
--------------------------------------------------------------------------------
1 | build/


--------------------------------------------------------------------------------
/MathDx/cuBLASDx/reference/naive_reference.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef CUBLASDX_EXAMPLE_NAIVE_REFERENCE_HPP
 2 | #define CUBLASDX_EXAMPLE_NAIVE_REFERENCE_HPP
 3 | 
 4 | #include <type_traits>
 5 | #include "../common.hpp"
 6 | 
 7 | namespace example {
 8 |     template<typename ValueType>
 9 |     void reference_gemm_naive_device(const unsigned int                 m,
10 |                                      const unsigned int                 n,
11 |                                      const unsigned int                 k,
12 |                                      const ValueType                    alpha,
13 |                                      example::device_vector<ValueType>& A,
14 |                                      const unsigned int                 lda,
15 |                                      cublasdx::arrangement              arr_a,
16 |                                      example::device_vector<ValueType>& B,
17 |                                      const unsigned int                 ldb,
18 |                                      cublasdx::arrangement              arr_b,
19 |                                      const ValueType                    beta,
20 |                                      example::device_vector<ValueType>& C,
21 |                                      const unsigned int                 ldc,
22 |                                      cublasdx::arrangement              arr_c);
23 | } // namespace example
24 | 
25 | #endif // CUBLASDX_EXAMPLE_NAIVE_REFERENCE_HPP
26 | 


--------------------------------------------------------------------------------
/MathDx/cuFFTDx/01_simple_fft_thread/02_simple_fft_thread_lto_cases.csv:
--------------------------------------------------------------------------------
1 | exec_op,size,type,direction,precision
2 | Thread,8,fft_type::c2c,fft_direction::forward,double


--------------------------------------------------------------------------------
/MathDx/cuFFTDx/02_simple_fft_block/10_simple_fft_block_c2r_lto_cases.csv:
--------------------------------------------------------------------------------
1 | exec_op,size,type,precision,elements_per_thread,real_mode
2 | Block,128,fft_type::c2r,float,8,real_mode::normal


--------------------------------------------------------------------------------
/MathDx/cuFFTDx/03_block_fft_performance/00_block_fft_performance.cu:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <vector>
 3 | #include <chrono>
 4 | #include <cmath>
 5 | 
 6 | #include "block_fft_performance.hpp"
 7 | 
 8 | template<unsigned int Arch>
 9 | void block_fft_performance() {
10 |     using namespace cufftdx;
11 | 
12 |     using fft_base = decltype(Block() + Type<fft_type::c2c>() + Direction<fft_direction::forward>() +
13 |                               Precision<float>() + SM<Arch>());
14 | 
15 |     static constexpr unsigned int elements_per_thread = 8;
16 |     static constexpr unsigned int fft_size            = 512;
17 |     static constexpr unsigned int ffts_per_block      = 1;
18 | 
19 |     cudaStream_t stream;
20 |     CUDA_CHECK_AND_EXIT(cudaStreamCreate(&stream))
21 |     benchmark_block_fft<fft_base, fft_size, elements_per_thread, ffts_per_block>(stream, true);
22 |     CUDA_CHECK_AND_EXIT(cudaStreamDestroy(stream));
23 | }
24 | 
25 | template<unsigned int Arch>
26 | struct block_fft_performance_functor {
27 |     void operator()() { return block_fft_performance<Arch>(); }
28 | };
29 | 
30 | int main(int, char**) {
31 |     return example::sm_runner<block_fft_performance_functor>();
32 | }
33 | 


--------------------------------------------------------------------------------
/MathDx/cuFFTDx/03_block_fft_performance/02_block_fft_lto_ptx_performance_cases.csv:
--------------------------------------------------------------------------------
 1 | size,direction,type,precision,exec_op
 2 | 16,fft_direction::forward,fft_type::c2c,float,Block
 3 | 32,fft_direction::forward,fft_type::c2c,float,Block
 4 | 64,fft_direction::forward,fft_type::c2c,float,Block
 5 | 128,fft_direction::forward,fft_type::c2c,float,Block
 6 | 256,fft_direction::forward,fft_type::c2c,float,Block
 7 | 512,fft_direction::forward,fft_type::c2c,float,Block
 8 | 1024,fft_direction::forward,fft_type::c2c,float,Block
 9 | 2048,fft_direction::forward,fft_type::c2c,float,Block
10 | 4096,fft_direction::forward,fft_type::c2c,float,Block
11 | 8192,fft_direction::forward,fft_type::c2c,float,Block
12 | 16384,fft_direction::forward,fft_type::c2c,float,Block
13 | 544,fft_direction::forward,fft_type::c2c,float,Block
14 | 608,fft_direction::forward,fft_type::c2c,float,Block
15 | 675,fft_direction::forward,fft_type::c2c,float,Block
16 | 686,fft_direction::forward,fft_type::c2c,float,Block
17 | 800,fft_direction::forward,fft_type::c2c,float,Block
18 | 


--------------------------------------------------------------------------------
/MathDx/cuFFTDx/09_introduction_lto_example/00_introduction_lto_cases.csv:
--------------------------------------------------------------------------------
1 | size,direction,exec_op
2 | 128,fft_direction::forward,Block


--------------------------------------------------------------------------------
/MathDx/cuFFTDx/10_cufft_device_api_example/cufft_device_api_lto_helper/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.18)
 2 | 
 3 | # Define project
 4 | project(cufft_device_api_lto_helper LANGUAGES CXX)
 5 | 
 6 | set(CMAKE_CXX_STANDARD 17)
 7 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
 8 | 
 9 | # Possible variables that can be set externally
10 | # cufft_ROOT: points to the cuFFT library root directory
11 | 
12 | find_package(cufft 11.5.0 EXACT REQUIRED CONFIG
13 |     PATHS
14 |         "${PROJECT_SOURCE_DIR}/../../../../../cufft"
15 |         "/opt/cufft"
16 | )
17 | 
18 | # Define the helper executable
19 | add_executable(cufft_device_api_lto_helper cufft_device_api_lto_helper.cpp)
20 | target_link_libraries(cufft_device_api_lto_helper
21 |     PUBLIC
22 |         cufft::cufft_static
23 | )
24 | 


--------------------------------------------------------------------------------
/MathDx/cuFFTDx/lto_helper/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.18)
 2 | 
 3 | # Define project
 4 | project(lto_helper LANGUAGES CXX)
 5 | 
 6 | set(CMAKE_CXX_STANDARD 17)
 7 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
 8 | 
 9 | # Possible variables that can be set externally
10 | # cufft_ROOT: points to the cuFFT library root directory
11 | 
12 | find_package(cufft 11.5.0 EXACT REQUIRED CONFIG
13 |     PATHS
14 |         "${PROJECT_SOURCE_DIR}/../../../../cufft"
15 |         "/opt/cufft"
16 | )
17 | 
18 | # Define the helper executable
19 | add_executable(cufftdx_cufft_lto_helper cufftdx_cufft_lto_helper.cpp)
20 | target_link_libraries(cufftdx_cufft_lto_helper
21 |     PRIVATE
22 |         cufft::cufft_static
23 | )
24 | 


--------------------------------------------------------------------------------
/MathDx/cuSolverDx/common.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef CUSOLVERDX_EXAMPLE_COMMON_HPP_
 2 | #define CUSOLVERDX_EXAMPLE_COMMON_HPP_
 3 | 
 4 | #include <type_traits>
 5 | #include <vector>
 6 | #include <random>
 7 | 
 8 | #ifndef CUSOLVERDX_EXAMPLE_NVRTC
 9 | #    include <cuda/std/complex>
10 | #    include <cusolverdx.hpp>
11 | #endif
12 | 
13 | #include "common/macros.hpp"
14 | #include "common/cudart.hpp"
15 | #include "common/error_checking.hpp"
16 | #include "common/measure.hpp"
17 | #include "common/numeric.hpp"
18 | #include "common/random.hpp"
19 | #include "common/example_sm_runner.hpp"
20 | #include "common/device_io.hpp"
21 | #include "common/print.hpp"
22 | #include "common/cusolver_reference_cholesky.hpp"
23 | #include "common/cusolver_reference_lu.hpp"
24 | 
25 | // the nvcc bug in CUDA 12.2-12.4, fixed in 12.5
26 | #ifdef __NVCC__
27 | #    if (__CUDACC_VER_MAJOR__ == 12 && (__CUDACC_VER_MINOR__ >= 2 && __CUDACC_VER_MINOR__ <= 5))
28 | #        define CUSOLVERDX_EXAMPLE_DETAIL_NVCC_12_2_BUG_WORKAROUND 1
29 | #    endif
30 | #endif
31 | 
32 | namespace example {
33 |     // Used when CUSOLVERDX_EXAMPLE_DETAIL_NVCC_12_2_BUG_WORKAROUND is defined
34 |     template<typename T>
35 |     using a_data_type_t = typename T::a_data_type;
36 | 
37 |     template<typename T>
38 |     using a_cuda_data_type_t = typename T::a_cuda_data_type;
39 | } // namespace example
40 | 
41 | #endif
42 | 


--------------------------------------------------------------------------------
/MathDx/cuSolverDx/common/error_checking.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef CUSOLVERDX_EXAMPLE_COMMON_ERROR_CHECKING_HPP
 2 | #define CUSOLVERDX_EXAMPLE_COMMON_ERROR_CHECKING_HPP
 3 | 
 4 | #include <cmath>
 5 | #include <iostream>
 6 | 
 7 | #include <type_traits>
 8 | 
 9 | #include <cusolverdx.hpp>
10 | 
11 | #include "numeric.hpp"
12 | 
13 | namespace common {
14 | 
15 |     template<typename ResultType, typename ReferenceType>
16 |     double check_error(const ResultType* data, const ReferenceType* reference, const std::size_t n, bool print = false, bool verbose = false);
17 | 
18 |     template<typename T>
19 |     bool is_error_acceptable(double tot_rel_err) {
20 |         constexpr bool is_non_float_non_double_a_b_c =
21 |             (!std::is_same_v<T, float> && !std::is_same_v<T, double>) || (!std::is_same_v<T, cusolverdx::complex<float>> && !std::is_same_v<T, cusolverdx::complex<double>>);
22 | 
23 |         if (is_non_float_non_double_a_b_c) {
24 |             if (tot_rel_err > 1e-2) {
25 |                 std::cout << tot_rel_err << std::endl;
26 |                 return false;
27 |             }
28 |         } else { // A,B,C are either float or double
29 |             if (tot_rel_err > 1e-3) {
30 |                 std::cout << tot_rel_err << std::endl;
31 |                 return false;
32 |             }
33 |         }
34 |         return std::isfinite(tot_rel_err);
35 |     }
36 | } // namespace common
37 | 
38 | 
39 | #endif // CUSOLVERDX_TEST_COMMON_ERROR_CHECKING_HPP
40 | 


--------------------------------------------------------------------------------
/NPP+/batchedLabelMarkersAndCompression/images/CT_skull_512x512_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/batchedLabelMarkersAndCompression/images/CT_skull_512x512_8u.raw


--------------------------------------------------------------------------------
/NPP+/batchedLabelMarkersAndCompression/images/CT_skull_CompressedMarkerLabelsUF_8Way_512x512_32u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/batchedLabelMarkersAndCompression/images/CT_skull_CompressedMarkerLabelsUF_8Way_512x512_32u.raw


--------------------------------------------------------------------------------
/NPP+/batchedLabelMarkersAndCompression/images/CT_skull_LabelMarkersUFBatch_8Way_512x512_32u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/batchedLabelMarkersAndCompression/images/CT_skull_LabelMarkersUFBatch_8Way_512x512_32u.raw


--------------------------------------------------------------------------------
/NPP+/batchedLabelMarkersAndCompression/images/CT_skull_LabelMarkersUF_8Way_512x512_32u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/batchedLabelMarkersAndCompression/images/CT_skull_LabelMarkersUF_8Way_512x512_32u.raw


--------------------------------------------------------------------------------
/NPP+/batchedLabelMarkersAndCompression/images/Lena_CompressedMarkerLabelsUF_8Way_512x512_32u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/batchedLabelMarkersAndCompression/images/Lena_CompressedMarkerLabelsUF_8Way_512x512_32u.raw


--------------------------------------------------------------------------------
/NPP+/batchedLabelMarkersAndCompression/images/Lena_LabelMarkersUFBatch_8Way_512x512_32u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/batchedLabelMarkersAndCompression/images/Lena_LabelMarkersUFBatch_8Way_512x512_32u.raw


--------------------------------------------------------------------------------
/NPP+/batchedLabelMarkersAndCompression/images/Lena_LabelMarkersUF_8Way_512x512_32u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/batchedLabelMarkersAndCompression/images/Lena_LabelMarkersUF_8Way_512x512_32u.raw


--------------------------------------------------------------------------------
/NPP+/batchedLabelMarkersAndCompression/images/PCB2_1024x683_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/batchedLabelMarkersAndCompression/images/PCB2_1024x683_8u.raw


--------------------------------------------------------------------------------
/NPP+/batchedLabelMarkersAndCompression/images/PCB2_CompressedMarkerLabelsUF_8Way_1024x683_32u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/batchedLabelMarkersAndCompression/images/PCB2_CompressedMarkerLabelsUF_8Way_1024x683_32u.raw


--------------------------------------------------------------------------------
/NPP+/batchedLabelMarkersAndCompression/images/PCB2_LabelMarkersUFBatch_8Way_1024x683_32u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/batchedLabelMarkersAndCompression/images/PCB2_LabelMarkersUFBatch_8Way_1024x683_32u.raw


--------------------------------------------------------------------------------
/NPP+/batchedLabelMarkersAndCompression/images/PCB2_LabelMarkersUF_8Way_1024x683_32u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/batchedLabelMarkersAndCompression/images/PCB2_LabelMarkersUF_8Way_1024x683_32u.raw


--------------------------------------------------------------------------------
/NPP+/batchedLabelMarkersAndCompression/images/PCB_1280x720_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/batchedLabelMarkersAndCompression/images/PCB_1280x720_8u.raw


--------------------------------------------------------------------------------
/NPP+/batchedLabelMarkersAndCompression/images/PCB_CompressedMarkerLabelsUF_8Way_1280x720_32u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/batchedLabelMarkersAndCompression/images/PCB_CompressedMarkerLabelsUF_8Way_1280x720_32u.raw


--------------------------------------------------------------------------------
/NPP+/batchedLabelMarkersAndCompression/images/PCB_LabelMarkersUFBatch_8Way_1280x720_32u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/batchedLabelMarkersAndCompression/images/PCB_LabelMarkersUFBatch_8Way_1280x720_32u.raw


--------------------------------------------------------------------------------
/NPP+/batchedLabelMarkersAndCompression/images/PCB_LabelMarkersUF_8Way_1280x720_32u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/batchedLabelMarkersAndCompression/images/PCB_LabelMarkersUF_8Way_1280x720_32u.raw


--------------------------------------------------------------------------------
/NPP+/batchedLabelMarkersAndCompression/images/PCB_METAL_509x335_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/batchedLabelMarkersAndCompression/images/PCB_METAL_509x335_8u.raw


--------------------------------------------------------------------------------
/NPP+/batchedLabelMarkersAndCompression/images/PCB_METAL_CompressedMarkerLabelsUF_8Way_509x335_32u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/batchedLabelMarkersAndCompression/images/PCB_METAL_CompressedMarkerLabelsUF_8Way_509x335_32u.raw


--------------------------------------------------------------------------------
/NPP+/batchedLabelMarkersAndCompression/images/PCB_METAL_LabelMarkersUFBatch_8Way_509x335_32u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/batchedLabelMarkersAndCompression/images/PCB_METAL_LabelMarkersUFBatch_8Way_509x335_32u.raw


--------------------------------------------------------------------------------
/NPP+/batchedLabelMarkersAndCompression/images/PCB_METAL_LabelMarkersUF_8Way_509x335_32u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/batchedLabelMarkersAndCompression/images/PCB_METAL_LabelMarkersUF_8Way_509x335_32u.raw


--------------------------------------------------------------------------------
/NPP+/batchedLabelMarkersAndCompression/images/lena_512x512_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/batchedLabelMarkersAndCompression/images/lena_512x512_8u.raw


--------------------------------------------------------------------------------
/NPP+/cannyEdgeDetectorPython/Teapot.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/cannyEdgeDetectorPython/Teapot.jpg


--------------------------------------------------------------------------------
/NPP+/cannyEdgeDetectorPython/Teapot_resolutions/out_npp_1280x720.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/cannyEdgeDetectorPython/Teapot_resolutions/out_npp_1280x720.png


--------------------------------------------------------------------------------
/NPP+/cannyEdgeDetectorPython/Teapot_resolutions/out_npp_1920x1080.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/cannyEdgeDetectorPython/Teapot_resolutions/out_npp_1920x1080.png


--------------------------------------------------------------------------------
/NPP+/cannyEdgeDetectorPython/Teapot_resolutions/out_npp_2560x1440.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/cannyEdgeDetectorPython/Teapot_resolutions/out_npp_2560x1440.png


--------------------------------------------------------------------------------
/NPP+/cannyEdgeDetectorPython/Teapot_resolutions/out_npp_320x180.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/cannyEdgeDetectorPython/Teapot_resolutions/out_npp_320x180.png


--------------------------------------------------------------------------------
/NPP+/cannyEdgeDetectorPython/Teapot_resolutions/out_npp_3840x2160.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/cannyEdgeDetectorPython/Teapot_resolutions/out_npp_3840x2160.png


--------------------------------------------------------------------------------
/NPP+/cannyEdgeDetectorPython/Teapot_resolutions/out_npp_5120x2880.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/cannyEdgeDetectorPython/Teapot_resolutions/out_npp_5120x2880.png


--------------------------------------------------------------------------------
/NPP+/cannyEdgeDetectorPython/Teapot_resolutions/out_npp_640x360.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/cannyEdgeDetectorPython/Teapot_resolutions/out_npp_640x360.png


--------------------------------------------------------------------------------
/NPP+/cannyEdgeDetectorPython/Teapot_resolutions/out_npp_800x600.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/cannyEdgeDetectorPython/Teapot_resolutions/out_npp_800x600.png


--------------------------------------------------------------------------------
/NPP+/cannyEdgeDetectorPython/Teapot_resolutions/performance_results.csv:
--------------------------------------------------------------------------------
 1 | Resolution,Megapixels,NPP Time (ms)
 2 | 320x180,0.0576,0.04460153608769178
 3 | 640x360,0.2304,0.04880320030450821
 4 | 800x600,0.48,0.054283583376556636
 5 | 1280x720,0.9216,0.06511971176043153
 6 | 1920x1080,2.0736,0.1064842866435647
 7 | 2560x1440,3.6864,0.15593324881792067
 8 | 3840x2160,8.2944,0.30110825645923617
 9 | 5120x2880,14.7456,0.5023832985758782
10 | 


--------------------------------------------------------------------------------
/NPP+/distanceTransform/DistanceTransformTrue_Dolphin1_319x319_16u.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/distanceTransform/DistanceTransformTrue_Dolphin1_319x319_16u.jpg


--------------------------------------------------------------------------------
/NPP+/distanceTransform/dolphin1_Input_319x319_8u.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/distanceTransform/dolphin1_Input_319x319_8u.jpg


--------------------------------------------------------------------------------
/NPP+/distanceTransform/images/Dolphin1_313x317_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/distanceTransform/images/Dolphin1_313x317_8u.raw


--------------------------------------------------------------------------------
/NPP+/distanceTransform/images/TestImage3_diamond_64x64_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/distanceTransform/images/TestImage3_diamond_64x64_8u.raw


--------------------------------------------------------------------------------
/NPP+/findContour/CircuitBoard_2048x1024_8u.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/findContour/CircuitBoard_2048x1024_8u.jpg


--------------------------------------------------------------------------------
/NPP+/findContour/CircuitBoard_CompressedMarkerLabelsUF_8Way_2048x1024_32u.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/findContour/CircuitBoard_CompressedMarkerLabelsUF_8Way_2048x1024_32u.jpg


--------------------------------------------------------------------------------
/NPP+/findContour/CircuitBoard_ContoursReconstructed_8Way_2048x1024_8u.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/findContour/CircuitBoard_ContoursReconstructed_8Way_2048x1024_8u.jpg


--------------------------------------------------------------------------------
/NPP+/findContour/CircuitBoard_Contours_8Way_2048x1024_8u.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/findContour/CircuitBoard_Contours_8Way_2048x1024_8u.jpg


--------------------------------------------------------------------------------
/NPP+/findContour/CircuitBoard_LabelMarkersUF_8Way_2048x1024_32u.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/findContour/CircuitBoard_LabelMarkersUF_8Way_2048x1024_32u.jpg


--------------------------------------------------------------------------------
/NPP+/findContour/images/CircuitBoard_2048x1024_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/findContour/images/CircuitBoard_2048x1024_8u.raw


--------------------------------------------------------------------------------
/NPP+/floodFill/images/CT_skull_512x512_8u_Gray.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/floodFill/images/CT_skull_512x512_8u_Gray.raw


--------------------------------------------------------------------------------
/NPP+/floodFill/images/Corn_614x461_8u_Gray.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/floodFill/images/Corn_614x461_8u_Gray.raw


--------------------------------------------------------------------------------
/NPP+/floodFill/images/DistanceSampler_512x512_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/floodFill/images/DistanceSampler_512x512_8u.raw


--------------------------------------------------------------------------------
/NPP+/floodFill/images/DistanceSampler_512x512_Inverted_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/floodFill/images/DistanceSampler_512x512_Inverted_8u.raw


--------------------------------------------------------------------------------
/NPP+/floodFill/images/RainbowChart_RGB_C3_1024x445_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/floodFill/images/RainbowChart_RGB_C3_1024x445_8u.raw


--------------------------------------------------------------------------------
/NPP+/floodFill/images/RainbowChart_RGB_C3_Fill_8Way_1024x445_Dev_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/floodFill/images/RainbowChart_RGB_C3_Fill_8Way_1024x445_Dev_8u.raw


--------------------------------------------------------------------------------
/NPP+/floodFill/images/RainbowChart_RGB_C3_Fill_8Way_Gradient_1024x445_Dev_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/floodFill/images/RainbowChart_RGB_C3_Fill_8Way_Gradient_1024x445_Dev_8u.raw


--------------------------------------------------------------------------------
/NPP+/floodFill/images/RainbowChart_RGB_C3_Fill_8Way_Gradient_Boundary_1024x445_Dev_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/floodFill/images/RainbowChart_RGB_C3_Fill_8Way_Gradient_Boundary_1024x445_Dev_8u.raw


--------------------------------------------------------------------------------
/NPP+/floodFill/images/Rocks_512x512_8u_Gray.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/floodFill/images/Rocks_512x512_8u_Gray.raw


--------------------------------------------------------------------------------
/NPP+/floodFill/images/SeabedSampler_RGB_C3_675x1024_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/floodFill/images/SeabedSampler_RGB_C3_675x1024_8u.raw


--------------------------------------------------------------------------------
/NPP+/floodFill/images/SeabedSampler_RGB_C3_Fill_8Way_Range_675x1024_Dev_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/floodFill/images/SeabedSampler_RGB_C3_Fill_8Way_Range_675x1024_Dev_8u.raw


--------------------------------------------------------------------------------
/NPP+/floodFill/images/SeabedSampler_RGB_C3_Fill_8Way_Range_Boundary_675x1024_Dev_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/floodFill/images/SeabedSampler_RGB_C3_Fill_8Way_Range_Boundary_675x1024_Dev_8u.raw


--------------------------------------------------------------------------------
/NPP+/floodFill/images/SignedCircle_256x206_64f.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/floodFill/images/SignedCircle_256x206_64f.raw


--------------------------------------------------------------------------------
/NPP+/floodFill/images/SignedCircle_256x206_Inverted_64f.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/floodFill/images/SignedCircle_256x206_Inverted_64f.raw


--------------------------------------------------------------------------------
/NPP+/floodFill/images/SignedLith_554x554_32f.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/floodFill/images/SignedLith_554x554_32f.raw


--------------------------------------------------------------------------------
/NPP+/watershedSegmentation/images/CT_skull_512x512_8u_Gray.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/CT_skull_512x512_8u_Gray.raw


--------------------------------------------------------------------------------
/NPP+/watershedSegmentation/images/CT_skull_CompressedSegmentLabels_8Way_512x512_32u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/CT_skull_CompressedSegmentLabels_8Way_512x512_32u.raw


--------------------------------------------------------------------------------
/NPP+/watershedSegmentation/images/CT_skull_SegmentBoundaries_8Way_512x512_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/CT_skull_SegmentBoundaries_8Way_512x512_8u.raw


--------------------------------------------------------------------------------
/NPP+/watershedSegmentation/images/CT_skull_SegmentsWithContrastingBoundaries_8Way_512x512_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/CT_skull_SegmentsWithContrastingBoundaries_8Way_512x512_8u.raw


--------------------------------------------------------------------------------
/NPP+/watershedSegmentation/images/CT_skull_Segments_8Way_512x512_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/CT_skull_Segments_8Way_512x512_8u.raw


--------------------------------------------------------------------------------
/NPP+/watershedSegmentation/images/Corn_614x461_8u_Gray.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/Corn_614x461_8u_Gray.raw


--------------------------------------------------------------------------------
/NPP+/watershedSegmentation/images/Corn_CompressedSegmentLabels_8Way_614x461_32u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/Corn_CompressedSegmentLabels_8Way_614x461_32u.raw


--------------------------------------------------------------------------------
/NPP+/watershedSegmentation/images/Corn_SegmentBoundaries_8Way_614x461_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/Corn_SegmentBoundaries_8Way_614x461_8u.raw


--------------------------------------------------------------------------------
/NPP+/watershedSegmentation/images/Corn_SegmentsWithContrastingBoundaries_8Way_614x461_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/Corn_SegmentsWithContrastingBoundaries_8Way_614x461_8u.raw


--------------------------------------------------------------------------------
/NPP+/watershedSegmentation/images/Corn_Segments_8Way_614x461_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/Corn_Segments_8Way_614x461_8u.raw


--------------------------------------------------------------------------------
/NPP+/watershedSegmentation/images/DistanceSampler_512x512_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/DistanceSampler_512x512_8u.raw


--------------------------------------------------------------------------------
/NPP+/watershedSegmentation/images/DistanceSampler_512x512_Inverted_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/DistanceSampler_512x512_Inverted_8u.raw


--------------------------------------------------------------------------------
/NPP+/watershedSegmentation/images/RainbowChart_RGB_C3_1024x445_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/RainbowChart_RGB_C3_1024x445_8u.raw


--------------------------------------------------------------------------------
/NPP+/watershedSegmentation/images/Rocks_512x512_8u_Gray.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/Rocks_512x512_8u_Gray.raw


--------------------------------------------------------------------------------
/NPP+/watershedSegmentation/images/Rocks_CompressedSegmentLabels_8Way_512x512_32u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/Rocks_CompressedSegmentLabels_8Way_512x512_32u.raw


--------------------------------------------------------------------------------
/NPP+/watershedSegmentation/images/Rocks_SegmentBoundaries_8Way_512x512_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/Rocks_SegmentBoundaries_8Way_512x512_8u.raw


--------------------------------------------------------------------------------
/NPP+/watershedSegmentation/images/Rocks_SegmentsWithContrastingBoundaries_8Way_512x512_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/Rocks_SegmentsWithContrastingBoundaries_8Way_512x512_8u.raw


--------------------------------------------------------------------------------
/NPP+/watershedSegmentation/images/Rocks_Segments_8Way_512x512_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/Rocks_Segments_8Way_512x512_8u.raw


--------------------------------------------------------------------------------
/NPP+/watershedSegmentation/images/SeabedSampler_RGB_C3_675x1024_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/SeabedSampler_RGB_C3_675x1024_8u.raw


--------------------------------------------------------------------------------
/NPP+/watershedSegmentation/images/SignedCircle_256x206_64f.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/SignedCircle_256x206_64f.raw


--------------------------------------------------------------------------------
/NPP+/watershedSegmentation/images/SignedCircle_256x206_Inverted_64f.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/SignedCircle_256x206_Inverted_64f.raw


--------------------------------------------------------------------------------
/NPP+/watershedSegmentation/images/SignedLith_554x554_32f.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP+/watershedSegmentation/images/SignedLith_554x554_32f.raw


--------------------------------------------------------------------------------
/NPP/batchedLabelMarkersAndCompression/images/CT_skull_512x512_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/batchedLabelMarkersAndCompression/images/CT_skull_512x512_8u.raw


--------------------------------------------------------------------------------
/NPP/batchedLabelMarkersAndCompression/images/CT_skull_CompressedMarkerLabelsUF_8Way_512x512_32u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/batchedLabelMarkersAndCompression/images/CT_skull_CompressedMarkerLabelsUF_8Way_512x512_32u.raw


--------------------------------------------------------------------------------
/NPP/batchedLabelMarkersAndCompression/images/CT_skull_LabelMarkersUFBatch_8Way_512x512_32u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/batchedLabelMarkersAndCompression/images/CT_skull_LabelMarkersUFBatch_8Way_512x512_32u.raw


--------------------------------------------------------------------------------
/NPP/batchedLabelMarkersAndCompression/images/CT_skull_LabelMarkersUF_8Way_512x512_32u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/batchedLabelMarkersAndCompression/images/CT_skull_LabelMarkersUF_8Way_512x512_32u.raw


--------------------------------------------------------------------------------
/NPP/batchedLabelMarkersAndCompression/images/Lena_CompressedMarkerLabelsUF_8Way_512x512_32u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/batchedLabelMarkersAndCompression/images/Lena_CompressedMarkerLabelsUF_8Way_512x512_32u.raw


--------------------------------------------------------------------------------
/NPP/batchedLabelMarkersAndCompression/images/Lena_LabelMarkersUFBatch_8Way_512x512_32u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/batchedLabelMarkersAndCompression/images/Lena_LabelMarkersUFBatch_8Way_512x512_32u.raw


--------------------------------------------------------------------------------
/NPP/batchedLabelMarkersAndCompression/images/Lena_LabelMarkersUF_8Way_512x512_32u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/batchedLabelMarkersAndCompression/images/Lena_LabelMarkersUF_8Way_512x512_32u.raw


--------------------------------------------------------------------------------
/NPP/batchedLabelMarkersAndCompression/images/PCB2_1024x683_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/batchedLabelMarkersAndCompression/images/PCB2_1024x683_8u.raw


--------------------------------------------------------------------------------
/NPP/batchedLabelMarkersAndCompression/images/PCB2_CompressedMarkerLabelsUF_8Way_1024x683_32u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/batchedLabelMarkersAndCompression/images/PCB2_CompressedMarkerLabelsUF_8Way_1024x683_32u.raw


--------------------------------------------------------------------------------
/NPP/batchedLabelMarkersAndCompression/images/PCB2_LabelMarkersUFBatch_8Way_1024x683_32u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/batchedLabelMarkersAndCompression/images/PCB2_LabelMarkersUFBatch_8Way_1024x683_32u.raw


--------------------------------------------------------------------------------
/NPP/batchedLabelMarkersAndCompression/images/PCB2_LabelMarkersUF_8Way_1024x683_32u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/batchedLabelMarkersAndCompression/images/PCB2_LabelMarkersUF_8Way_1024x683_32u.raw


--------------------------------------------------------------------------------
/NPP/batchedLabelMarkersAndCompression/images/PCB_1280x720_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/batchedLabelMarkersAndCompression/images/PCB_1280x720_8u.raw


--------------------------------------------------------------------------------
/NPP/batchedLabelMarkersAndCompression/images/PCB_CompressedMarkerLabelsUF_8Way_1280x720_32u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/batchedLabelMarkersAndCompression/images/PCB_CompressedMarkerLabelsUF_8Way_1280x720_32u.raw


--------------------------------------------------------------------------------
/NPP/batchedLabelMarkersAndCompression/images/PCB_LabelMarkersUFBatch_8Way_1280x720_32u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/batchedLabelMarkersAndCompression/images/PCB_LabelMarkersUFBatch_8Way_1280x720_32u.raw


--------------------------------------------------------------------------------
/NPP/batchedLabelMarkersAndCompression/images/PCB_LabelMarkersUF_8Way_1280x720_32u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/batchedLabelMarkersAndCompression/images/PCB_LabelMarkersUF_8Way_1280x720_32u.raw


--------------------------------------------------------------------------------
/NPP/batchedLabelMarkersAndCompression/images/PCB_METAL_509x335_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/batchedLabelMarkersAndCompression/images/PCB_METAL_509x335_8u.raw


--------------------------------------------------------------------------------
/NPP/batchedLabelMarkersAndCompression/images/PCB_METAL_CompressedMarkerLabelsUF_8Way_509x335_32u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/batchedLabelMarkersAndCompression/images/PCB_METAL_CompressedMarkerLabelsUF_8Way_509x335_32u.raw


--------------------------------------------------------------------------------
/NPP/batchedLabelMarkersAndCompression/images/PCB_METAL_LabelMarkersUFBatch_8Way_509x335_32u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/batchedLabelMarkersAndCompression/images/PCB_METAL_LabelMarkersUFBatch_8Way_509x335_32u.raw


--------------------------------------------------------------------------------
/NPP/batchedLabelMarkersAndCompression/images/PCB_METAL_LabelMarkersUF_8Way_509x335_32u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/batchedLabelMarkersAndCompression/images/PCB_METAL_LabelMarkersUF_8Way_509x335_32u.raw


--------------------------------------------------------------------------------
/NPP/batchedLabelMarkersAndCompression/images/lena_512x512_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/batchedLabelMarkersAndCompression/images/lena_512x512_8u.raw


--------------------------------------------------------------------------------
/NPP/distanceTransform/DistanceTransformTrue_Dolphin1_319x319_16u.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/distanceTransform/DistanceTransformTrue_Dolphin1_319x319_16u.jpg


--------------------------------------------------------------------------------
/NPP/distanceTransform/dolphin1_Input_319x319_8u.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/distanceTransform/dolphin1_Input_319x319_8u.jpg


--------------------------------------------------------------------------------
/NPP/distanceTransform/images/Dolphin1_313x317_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/distanceTransform/images/Dolphin1_313x317_8u.raw


--------------------------------------------------------------------------------
/NPP/distanceTransform/images/TestImage3_diamond_64x64_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/distanceTransform/images/TestImage3_diamond_64x64_8u.raw


--------------------------------------------------------------------------------
/NPP/findContour/CircuitBoard_2048x1024_8u.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/findContour/CircuitBoard_2048x1024_8u.jpg


--------------------------------------------------------------------------------
/NPP/findContour/CircuitBoard_CompressedMarkerLabelsUF_8Way_2048x1024_32u.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/findContour/CircuitBoard_CompressedMarkerLabelsUF_8Way_2048x1024_32u.jpg


--------------------------------------------------------------------------------
/NPP/findContour/CircuitBoard_ContoursReconstructed_8Way_2048x1024_8u.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/findContour/CircuitBoard_ContoursReconstructed_8Way_2048x1024_8u.jpg


--------------------------------------------------------------------------------
/NPP/findContour/CircuitBoard_Contours_8Way_2048x1024_8u.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/findContour/CircuitBoard_Contours_8Way_2048x1024_8u.jpg


--------------------------------------------------------------------------------
/NPP/findContour/CircuitBoard_LabelMarkersUF_8Way_2048x1024_32u.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/findContour/CircuitBoard_LabelMarkersUF_8Way_2048x1024_32u.jpg


--------------------------------------------------------------------------------
/NPP/findContour/images/CircuitBoard_2048x1024_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/findContour/images/CircuitBoard_2048x1024_8u.raw


--------------------------------------------------------------------------------
/NPP/watershedSegmentation/images/CT_skull_512x512_8u_Gray.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/watershedSegmentation/images/CT_skull_512x512_8u_Gray.raw


--------------------------------------------------------------------------------
/NPP/watershedSegmentation/images/CT_skull_CompressedSegmentLabels_8Way_512x512_32u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/watershedSegmentation/images/CT_skull_CompressedSegmentLabels_8Way_512x512_32u.raw


--------------------------------------------------------------------------------
/NPP/watershedSegmentation/images/CT_skull_SegmentBoundaries_8Way_512x512_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/watershedSegmentation/images/CT_skull_SegmentBoundaries_8Way_512x512_8u.raw


--------------------------------------------------------------------------------
/NPP/watershedSegmentation/images/CT_skull_SegmentsWithContrastingBoundaries_8Way_512x512_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/watershedSegmentation/images/CT_skull_SegmentsWithContrastingBoundaries_8Way_512x512_8u.raw


--------------------------------------------------------------------------------
/NPP/watershedSegmentation/images/CT_skull_Segments_8Way_512x512_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/watershedSegmentation/images/CT_skull_Segments_8Way_512x512_8u.raw


--------------------------------------------------------------------------------
/NPP/watershedSegmentation/images/Lena_512x512_8u_Gray.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/watershedSegmentation/images/Lena_512x512_8u_Gray.raw


--------------------------------------------------------------------------------
/NPP/watershedSegmentation/images/Lena_CompressedSegmentLabels_8Way_512x512_32u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/watershedSegmentation/images/Lena_CompressedSegmentLabels_8Way_512x512_32u.raw


--------------------------------------------------------------------------------
/NPP/watershedSegmentation/images/Lena_SegmentBoundaries_8Way_512x512_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/watershedSegmentation/images/Lena_SegmentBoundaries_8Way_512x512_8u.raw


--------------------------------------------------------------------------------
/NPP/watershedSegmentation/images/Lena_SegmentsWithContrastingBoundaries_8Way_512x512_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/watershedSegmentation/images/Lena_SegmentsWithContrastingBoundaries_8Way_512x512_8u.raw


--------------------------------------------------------------------------------
/NPP/watershedSegmentation/images/Lena_Segments_8Way_512x512_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/watershedSegmentation/images/Lena_Segments_8Way_512x512_8u.raw


--------------------------------------------------------------------------------
/NPP/watershedSegmentation/images/Rocks_512x512_8u_Gray.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/watershedSegmentation/images/Rocks_512x512_8u_Gray.raw


--------------------------------------------------------------------------------
/NPP/watershedSegmentation/images/Rocks_CompressedSegmentLabels_8Way_512x512_32u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/watershedSegmentation/images/Rocks_CompressedSegmentLabels_8Way_512x512_32u.raw


--------------------------------------------------------------------------------
/NPP/watershedSegmentation/images/Rocks_SegmentBoundaries_8Way_512x512_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/watershedSegmentation/images/Rocks_SegmentBoundaries_8Way_512x512_8u.raw


--------------------------------------------------------------------------------
/NPP/watershedSegmentation/images/Rocks_SegmentsWithContrastingBoundaries_8Way_512x512_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/watershedSegmentation/images/Rocks_SegmentsWithContrastingBoundaries_8Way_512x512_8u.raw


--------------------------------------------------------------------------------
/NPP/watershedSegmentation/images/Rocks_Segments_8Way_512x512_8u.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/watershedSegmentation/images/Rocks_Segments_8Way_512x512_8u.raw


--------------------------------------------------------------------------------
/NPP/watershedSegmentation/images/coins_500x383_8u_Gray.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/watershedSegmentation/images/coins_500x383_8u_Gray.raw


--------------------------------------------------------------------------------
/NPP/watershedSegmentation/images/coins_overlay_500x569_8u_Gray.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/NPP/watershedSegmentation/images/coins_overlay_500x569_8u_Gray.raw


--------------------------------------------------------------------------------
/cuBLAS/Emulation/bf16x9_gemmEx/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Emulation/bf16x9_sgemm/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Extensions/AxpyEx/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Extensions/Cherk3mEx/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Extensions/CherkEx/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Extensions/Csyrk3mEx/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Extensions/CsyrkEx/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Extensions/DotEx/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Extensions/GemmBatchedEx/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Extensions/GemmEx/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Extensions/GemmGroupedBatchedEx/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Extensions/GemmStridedBatchedEx/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Extensions/Nrm2Ex/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Extensions/RotEx/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Extensions/ScalEx/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Extensions/dgmm/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Extensions/geam/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Extensions/tpttr/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Extensions/tpttr/README.md:
--------------------------------------------------------------------------------
 1 | # cuBLAS Extension APIs - `cublas<t>tpttr`
 2 | 
 3 | ## Description
 4 | 
 5 | This code demonstrates a usage of cuBLAS `tpttr` function to perform the conversion from the triangular packed format to the triangular format
 6 | 
 7 | ```
 8 | A = | 1.0 | 2.0 |
 9 |     | 3.0 | 4.0 |
10 | ```
11 | 
12 | See documentation for further details.
13 | 
14 | ## Supported SM Architectures
15 | 
16 | All GPUs supported by CUDA Toolkit (https://developer.nvidia.com/cuda-gpus)  
17 | 
18 | ## Supported OSes
19 | 
20 | Linux  
21 | Windows
22 | 
23 | ## Supported CPU Architecture
24 | 
25 | x86_64  
26 | ppc64le  
27 | arm64-sbsa
28 | 
29 | ## CUDA APIs involved
30 | - [cublas\<t>tpttr() API](https://docs.nvidia.com/cuda/cublas/index.html#cublas-t-tpttr)
31 | 
32 | # Building (make)
33 | 
34 | # Prerequisites
35 | - A Linux/Windows system with recent NVIDIA drivers.
36 | - [CMake](https://cmake.org/download) version 3.18 minimum
37 | 
38 | ## Build command on Linux
39 | ```
40 | $ mkdir build
41 | $ cd build
42 | $ cmake ..
43 | $ make
44 | ```
45 | Make sure that CMake finds expected CUDA Toolkit. If that is not the case you can add argument `-DCMAKE_CUDA_COMPILER=/path/to/cuda/bin/nvcc` to cmake command.
46 | 
47 | ## Build command on Windows
48 | ```
49 | $ mkdir build
50 | $ cd build
51 | $ cmake -DCMAKE_GENERATOR_PLATFORM=x64 ..
52 | $ Open cublas_examples.sln project in Visual Studio and build
53 | ```
54 | 
55 | # Usage
56 | ```
57 | $  ./cublas_tpttr_example
58 | ```
59 | 
60 | Sample example output:
61 | 
62 | ```
63 | AP
64 | 1.00 2.00 
65 | 3.00 4.00 
66 | =====
67 | A
68 | 1.00 3.00 
69 | 0.00 2.00 
70 | =====
71 | ```
72 | 


--------------------------------------------------------------------------------
/cuBLAS/Extensions/trttp/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Extensions/trttp/README.md:
--------------------------------------------------------------------------------
 1 | # cuBLAS Extension APIs - `cublas<t>trttp`
 2 | 
 3 | ## Description
 4 | 
 5 | This code demonstrates a usage of cuBLAS `trttp` function to perform the conversion from the triangular format to the triangular packed format
 6 | 
 7 | ```
 8 | A = | 1.0 | 0.0 |
 9 |     | 2.0 | 3.0 |
10 | ```
11 | 
12 | See documentation for further details.
13 | 
14 | ## Supported SM Architectures
15 | 
16 | All GPUs supported by CUDA Toolkit (https://developer.nvidia.com/cuda-gpus)  
17 | 
18 | ## Supported OSes
19 | 
20 | Linux  
21 | Windows
22 | 
23 | ## Supported CPU Architecture
24 | 
25 | x86_64  
26 | ppc64le  
27 | arm64-sbsa
28 | 
29 | ## CUDA APIs involved
30 | - [cublas\<t>trttp() API](https://docs.nvidia.com/cuda/cublas/index.html#cublas-t-trttp)
31 | 
32 | # Building (make)
33 | 
34 | # Prerequisites
35 | - A Linux/Windows system with recent NVIDIA drivers.
36 | - [CMake](https://cmake.org/download) version 3.18 minimum
37 | 
38 | ## Build command on Linux
39 | ```
40 | $ mkdir build
41 | $ cd build
42 | $ cmake ..
43 | $ make
44 | ```
45 | Make sure that CMake finds expected CUDA Toolkit. If that is not the case you can add argument `-DCMAKE_CUDA_COMPILER=/path/to/cuda/bin/nvcc` to cmake command.
46 | 
47 | ## Build command on Windows
48 | ```
49 | $ mkdir build
50 | $ cd build
51 | $ cmake -DCMAKE_GENERATOR_PLATFORM=x64 ..
52 | $ Open cublas_examples.sln project in Visual Studio and build
53 | ```
54 | 
55 | # Usage
56 | ```
57 | $  ./cublas_trttp_example
58 | ```
59 | 
60 | Sample example output:
61 | 
62 | ```
63 | A
64 | 1.00 3.00 
65 | 2.00 4.00 
66 | =====
67 | AP
68 | 1.00 4.00 
69 | 3.00 0.00 
70 | =====
71 | ```
72 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-1/amax/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-1/amax/README.md:
--------------------------------------------------------------------------------
 1 | # cuBLAS Level-1 APIs - `cublas<t>amax`
 2 | 
 3 | ## Description
 4 | 
 5 | This code demonstrates a usage of cuBLAS `amax` function to find the (smallest) index of the element of the maximum magnitude
 6 | 
 7 | ```
 8 | A = | 1.0 | 2.0 | 3.0 | 4.0 |
 9 | ```
10 | 
11 | See documentation for further details.
12 | 
13 | ## Supported SM Architectures
14 | 
15 | All GPUs supported by CUDA Toolkit (https://developer.nvidia.com/cuda-gpus)  
16 | 
17 | ## Supported OSes
18 | 
19 | Linux  
20 | Windows
21 | 
22 | ## Supported CPU Architecture
23 | 
24 | x86_64  
25 | ppc64le  
26 | arm64-sbsa
27 | 
28 | ## CUDA APIs involved
29 | - [cublasI\<t>amax API](https://docs.nvidia.com/cuda/cublas/index.html#cublasi-t-amax)
30 | 
31 | # Building (make)
32 | 
33 | # Prerequisites
34 | - A Linux/Windows system with recent NVIDIA drivers.
35 | - [CMake](https://cmake.org/download) version 3.18 minimum
36 | 
37 | ## Build command on Linux
38 | ```
39 | $ mkdir build
40 | $ cd build
41 | $ cmake ..
42 | $ make
43 | ```
44 | Make sure that CMake finds expected CUDA Toolkit. If that is not the case you can add argument `-DCMAKE_CUDA_COMPILER=/path/to/cuda/bin/nvcc` to cmake command.
45 | 
46 | ## Build command on Windows
47 | ```
48 | $ mkdir build
49 | $ cd build
50 | $ cmake -DCMAKE_GENERATOR_PLATFORM=x64 ..
51 | $ Open cublas_examples.sln project in Visual Studio and build
52 | ```
53 | 
54 | # Usage
55 | ```
56 | $  ./cublas_amax_example
57 | ```
58 | 
59 | Sample example output:
60 | 
61 | ```
62 | A
63 | 1.00 2.00 3.00 4.00 
64 | =====
65 | result
66 | 4
67 | =====
68 | ```
69 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-1/amin/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-1/amin/README.md:
--------------------------------------------------------------------------------
 1 | # cuBLAS Level-1 APIs - `cublas<t>amin`
 2 | 
 3 | ## Description
 4 | 
 5 | This code demonstrates a usage of cuBLAS `amin` function to find the (smallest) index of the element of the minimum magnitude
 6 | 
 7 | ```
 8 | A = | 1.0 | 2.0 | 3.0 | 4.0 |
 9 | ```
10 | 
11 | See documentation for further details.
12 | 
13 | ## Supported SM Architectures
14 | 
15 | All GPUs supported by CUDA Toolkit (https://developer.nvidia.com/cuda-gpus)  
16 | 
17 | ## Supported OSes
18 | 
19 | Linux  
20 | Windows
21 | 
22 | ## Supported CPU Architecture
23 | 
24 | x86_64  
25 | ppc64le  
26 | arm64-sbsa
27 | 
28 | ## CUDA APIs involved
29 | - [cublasI\<t>amin API](https://docs.nvidia.com/cuda/cublas/index.html#cublasi-t-amin)
30 | 
31 | # Building (make)
32 | 
33 | # Prerequisites
34 | - A Linux/Windows system with recent NVIDIA drivers.
35 | - [CMake](https://cmake.org/download) version 3.18 minimum
36 | 
37 | ## Build command on Linux
38 | ```
39 | $ mkdir build
40 | $ cd build
41 | $ cmake ..
42 | $ make
43 | ```
44 | Make sure that CMake finds expected CUDA Toolkit. If that is not the case you can add argument `-DCMAKE_CUDA_COMPILER=/path/to/cuda/bin/nvcc` to cmake command.
45 | 
46 | ## Build command on Windows
47 | ```
48 | $ mkdir build
49 | $ cd build
50 | $ cmake -DCMAKE_GENERATOR_PLATFORM=x64 ..
51 | $ Open cublas_examples.sln project in Visual Studio and build
52 | ```
53 | 
54 | # Usage
55 | ```
56 | $  ./cublas_amin_example
57 | ```
58 | 
59 | Sample example output:
60 | 
61 | ```
62 | A
63 | 1.00 2.00 3.00 4.00 
64 | =====
65 | result
66 | 1
67 | =====
68 | ```
69 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-1/asum/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-1/asum/README.md:
--------------------------------------------------------------------------------
 1 | # cuBLAS Level-1 APIs - `cublas<t>asum`
 2 | 
 3 | ## Description
 4 | 
 5 | This code demonstrates a usage of cuBLAS `asum` function to compute the sum of the absolute values of the elements of vector _x_
 6 | 
 7 | ```
 8 | A = | 1.0 | 2.0 | 3.0 | 4.0 |
 9 | ```
10 | 
11 | See documentation for further details.
12 | 
13 | ## Supported SM Architectures
14 | 
15 | All GPUs supported by CUDA Toolkit (https://developer.nvidia.com/cuda-gpus)  
16 | 
17 | ## Supported OSes
18 | 
19 | Linux  
20 | Windows
21 | 
22 | ## Supported CPU Architecture
23 | 
24 | x86_64  
25 | ppc64le  
26 | arm64-sbsa
27 | 
28 | ## CUDA APIs involved
29 | - [cublas\<t>asum API](https://docs.nvidia.com/cuda/cublas/index.html#cublas-t-asum)
30 | 
31 | # Building (make)
32 | 
33 | # Prerequisites
34 | - A Linux/Windows system with recent NVIDIA drivers.
35 | - [CMake](https://cmake.org/download) version 3.18 minimum
36 | 
37 | ## Build command on Linux
38 | ```
39 | $ mkdir build
40 | $ cd build
41 | $ cmake ..
42 | $ make
43 | ```
44 | Make sure that CMake finds expected CUDA Toolkit. If that is not the case you can add argument `-DCMAKE_CUDA_COMPILER=/path/to/cuda/bin/nvcc` to cmake command.
45 | 
46 | ## Build command on Windows
47 | ```
48 | $ mkdir build
49 | $ cd build
50 | $ cmake -DCMAKE_GENERATOR_PLATFORM=x64 ..
51 | $ Open cublas_examples.sln project in Visual Studio and build
52 | ```
53 | 
54 | # Usage
55 | ```
56 | $  ./cublas_asum_example
57 | ```
58 | 
59 | Sample example output:
60 | 
61 | ```
62 | A
63 | 1.00 2.00 3.00 4.00 
64 | =====
65 | result
66 | 10.00
67 | =====
68 | ```
69 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-1/axpy/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-1/copy/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-1/copy/README.md:
--------------------------------------------------------------------------------
 1 | # cuBLAS Level-1 APIs - `cublas<t>copy`
 2 | 
 3 | ## Description
 4 | 
 5 | This code demonstrates a usage of cuBLAS `copy` function to copy the vector _x_ into the vector _y_
 6 | 
 7 | ```
 8 | A = | 1.0 | 2.0 | 3.0 | 4.0 |
 9 | ```
10 | 
11 | See documentation for further details.
12 | 
13 | ## Supported SM Architectures
14 | 
15 | All GPUs supported by CUDA Toolkit (https://developer.nvidia.com/cuda-gpus)  
16 | 
17 | ## Supported OSes
18 | 
19 | Linux  
20 | Windows
21 | 
22 | ## Supported CPU Architecture
23 | 
24 | x86_64  
25 | ppc64le  
26 | arm64-sbsa
27 | 
28 | ## CUDA APIs involved
29 | - [cublas\<t>copy API](https://docs.nvidia.com/cuda/cublas/index.html#cublas-t-copy)
30 | 
31 | # Building (make)
32 | 
33 | # Prerequisites
34 | - A Linux/Windows system with recent NVIDIA drivers.
35 | - [CMake](https://cmake.org/download) version 3.18 minimum
36 | 
37 | ## Build command on Linux
38 | ```
39 | $ mkdir build
40 | $ cd build
41 | $ cmake ..
42 | $ make
43 | ```
44 | Make sure that CMake finds expected CUDA Toolkit. If that is not the case you can add argument `-DCMAKE_CUDA_COMPILER=/path/to/cuda/bin/nvcc` to cmake command.
45 | 
46 | ## Build command on Windows
47 | ```
48 | $ mkdir build
49 | $ cd build
50 | $ cmake -DCMAKE_GENERATOR_PLATFORM=x64 ..
51 | $ Open cublas_examples.sln project in Visual Studio and build
52 | ```
53 | 
54 | # Usage
55 | ```
56 | $  ./cublas_copy_example
57 | ```
58 | 
59 | Sample example output:
60 | 
61 | ```
62 | A
63 | 1.00 2.00 3.00 4.00 
64 | =====
65 | B
66 | 0.00 0.00 0.00 0.00 
67 | =====
68 | B
69 | 1.00 2.00 3.00 4.00 
70 | =====
71 | 
72 | ```
73 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-1/dot/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-1/nrm2/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-1/nrm2/README.md:
--------------------------------------------------------------------------------
 1 | # cuBLAS Level-1 APIs - `cublas<t>nrm2`
 2 | 
 3 | ## Description
 4 | 
 5 | This code demonstrates a usage of cuBLAS `nrm2` function to compute the Euclidean norm of a vector
 6 | 
 7 | ```
 8 | A = | 1.0 | 2.0 | 3.0 | 4.0 |
 9 | ``` 
10 | 
11 | See documentation for further details.
12 | 
13 | ## Supported SM Architectures
14 | 
15 | All GPUs supported by CUDA Toolkit (https://developer.nvidia.com/cuda-gpus)  
16 | 
17 | ## Supported OSes
18 | 
19 | Linux  
20 | Windows
21 | 
22 | ## Supported CPU Architecture
23 | 
24 | x86_64  
25 | ppc64le  
26 | arm64-sbsa
27 | 
28 | ## CUDA APIs involved
29 | - [cublas\<t>nrm2 API](https://docs.nvidia.com/cuda/cublas/index.html#cublas-t-nrm2)
30 | 
31 | # Building (make)
32 | 
33 | # Prerequisites
34 | - A Linux/Windows system with recent NVIDIA drivers.
35 | - [CMake](https://cmake.org/download) version 3.18 minimum
36 | 
37 | ## Build command on Linux
38 | ```
39 | $ mkdir build
40 | $ cd build
41 | $ cmake ..
42 | $ make
43 | ```
44 | Make sure that CMake finds expected CUDA Toolkit. If that is not the case you can add argument `-DCMAKE_CUDA_COMPILER=/path/to/cuda/bin/nvcc` to cmake command.
45 | 
46 | ## Build command on Windows
47 | ```
48 | $ mkdir build
49 | $ cd build
50 | $ cmake -DCMAKE_GENERATOR_PLATFORM=x64 ..
51 | $ Open cublas_examples.sln project in Visual Studio and build
52 | ```
53 | 
54 | # Usage
55 | ```
56 | $  ./cublas_nrm2_example
57 | ```
58 | 
59 | Sample example output:
60 | 
61 | ```
62 | A
63 | 1.00 2.00 3.00 4.00
64 | =====
65 | Result
66 | 5.48
67 | =====
68 | ```
69 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-1/rot/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-1/rotg/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-1/rotm/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-1/rotmg/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-1/scal/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-1/swap/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-2/gbmv/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-2/gbmv/README.md:
--------------------------------------------------------------------------------
 1 | # cuBLAS Level-2 APIs - `cublas<t>gbmv`
 2 | 
 3 | ## Description
 4 | 
 5 | This code demonstrates a usage of cuBLAS `gbmv` function to compute a banded matrix-vector multiplication
 6 | 
 7 | ```
 8 | A = | 1.0 | 2.0 | 
 9 |     | 3.0 | 4.0 |
10 |     
11 | x = | 5.0 | 6.0 |
12 | ```
13 | 
14 | See documentation for further details.
15 | 
16 | ## Supported SM Architectures
17 | 
18 | All GPUs supported by CUDA Toolkit (https://developer.nvidia.com/cuda-gpus)  
19 | 
20 | ## Supported OSes
21 | 
22 | Linux  
23 | Windows
24 | 
25 | ## Supported CPU Architecture
26 | 
27 | x86_64  
28 | ppc64le  
29 | arm64-sbsa
30 | 
31 | ## CUDA APIs involved
32 | - [cublas\<t>gbmv API](https://docs.nvidia.com/cuda/cublas/index.html#cublas-t-gbmv)
33 | 
34 | # Building (make)
35 | 
36 | # Prerequisites
37 | - A Linux/Windows system with recent NVIDIA drivers.
38 | - [CMake](https://cmake.org/download) version 3.18 minimum
39 | 
40 | ## Build command on Linux
41 | ```
42 | $ mkdir build
43 | $ cd build
44 | $ cmake ..
45 | $ make
46 | ```
47 | Make sure that CMake finds expected CUDA Toolkit. If that is not the case you can add argument `-DCMAKE_CUDA_COMPILER=/path/to/cuda/bin/nvcc` to cmake command.
48 | 
49 | ## Build command on Windows
50 | ```
51 | $ mkdir build
52 | $ cd build
53 | $ cmake -DCMAKE_GENERATOR_PLATFORM=x64 ..
54 | $ Open cublas_examples.sln project in Visual Studio and build
55 | ```
56 | 
57 | # Usage
58 | ```
59 | $  ./cublas_gbmv_example
60 | ```
61 | 
62 | Sample example output:
63 | 
64 | ```
65 | A
66 | 1.00 2.00 
67 | 3.00 4.00 
68 | =====
69 | x
70 | 5.00 6.00 
71 | =====
72 | y
73 | 27.00 24.00 
74 | =====
75 | ```
76 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-2/gemv/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-2/gemv/README.md:
--------------------------------------------------------------------------------
 1 | # cuBLAS Level-2 APIs - `cublas<t>gemv`
 2 | 
 3 | ## Description
 4 | 
 5 | This code demonstrates a usage of cuBLAS `gemv` function to compute a matrix-vector multiplication
 6 | 
 7 | ```
 8 | A = | 1.0 | 2.0 | 
 9 |     | 3.0 | 4.0 |
10 | 
11 | x = | 5.0 | 6.0 |
12 | ```
13 | 
14 | See documentation for further details.
15 | 
16 | ## Supported SM Architectures
17 | 
18 | All GPUs supported by CUDA Toolkit (https://developer.nvidia.com/cuda-gpus)  
19 | 
20 | ## Supported OSes
21 | 
22 | Linux  
23 | Windows
24 | 
25 | ## Supported CPU Architecture
26 | 
27 | x86_64  
28 | ppc64le  
29 | arm64-sbsa
30 | 
31 | ## CUDA APIs involved
32 | - [cublas\<t>gemv API](https://docs.nvidia.com/cuda/cublas/index.html#cublas-t-gemv)
33 | 
34 | # Building (make)
35 | 
36 | # Prerequisites
37 | - A Linux/Windows system with recent NVIDIA drivers.
38 | - [CMake](https://cmake.org/download) version 3.18 minimum
39 | 
40 | ## Build command on Linux
41 | ```
42 | $ mkdir build
43 | $ cd build
44 | $ cmake ..
45 | $ make
46 | ```
47 | Make sure that CMake finds expected CUDA Toolkit. If that is not the case you can add argument `-DCMAKE_CUDA_COMPILER=/path/to/cuda/bin/nvcc` to cmake command.
48 | 
49 | ## Build command on Windows
50 | ```
51 | $ mkdir build
52 | $ cd build
53 | $ cmake -DCMAKE_GENERATOR_PLATFORM=x64 ..
54 | $ Open cublas_examples.sln project in Visual Studio and build
55 | ```
56 | 
57 | # Usage
58 | ```
59 | $  ./cublas_gemv_example
60 | ```
61 | 
62 | Sample example output:
63 | 
64 | ```
65 | A
66 | 1.00 2.00 
67 | 3.00 4.00 
68 | =====
69 | x
70 | 5.00 6.00 
71 | =====
72 | y
73 | 17.00 39.00 
74 | =====
75 | ```
76 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-2/ger/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-2/hbmv/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-2/hemv/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-2/her/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-2/her2/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-2/hpmv/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-2/hpr/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-2/hpr2/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-2/sbmv/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-2/spmv/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-2/spr/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-2/spr2/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-2/symv/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-2/symv/README.md:
--------------------------------------------------------------------------------
 1 | # cuBLAS Level-2 APIs - `cublas<t>symv`
 2 | 
 3 | ## Description
 4 | 
 5 | This code demonstrates a usage of cuBLAS `symv` function to compute a symmetric matrix-vector multiplication
 6 | 
 7 | ```
 8 | A = | 1.0 | 3.0 | 
 9 |     | 3.0 | 4.0 |
10 |     
11 | x = | 5.0 | 6.0 |
12 | ```
13 | 
14 | See documentation for further details.
15 | 
16 | ## Supported SM Architectures
17 | 
18 | All GPUs supported by CUDA Toolkit (https://developer.nvidia.com/cuda-gpus)  
19 | 
20 | ## Supported OSes
21 | 
22 | Linux  
23 | Windows
24 | 
25 | ## Supported CPU Architecture
26 | 
27 | x86_64  
28 | ppc64le  
29 | arm64-sbsa
30 | 
31 | ## CUDA APIs involved
32 | - [cublas\<t>symv API](https://docs.nvidia.com/cuda/cublas/index.html#cublas-t-symv)
33 | 
34 | # Building (make)
35 | 
36 | # Prerequisites
37 | - A Linux/Windows system with recent NVIDIA drivers.
38 | - [CMake](https://cmake.org/download) version 3.18 minimum
39 | 
40 | ## Build command on Linux
41 | ```
42 | $ mkdir build
43 | $ cd build
44 | $ cmake ..
45 | $ make
46 | ```
47 | Make sure that CMake finds expected CUDA Toolkit. If that is not the case you can add argument `-DCMAKE_CUDA_COMPILER=/path/to/cuda/bin/nvcc` to cmake command.
48 | 
49 | ## Build command on Windows
50 | ```
51 | $ mkdir build
52 | $ cd build
53 | $ cmake -DCMAKE_GENERATOR_PLATFORM=x64 ..
54 | $ Open cublas_examples.sln project in Visual Studio and build
55 | ```
56 | 
57 | # Usage
58 | ```
59 | $  ./cublas_symv_example
60 | ```
61 | 
62 | Sample example output:
63 | 
64 | ```
65 | A
66 | 1.00 2.00 
67 | 3.00 4.00 
68 | =====
69 | x
70 | 5.00 6.00 
71 | =====
72 | y
73 | 23.00 39.00 
74 | =====
75 | ```
76 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-2/syr/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-2/syr2/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-2/tbmv/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-2/tbsv/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-2/tpmv/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-2/tpsv/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-2/trmv/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-2/trsv/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-3/gemm/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-3/gemm3m/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-3/gemmBatched/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-3/gemmGroupedBatched/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-3/gemmStridedBatched/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-3/hemm/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-3/her2k/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-3/herk/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-3/herkx/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-3/symm/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-3/syr2k/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-3/syrk/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-3/syrk/README.md:
--------------------------------------------------------------------------------
 1 | # cuBLAS Level-3 APIs - `cublas<t>syrk`
 2 | 
 3 | ## Description
 4 | 
 5 | This code demonstrates a usage of cuBLAS `syrk` function to compute a symmetric rank-k update
 6 | 
 7 | ```
 8 | A = | 1.0 | 3.0 |
 9 |     | 3.0 | 4.0 |
10 | 
11 | B = | 5.0 | 7.0 |
12 |     | 7.0 | 8.0 |
13 | ```
14 | 
15 | See documentation for further details.
16 | 
17 | ## Supported SM Architectures
18 | 
19 | All GPUs supported by CUDA Toolkit (https://developer.nvidia.com/cuda-gpus)  
20 | 
21 | ## Supported OSes
22 | 
23 | Linux  
24 | Windows
25 | 
26 | ## Supported CPU Architecture
27 | 
28 | x86_64  
29 | ppc64le  
30 | arm64-sbsa
31 | 
32 | ## CUDA APIs involved
33 | - [cublas\<t>syrk API](https://docs.nvidia.com/cuda/cublas/index.html#cublas-t-syrk)
34 | 
35 | # Building (make)
36 | 
37 | # Prerequisites
38 | - A Linux/Windows system with recent NVIDIA drivers.
39 | - [CMake](https://cmake.org/download) version 3.18 minimum
40 | 
41 | ## Build command on Linux
42 | ```
43 | $ mkdir build
44 | $ cd build
45 | $ cmake ..
46 | $ make
47 | ```
48 | Make sure that CMake finds expected CUDA Toolkit. If that is not the case you can add argument `-DCMAKE_CUDA_COMPILER=/path/to/cuda/bin/nvcc` to cmake command.
49 | 
50 | ## Build command on Windows
51 | ```
52 | $ mkdir build
53 | $ cd build
54 | $ cmake -DCMAKE_GENERATOR_PLATFORM=x64 ..
55 | $ Open cublas_examples.sln project in Visual Studio and build
56 | ```
57 | 
58 | # Usage
59 | ```
60 | $  ./cublas_syrk_example
61 | ```
62 | 
63 | Sample example output:
64 | 
65 | ```
66 | A
67 | 1.00 3.00 
68 | 3.00 4.00 
69 | =====
70 | C
71 | 10.00 15.00 
72 | 0.00 25.00 
73 | =====
74 | ```
75 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-3/syrkx/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-3/trmm/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-3/trsm/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLAS/Level-3/trsmBatched/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuBLASLt/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # 
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | # 
 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 5 | # and proprietary rights in and to this software, related documentation
 6 | # and any modifications thereto. Any use, reproduction, disclosure or
 7 | # distribution of this software and related documentation without an express
 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 | # 
10 | 
11 | cmake_minimum_required(VERSION 3.10.0)
12 | 
13 | project(cublasLtSamples)
14 | 
15 | add_subdirectory(LtSgemm)
16 | add_subdirectory(LtFp8Matmul)
17 | add_subdirectory(LtBlk128x128Fp8Matmul)
18 | add_subdirectory(LtMxfp8Matmul)
19 | add_subdirectory(LtNvfp4Matmul)
20 | add_subdirectory(LtDgemmPresetAlgo)
21 | add_subdirectory(LtIgemmTensor)
22 | add_subdirectory(LtHSHgemmStridedBatchSimple)
23 | add_subdirectory(LtHSHgemmPointerArrayBatchSimple)
24 | add_subdirectory(LtSgemmCustomFind)
25 | add_subdirectory(LtFp8CustomFind)
26 | add_subdirectory(LtPlanarComplex)
27 | add_subdirectory(LtSgemmSimpleAutoTuning)


--------------------------------------------------------------------------------
/cuBLASLt/LtBlk128x128Fp8Matmul/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # 
 2 | # Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 5 | # and proprietary rights in and to this software, related documentation
 6 | # and any modifications thereto. Any use, reproduction, disclosure or
 7 | # distribution of this software and related documentation without an express
 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 | #
10 | 
11 | cmake_minimum_required(VERSION 3.10.0)
12 | 
13 | project(sample_cublasLt_LtBlk128x128Fp8Matmul LANGUAGES CXX CUDA)
14 | 
15 | set(CMAKE_CXX_STANDARD 11)
16 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
17 | 
18 | add_executable(${PROJECT_NAME}
19 |     main.cpp
20 |     ${PROJECT_NAME}.cu
21 | )
22 | 
23 | target_include_directories(${PROJECT_NAME} PRIVATE
24 |     ${CMAKE_CURRENT_SOURCE_DIR}/../Common
25 |     ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
26 | )
27 | 
28 | set(CUDART_LIBRARY cudart)
29 | set(CUBLASLT_LIBRARY cublasLt)
30 | 
31 | if(NOT WIN32)
32 |     find_library(CUDART_LIBRARY cudart ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
33 |     find_library(CUBLASLT_LIBRARY cublasLt ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
34 | endif()
35 | 
36 | target_link_libraries(${PROJECT_NAME}
37 |     ${CUDART_LIBRARY}
38 |     ${CUBLASLT_LIBRARY}
39 | )


--------------------------------------------------------------------------------
/cuBLASLt/LtDgemmPresetAlgo/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # 
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | # 
 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 5 | # and proprietary rights in and to this software, related documentation
 6 | # and any modifications thereto. Any use, reproduction, disclosure or
 7 | # distribution of this software and related documentation without an express
 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 | # 
10 | 
11 | cmake_minimum_required(VERSION 3.10.0)
12 | 
13 | project(sample_cublasLt_LtDgemmPresetAlgo LANGUAGES CXX CUDA)
14 | 
15 | set(CMAKE_CXX_STANDARD 11)
16 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
17 | 
18 | add_executable(${PROJECT_NAME}
19 |     main.cpp
20 |     ${PROJECT_NAME}.cu
21 | )
22 | 
23 | target_include_directories(${PROJECT_NAME} PRIVATE
24 |     ${CMAKE_CURRENT_SOURCE_DIR}/../Common
25 |     ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
26 | )
27 | 
28 | set(CUDART_LIBRARY cudart)
29 | set(CUBLASLT_LIBRARY cublasLt)
30 | 
31 | if(NOT WIN32)
32 |     find_library(CUDART_LIBRARY cudart ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
33 |     find_library(CUBLASLT_LIBRARY cublasLt ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
34 | endif()
35 | 
36 | target_link_libraries(${PROJECT_NAME}
37 |     ${CUDART_LIBRARY}
38 |     ${CUBLASLT_LIBRARY}
39 | )


--------------------------------------------------------------------------------
/cuBLASLt/LtFp8CustomFind/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # 
 2 | # Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 3 | # 
 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 5 | # and proprietary rights in and to this software, related documentation
 6 | # and any modifications thereto. Any use, reproduction, disclosure or
 7 | # distribution of this software and related documentation without an express
 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 | # 
10 | 
11 | cmake_minimum_required(VERSION 3.10.0)
12 | 
13 | project(sample_cublasLt_LtFp8CustomFind LANGUAGES CXX CUDA)
14 | 
15 | set(CMAKE_CXX_STANDARD 11)
16 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
17 | 
18 | add_executable(${PROJECT_NAME}
19 |     main.cpp
20 |     ../Common/helpers.cpp
21 | )
22 | 
23 | target_include_directories(${PROJECT_NAME} PRIVATE
24 |     ${CMAKE_CURRENT_SOURCE_DIR}/../Common
25 |     ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
26 | )
27 | 
28 | set(CUDART_LIBRARY cudart)
29 | set(CUBLASLT_LIBRARY cublasLt)
30 | 
31 | if(NOT WIN32)
32 |     find_library(CUDART_LIBRARY cudart ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
33 |     find_library(CUBLASLT_LIBRARY cublasLt ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
34 | endif()
35 | 
36 | target_link_libraries(${PROJECT_NAME}
37 |     ${CUDART_LIBRARY}
38 |     ${CUBLASLT_LIBRARY}
39 | )


--------------------------------------------------------------------------------
/cuBLASLt/LtFp8Matmul/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # 
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | # 
 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 5 | # and proprietary rights in and to this software, related documentation
 6 | # and any modifications thereto. Any use, reproduction, disclosure or
 7 | # distribution of this software and related documentation without an express
 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 | # 
10 | 
11 | cmake_minimum_required(VERSION 3.10.0)
12 | 
13 | project(sample_cublasLt_LtFp8Matmul LANGUAGES CXX CUDA)
14 | 
15 | set(CMAKE_CXX_STANDARD 11)
16 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
17 | 
18 | add_executable(${PROJECT_NAME}
19 |     main.cpp
20 |     ${PROJECT_NAME}.cu
21 | )
22 | 
23 | target_include_directories(${PROJECT_NAME} PRIVATE
24 |     ${CMAKE_CURRENT_SOURCE_DIR}/../Common
25 |     ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
26 | )
27 | 
28 | set(CUDART_LIBRARY cudart)
29 | set(CUBLASLT_LIBRARY cublasLt)
30 | 
31 | if(NOT WIN32)
32 |     find_library(CUDART_LIBRARY cudart ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
33 |     find_library(CUBLASLT_LIBRARY cublasLt ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
34 | endif()
35 | 
36 | target_link_libraries(${PROJECT_NAME}
37 |     ${CUDART_LIBRARY}
38 |     ${CUBLASLT_LIBRARY}
39 | )


--------------------------------------------------------------------------------
/cuBLASLt/LtHSHgemmPointerArrayBatchSimple/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # 
 2 | # Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 3 | # 
 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 5 | # and proprietary rights in and to this software, related documentation
 6 | # and any modifications thereto. Any use, reproduction, disclosure or
 7 | # distribution of this software and related documentation without an express
 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 | # 
10 | 
11 | cmake_minimum_required(VERSION 3.10.0)
12 | 
13 | project(sample_cublasLt_LtHSHgemmPointerArrayBatchSimple LANGUAGES CXX CUDA)
14 | 
15 | set(CMAKE_CXX_STANDARD 11)
16 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
17 | 
18 | add_executable(${PROJECT_NAME}
19 |     main.cpp
20 |     ${PROJECT_NAME}.cu
21 | )
22 | 
23 | target_include_directories(${PROJECT_NAME} PRIVATE
24 |     ${CMAKE_CURRENT_SOURCE_DIR}/../Common
25 |     ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
26 | )
27 | 
28 | set(CUDART_LIBRARY cudart)
29 | set(CUBLASLT_LIBRARY cublasLt)
30 | 
31 | if(NOT WIN32)
32 |     find_library(CUDART_LIBRARY cudart ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
33 |     find_library(CUBLASLT_LIBRARY cublasLt ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
34 | endif()
35 | 
36 | target_link_libraries(${PROJECT_NAME}
37 |     ${CUDART_LIBRARY}
38 |     ${CUBLASLT_LIBRARY}
39 | )


--------------------------------------------------------------------------------
/cuBLASLt/LtHSHgemmStridedBatchSimple/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # 
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | # 
 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 5 | # and proprietary rights in and to this software, related documentation
 6 | # and any modifications thereto. Any use, reproduction, disclosure or
 7 | # distribution of this software and related documentation without an express
 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 | # 
10 | 
11 | cmake_minimum_required(VERSION 3.10.0)
12 | 
13 | project(sample_cublasLt_LtHSHgemmStridedBatchSimple LANGUAGES CXX CUDA)
14 | 
15 | set(CMAKE_CXX_STANDARD 11)
16 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
17 | 
18 | add_executable(${PROJECT_NAME}
19 |     main.cpp
20 |     ${PROJECT_NAME}.cu
21 | )
22 | 
23 | target_include_directories(${PROJECT_NAME} PRIVATE
24 |     ${CMAKE_CURRENT_SOURCE_DIR}/../Common
25 |     ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
26 | )
27 | 
28 | set(CUDART_LIBRARY cudart)
29 | set(CUBLASLT_LIBRARY cublasLt)
30 | 
31 | if(NOT WIN32)
32 |     find_library(CUDART_LIBRARY cudart ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
33 |     find_library(CUBLASLT_LIBRARY cublasLt ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
34 | endif()
35 | 
36 | target_link_libraries(${PROJECT_NAME}
37 |     ${CUDART_LIBRARY}
38 |     ${CUBLASLT_LIBRARY}
39 | )


--------------------------------------------------------------------------------
/cuBLASLt/LtIgemmTensor/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # 
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | # 
 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 5 | # and proprietary rights in and to this software, related documentation
 6 | # and any modifications thereto. Any use, reproduction, disclosure or
 7 | # distribution of this software and related documentation without an express
 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 | # 
10 | 
11 | cmake_minimum_required(VERSION 3.10.0)
12 | 
13 | project(sample_cublasLt_LtIgemmTensor LANGUAGES CXX CUDA)
14 | 
15 | set(CMAKE_CXX_STANDARD 11)
16 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
17 | 
18 | add_executable(${PROJECT_NAME}
19 |     main.cpp
20 |     ${PROJECT_NAME}.cu
21 | )
22 | 
23 | target_include_directories(${PROJECT_NAME} PRIVATE
24 |     ${CMAKE_CURRENT_SOURCE_DIR}/../Common
25 |     ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
26 | )
27 | 
28 | set(CUDART_LIBRARY cudart)
29 | set(CUBLASLT_LIBRARY cublasLt)
30 | 
31 | if(NOT WIN32)
32 |     find_library(CUDART_LIBRARY cudart ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
33 |     find_library(CUBLASLT_LIBRARY cublasLt ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
34 | endif()
35 | 
36 | target_link_libraries(${PROJECT_NAME}
37 |     ${CUDART_LIBRARY}
38 |     ${CUBLASLT_LIBRARY}
39 | )


--------------------------------------------------------------------------------
/cuBLASLt/LtMxfp8Matmul/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # 
 2 | # Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 5 | # and proprietary rights in and to this software, related documentation
 6 | # and any modifications thereto. Any use, reproduction, disclosure or
 7 | # distribution of this software and related documentation without an express
 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 | #
10 | 
11 | cmake_minimum_required(VERSION 3.10.0)
12 | 
13 | project(sample_cublasLt_LtMxfp8Matmul LANGUAGES CXX CUDA)
14 | 
15 | set(CMAKE_CXX_STANDARD 11)
16 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
17 | 
18 | add_executable(${PROJECT_NAME}
19 |     main.cpp
20 |     ${PROJECT_NAME}.cu
21 | )
22 | 
23 | target_include_directories(${PROJECT_NAME} PRIVATE
24 |     ${CMAKE_CURRENT_SOURCE_DIR}/../Common
25 |     ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
26 | )
27 | 
28 | set(CUDART_LIBRARY cudart)
29 | set(CUBLASLT_LIBRARY cublasLt)
30 | 
31 | if(NOT WIN32)
32 |     find_library(CUDART_LIBRARY cudart ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
33 |     find_library(CUBLASLT_LIBRARY cublasLt ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
34 | endif()
35 | 
36 | target_link_libraries(${PROJECT_NAME}
37 |     ${CUDART_LIBRARY}
38 |     ${CUBLASLT_LIBRARY}
39 | )


--------------------------------------------------------------------------------
/cuBLASLt/LtNvfp4Matmul/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # 
 2 | # Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 5 | # and proprietary rights in and to this software, related documentation
 6 | # and any modifications thereto. Any use, reproduction, disclosure or
 7 | # distribution of this software and related documentation without an express
 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 | #
10 | 
11 | cmake_minimum_required(VERSION 3.10.0)
12 | 
13 | project(sample_cublasLt_LtNvfp4Matmul LANGUAGES CXX CUDA)
14 | 
15 | set(CMAKE_CXX_STANDARD 11)
16 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
17 | 
18 | add_executable(${PROJECT_NAME}
19 |     main.cpp
20 |     ${PROJECT_NAME}.cu
21 | )
22 | 
23 | target_include_directories(${PROJECT_NAME} PRIVATE
24 |     ${CMAKE_CURRENT_SOURCE_DIR}/../Common
25 |     ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
26 | )
27 | 
28 | set(CUDART_LIBRARY cudart)
29 | set(CUBLASLT_LIBRARY cublasLt)
30 | 
31 | if(NOT WIN32)
32 |     find_library(CUDART_LIBRARY cudart ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
33 |     find_library(CUBLASLT_LIBRARY cublasLt ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
34 | endif()
35 | 
36 | target_link_libraries(${PROJECT_NAME}
37 |     ${CUDART_LIBRARY}
38 |     ${CUBLASLT_LIBRARY}
39 | )


--------------------------------------------------------------------------------
/cuBLASLt/LtPlanarComplex/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # 
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | # 
 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 5 | # and proprietary rights in and to this software, related documentation
 6 | # and any modifications thereto. Any use, reproduction, disclosure or
 7 | # distribution of this software and related documentation without an express
 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 | # 
10 | 
11 | cmake_minimum_required(VERSION 3.10.0)
12 | 
13 | project(sample_cublasLt_LtPlanarComplex LANGUAGES CXX CUDA)
14 | 
15 | set(CMAKE_CXX_STANDARD 11)
16 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
17 | 
18 | add_executable(${PROJECT_NAME}
19 |     main.cpp
20 |     ${PROJECT_NAME}.cu
21 | )
22 | 
23 | target_include_directories(${PROJECT_NAME} PRIVATE
24 |     ${CMAKE_CURRENT_SOURCE_DIR}/../Common
25 |     ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
26 | )
27 | 
28 | set(CUDART_LIBRARY cudart)
29 | set(CUBLASLT_LIBRARY cublasLt)
30 | 
31 | if(NOT WIN32)
32 |     find_library(CUDART_LIBRARY cudart ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
33 |     find_library(CUBLASLT_LIBRARY cublasLt ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
34 | endif()
35 | 
36 | target_link_libraries(${PROJECT_NAME}
37 |     ${CUDART_LIBRARY}
38 |     ${CUBLASLT_LIBRARY}
39 | )


--------------------------------------------------------------------------------
/cuBLASLt/LtSgemm/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # 
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | # 
 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 5 | # and proprietary rights in and to this software, related documentation
 6 | # and any modifications thereto. Any use, reproduction, disclosure or
 7 | # distribution of this software and related documentation without an express
 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 | # 
10 | 
11 | cmake_minimum_required(VERSION 3.10.0)
12 | 
13 | project(sample_cublasLt_LtSgemm LANGUAGES CXX CUDA)
14 | 
15 | set(CMAKE_CXX_STANDARD 11)
16 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
17 | 
18 | add_executable(${PROJECT_NAME}
19 |     main.cpp
20 |     ${PROJECT_NAME}.cu
21 | )
22 | 
23 | target_include_directories(${PROJECT_NAME} PRIVATE
24 |     ${CMAKE_CURRENT_SOURCE_DIR}/../Common
25 |     ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
26 | )
27 | 
28 | set(CUDART_LIBRARY cudart)
29 | set(CUBLASLT_LIBRARY cublasLt)
30 | 
31 | if(NOT WIN32)
32 |     find_library(CUDART_LIBRARY cudart ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
33 |     find_library(CUBLASLT_LIBRARY cublasLt ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
34 | endif()
35 | 
36 | target_link_libraries(${PROJECT_NAME}
37 |     ${CUDART_LIBRARY}
38 |     ${CUBLASLT_LIBRARY}
39 | )


--------------------------------------------------------------------------------
/cuBLASLt/LtSgemmCustomFind/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # 
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | # 
 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 5 | # and proprietary rights in and to this software, related documentation
 6 | # and any modifications thereto. Any use, reproduction, disclosure or
 7 | # distribution of this software and related documentation without an express
 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 | # 
10 | 
11 | cmake_minimum_required(VERSION 3.10.0)
12 | 
13 | project(sample_cublasLt_LtSgemmCustomFind LANGUAGES CXX CUDA)
14 | 
15 | set(CMAKE_CXX_STANDARD 11)
16 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
17 | 
18 | add_executable(${PROJECT_NAME}
19 |     main.cpp
20 |     ../Common/helpers.cpp
21 | )
22 | 
23 | target_include_directories(${PROJECT_NAME} PRIVATE
24 |     ${CMAKE_CURRENT_SOURCE_DIR}/../Common
25 |     ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
26 | )
27 | 
28 | set(CUDART_LIBRARY cudart)
29 | set(CUBLASLT_LIBRARY cublasLt)
30 | 
31 | if(NOT WIN32)
32 |     find_library(CUDART_LIBRARY cudart ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
33 |     find_library(CUBLASLT_LIBRARY cublasLt ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
34 | endif()
35 | 
36 | target_link_libraries(${PROJECT_NAME}
37 |     ${CUDART_LIBRARY}
38 |     ${CUBLASLT_LIBRARY}
39 | )


--------------------------------------------------------------------------------
/cuBLASLt/LtSgemmSimpleAutoTuning/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # 
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | # 
 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 5 | # and proprietary rights in and to this software, related documentation
 6 | # and any modifications thereto. Any use, reproduction, disclosure or
 7 | # distribution of this software and related documentation without an express
 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 | # 
10 | 
11 | cmake_minimum_required(VERSION 3.10.0)
12 | 
13 | project(sample_cublasLt_LtSgemmSimpleAutoTuning LANGUAGES CXX CUDA)
14 | 
15 | set(CMAKE_CXX_STANDARD 11)
16 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
17 | 
18 | add_executable(${PROJECT_NAME}
19 |     main.cpp
20 |     ${PROJECT_NAME}.cu
21 | )
22 | 
23 | target_include_directories(${PROJECT_NAME} PRIVATE
24 |     ${CMAKE_CURRENT_SOURCE_DIR}/../Common
25 |     ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
26 | )
27 | 
28 | set(CUDART_LIBRARY cudart)
29 | set(CUBLASLT_LIBRARY cublasLt)
30 | 
31 | if(NOT WIN32)
32 |     find_library(CUDART_LIBRARY cudart ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
33 |     find_library(CUBLASLT_LIBRARY cublasLt ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
34 | endif()
35 | 
36 | target_link_libraries(${PROJECT_NAME}
37 |     ${CUDART_LIBRARY}
38 |     ${CUBLASLT_LIBRARY}
39 | )


--------------------------------------------------------------------------------
/cuFFT/1d_c2c/.gitignore:
--------------------------------------------------------------------------------
1 | build/


--------------------------------------------------------------------------------
/cuFFT/1d_c2c/README.md:
--------------------------------------------------------------------------------
 1 | # cuFFT 1D FFT C2C example
 2 | 
 3 | ## Description
 4 | 
 5 | In this example a one-dimensional complex-to-complex transform is applied to the input data. Afterwards an inverse transform is performed on the computed frequency domain representation.
 6 | 
 7 | ## Supported SM Architectures
 8 | 
 9 | All GPUs supported by CUDA Toolkit (https://developer.nvidia.com/cuda-gpus)  
10 | 
11 | ## Supported OSes
12 | 
13 | Linux  
14 | Windows
15 | 
16 | ## Supported CPU Architecture
17 | 
18 | x86_64  
19 | ppc64le  
20 | arm64-sbsa
21 | 
22 | ## CUDA APIs involved
23 | - [cufftExecC2C API](https://docs.nvidia.com/cuda/cufft/index.html#function-cufftexecc2c-cufftexecz2z)
24 | 
25 | 
26 | # Building (make)
27 | 
28 | # Prerequisites
29 | - A Linux/Windows system with recent NVIDIA drivers.
30 | - [CMake](https://cmake.org/download) version 3.18 minimum
31 | 
32 | ## Build command on Linux
33 | ```
34 | $ mkdir build
35 | $ cd build
36 | $ cmake ..
37 | $ make
38 | ```
39 | Make sure that CMake finds expected CUDA Toolkit. If that is not the case you can add argument `-DCMAKE_CUDA_COMPILER=/path/to/cuda/bin/nvcc` to cmake command.
40 | 
41 | # Usage 1
42 | ```
43 | $  ./bin/1d_c2c_example
44 | ```
45 | 


--------------------------------------------------------------------------------
/cuFFT/1d_mgpu_c2c/.gitignore:
--------------------------------------------------------------------------------
1 | build/


--------------------------------------------------------------------------------
/cuFFT/1d_r2c_c2r/.gitignore:
--------------------------------------------------------------------------------
1 | build/


--------------------------------------------------------------------------------
/cuFFT/1d_r2c_c2r/README.md:
--------------------------------------------------------------------------------
 1 | # cuFFT 1D FFT R2C/C2R example
 2 | 
 3 | ## Description
 4 | 
 5 | In this example a one-dimensional real-to-complex transform is applied to the input data.
 6 | 
 7 | ## Supported SM Architectures
 8 | 
 9 | All GPUs supported by CUDA Toolkit (https://developer.nvidia.com/cuda-gpus)  
10 | 
11 | ## Supported OSes
12 | 
13 | Linux  
14 | Windows
15 | 
16 | ## Supported CPU Architecture
17 | 
18 | x86_64  
19 | ppc64le  
20 | arm64-sbsa
21 | 
22 | ## CUDA APIs involved
23 | - [cufftExecR2C API](https://docs.nvidia.com/cuda/cufft/index.html#function-cufftexecr2c-cufftexecd2z)
24 | 
25 | # Building (make)
26 | 
27 | # Prerequisites
28 | - A Linux/Windows system with recent NVIDIA drivers.
29 | - [CMake](https://cmake.org/download) version 3.18 minimum
30 | 
31 | ## Build command on Linux
32 | ```
33 | $ mkdir build
34 | $ cd build
35 | $ cmake ..
36 | $ make
37 | ```
38 | Make sure that CMake finds expected CUDA Toolkit. If that is not the case you can add argument `-DCMAKE_CUDA_COMPILER=/path/to/cuda/bin/nvcc` to cmake command.
39 | 
40 | # Usage 1
41 | ```
42 | $  ./bin/1d_r2c_c2r_example
43 | ```
44 | 


--------------------------------------------------------------------------------
/cuFFT/2d_c2r_r2c/.gitignore:
--------------------------------------------------------------------------------
1 | build/


--------------------------------------------------------------------------------
/cuFFT/2d_c2r_r2c/README.md:
--------------------------------------------------------------------------------
 1 | # cuFFT 2D FFT C2R/R2C example
 2 | 
 3 | ## Description
 4 | 
 5 | In this example a two-dimensional complex-to-real transform is applied to the input data arranged according to the requirements of the default FFTW padding mode.
 6 | 
 7 | ## Supported SM Architectures
 8 | 
 9 | All GPUs supported by CUDA Toolkit (https://developer.nvidia.com/cuda-gpus)  
10 | 
11 | ## Supported OSes
12 | 
13 | Linux  
14 | Windows
15 | 
16 | ## Supported CPU Architecture
17 | 
18 | x86_64  
19 | ppc64le  
20 | arm64-sbsa
21 | 
22 | ## CUDA APIs involved
23 | - [cufftExecC2R API](https://docs.nvidia.com/cuda/cufft/index.html#function-cufftexecr2c-cufftexecd2z)
24 | 
25 | # Building (make)
26 | 
27 | # Prerequisites
28 | - A Linux/Windows system with recent NVIDIA drivers.
29 | - [CMake](https://cmake.org/download) version 3.18 minimum
30 | 
31 | ## Build command on Linux
32 | ```
33 | $ mkdir build
34 | $ cd build
35 | $ cmake ..
36 | $ make
37 | ```
38 | Make sure that CMake finds expected CUDA Toolkit. If that is not the case you can add argument `-DCMAKE_CUDA_COMPILER=/path/to/cuda/bin/nvcc` to cmake command.
39 | 
40 | # Usage 1
41 | ```
42 | $  ./bin/2d_c2r_r2c_example
43 | ```
44 | 


--------------------------------------------------------------------------------
/cuFFT/3d_c2c/.gitignore:
--------------------------------------------------------------------------------
1 | build/


--------------------------------------------------------------------------------
/cuFFT/3d_c2c/README.md:
--------------------------------------------------------------------------------
 1 | # cuFFT 3D FFT C2C example
 2 | 
 3 | ## Description
 4 | 
 5 | In this example a three-dimensional complex-to-complex transform is applied to the input data. Afterwards an inverse transform is performed on the computed frequency domain representation.
 6 | 
 7 | ## Supported SM Architectures
 8 | 
 9 | All GPUs supported by CUDA Toolkit (https://developer.nvidia.com/cuda-gpus)  
10 | 
11 | ## Supported OSes
12 | 
13 | Linux  
14 | Windows
15 | 
16 | ## Supported CPU Architecture
17 | 
18 | x86_64  
19 | ppc64le  
20 | arm64-sbsa
21 | 
22 | ## CUDA APIs involved
23 | - [cufftExecC2C API](https://docs.nvidia.com/cuda/cufft/index.html#function-cufftexecc2c-cufftexecz2z)
24 | 
25 | 
26 | # Building (make)
27 | 
28 | # Prerequisites
29 | - A Linux/Windows system with recent NVIDIA drivers.
30 | - [CMake](https://cmake.org/download) version 3.18 minimum
31 | 
32 | ## Build command on Linux
33 | ```
34 | $ mkdir build
35 | $ cd build
36 | $ cmake ..
37 | $ make
38 | ```
39 | Make sure that CMake finds expected CUDA Toolkit. If that is not the case you can add argument `-DCMAKE_CUDA_COMPILER=/path/to/cuda/bin/nvcc` to cmake command.
40 | 
41 | # Usage 1
42 | ```
43 | $  ./bin/3d_c2c_example
44 | ```
45 | 


--------------------------------------------------------------------------------
/cuFFT/3d_mgpu_c2c/.gitignore:
--------------------------------------------------------------------------------
1 | build/


--------------------------------------------------------------------------------
/cuFFT/3d_mgpu_r2c_c2r/.gitignore:
--------------------------------------------------------------------------------
1 | build/


--------------------------------------------------------------------------------
/cuFFTMp/Fortran_samples/Fortran_wrappers_nvhpc/libattachcommWrapper.a:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuFFTMp/Fortran_samples/Fortran_wrappers_nvhpc/libattachcommWrapper.a


--------------------------------------------------------------------------------
/cuFFTMp/Fortran_samples/Fortran_wrappers_nvhpc/libnvhpcwrapcufftxt.a:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuFFTMp/Fortran_samples/Fortran_wrappers_nvhpc/libnvhpcwrapcufftxt.a


--------------------------------------------------------------------------------
/cuFFTMp/Fortran_samples/c2c/Makefile:
--------------------------------------------------------------------------------
 1 | include ../common.mk 
 2 | 
 3 | exe = cufftmp_c2c
 4 | exe_acc = cufftmp_c2c_acc
 5 | 
 6 | all : $(exe) $(exe_acc) 
 7 | 
 8 | $(exe) : $(exe).o   
 9 | 	$(f90) -o $@ $+ $(FLAGS) $(LINKER)
10 | 
11 | $(exe_acc) : $(exe_acc).o   
12 | 	$(f90) -o $@ $+ $(FLAGS) $(LINKER) -acc 
13 | 
14 | %.o : %.f90
15 | 	$(f90) -c $< -o $@ $(FLAGS)
16 | 
17 | %_acc.o : %.f90
18 | 	$(f90) -c $< -o $@ $(FLAGS) -DACC -acc 
19 | 
20 | .PHONY: clean
21 | 
22 | clean:
23 | 	@rm -rf *.mod *.o  $(exe) $(exe_acc)
24 | 
25 | run: $(exe) $(exe_acc)
26 | 	LD_LIBRARY_PATH="${NVSHMEM_LIB}:${CUFFT_LIB}:${LD_LIBRARY_PATH}" mpirun -n 2 $(exe)
27 | 	LD_LIBRARY_PATH="${NVSHMEM_LIB}:${CUFFT_LIB}:${LD_LIBRARY_PATH}" mpirun -n 2 $(exe_acc) 
28 | 


--------------------------------------------------------------------------------
/cuFFTMp/Fortran_samples/c2c_no_descriptors/Makefile:
--------------------------------------------------------------------------------
 1 | include ../common.mk 
 2 | 
 3 | exe = cufftmp_c2c_no_descriptors
 4 | 
 5 | all : $(exe) 
 6 | 
 7 | $(exe) : $(exe).o   
 8 | 	$(f90) -o $@ $+ $(FLAGS) $(LINKER)
 9 | 
10 | %.o : %.f90
11 | 	$(f90) -c $< -o $@ $(FLAGS)
12 | 
13 | .PHONY: clean
14 | 
15 | clean:
16 | 	@rm -rf *.mod *.o  $(exe)
17 | 
18 | run: $(exe)
19 | 	LD_LIBRARY_PATH="${NVSHMEM_LIB}:${CUFFT_LIB}:${LD_LIBRARY_PATH}" mpirun -n 2 $(exe)
20 | 


--------------------------------------------------------------------------------
/cuFFTMp/Fortran_samples/c2c_pencils/Makefile:
--------------------------------------------------------------------------------
 1 | include ../common.mk 
 2 | 
 3 | exe = cufftmp_c2c_pencils
 4 | 
 5 | all : $(exe)  
 6 | 
 7 | $(exe) : $(exe).o   
 8 | 	$(f90) -o $@ $+ $(FLAGS) $(LINKER)
 9 | 
10 | %.o : %.f90
11 | 	$(f90) -c $< -o $@ $(FLAGS)
12 | 
13 | .PHONY: clean
14 | 
15 | clean:
16 | 	@rm -rf *.mod *.o  $(exe) 
17 | 
18 | run: $(exe) 
19 | 	LD_LIBRARY_PATH="${NVSHMEM_LIB}:${CUFFT_LIB}:${LD_LIBRARY_PATH}" mpirun -n 4 $(exe)
20 | 


--------------------------------------------------------------------------------
/cuFFTMp/Fortran_samples/common.mk:
--------------------------------------------------------------------------------
 1 | NVSHMEM_LIB ?= ../../cufft/lib
 2 | CUFFT_LIB   ?= ../../cufft/lib
 3 | CUFFT_INC   ?= ../../cufft/include
 4 | 
 5 | f90   := mpif90
 6 | 
 7 | WRAPPERS_DIR = ../Fortran_wrappers_nvhpc
 8 | FLAGS  = -O3 -Mfree -fast -Mextend -Mpreprocess -Minform=warn
 9 | FLAGS += -I./ -I${WRAPPERS_DIR}/ -I${CUFFT_INC}/
10 | # Add flags -gpu to build for specific architecture. E.g., -gpu=cc70,cc80,cc90
11 | # See https://docs.nvidia.com/hpc-sdk/compilers/hpc-compilers-user-guide/index.html#compute-capability
12 | # Also see https://docs.nvidia.com/cuda/cufftmp/usage/requirements.html for supported architectures
13 | FLAGS += -Minfo=accel -cuda -cudalib=cufftmp 
14 | LINKER := -L$(HPCSDK_ROOT)/compilers/lib -lnvhpcwrapcufft -lnvhpcwrapcufftmp -L${NVSHMEM_LIB} -lnvshmem_host -lnvshmem_device -L${CUFFT_LIB} 
15 | 
16 | 


--------------------------------------------------------------------------------
/cuFFTMp/Fortran_samples/r2c_c2r/Makefile:
--------------------------------------------------------------------------------
 1 | include ../common.mk 
 2 | 
 3 | exe = cufftmp_r2c
 4 | exe_acc = cufftmp_r2c_acc
 5 | 
 6 | all : $(exe) $(exe_acc) 
 7 | 
 8 | $(exe) : $(exe).o   
 9 | 	$(f90) -o $@ $+ $(FLAGS) $(LINKER)
10 | 
11 | $(exe_acc) : $(exe_acc).o   
12 | 	$(f90) -o $@ $+ $(FLAGS) $(LINKER) -acc 
13 | 
14 | %.o : %.f90
15 | 	$(f90) -c $< -o $@ $(FLAGS)
16 | 
17 | %_acc.o : %.f90
18 | 	$(f90) -c $< -o $@ $(FLAGS) -DACC -acc 
19 | 
20 | .PHONY: clean
21 | 
22 | clean:
23 | 	@rm -rf *.mod *.o  $(exe) $(exe_acc)
24 | 
25 | run: $(exe) $(exe_acc)
26 | 	LD_LIBRARY_PATH="${NVSHMEM_LIB}:${CUFFT_LIB}:${LD_LIBRARY_PATH}" mpirun -n 2 $(exe)
27 | 	LD_LIBRARY_PATH="${NVSHMEM_LIB}:${CUFFT_LIB}:${LD_LIBRARY_PATH}" mpirun -n 2 $(exe_acc) 
28 | 


--------------------------------------------------------------------------------
/cuFFTMp/Fortran_samples/r2c_c2r_no_descriptors/Makefile:
--------------------------------------------------------------------------------
 1 | include ../common.mk 
 2 | 
 3 | exe = cufftmp_r2c_c2r_no_descriptors
 4 | 
 5 | all : $(exe) 
 6 | 
 7 | $(exe) : $(exe).o   
 8 | 	$(f90) -o $@ $+ $(FLAGS) $(LINKER)
 9 | 
10 | %.o : %.f90
11 | 	$(f90) -c $< -o $@ $(FLAGS)
12 | 
13 | .PHONY: clean
14 | 
15 | clean:
16 | 	@rm -rf *.mod *.o  $(exe)
17 | 
18 | run: $(exe)
19 | 	LD_LIBRARY_PATH="${NVSHMEM_LIB}:${CUFFT_LIB}:${LD_LIBRARY_PATH}" mpirun -n 2 $(exe)
20 | 


--------------------------------------------------------------------------------
/cuFFTMp/Fortran_samples/r2c_c2r_pencils/Makefile:
--------------------------------------------------------------------------------
 1 | include ../common.mk 
 2 | 
 3 | exe = cufftmp_r2c_c2r_pencils
 4 | 
 5 | all : $(exe)  
 6 | 
 7 | $(exe) : $(exe).o   
 8 | 	$(f90) -o $@ $+ $(FLAGS) $(LINKER)
 9 | 
10 | %.o : %.f90
11 | 	$(f90) -c $< -o $@ $(FLAGS)
12 | 
13 | .PHONY: clean
14 | 
15 | clean:
16 | 	@rm -rf *.mod *.o  $(exe) 
17 | 
18 | run: $(exe) 
19 | 	LD_LIBRARY_PATH="${NVSHMEM_LIB}:${CUFFT_LIB}:${LD_LIBRARY_PATH}" mpirun -n 4 $(exe)
20 | 


--------------------------------------------------------------------------------
/cuFFTMp/Fortran_samples/r2c_c2r_shared_scratch/Makefile:
--------------------------------------------------------------------------------
 1 | include ../common.mk 
 2 | 
 3 | exe = cufftmp_r2c_workarea
 4 | 
 5 | all : $(exe) 
 6 | 
 7 | $(exe) : $(exe).o   
 8 | 	$(f90) -o $@ $+ $(FLAGS) $(LINKER) 
 9 | 
10 | %.o : %.f90
11 | 	$(f90) -c $< -o $@ $(FLAGS) -DSHARED_WORKAREA
12 | 
13 | .PHONY: clean
14 | 
15 | clean:
16 | 	@rm -rf *.mod *.o  $(exe) 
17 | 
18 | run: $(exe)
19 | 	LD_LIBRARY_PATH="${NVSHMEM_LIB}:${CUFFT_LIB}:${LD_LIBRARY_PATH}" mpirun -n 2 $(exe)
20 | 
21 | 


--------------------------------------------------------------------------------
/cuFFTMp/Fortran_samples/r2c_c2r_shared_scratch/README.md:
--------------------------------------------------------------------------------
 1 | # Fortran R2C_C2R Sample with workarea sharing
 2 | 
 3 | This sample shows how to compute a distributed R2C-C2R transform using shared scratch workarea between the two plans.
 4 | It is otherwise identical to the other, simpler R2C-C2R sample.
 5 | 
 6 | Requirement:
 7 | - HPC SDK 21.9 and up
 8 | - `mpif90` and `mpicc` should be in your `$PATH`
 9 | 
10 | To build and run:
11 | ```
12 | export CUFFT_LIB=/path/to/cufftMp/lib/
13 | export CUFFT_INC=/path/to/cufftMp/include/
14 | 
15 | cd r2c_c2r_shared_scratch
16 | make run
17 | [...]
18 |  Hello from rank             0  gpu id            0 size            2
19 |  Hello from rank             1  gpu id            1 size            2
20 |  local_rshape          :          258          256          128
21 |  local_permuted_cshape :          129          128          256
22 |  shape of u is           258          256          128
23 |  shape of u_permuted is           129          128          256
24 | [...]
25 |         after C2R 0  max_norm is                1.00000000  max_diff is      0.00000107
26 |    Relative Linf on rank 0           is                0.00000107
27 |         after C2R 1  max_norm is                1.00000000  max_diff is      0.00000107
28 |    Relative Linf on rank 1           is                0.00000107
29 |  >>>> PASSED on rank             0
30 |  >>>> PASSED on rank             1
31 | ```
32 | 


--------------------------------------------------------------------------------
/cuFFTMp/Fortran_samples/reshape/Makefile:
--------------------------------------------------------------------------------
 1 | include ../common.mk 
 2 | 
 3 | exe = cufftmp_reshape
 4 | 
 5 | all : $(exe)  
 6 | 
 7 | $(exe) : $(exe).o   
 8 | 	$(f90) -o $@ $+ $(FLAGS) $(LINKER) 
 9 | 
10 | %.o : %.f90
11 | 	$(f90) -c $< -o $@ $(FLAGS)
12 | 
13 | .PHONY: clean
14 | 
15 | clean:
16 | 	@rm -rf *.mod *.o  $(exe) 
17 | 
18 | run: $(exe) 
19 | 	LD_LIBRARY_PATH="${NVSHMEM_LIB}:${CUFFT_LIB}:${LD_LIBRARY_PATH}" mpirun -n 2 $(exe)
20 | 


--------------------------------------------------------------------------------
/cuFFTMp/Fortran_samples/reshape/README.md:
--------------------------------------------------------------------------------
 1 | # Fortran Standalone Reshape Sample
 2 | 
 3 | This sample shows how to use the reshape API to re-distribute data accross GPUs.
 4 | 
 5 | Requirement:
 6 | - HPC SDK 21.9 and up
 7 | - `mpif90` and `mpicc` should be in your `$PATH`
 8 | 
 9 | To build and run:
10 | ```
11 | export CUFFT_LIB=/path/to/cufftMp/lib/
12 | export CUFFT_INC=/path/to/cufftMp/include/
13 | cd reshape
14 | make run
15 | 
16 |  Hello from rank             0  gpu id            0 size            2
17 |            Input data on rank 0:  0  1  4  5  8  9 12 13
18 |  Expected output data on rank 0:  0  1  2  3  4  5  6  7
19 |  Hello from rank             1  gpu id            1 size            2
20 |            Input data on rank 1:  2  3  6  7 10 11 14 15
21 |  Expected output data on rank 1:  8  9 10 11 12 13 14 15
22 |           Output data on rank 0:  0  1  2  3  4  5  6  7
23 |           Output data on rank 1:  8  9 10 11 12 13 14 15
24 |  >>>> PASSED on rank             1
25 |  >>>> PASSED on rank             0
26 | 


--------------------------------------------------------------------------------
/cuFFTMp/JAX_FFT/Dockerfile:
--------------------------------------------------------------------------------
 1 | # We use JAX-toolbox from https://github.com/NVIDIA/JAX-Toolbox
 2 | FROM ghcr.io/nvidia/jax:jax-2024-10-24
 3 | RUN apt-get update && apt-get install openmpi-bin -y
 4 | 
 5 | COPY . /fft_jax
 6 | RUN rm -rf /fft_jax/build
 7 | RUN pip install -e /fft_jax
 8 | 
 9 | ENV LD_LIBRARY_PATH=/fft_jax/nvshmem/lib:/fft_jax/cufftmp/lib:$LD_LIBRARY_PATH
10 | 
11 | ENV NVSHMEM_DISABLE_NCCL=1
12 | ENV NVSHMEM_DISABLE_GDRCOPY=1
13 | ENV NVSHMEM_BOOTSTRAP=MPI
14 | 
15 | # Infiniband service level is beneficial for performance for large FFTs on many GPUs.
16 | # see *Note* in https://docs.nvidia.com/hpc-sdk/cufftmp/usage/performances.html#performance-considerations
17 | # The IB service level for both NVSHMEM (for cuFFTMp) and NCCL (for JAX FFT) are declared here.
18 | ENV NVSHMEM_IB_SL=1
19 | ENV NCCL_IB_SL=1
20 | 
21 | ENV OMPI_ALLOW_RUN_AS_ROOT=1
22 | ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1


--------------------------------------------------------------------------------
/cuFFTMp/JAX_FFT/misc/strong.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuFFTMp/JAX_FFT/misc/strong.png


--------------------------------------------------------------------------------
/cuFFTMp/JAX_FFT/misc/strong_eos.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuFFTMp/JAX_FFT/misc/strong_eos.png


--------------------------------------------------------------------------------
/cuFFTMp/JAX_FFT/misc/weak.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuFFTMp/JAX_FFT/misc/weak.png


--------------------------------------------------------------------------------
/cuFFTMp/JAX_FFT/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=42", "wheel", "setuptools_scm[toml]>=3.4", "pybind11>=2.6", "cmake"]
3 | build-backend = "setuptools.build_meta"
4 | 


--------------------------------------------------------------------------------
/cuFFTMp/JAX_FFT/src/cufftmp_jax/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.12)
 2 | project(cufftmp_jax LANGUAGES CXX CUDA)
 3 | 
 4 | find_package(Python COMPONENTS Interpreter Development REQUIRED)
 5 | find_package(pybind11 CONFIG REQUIRED)
 6 | 
 7 | include_directories(${CMAKE_CURRENT_LIST_DIR}/src)
 8 | 
 9 | message(STATUS "Using ${NVSHMEM_HOME} for NVSHMEM_HOME and ${CUFFTMP_HOME} for CUFFTMP_HOME")
10 | include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} ${CUFFTMP_HOME}/include ${NVSHMEM_HOME}/include)
11 | link_directories(${CUFFTMP_HOME}/lib ${NVSHMEM_HOME}/lib)
12 | 
13 | pybind11_add_module(gpu_ops 
14 |     ${CMAKE_CURRENT_LIST_DIR}/src/kernels.cu 
15 |     ${CMAKE_CURRENT_LIST_DIR}/src/gpu_ops.cpp
16 | )
17 | 
18 | target_link_libraries(gpu_ops 
19 |     PRIVATE 
20 |         cufftMp
21 |         nvshmem_host
22 |         nvshmem_device
23 | )
24 | 
25 | set_target_properties(gpu_ops 
26 |     PROPERTIES
27 |         CUDA_STANDARD 17
28 |         CUDA_RESOLVE_DEVICE_SYMBOLS ON
29 |         POSITION_INDEPENDENT_CODE ON
30 |         CUDA_SEPARABLE_COMPILATION ON
31 | )
32 | 
33 | install(TARGETS gpu_ops DESTINATION cufftmp_jax)
34 | 


--------------------------------------------------------------------------------
/cuFFTMp/JAX_FFT/src/cufftmp_jax/NOTICE:
--------------------------------------------------------------------------------
 1 | The code in this directory was adapted from https://github.com/dfm/extending-jax
 2 | by Dan Foreman-Mackey and published under the MIT license as stated below.
 3 | 
 4 | --------------------------------------------------------------------------------
 5 | 
 6 | MIT License
 7 | 
 8 | Copyright (c) 2021 Dan Foreman-Mackey
 9 | 
10 | Permission is hereby granted, free of charge, to any person obtaining a copy
11 | of this software and associated documentation files (the "Software"), to deal
12 | in the Software without restriction, including without limitation the rights
13 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 | copies of the Software, and to permit persons to whom the Software is
15 | furnished to do so, subject to the following conditions:
16 | 
17 | The above copyright notice and this permission notice shall be included in all
18 | copies or substantial portions of the Software.
19 | 
20 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 | SOFTWARE.


--------------------------------------------------------------------------------
/cuFFTMp/JAX_FFT/src/cufftmp_jax/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | from .cufftmp_jax import cufftmp
4 | 


--------------------------------------------------------------------------------
/cuFFTMp/JAX_FFT/src/cufftmp_jax/src/gpu_ops.cpp:
--------------------------------------------------------------------------------
 1 | #include "kernels.h"
 2 | #include "pybind11_kernel_helpers.h"
 3 | 
 4 | using namespace cufftmp_jax;
 5 | 
 6 | /**
 7 |  * Boilerplate used to
 8 |  * (1) Expose the gpu_cufftmp function to Python (to launch our custom op)
 9 |  * (2) Expose the cufftmpDescriptor (to pass parameters from Python to C++)
10 |  */
11 | 
12 | namespace {
13 | 
14 | pybind11::dict Registrations() {
15 |     pybind11::dict dict;
16 |     dict["gpu_cufftmp"] = EncapsulateFunction(gpu_cufftmp);
17 |     return dict;
18 | }
19 | 
20 | PYBIND11_MODULE(gpu_ops, m) {
21 |     m.def("registrations", &Registrations);
22 |     m.def("build_cufftmp_descriptor",
23 |         [](std::int64_t x, std::int64_t y, std::int64_t z, int dist, int dir) { 
24 |             return PackDescriptor(cufftmpDescriptor{x, y, z, dist, dir}); 
25 |         }
26 |     );
27 | }
28 | 
29 | }  // namespace
30 | 


--------------------------------------------------------------------------------
/cuFFTMp/JAX_FFT/src/cufftmp_jax/src/kernel_helpers.h:
--------------------------------------------------------------------------------
 1 | #ifndef _CUFFTMP_JAX_KERNEL_HELPERS_H_
 2 | #define _CUFFTMP_JAX_KERNEL_HELPERS_H_
 3 | 
 4 | #include <cstdint>
 5 | #include <stdexcept>
 6 | #include <string>
 7 | #include <type_traits>
 8 | 
 9 | /**
10 |  * Boilerplate to copy descriptors from Python to C++
11 |  */
12 | 
13 | namespace cufftmp_jax {
14 | 
15 | // See https://en.cppreference.com/w/cpp/numeric/bit_cast
16 | template <class To, class From>
17 | typename std::enable_if<sizeof(To) == sizeof(From) && 
18 |                         std::is_trivially_copyable<From>::value &&
19 |                         std::is_trivially_copyable<To>::value,
20 |                         To>::type
21 | bit_cast(const From& src) noexcept {
22 |     static_assert(
23 |         std::is_trivially_constructible<To>::value,
24 |         "This implementation additionally requires destination type to be trivially constructible"
25 |     );
26 | 
27 |     To dst;
28 |     memcpy(&dst, &src, sizeof(To));
29 |     return dst;
30 | }
31 | 
32 | template <typename T>
33 | std::string PackDescriptorAsString(const T& descriptor) {
34 |     return std::string(bit_cast<const char*>(&descriptor), sizeof(T));
35 | }
36 | 
37 | template <typename T>
38 | const T* UnpackDescriptor(const char* opaque, std::size_t opaque_len) {
39 |     if (opaque_len != sizeof(T)) {
40 |         throw std::runtime_error("Invalid opaque object size with opaque_len = " + std::to_string(opaque_len) + ", and sizeof(T) = " + std::to_string(sizeof(T)));
41 |     }
42 |     return bit_cast<const T*>(opaque);
43 | }
44 | 
45 | }  // namespace cufftmp_jax
46 | 
47 | #endif
48 | 


--------------------------------------------------------------------------------
/cuFFTMp/JAX_FFT/src/cufftmp_jax/src/kernels.h:
--------------------------------------------------------------------------------
 1 | #ifndef _CUFFTMP_JAX_KERNELS_H_
 2 | #define _CUFFTMP_JAX_KERNELS_H_
 3 | 
 4 | #include <cuda_runtime_api.h>
 5 | 
 6 | #include <cstddef>
 7 | #include <cstdint>
 8 | 
 9 | namespace cufftmp_jax {
10 | 
11 | /**
12 |  * Description of an FFT
13 |  * - global_x, global_y, global_z are the global size of the tensor to transform
14 |  * - distribution is 0 for a CUFFT_XT_FORMAT_INPLACE (== Slabs_X) and
15 |  *   1 for a CUFFT_XT_FORMAT_INPLACE_SHUFFLED (== Slabs_Y) data distribution
16 |  * - direction is 0 for a CUFFT_FORWARD transform, 1 for CUFFT_INVERSE
17 |  */
18 | 
19 | struct cufftmpDescriptor {
20 |     std::int64_t global_x;
21 |     std::int64_t global_y;
22 |     std::int64_t global_z;
23 |     int distribution;
24 |     int direction;
25 | };
26 | 
27 | /**
28 |  * Generic signature for a custom op with CUDA
29 |  */
30 | void gpu_cufftmp(cudaStream_t stream, void** buffers, const char* opaque, std::size_t opaque_len);
31 | 
32 | }  // namespace cufftmp_jax
33 | 
34 | #endif


--------------------------------------------------------------------------------
/cuFFTMp/JAX_FFT/src/cufftmp_jax/src/pybind11_kernel_helpers.h:
--------------------------------------------------------------------------------
 1 | #ifndef _CUFFTMP_JAX_PYBIND11_KERNEL_HELPERS_H_
 2 | #define _CUFFTMP_JAX_PYBIND11_KERNEL_HELPERS_H_
 3 | 
 4 | #include <pybind11/pybind11.h>
 5 | 
 6 | #include "kernel_helpers.h"
 7 | 
 8 | /**
 9 |  * pybind11 boilerplate
10 |  */
11 | 
12 | namespace cufftmp_jax {
13 | 
14 | template <typename T>
15 | pybind11::bytes PackDescriptor(const T& descriptor) {
16 |     return pybind11::bytes(PackDescriptorAsString(descriptor));
17 | }
18 | 
19 | template <typename T>
20 | pybind11::capsule EncapsulateFunction(T* fn) {
21 |     return pybind11::capsule(bit_cast<void*>(fn), "xla._CUSTOM_CALL_TARGET");
22 | }
23 | 
24 | }  // namespace cufftmp_jax
25 | 
26 | #endif
27 | 


--------------------------------------------------------------------------------
/cuFFTMp/JAX_FFT/src/fft_common/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | from .utils import Dist, Dir
4 | 


--------------------------------------------------------------------------------
/cuFFTMp/JAX_FFT/src/xfft/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | from .xfft import xfft
4 | 


--------------------------------------------------------------------------------
/cuFFTMp/extra_bootstraps/Makefile:
--------------------------------------------------------------------------------
 1 | DEST ?= "myMPI"
 2 | MPICC ?= mpicc
 3 | NVSHMEM_VER ?= 2.8.0
 4 | NVSHMEM_VER_RC ?= 2.8.0-3
 5 | 
 6 | all: mpi_bootstrap
 7 | 
 8 | ${DEST}/nvshmem_bootstrap_mpi.so:
 9 | 	mkdir -p ${DEST}
10 | 	wget https://developer.download.nvidia.com/compute/redist/nvshmem/${NVSHMEM_VER}/source/nvshmem_src_${NVSHMEM_VER_RC}.txz
11 | 	tar Jxvf nvshmem_src_${NVSHMEM_VER_RC}.txz
12 | 	make -C nvshmem_src_${NVSHMEM_VER_RC} $(shell pwd)/nvshmem_src_${NVSHMEM_VER_RC}/build/lib/nvshmem_bootstrap_mpi.so.${NVSHMEM_VER}
13 | 	echo "Bootstrap built!"
14 | 	mv $(shell pwd)/nvshmem_src_${NVSHMEM_VER_RC}/build/lib/nvshmem_bootstrap_mpi* ${DEST}
15 | 	ldd ${DEST}/nvshmem_bootstrap_mpi.so
16 | 	rm -rf ./nvshmem_src_${NVSHMEM_VER_RC}.txz ./nvshmem_src_${NVSHMEM_VER_RC}
17 | 
18 | mpi_bootstrap: ${DEST}/nvshmem_bootstrap_mpi.so
19 | 
20 | clean:
21 | 	rm -rf ./nvshmem_src_${NVSHMEM_VER_RC}.txz ./nvshmem_src_${NVSHMEM_VER_RC} ./${DEST}
22 | 


--------------------------------------------------------------------------------
/cuFFTMp/extra_bootstraps/README.md:
--------------------------------------------------------------------------------
 1 | # Building bootstraps for other versions of MPI
 2 | 
 3 | cuFFTMp uses NVSHMEM. In order to interoperate with MPI, a bootstrap plugin is required. NVSHMEM ships with a bootstrap compatible with HPC-X.
 4 | However, you can easily build you own bootstrap, compatible with another MPI implementation. To do so,
 5 | ```
 6 | MPI_HOME=/path/to/mpi/home/ CUDA_HOME=/path/to/cuda/home DEST=myMPI make mpi_bootstrap
 7 | ```
 8 | will download NVSHMEM, build the bootstrap library and place it in the `myMPI` folder.
 9 | 
10 | After this, you can run any sample by
11 | ```
12 | MPI_HOME=/path/to/my/mpi/ NVSHMEM_LIB="../../extra_bootstraps/myMPI" make run
13 | ```
14 | which effectively place `myMPI` in your `LD_LIBRARY_PATH`.
15 | 


--------------------------------------------------------------------------------
/cuFFTMp/samples/c2c/Makefile:
--------------------------------------------------------------------------------
 1 | include ../common.mk
 2 | 
 3 | exe = cufftmp_c2c
 4 | 
 5 | all: $(exe)
 6 | 
 7 | .PHONY: clean
 8 | 
 9 | clean: 
10 | 	rm -rf $(exe)
11 | 
12 | $(exe): $(exe).cu
13 | 	${CUDA_HOME}/bin/nvcc $< -o $@ ${CXXFLAGS} ${INCFLAGS} ${LDFLAGS}
14 | 
15 | build: $(exe)
16 | 
17 | run: $(exe)
18 | 	LD_LIBRARY_PATH="${NVSHMEM_LIB}:${CUFFT_LIB}:${LD_LIBRARY_PATH}" mpirun -oversubscribe -n 2 $(exe)
19 | 


--------------------------------------------------------------------------------
/cuFFTMp/samples/c2c_no_descriptors/Makefile:
--------------------------------------------------------------------------------
 1 | include ../common.mk
 2 | 
 3 | exe = cufftmp_c2c_no_descriptors
 4 | 
 5 | all: $(exe)
 6 | 
 7 | .PHONY: clean
 8 | 
 9 | clean: 
10 | 	rm -rf $(exe)
11 | 
12 | $(exe): $(exe).cu
13 | 	${CUDA_HOME}/bin/nvcc $< -o $@ ${CXXFLAGS} ${INCFLAGS} ${LDFLAGS}
14 | 
15 | build: $(exe)
16 | 
17 | run: $(exe)
18 | 	LD_LIBRARY_PATH="${NVSHMEM_LIB}:${CUFFT_LIB}:${LD_LIBRARY_PATH}" mpirun -oversubscribe -n 2 $(exe)
19 | 


--------------------------------------------------------------------------------
/cuFFTMp/samples/c2c_pencils/Makefile:
--------------------------------------------------------------------------------
 1 | include ../common.mk
 2 | 
 3 | exe = cufftmp_c2c_pencils
 4 | 
 5 | all: $(exe)
 6 | 
 7 | .PHONY: clean
 8 | 
 9 | clean: 
10 | 	rm -rf $(exe)
11 | 
12 | $(exe): $(exe).cu ../iterators/box_iterator.hpp
13 | 	${CUDA_HOME}/bin/nvcc $< -o $@ ${CXXFLAGS} ${INCFLAGS} ${LDFLAGS}
14 | 
15 | build: $(exe)
16 | 
17 | run: $(exe)
18 | 	LD_LIBRARY_PATH="${NVSHMEM_LIB}:${CUFFT_LIB}:${LD_LIBRARY_PATH}" mpirun -oversubscribe -n 4 $(exe)
19 | 


--------------------------------------------------------------------------------
/cuFFTMp/samples/c2c_pencils/README.md:
--------------------------------------------------------------------------------
 1 | # C2C using a custom user distributions (pencils)
 2 | ## Sample description
 3 | This sample is similar to [samples/c2c](../c2c/README.md), where it performs
 4 | - C2C forward transform
 5 | - [Scaling/normalization](../common/README.md)
 6 | - C2C backward transform.
 7 |   
 8 | But this sample assumes pencil decomposition layout:
 9 | - the input data is distributed using a pencil decomposition in X and Y, along Z;
10 | - the output data is distributed using a pencil decomposition in X and Z, along Y.
11 | 
12 | This is achieved using a custom user-defined distribution and `cufftXtSetDistribution`.
13 | 
14 | ## Build and run
15 | This example requires 4 GPUs.
16 | 
17 | See [Requirements](../../README.md) and [Quick start for C++ samples](../../README.md) for hardware/software requirements and build instructions.
18 | 
19 | Example code snippet:
20 | ```
21 | $ MPI_HOME=/usr/lib/x86_64-linux-gnu/openmpi/ make run
22 | Hello from rank 3/4 using GPU 3
23 | Hello from rank 1/4 using GPU 1
24 | Hello from rank 0/4 using GPU 0
25 | Hello from rank 2/4 using GPU 2
26 | input data, global 3D index [2,0,0], local index 0, rank 2 is (-0.12801,-0.629836)
27 | input data, global 3D index [2,0,1], local index 1, rank 2 is (-0.948148,0.863082)
28 | [...]
29 | output, global 3D index [0,0,2], local index 0, rank 1 is (-8.45704,12.8481)
30 | output, global 3D index [0,0,3], local index 1, rank 1 is (3.18903,28.6322)
31 | [...]
32 | ```


--------------------------------------------------------------------------------
/cuFFTMp/samples/common.mk:
--------------------------------------------------------------------------------
 1 | MPI_HOME    ?= /opt/nvidia/hpc_sdk/Linux_x86_64/${HPCSDK_VERSION}/comm_libs/mpi
 2 | NVSHMEM_LIB ?= /opt/nvidia/hpc_sdk/Linux_x86_64/${HPCSDK_VERSION}/comm_libs/nvshmem/lib
 3 | NVSHMEM_INC ?= /opt/nvidia/hpc_sdk/Linux_x86_64/${HPCSDK_VERSION}/comm_libs/nvshmem/include
 4 | CUDA_HOME   ?= $(shell dirname $$(command -v nvcc))/..
 5 | CUFFT_LIB   ?= ../../cufft/lib/
 6 | CUFFT_INC   ?= ../../cufft/include/
 7 | ARCH        ?= $(shell uname -m)
 8 | ifeq ($(ARCH), ppc64le)
 9 | MPI         ?= mpi_ibm
10 | else
11 | MPI         ?= mpi
12 | endif
13 | # Also see https://docs.nvidia.com/cuda/cufftmp/usage/requirements.html for supported architectures
14 | CXXFLAGS = -std=c++17 --generate-code arch=compute_70,code=sm_70 --generate-code arch=compute_75,code=sm_75 --generate-code arch=compute_80,code=sm_80 --generate-code arch=compute_86,code=sm_86 --generate-code arch=compute_89,code=sm_89 --generate-code arch=compute_90,code=sm_90 --generate-code arch=compute_100,code=sm_100
15 | INCFLAGS = -I${CUFFT_INC} -I${NVSHMEM_INC} -I${MPI_HOME}/include
16 | LDFLAGS  = -lcuda -L${CUFFT_LIB} -L${NVSHMEM_LIB}  -lcufftMp -lnvshmem_device -lnvshmem_host -L${MPI_HOME}/lib -l${MPI}
17 | 


--------------------------------------------------------------------------------
/cuFFTMp/samples/common/README.md:
--------------------------------------------------------------------------------
 1 | # Auxiliary functions for samples
 2 | 
 3 | This folder contains a few auxiliary functions for various samples.
 4 | 
 5 | ## error_checks.hpp
 6 | - error_check: Compute the global L2 norm between reference and test values by using `BoxIterator`.
 7 | - assess_error: Assess the error based on some tolerance (default: `tolerance = 1e-6`). This also produces a print statement on the MPI rank 0.
 8 | 
 9 | ## generate_random.hpp
10 | - Two generate_random functions that generate real or complex values in a `std::vector`
11 | 
12 | ## scaling.cuh
13 | - scaling_kernel: Normalize entries in the box with a constant scaling factor using `BoxIterator`. By default, entries corresponding to the first 10 threads are printed for illustration. This kernel serves as an example of intermediate operations that can be done between two Fourier transforms.


--------------------------------------------------------------------------------
/cuFFTMp/samples/common/generate_random.hpp:
--------------------------------------------------------------------------------
 1 | #include <random>
 2 | #include <vector>
 3 | #include <complex>
 4 | 
 5 | void generate_random(std::vector<std::complex<float>>& data, int seed) {
 6 |     std::mt19937 gen(seed);
 7 |     std::uniform_real_distribution<float> dist(-1, 1);
 8 |     for(auto& v: data) {
 9 |         float r = dist(gen);
10 |         float i = dist(gen);
11 |         v = {r, i};
12 |     }
13 | }
14 | 
15 | void generate_random(std::vector<float>& data, int seed) {
16 |     std::mt19937 gen(seed);
17 |     std::uniform_real_distribution<float> dist(-1, 1);
18 |     for(auto& v: data) {
19 |         v = dist(gen);
20 |     }
21 | }


--------------------------------------------------------------------------------
/cuFFTMp/samples/common/scaling.cuh:
--------------------------------------------------------------------------------
 1 | #include "../iterators/box_iterator.hpp"
 2 | 
 3 | __global__
 4 | void scaling_kernel(BoxIterator<cufftComplex> begin, BoxIterator<cufftComplex> end, int rank, int size, size_t nx, size_t ny, size_t nz, bool printing = true) {
 5 |     const int tid = threadIdx.x + blockIdx.x * blockDim.x;
 6 |     begin += tid;
 7 |     if(begin < end) {
 8 |         // begin.x(), begin.y() and begin.z() are the global 3D coordinate of the data pointed by the iterator
 9 |         // begin->x and begin->y are the real and imaginary part of the corresponding cufftComplex element
10 |         if(tid < 10 && printing) {
11 |             printf("GPU data (after first transform): global 3D index [%d %d %d], local index %d, rank %d is (%f,%f)\n", 
12 |                 (int)begin.x(), (int)begin.y(), (int)begin.z(), (int)begin.i(), rank, begin->x, begin->y);
13 |         }
14 |         *begin = {begin->x / (float)(nx * ny * nz), begin->y / (float)(nx * ny * nz)};
15 |     }
16 | };


--------------------------------------------------------------------------------
/cuFFTMp/samples/r2c_c2r/Makefile:
--------------------------------------------------------------------------------
 1 | include ../common.mk
 2 | 
 3 | exe = cufftmp_r2c_c2r
 4 | 
 5 | all: $(exe)
 6 | 
 7 | .PHONY: clean
 8 | 
 9 | clean: 
10 | 	rm -rf $(exe)
11 | 
12 | $(exe): $(exe).cu
13 | 	${CUDA_HOME}/bin/nvcc $< -o $@ ${CXXFLAGS} ${INCFLAGS} ${LDFLAGS}
14 | 
15 | build: $(exe)
16 | 
17 | run: $(exe)
18 | 	LD_LIBRARY_PATH="${NVSHMEM_LIB}:${CUFFT_LIB}:${LD_LIBRARY_PATH}" mpirun -oversubscribe -n 2 $(exe) 
19 | 


--------------------------------------------------------------------------------
/cuFFTMp/samples/r2c_c2r_no_descriptors/Makefile:
--------------------------------------------------------------------------------
 1 | include ../common.mk
 2 | 
 3 | exe = cufftmp_r2c_c2r_no_descriptors
 4 | 
 5 | all: $(exe)
 6 | 
 7 | .PHONY: clean
 8 | 
 9 | clean: 
10 | 	rm -rf $(exe)
11 | 
12 | $(exe): $(exe).cu
13 | 	${CUDA_HOME}/bin/nvcc $< -o $@ ${CXXFLAGS} ${INCFLAGS} ${LDFLAGS}
14 | 
15 | build: $(exe)
16 | 
17 | run: $(exe)
18 | 	LD_LIBRARY_PATH="${NVSHMEM_LIB}:${CUFFT_LIB}:${LD_LIBRARY_PATH}" mpirun -oversubscribe -n 2 $(exe) 
19 | 


--------------------------------------------------------------------------------
/cuFFTMp/samples/r2c_c2r_pencils/Makefile:
--------------------------------------------------------------------------------
 1 | include ../common.mk
 2 | 
 3 | exe = cufftmp_r2c_c2r_pencils
 4 | 
 5 | all: $(exe)
 6 | 
 7 | .PHONY: clean
 8 | 
 9 | clean: 
10 | 	rm -rf $(exe)
11 | 
12 | $(exe): $(exe).cu
13 | 	${CUDA_HOME}/bin/nvcc $< -o $@ ${CXXFLAGS} ${INCFLAGS} ${LDFLAGS}
14 | 
15 | build: $(exe)
16 | 
17 | run: $(exe)
18 | 	LD_LIBRARY_PATH="${NVSHMEM_LIB}:${CUFFT_LIB}:${LD_LIBRARY_PATH}" mpirun -oversubscribe -n 4 $(exe) 
19 | 


--------------------------------------------------------------------------------
/cuFFTMp/samples/r2c_c2r_pencils_cufftMpMakePlan/README.md:
--------------------------------------------------------------------------------
 1 | # R2C_C2R Sample using a custom user distributions (pencils)
 2 | ## Sample description
 3 | This sample is simiar to [samples/r2c_c2r_pencils](../r2c_c2r_pencils/README.md), where it performs
 4 | - R2C forward transform
 5 | - [Scaling/normalization](../common/README.md)
 6 | - C2R backward transform.
 7 | 
 8 | But this sample uses the new API `cufftMpMakePlanDecomposition` where the box coordinates for data decomposition across ranks/PEs as well as the communicator are passed directly to the planning function.
 9 | 
10 | ## Build and run
11 | This example requires 4 GPUs.
12 | 
13 | See [Requirements](../../README.md) and [Quick start for C++ samples](../../README.md) for hardware/software requirements and build instructions.
14 | 
15 | Example code snippet:
16 | ```
17 | $ MPI_HOME=/usr/lib/x86_64-linux-gnu/openmpi/ make run
18 | Hello from rank 1/4 using GPU 1
19 | Hello from rank 3/4 using GPU 3
20 | Hello from rank 2/4 using GPU 2
21 | Hello from rank 0/4 using GPU 0
22 | Input data, global 3D index [0,2,0], local index 0, rank 1 is -0.165956
23 | [...]
24 | GPU data (after first transform): global 3D index [0 4 3], local index 9, rank 1 is (0.412567,-9.293055)
25 | PASSED with L2 error 1.156259e-07 < 1.000000e-06
26 | [...]
27 | ```
28 | 


--------------------------------------------------------------------------------
/cuFFTMp/samples/r2c_c2r_shared_scratch/Makefile:
--------------------------------------------------------------------------------
 1 | include ../common.mk
 2 | 
 3 | exe = cufftmp_r2c_c2r_shared_scratch
 4 | 
 5 | all: $(exe)
 6 | 
 7 | .PHONY: clean
 8 | 
 9 | clean: 
10 | 	rm -rf $(exe)
11 | 
12 | $(exe): $(exe).cu
13 | 	${CUDA_HOME}/bin/nvcc $< -o $@ ${CXXFLAGS} ${INCFLAGS} ${LDFLAGS}
14 | 
15 | build: $(exe)
16 | 
17 | run: $(exe)
18 | 	LD_LIBRARY_PATH="${NVSHMEM_LIB}:${CUFFT_LIB}:${LD_LIBRARY_PATH}" mpirun -oversubscribe -n 2 $(exe)
19 | 


--------------------------------------------------------------------------------
/cuFFTMp/samples/r2c_c2r_slabs_GROMACS/Makefile:
--------------------------------------------------------------------------------
 1 | include ../common.mk
 2 | 
 3 | exe = cufftmp_r2c_c2r_slabs_GROMACS
 4 | 
 5 | all: $(exe)
 6 | 
 7 | .PHONY: clean
 8 | 
 9 | clean: 
10 | 	rm -rf $(exe)
11 | 
12 | $(exe): $(exe).cu
13 | 	${CUDA_HOME}/bin/nvcc $< -o $@ ${CXXFLAGS} ${INCFLAGS} ${LDFLAGS}
14 | 
15 | build: $(exe)
16 | 
17 | run: $(exe)
18 | 	LD_LIBRARY_PATH="${NVSHMEM_LIB}:${CUFFT_LIB}:${LD_LIBRARY_PATH}" mpirun -oversubscribe -n 4 $(exe) 
19 | 


--------------------------------------------------------------------------------
/cuFFTMp/samples/reshape/Makefile:
--------------------------------------------------------------------------------
 1 | include ../common.mk
 2 | 
 3 | exe = cufftmp_reshape
 4 | 
 5 | all: $(exe)
 6 | 
 7 | .PHONY: clean
 8 | 
 9 | clean: 
10 | 	rm -rf $(exe)
11 | 
12 | $(exe): $(exe).cu
13 | 	${CUDA_HOME}/bin/nvcc $< -o $@ ${CXXFLAGS} ${INCFLAGS} ${LDFLAGS}
14 | 
15 | build: $(exe)
16 | 
17 | run: $(exe)
18 | 	LD_LIBRARY_PATH="${NVSHMEM_LIB}:${CUFFT_LIB}:${LD_LIBRARY_PATH}" mpirun -oversubscribe -n 2 $(exe)
19 | 


--------------------------------------------------------------------------------
/cuPQC/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC=nvcc
 2 | NVCC_FLAGS=-dlto -arch=native -std=c++17 -O3 
 3 | 
 4 | 
 5 | CUPQC_DIR?=/usr/lib/nvidia/cupqc-pkg-0.3.0/
 6 | CUPQC_INCLUDE_DIR=$(CUPQC_DIR)/include/cupqc
 7 | COMMONDX_INCLUDE_DIR=$(CUPQC_DIR)/include/
 8 | #/commondx
 9 | 
10 | CUDA_BIN_DIR=$(shell dirname `which $(NVCC)`)
11 | CUDA_INCLUDE_DIR=$(CUDA_BIN_DIR)/../include
12 | 
13 | SRCS = $(wildcard *.cu)
14 | TARGETS=$(patsubst %.cu,%,$(SRCS))
15 | 
16 | $(TARGETS): %: %.cu
17 | 	$(NVCC) $(NVCC_FLAGS) -L$(CUPQC_DIR)/lib/ -lcupqc -lcuhash-o $@ $<  -I$(COMMONDX_INCLUDE_DIR) -I$(CUPQC_INCLUDE_DIR) 
18 | .PHONY: all clean
19 | 
20 | all: $(TARGETS) 
21 | 
22 | clean:
23 | 	rm -f $(TARGETS)
24 | 
25 | .DEFAULT_GOAL := all
26 | 


--------------------------------------------------------------------------------
/cuPQC/README.md:
--------------------------------------------------------------------------------
 1 | # cuPQC Library - API Examples
 2 | 
 3 | All examples are shipped within [cuPQC Software Development Kit](https://developer.nvidia.com/cupqc-downloads).
 4 | 
 5 | ## Description
 6 | 
 7 | This folder demonstrates how to use the libraries stored in the cuPQC SDK: cuPQC and cuHash.
 8 | 
 9 | * [cuPQC download page](https://developer.nvidia.com/cupqc-downloads)
10 | * [cuPQC API documentation](https://docs.nvidia.com/cuda/cupqc/index.html)
11 | 
12 | ## Requirements
13 | 
14 | * [cuPQC SDK](https://developer.nvidia.com/cupqc-downloads)
15 | * [See cuPQC SDK requirements](https://docs.nvidia.com/cuda/cupqc/requirements.html)
16 | * Linux system with installed NVIDIA drivers
17 | * NVIDIA GPU of Volta (SM70) or newer architecture
18 | 
19 | ## Build
20 | Download and expand the cuPQC SDK then use the MakeFile located in this directory. Make sure that you set the `CUPQC_DIR` to the location of your expanded cuPQC SDK folder.
21 | 
22 | ```
23 | export CUPQC_DIR=<your_path_to_cupqc>
24 | make
25 | // Run
26 | ./example_ml_kem
27 | ./example_ml_dsa
28 | ./example_sha2
29 | ./example_sha3
30 | ```
31 | 
32 | ## Examples
33 | There is a ML-KEM and a ML-DSA example in this directory, these demonstrate the usage for the cuPQC library, and requires `libcupqc.a`. 
34 | There are also SHA2 and SHA3 examples that demonstrate the usage of the cuHash library, these require `libcuhash.a`. 
35 | For the detailed descriptions of the examples please visit [Examples](https://docs.nvidia.com/cuda/cupqc/examples.html) section of the cuCPQ documentation.
36 | 
37 | 


--------------------------------------------------------------------------------
/cuPQC/example_sha3.cu:
--------------------------------------------------------------------------------
 1 | #include <vector>
 2 | #include <iomanip>
 3 | #include <cuhash.hpp>
 4 | #include <stdio.h>
 5 | 
 6 | using namespace cupqc;
 7 | 
 8 | using SHA3_256_WARP = decltype(SHA3_256() + Warp());
 9 | 
10 | __global__ void hash_sha3_kernel(uint8_t* digest, const uint8_t* msg, size_t inbuf_len)
11 | {
12 |     SHA3_256_WARP hash {};
13 |     hash.reset();
14 |     hash.update(msg, inbuf_len);
15 |     hash.finalize();
16 |     hash.digest(digest, SHA3_256_WARP::digest_size);
17 | }
18 | 
19 | void hash_sha3(std::vector<uint8_t>& digest, std::vector<uint8_t>& msg)
20 | {
21 |     uint8_t* d_msg;
22 |     uint8_t* d_digest;
23 |     cudaMalloc(reinterpret_cast<void**>(&d_msg), msg.size());
24 |     cudaMalloc(reinterpret_cast<void**>(&d_digest), digest.size());
25 | 
26 |     cudaMemcpy(d_msg, msg.data(), msg.size(), cudaMemcpyHostToDevice);
27 | 
28 |     hash_sha3_kernel<<<1, 32>>>(d_digest, d_msg, msg.size());
29 | 
30 |     cudaMemcpy(digest.data(), d_digest, digest.size(), cudaMemcpyDeviceToHost);
31 | 
32 |     cudaFree(d_msg);
33 |     cudaFree(d_digest);
34 | }
35 | 
36 | int main([[maybe_unused]] int argc, [[maybe_unused]] char* argv[]) {
37 |     const char * msg_str = "The quick brown fox jumps over the lazy dog";
38 |     std::vector<uint8_t> msg(reinterpret_cast<const uint8_t*>(msg_str), reinterpret_cast<const uint8_t*>(msg_str) + strlen(msg_str));
39 |     std::vector<uint8_t> digest(SHA3_256::digest_size, 0);
40 |     hash_sha3(digest, msg);
41 |     printf("SHA3-256: ");
42 |     for (uint8_t num : digest) {
43 |         printf("%02x", num);
44 |     }
45 |     printf("\n");
46 | }
47 | 


--------------------------------------------------------------------------------
/cuRAND/.gitignore:
--------------------------------------------------------------------------------
1 | format/


--------------------------------------------------------------------------------
/cuRAND/Host/mrg32k3a/.gitignore:
--------------------------------------------------------------------------------
1 | /build


--------------------------------------------------------------------------------
/cuRAND/Host/mrg32k3a/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_TOOLKIT := $(shell dirname $$(command -v nvcc))/..
 2 | INC          := -I$(CUDA_TOOLKIT)/include -I../../utils
 3 | LIBS         := -L$(CUDA_TOOLKIT)/lib64 -lcudart -lcurand
 4 | FLAGS        := -O3 -std=c++11
 5 | 
 6 | ROUTINES	 := curand_mrg32k3a_uniform_example \
 7 | 				curand_mrg32k3a_normal_example \
 8 | 				curand_mrg32k3a_lognormal_example\
 9 | 				curand_mrg32k3a_poisson_example
10 | 
11 | all: $(ROUTINES)
12 | 
13 | %: %.cpp
14 | 	nvcc $(FLAGS) $(INC) $^ -o $@ $(LIBS)
15 | 
16 | clean:
17 | 	rm -f $(ROUTINES)
18 | 
19 | .PHONY: clean all test
20 | 


--------------------------------------------------------------------------------
/cuRAND/Host/mt19937/.gitignore:
--------------------------------------------------------------------------------
1 | /build


--------------------------------------------------------------------------------
/cuRAND/Host/mt19937/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_TOOLKIT := $(shell dirname $$(command -v nvcc))/..
 2 | INC          := -I$(CUDA_TOOLKIT)/include -I../../utils
 3 | LIBS         := -L$(CUDA_TOOLKIT)/lib64 -lcudart -lcurand
 4 | FLAGS        := -O3 -std=c++11
 5 | 
 6 | ROUTINES	 := curand_mt19937_uniform_example \
 7 | 				curand_mt19937_normal_example \
 8 | 				curand_mt19937_lognormal_example\
 9 | 				curand_mt19937_poisson_example
10 | 
11 | all: $(ROUTINES)
12 | 
13 | %: %.cpp
14 | 	nvcc $(FLAGS) $(INC) $^ -o $@ $(LIBS)
15 | 
16 | clean:
17 | 	rm -f $(ROUTINES)
18 | 
19 | .PHONY: clean all test
20 | 


--------------------------------------------------------------------------------
/cuRAND/Host/mtgp32/.gitignore:
--------------------------------------------------------------------------------
1 | /build


--------------------------------------------------------------------------------
/cuRAND/Host/mtgp32/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_TOOLKIT := $(shell dirname $$(command -v nvcc))/..
 2 | INC          := -I$(CUDA_TOOLKIT)/include -I../../utils
 3 | LIBS         := -L$(CUDA_TOOLKIT)/lib64 -lcudart -lcurand
 4 | FLAGS        := -O3 -std=c++11
 5 | 
 6 | ROUTINES	 := curand_mtgp32_uniform_example \
 7 | 				curand_mtgp32_normal_example \
 8 | 				curand_mtgp32_lognormal_example\
 9 | 				curand_mtgp32_poisson_example
10 | 
11 | all: $(ROUTINES)
12 | 
13 | %: %.cpp
14 | 	nvcc $(FLAGS) $(INC) $^ -o $@ $(LIBS)
15 | 
16 | clean:
17 | 	rm -f $(ROUTINES)
18 | 
19 | .PHONY: clean all test
20 | 


--------------------------------------------------------------------------------
/cuRAND/Host/philox/.gitignore:
--------------------------------------------------------------------------------
1 | /build


--------------------------------------------------------------------------------
/cuRAND/Host/philox/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_TOOLKIT := $(shell dirname $$(command -v nvcc))/..
 2 | INC          := -I$(CUDA_TOOLKIT)/include -I../../utils
 3 | LIBS         := -L$(CUDA_TOOLKIT)/lib64 -lcudart -lcurand
 4 | FLAGS        := -O3 -std=c++11
 5 | 
 6 | ROUTINES	 := curand_philox_uniform_example \
 7 | 				curand_philox_normal_example \
 8 | 				curand_philox_lognormal_example\
 9 | 				curand_philox_poisson_example
10 | 
11 | all: $(ROUTINES)
12 | 
13 | %: %.cpp
14 | 	nvcc $(FLAGS) $(INC) $^ -o $@ $(LIBS)
15 | 
16 | clean:
17 | 	rm -f $(ROUTINES)
18 | 
19 | .PHONY: clean all test
20 | 


--------------------------------------------------------------------------------
/cuRAND/Host/scrambled_sobol32/.gitignore:
--------------------------------------------------------------------------------
1 | /build


--------------------------------------------------------------------------------
/cuRAND/Host/scrambled_sobol32/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_TOOLKIT := $(shell dirname $$(command -v nvcc))/..
 2 | INC          := -I$(CUDA_TOOLKIT)/include -I../../utils
 3 | LIBS         := -L$(CUDA_TOOLKIT)/lib64 -lcudart -lcurand
 4 | FLAGS        := -O3 -std=c++11
 5 | 
 6 | ROUTINES	 := curand_scrambled_sobol32_uniform_example \
 7 | 				curand_scrambled_sobol32_normal_example \
 8 | 				curand_scrambled_sobol32_lognormal_example\
 9 | 				curand_scrambled_sobol32_poisson_example
10 | 
11 | all: $(ROUTINES)
12 | 
13 | %: %.cpp
14 | 	nvcc $(FLAGS) $(INC) $^ -o $@ $(LIBS)
15 | 
16 | clean:
17 | 	rm -f $(ROUTINES)
18 | 
19 | .PHONY: clean all test
20 | 


--------------------------------------------------------------------------------
/cuRAND/Host/scrambled_sobol64/.gitignore:
--------------------------------------------------------------------------------
1 | /build


--------------------------------------------------------------------------------
/cuRAND/Host/scrambled_sobol64/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_TOOLKIT := $(shell dirname $$(command -v nvcc))/..
 2 | INC          := -I$(CUDA_TOOLKIT)/include -I../../utils
 3 | LIBS         := -L$(CUDA_TOOLKIT)/lib64 -lcudart -lcurand
 4 | FLAGS        := -O3 -std=c++11
 5 | 
 6 | ROUTINES	 := curand_scrambled_sobol64_uniform_example \
 7 | 				curand_scrambled_sobol64_normal_example \
 8 | 				curand_scrambled_sobol64_lognormal_example\
 9 | 				curand_scrambled_sobol64_poisson_example
10 | 
11 | all: $(ROUTINES)
12 | 
13 | %: %.cpp
14 | 	nvcc $(FLAGS) $(INC) $^ -o $@ $(LIBS)
15 | 
16 | clean:
17 | 	rm -f $(ROUTINES)
18 | 
19 | .PHONY: clean all test
20 | 


--------------------------------------------------------------------------------
/cuRAND/Host/sobol32/.gitignore:
--------------------------------------------------------------------------------
1 | /build


--------------------------------------------------------------------------------
/cuRAND/Host/sobol32/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_TOOLKIT := $(shell dirname $$(command -v nvcc))/..
 2 | INC          := -I$(CUDA_TOOLKIT)/include -I../../utils
 3 | LIBS         := -L$(CUDA_TOOLKIT)/lib64 -lcudart -lcurand
 4 | FLAGS        := -O3 -std=c++11
 5 | 
 6 | ROUTINES	 := curand_sobol32_uniform_example \
 7 | 				curand_sobol32_normal_example \
 8 | 				curand_sobol32_lognormal_example\
 9 | 				curand_sobol32_poisson_example
10 | 
11 | all: $(ROUTINES)
12 | 
13 | %: %.cpp
14 | 	nvcc $(FLAGS) $(INC) $^ -o $@ $(LIBS)
15 | 
16 | clean:
17 | 	rm -f $(ROUTINES)
18 | 
19 | .PHONY: clean all test
20 | 


--------------------------------------------------------------------------------
/cuRAND/Host/sobol64/.gitignore:
--------------------------------------------------------------------------------
1 | /build


--------------------------------------------------------------------------------
/cuRAND/Host/sobol64/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_TOOLKIT := $(shell dirname $$(command -v nvcc))/..
 2 | INC          := -I$(CUDA_TOOLKIT)/include -I../../utils
 3 | LIBS         := -L$(CUDA_TOOLKIT)/lib64 -lcudart -lcurand
 4 | FLAGS        := -O3 -std=c++11
 5 | 
 6 | ROUTINES	 := curand_sobol64_uniform_example \
 7 | 				curand_sobol64_normal_example \
 8 | 				curand_sobol64_lognormal_example \
 9 | 				curand_sobol64_poisson_example
10 | 
11 | all: $(ROUTINES)
12 | 
13 | %: %.cpp
14 | 	nvcc $(FLAGS) $(INC) $^ -o $@ $(LIBS)
15 | 
16 | clean:
17 | 	rm -f $(ROUTINES)
18 | 
19 | .PHONY: clean all test
20 | 


--------------------------------------------------------------------------------
/cuRAND/Host/xorwow/.gitignore:
--------------------------------------------------------------------------------
1 | /build


--------------------------------------------------------------------------------
/cuRAND/Host/xorwow/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_TOOLKIT := $(shell dirname $$(command -v nvcc))/..
 2 | INC          := -I$(CUDA_TOOLKIT)/include -I../../utils
 3 | LIBS         := -L$(CUDA_TOOLKIT)/lib64 -lcudart -lcurand
 4 | FLAGS        := -O3 -std=c++11
 5 | 
 6 | ROUTINES	 := curand_xorwow_uniform_example \
 7 | 				curand_xorwow_normal_example \
 8 | 				curand_xorwow_lognormal_example\
 9 | 				curand_xorwow_poisson_example
10 | 
11 | all: $(ROUTINES)
12 | 
13 | %: %.cpp
14 | 	nvcc $(FLAGS) $(INC) $^ -o $@ $(LIBS)
15 | 
16 | clean:
17 | 	rm -f $(ROUTINES)
18 | 
19 | .PHONY: clean all test
20 | 


--------------------------------------------------------------------------------
/cuSOLVER/MgGetrf/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuSOLVER/MgPotrf/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuSOLVER/MgSyevd/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuSOLVER/Xgeev/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuSOLVER/Xgeqrf/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuSOLVER/Xgesvd/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuSOLVER/Xgesvdp/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuSOLVER/Xgesvdr/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuSOLVER/Xgetrf/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuSOLVER/Xpotrf/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuSOLVER/Xsyevd/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuSOLVER/Xsyevdx/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuSOLVER/Xtrtri/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuSOLVER/csrqr/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuSOLVER/gesv/.gitignore:
--------------------------------------------------------------------------------
1 | build


--------------------------------------------------------------------------------
/cuSOLVER/gesvd/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuSOLVER/gesvdaStridedBatched/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuSOLVER/gesvdj/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuSOLVER/gesvdjBatched/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuSOLVER/getrf/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuSOLVER/orgqr/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuSOLVER/ormqr/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuSOLVER/potrfBatched/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuSOLVER/syevd/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuSOLVER/syevdx/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuSOLVER/syevj/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuSOLVER/syevjBatched/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuSOLVER/sygvd/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuSOLVER/sygvdx/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuSOLVER/sygvj/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/cuSOLVERMp/.gitignore:
--------------------------------------------------------------------------------
1 | mp_getrf_getrs
2 | mp_potrf_potrs
3 | 


--------------------------------------------------------------------------------
/cuSOLVERSp2cuDSS/test_complex.mtx:
--------------------------------------------------------------------------------
 1 | %%MatrixMarket matrix coordinate complex general
 2 | %-------------------------------------------------------------------------------
 3 | 12 12 46
 4 |      1   1         10 0
 5 |      2   1          1 0
 6 |      3   1          1 0
 7 |      1   2          1 0
 8 |      2   2         10 0
 9 |      4   2          1 0
10 |      5   2          2 0
11 |      1   3          1 0
12 |      3   3         10 0
13 |      5   3          1 0
14 |      6   3          1 0
15 |      2   4          1 0
16 |      4   4         10 0
17 |      7   4          1 0
18 |      8   4          1 0
19 |      2   5          2 0
20 |      3   5          1 0
21 |      5   5         10 0
22 |      8   5          2 0
23 |      9   5          2 0
24 |      3   6          1 0
25 |      6   6         10 0
26 |      9   6          1 0
27 |      4   7          1 0
28 |      7   7         10 0
29 |     10   7          2 0
30 |      4   8          1 0
31 |      5   8          2 0
32 |      8   8         10 0
33 |     10   8          2 0
34 |     11   8          1 0
35 |      5   9          2 0
36 |      6   9          1 0
37 |      9   9         10 0
38 |     11   9          1 0
39 |      7   10         2 0
40 |      8   10         2 0
41 |     10   10        10 0
42 |     12   10         2 0
43 |      8   11         1 0
44 |      9   11         1 0
45 |     11   11        10 0
46 |     12   11         1 0
47 |     10   12         2 0
48 |     11   12         1 0
49 |     12   12        10 0
50 | 


--------------------------------------------------------------------------------
/cuSOLVERSp2cuDSS/test_real.mtx:
--------------------------------------------------------------------------------
 1 | %%MatrixMarket matrix coordinate real general
 2 | %-------------------------------------------------------------------------------
 3 | 12 12 46
 4 |      1   1         10
 5 |      2   1          1
 6 |      3   1          1
 7 |      1   2          1
 8 |      2   2         10
 9 |      4   2          1
10 |      5   2          2
11 |      1   3          1
12 |      3   3         10
13 |      5   3          1
14 |      6   3          1
15 |      2   4          1
16 |      4   4         10
17 |      7   4          1
18 |      8   4          1
19 |      2   5          2
20 |      3   5          1
21 |      5   5         10
22 |      8   5          2
23 |      9   5          2
24 |      3   6          1
25 |      6   6         10
26 |      9   6          1
27 |      4   7          1
28 |      7   7         10
29 |     10   7          2
30 |      4   8          1
31 |      5   8          2
32 |      8   8         10
33 |     10   8          2
34 |     11   8          1
35 |      5   9          2
36 |      6   9          1
37 |      9   9         10
38 |     11   9          1
39 |      7   10         2
40 |      8   10         2
41 |     10   10        10
42 |     12   10         2
43 |      8   11         1
44 |      9   11         1
45 |     11   11        10
46 |     12   11         1
47 |     10   12         2
48 |     11   12         1
49 |     12   12        10
50 | 


--------------------------------------------------------------------------------
/cuSPARSE/axpby/README.md:
--------------------------------------------------------------------------------
 1 | # cuSPARSE Generic APIs - `cusparseSpVV`
 2 | 
 3 | ## Description
 4 | 
 5 | This sample demonstrates the usage of `cusparseAxpby` for performing *sparse vector - dense vector scaling and sum*.
 6 | 
 7 | [cusparseAxpby Documentation](https://docs.nvidia.com/cuda/cusparse/index.html#cusparse-generic-function-axpby)
 8 | 
 9 | <center>
10 | 
11 | `Y = alpha * X + beta * Y`
12 | 
13 | ![](axpby.png)
14 | </center>
15 | 
16 | ## Building
17 | 
18 | * Command line
19 |     ```bash
20 |     nvcc -I<cuda_toolkit_path>/include axpby_example.c -o axpby_example -lcusparse
21 |     ```
22 | 
23 | * Linux
24 |     ```bash
25 |     make
26 |     ```
27 | 
28 | * Windows/Linux
29 |     ```bash
30 |     mkdir build
31 |     cd build
32 |     cmake ..
33 |     make
34 |     ```
35 |     On Windows, instead of running the last build step, open the Visual Studio Solution that was created and build.
36 | 
37 | ## Support
38 | 
39 | * **Supported SM Architectures:** SM 5.0, SM 5.2, SM 5.3, SM 6.0, SM 6.1, SM 6.2, SM 7.0, SM 7.2, SM 7.5, SM 8.0, SM 8.6, SM 8.9, SM 9.0
40 | * **Supported OSes:** Linux, Windows, QNX, Android
41 | * **Supported CPU Architectures**: x86_64, arm64
42 | * **Supported Compilers**: gcc, clang, Intel icc, Microsoft msvc, Nvidia HPC SDK nvc
43 | * **Language**: `C99`
44 | 
45 | ## Prerequisites
46 | 
47 | * [CUDA 11.0 toolkit](https://developer.nvidia.com/cuda-downloads) (or above) and compatible driver (see [CUDA Driver Release Notes](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-major-component-versions)).
48 | * [CMake 3.9](https://cmake.org/download/) or above on Windows
49 | 


--------------------------------------------------------------------------------
/cuSPARSE/axpby/axpby.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/axpby/axpby.png


--------------------------------------------------------------------------------
/cuSPARSE/bicgstab/BiCGStab.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/bicgstab/BiCGStab.pdf


--------------------------------------------------------------------------------
/cuSPARSE/bicgstab/BiCGStab.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/bicgstab/BiCGStab.png


--------------------------------------------------------------------------------
/cuSPARSE/cg/cg.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/cg/cg.pdf


--------------------------------------------------------------------------------
/cuSPARSE/cg/cg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/cg/cg.png


--------------------------------------------------------------------------------
/cuSPARSE/coosort/README.md:
--------------------------------------------------------------------------------
 1 | # cuSPARSE APIs - `cusparseXcoosort`
 2 | 
 3 | ## Description
 4 | 
 5 | This sample demonstrates the usage of `cusparseXcoosortByRow` to perform sorting of COO format.
 6 | 
 7 | [cusparseXcoosort Documentation](https://docs.nvidia.com/cuda/cusparse/index.html#coosort)
 8 | 
 9 | ## Building
10 | 
11 | * Command line
12 |     ```bash
13 |     nvcc -I<cuda_toolkit_path>/include coosort_example.c -o coosort_example -lcusparse
14 |     ```
15 | 
16 | * Linux
17 |     ```bash
18 |     make
19 |     ```
20 | 
21 | * Windows/Linux
22 |     ```bash
23 |     mkdir build
24 |     cd build
25 |     cmake ..
26 |     make
27 |     ```
28 |     On Windows, instead of running the last build step, open the Visual Studio Solution that was created and build.
29 | 
30 | ## Support
31 | 
32 | * **Supported SM Architectures:** SM 5.0, SM 5.2, SM 5.3, SM 6.0, SM 6.1, SM 6.2, SM 7.0, SM 7.2, SM 7.5, SM 8.0, SM 8.6, SM 8.9, SM 9.0
33 | * **Supported OSes:** Linux, Windows, QNX, Android
34 | * **Supported CPU Architectures**: x86_64, arm64
35 | * **Supported Compilers**: gcc, clang, Intel icc, Microsoft msvc, Nvidia HPC SDK nvc
36 | * **Language**: `C99`
37 | 
38 | ## Prerequisites
39 | 
40 | * [CUDA 11.0 toolkit](https://developer.nvidia.com/cuda-downloads) (or above) and compatible driver (see [CUDA Driver Release Notes](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-major-component-versions)).
41 | * [CMake 3.9](https://cmake.org/download/) or above on Windows
42 | 


--------------------------------------------------------------------------------
/cuSPARSE/dense2sparse_blockedell/dense2sparse_blockedell.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/dense2sparse_blockedell/dense2sparse_blockedell.png


--------------------------------------------------------------------------------
/cuSPARSE/dense2sparse_csr/dense2sparse_csr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/dense2sparse_csr/dense2sparse_csr.png


--------------------------------------------------------------------------------
/cuSPARSE/gather/README.md:
--------------------------------------------------------------------------------
 1 | # cuSPARSE Generic APIs - `cusparseGather`
 2 | 
 3 | ## Description
 4 | 
 5 | This sample demonstrates the usage of `cusparseGather` for performing *sparse vector - dense vector element gathering*.
 6 | 
 7 | [cusparseGather Documentation](https://docs.nvidia.com/cuda/cusparse/index.html#cusparse-generic-function-gather)
 8 | 
 9 | <center>
10 | 
11 | ![](gather.png)
12 | </center>
13 | 
14 | ## Building
15 | 
16 | * Command line
17 |     ```bash
18 |     nvcc -I<cuda_toolkit_path>/include gather_example.c -o gather_example -lcusparse
19 |     ```
20 | 
21 | * Linux
22 |     ```bash
23 |     make
24 |     ```
25 | 
26 | * Windows/Linux
27 |     ```bash
28 |     mkdir build
29 |     cd build
30 |     cmake ..
31 |     make
32 |     ```
33 |     On Windows, instead of running the last build step, open the Visual Studio Solution that was created and build.
34 | 
35 | ## Support
36 | 
37 | * **Supported SM Architectures:** SM 5.0, SM 5.2, SM 5.3, SM 6.0, SM 6.1, SM 6.2, SM 7.0, SM 7.2, SM 7.5, SM 8.0, SM 8.6, SM 8.9, SM 9.0
38 | * **Supported OSes:** Linux, Windows, QNX, Android
39 | * **Supported CPU Architectures**: x86_64, arm64
40 | * **Supported Compilers**: gcc, clang, Intel icc, Microsoft msvc, Nvidia HPC SDK nvc
41 | * **Language**: `C99`
42 | 
43 | ## Prerequisites
44 | 
45 | * [CUDA 11.0 toolkit](https://developer.nvidia.com/cuda-downloads) (or above) and compatible driver (see [CUDA Driver Release Notes](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-major-component-versions)).
46 | * [CMake 3.9](https://cmake.org/download/) or above on Windows
47 | 


--------------------------------------------------------------------------------
/cuSPARSE/gather/gather.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/gather/gather.png


--------------------------------------------------------------------------------
/cuSPARSE/graph_capture/README.md:
--------------------------------------------------------------------------------
 1 | # cuSPARSE Generic APIs - `CUDA Graph Capture`
 2 | 
 3 | ## Description
 4 | 
 5 | The sample demonstrates how to optimize *sparse vector - dense vector dot product* (`cusparseSpVV`) by exploiting *CUDA Graph Capture functionality*
 6 | 
 7 | [cuSPARSE Optimization Notes](https://docs.nvidia.com/cuda/cusparse/index.html#optimization-notes)
 8 | 
 9 | ## Building
10 | 
11 | * Command line
12 |     ```bash
13 |     nvcc -I<cuda_toolkit_path>/include graph_capture_example.c -o graph_capture_example -lcusparse
14 |     ```
15 | 
16 | * Linux
17 |     ```bash
18 |     make
19 |     ```
20 | 
21 | * Windows/Linux
22 |     ```bash
23 |     mkdir build
24 |     cd build
25 |     cmake ..
26 |     make
27 |     ```
28 |     On Windows, instead of running the last build step, open the Visual Studio Solution that was created and build.
29 | 
30 | ## Support
31 | 
32 | * **Supported SM Architectures:** SM 5.0, SM 5.2, SM 5.3, SM 6.0, SM 6.1, SM 6.2, SM 7.0, SM 7.2, SM 7.5, SM 8.0, SM 8.6, SM 8.9, SM 9.0
33 | * **Supported OSes:** Linux, Windows, QNX, Android
34 | * **Supported CPU Architectures**: x86_64, arm64
35 | * **Supported Compilers**: gcc, clang, Intel icc, Microsoft msvc, Nvidia HPC SDK nvc
36 | * **Language**: `C99`
37 | 
38 | ## Prerequisites
39 | 
40 | * [CUDA 11.0 toolkit](https://developer.nvidia.com/cuda-downloads) (or above) and compatible driver (see [CUDA Driver Release Notes](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-major-component-versions)).
41 | * [CMake 3.9](https://cmake.org/download/) or above on Windows
42 | 


--------------------------------------------------------------------------------
/cuSPARSE/rot/README.md:
--------------------------------------------------------------------------------
 1 | # cuSPARSE Generic APIs - `cusparseRot`
 2 | 
 3 | ## Description
 4 | 
 5 | This sample demonstrates the usage of `cusparseRot` for performing *Givens rotation*.
 6 | 
 7 | [cusparseRot Documentation](https://docs.nvidia.com/cuda/cusparse/index.html#cusparse-generic-function-rot)
 8 | 
 9 | <center>
10 | 
11 | ![](rot.png)
12 | </center>
13 | 
14 | ## Building
15 | 
16 | * Command line
17 |     ```bash
18 |     nvcc -I<cuda_toolkit_path>/include rot_example.c -o rot_example -lcusparse
19 |     ```
20 | 
21 | * Linux
22 |     ```bash
23 |     make
24 |     ```
25 | 
26 | * Windows/Linux
27 |     ```bash
28 |     mkdir build
29 |     cd build
30 |     cmake ..
31 |     make
32 |     ```
33 |     On Windows, instead of running the last build step, open the Visual Studio Solution that was created and build.
34 | 
35 | ## Support
36 | 
37 | * **Supported SM Architectures:** SM 5.0, SM 5.2, SM 5.3, SM 6.0, SM 6.1, SM 6.2, SM 7.0, SM 7.2, SM 7.5, SM 8.0, SM 8.6, SM 8.9, SM 9.0
38 | * **Supported OSes:** Linux, Windows, QNX, Android
39 | * **Supported CPU Architectures**: x86_64, arm64
40 | * **Supported Compilers**: gcc, clang, Intel icc, Microsoft msvc, Nvidia HPC SDK nvc
41 | * **Language**: `C99`
42 | 
43 | ## Prerequisites
44 | 
45 | * [CUDA 11.0 toolkit](https://developer.nvidia.com/cuda-downloads) (or above) and compatible driver (see [CUDA Driver Release Notes](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-major-component-versions)).
46 | * [CMake 3.9](https://cmake.org/download/) or above on Windows
47 | 


--------------------------------------------------------------------------------
/cuSPARSE/rot/rot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/rot/rot.png


--------------------------------------------------------------------------------
/cuSPARSE/scatter/README.md:
--------------------------------------------------------------------------------
 1 | # cuSPARSE Generic APIs - `cusparseScatter`
 2 | 
 3 | ## Description
 4 | 
 5 | This sample demonstrates the usage of `cusparseScatter` for performing *sparse vector - dense vector element scattering*.
 6 | 
 7 | [cusparseScatter Documentation](https://docs.nvidia.com/cuda/cusparse/index.html#cusparse-generic-function-spvv)
 8 | 
 9 | <center>
10 | 
11 | ![](scatter.png)
12 | </center>
13 | 
14 | ## Building
15 | 
16 | * Command line
17 |     ```bash
18 |     nvcc -I<cuda_toolkit_path>/include scatter_example.c -o scatter_example -lcusparse
19 |     ```
20 | 
21 | * Linux
22 |     ```bash
23 |     make
24 |     ```
25 | 
26 | * Windows/Linux
27 |     ```bash
28 |     mkdir build
29 |     cd build
30 |     cmake ..
31 |     make
32 |     ```
33 |     On Windows, instead of running the last build step, open the Visual Studio Solution that was created and build.
34 | 
35 | ## Support
36 | 
37 | * **Supported SM Architectures:** SM 5.0, SM 5.2, SM 5.3, SM 6.0, SM 6.1, SM 6.2, SM 7.0, SM 7.2, SM 7.5, SM 8.0, SM 8.6, SM 8.9, SM 9.0
38 | * **Supported OSes:** Linux, Windows, QNX, Android
39 | * **Supported CPU Architectures**: x86_64, arm64
40 | * **Supported Compilers**: gcc, clang, Intel icc, Microsoft msvc, Nvidia HPC SDK nvc
41 | * **Language**: `C99`
42 | 
43 | ## Prerequisites
44 | 
45 | * [CUDA 11.0 toolkit](https://developer.nvidia.com/cuda-downloads) (or above) and compatible driver (see [CUDA Driver Release Notes](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-major-component-versions)).
46 | * [CMake 3.9](https://cmake.org/download/) or above on Windows
47 | 


--------------------------------------------------------------------------------
/cuSPARSE/scatter/scatter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/scatter/scatter.png


--------------------------------------------------------------------------------
/cuSPARSE/sddmm_bsr/README.md:
--------------------------------------------------------------------------------
 1 | # cuSPARSE Generic APIs - `cusparseSDDMM CSR`
 2 | 
 3 | ## Description
 4 | 
 5 | This sample demonstrates the usage of `cusparseSDDMM` for performing *dense matrix - dense matrix multiplication into sparse matrix*, where the sparse matrix is represented in CSR (Compressed Sparse Row) storage format.
 6 | 
 7 | [cusparseSDDMM Documentation](https://docs.nvidia.com/cuda/cusparse/index.html#cusparse-generic-function-sddmm)
 8 | 
 9 | <center>
10 | 
11 | `C = (alpha * A * B) ° spy(C) + beta * C`
12 | 
13 | ![](sddmm_csr.png)
14 | </center>
15 | 
16 | ## Building
17 | 
18 | * Linux
19 |     ```bash
20 |     make
21 |     ```
22 | 
23 | * Windows/Linux
24 |     ```bash
25 |     mkdir build
26 |     cd build
27 |     cmake ..
28 |     make
29 |     ```
30 |     On Windows, instead of running the last build step, open the Visual Studio Solution that was created and build.
31 | 
32 | ## Support
33 | 
34 | * **Supported SM Architectures:** SM 5.0, SM 5.2, SM 5.3, SM 6.0, SM 6.1, SM 6.2, SM 7.0, SM 7.2, SM 7.5, SM 8.0, SM 8.6
35 | * **Supported OSes:** Linux, Windows, QNX, Android
36 | * **Supported CPU Architectures**: x86_64, arm64
37 | * **Supported Compilers**: gcc, clang, Intel icc, Microsoft msvc, Nvidia HPC SDK nvc
38 | * **Language**: `C99`
39 | 
40 | ## Prerequisites
41 | 
42 | * [CUDA 12.1 toolkit](https://developer.nvidia.com/cuda-downloads) (or above) and compatible driver (see [CUDA Driver Release Notes](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-major-component-versions)).
43 | * [CMake 3.9](https://cmake.org/download/) or above on Windows
44 | 


--------------------------------------------------------------------------------
/cuSPARSE/sddmm_bsr/sddmm_bsr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/sddmm_bsr/sddmm_bsr.png


--------------------------------------------------------------------------------
/cuSPARSE/sddmm_csr/sddmm_csr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/sddmm_csr/sddmm_csr.png


--------------------------------------------------------------------------------
/cuSPARSE/sddmm_csr_batched/sddmm_csr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/sddmm_csr_batched/sddmm_csr.png


--------------------------------------------------------------------------------
/cuSPARSE/sparse2dense_csr/sparse2dense_csr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/sparse2dense_csr/sparse2dense_csr.png


--------------------------------------------------------------------------------
/cuSPARSE/spgemm/spgemm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/spgemm/spgemm.png


--------------------------------------------------------------------------------
/cuSPARSE/spgemm_mem/spgemm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/spgemm_mem/spgemm.png


--------------------------------------------------------------------------------
/cuSPARSE/spgemm_reuse/spgemm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/spgemm_reuse/spgemm.png


--------------------------------------------------------------------------------
/cuSPARSE/spmm_blockedell/spmm_blockedell.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/spmm_blockedell/spmm_blockedell.png


--------------------------------------------------------------------------------
/cuSPARSE/spmm_coo/spmm_coo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/spmm_coo/spmm_coo.png


--------------------------------------------------------------------------------
/cuSPARSE/spmm_coo_batched/spmm_coo_batched.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/spmm_coo_batched/spmm_coo_batched.png


--------------------------------------------------------------------------------
/cuSPARSE/spmm_csr/spmm_csr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/spmm_csr/spmm_csr.png


--------------------------------------------------------------------------------
/cuSPARSE/spmm_csr_batched/spmm_csr_batched.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/spmm_csr_batched/spmm_csr_batched.png


--------------------------------------------------------------------------------
/cuSPARSE/spmm_csr_op/spmm_csr_op.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/spmm_csr_op/spmm_csr_op.png


--------------------------------------------------------------------------------
/cuSPARSE/spmv_coo/spmv_coo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/spmv_coo/spmv_coo.png


--------------------------------------------------------------------------------
/cuSPARSE/spmv_csr/spmv_csr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/spmv_csr/spmv_csr.png


--------------------------------------------------------------------------------
/cuSPARSE/spmv_sell/spmv_sell.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/spmv_sell/spmv_sell.png


--------------------------------------------------------------------------------
/cuSPARSE/spsm_coo/spsm_coo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/spsm_coo/spsm_coo.png


--------------------------------------------------------------------------------
/cuSPARSE/spsm_csr/spsm_csr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/spsm_csr/spsm_csr.png


--------------------------------------------------------------------------------
/cuSPARSE/spsv_coo/spsv_coo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/spsv_coo/spsv_coo.png


--------------------------------------------------------------------------------
/cuSPARSE/spsv_csr/spsv_csr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/spsv_csr/spsv_csr.png


--------------------------------------------------------------------------------
/cuSPARSE/spsv_sell/spsv_sell.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/spsv_sell/spsv_sell.png


--------------------------------------------------------------------------------
/cuSPARSE/spvv/README.md:
--------------------------------------------------------------------------------
 1 | # cuSPARSE Generic APIs - `cusparseSpVV`
 2 | 
 3 | ## Description
 4 | 
 5 | This sample demonstrates the usage of `cusparseSpVV` for performing *sparse vector - dense vector dot product*.
 6 | 
 7 | [cusparseSpVV Documentation](https://docs.nvidia.com/cuda/cusparse/index.html#cusparse-generic-function-spvv)
 8 | 
 9 | <center>
10 | 
11 | `result = X * Y` or `result = X^H * Y`
12 | 
13 | ![](spvv.png)
14 | </center>
15 | 
16 | ## Building
17 | 
18 | * Command line
19 |     ```bash
20 |     nvcc -I<cuda_toolkit_path>/include spvv_example.c -o spvv_example -lcusparse
21 |     ```
22 | 
23 | * Linux
24 |     ```bash
25 |     make
26 |     ```
27 | 
28 | * Windows/Linux
29 |     ```bash
30 |     mkdir build
31 |     cd build
32 |     cmake ..
33 |     make
34 |     ```
35 |     On Windows, instead of running the last build step, open the Visual Studio Solution that was created and build.
36 | 
37 | ## Support
38 | 
39 | * **Supported SM Architectures:** SM 5.0, SM 5.2, SM 5.3, SM 6.0, SM 6.1, SM 6.2, SM 7.0, SM 7.2, SM 7.5, SM 8.0, SM 8.6, SM 8.9, SM 9.0
40 | * **Supported OSes:** Linux, Windows, QNX, Android
41 | * **Supported CPU Architectures**: x86_64, arm64
42 | * **Supported Compilers**: gcc, clang, Intel icc, Microsoft msvc, Nvidia HPC SDK nvc
43 | * **Language**: `C99`
44 | 
45 | ## Prerequisites
46 | 
47 | * [CUDA 11.0 toolkit](https://developer.nvidia.com/cuda-downloads) (or above) and compatible driver (see [CUDA Driver Release Notes](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-major-component-versions)).
48 | * [CMake 3.9](https://cmake.org/download/) or above on Windows
49 | 


--------------------------------------------------------------------------------
/cuSPARSE/spvv/spvv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/cuSPARSE/spvv/spvv.png


--------------------------------------------------------------------------------
/cuSPARSELt/README.md:
--------------------------------------------------------------------------------
 1 | # cuSPARSELt Library
 2 | 
 3 | ## Description
 4 | 
 5 | This folder demonstrates cuSPARSELt Generic APIs usage.
 6 | 
 7 | [cuSPARSELt Documentation](https://docs.nvidia.com/cuda/cusparselt/index.html)
 8 | 
 9 | ## cuSPARSELt Samples
10 | 
11 | * [Structured Matrix-Matrix Multiplication - Basic Concepts](matmul/)
12 | 
13 |     The sample demonstrates how to exploit *Sparse Tensor Cores* for performing Structured Matrix-Matrix Multiplication
14 | 
15 | * [Batched GEMM, Activation Function, and Bias](matmul_advanced/)
16 | 
17 |     The sample extends the previous code to demonstrate how to perform batched GEMM computation, Split-K, and how to set up the activation function and bias
18 | 


--------------------------------------------------------------------------------
/cuTENSOR/Makefile:
--------------------------------------------------------------------------------
 1 | CXX_FLAGS=-std=c++11 -I${CUTENSOR_ROOT}/include -L${CUTENSOR_ROOT}/lib -lcutensor -lcudart
 2 | 
 3 | all:
 4 | 	nvcc einsum.cu -o  einsum ${CXX_FLAGS}
 5 | 	nvcc contraction.cu -o  contraction ${CXX_FLAGS}
 6 | 	nvcc contraction_jit.cu -o  contraction_jit ${CXX_FLAGS}
 7 | 	nvcc elementwise_binary.cu -o  elementwise_binary ${CXX_FLAGS}
 8 | 	nvcc elementwise_permute.cu -o  elementwise_permute ${CXX_FLAGS}
 9 | 	nvcc elementwise_trinary.cu -o  elementwise_trinary ${CXX_FLAGS}
10 | 	nvcc reduction.cu -o  reduction ${CXX_FLAGS}
11 | 
12 | run:
13 | 	./einsum
14 | 	./contraction
15 | 	./contraction_jit
16 | 	./elementwise_binary
17 | 	./elementwise_permute
18 | 	./elementwise_trinary
19 | 	./reduction
20 | 
21 | clean:
22 | 	rm -f contraction contraction_jit elementwise_binary elementwise_permute elementwise_trinary reduction
23 | 


--------------------------------------------------------------------------------
/cuTENSOR/README.md:
--------------------------------------------------------------------------------
 1 | # cuTENSOR - Samples#
 2 | 
 3 | * [Documentation](https://docs.nvidia.com/cuda/cutensor/index.html)
 4 | 
 5 | # Install
 6 | 
 7 | ## Linux 
 8 | 
 9 | You can use make or cmake to compile the cuTENSOR samples.
10 | 
11 | With make
12 | 
13 | ```
14 | export CUTENSOR_ROOT=<path_to_cutensor_root>
15 | make -j8
16 | ```
17 | 
18 | With cmake
19 | 
20 | ```
21 | mkdir build && cd build
22 | cmake .. -DCUTENSOR_ROOT=<path_to_cutensor_root>
23 | make -j8
24 | ```
25 | 
26 | ## Windows
27 | 
28 | We recommend using cmake with Ninja generator to compile:
29 | 
30 | ```
31 | mkdir build && cd build
32 | cmake .. -DCUTENSOR_ROOT=<path_to_cutensor_root> -G Ninja
33 | ninja
34 | ```
35 | 
36 | To run the examples, make sure the library files are located in a directory included in your %PATH%
37 | 


--------------------------------------------------------------------------------
/cuTENSOR/python/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.h
2 | 


--------------------------------------------------------------------------------
/nvJPEG/Image-Resize-WaterMark/NVLogo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize-WaterMark/NVLogo.jpg


--------------------------------------------------------------------------------
/nvJPEG/Image-Resize-WaterMark/NVLogo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize-WaterMark/NVLogo.png


--------------------------------------------------------------------------------
/nvJPEG/Image-Resize-WaterMark/img9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize-WaterMark/img9.png


--------------------------------------------------------------------------------
/nvJPEG/Image-Resize-WaterMark/img9wm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize-WaterMark/img9wm.png


--------------------------------------------------------------------------------
/nvJPEG/Image-Resize-WaterMark/input_images/cat.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize-WaterMark/input_images/cat.jpg


--------------------------------------------------------------------------------
/nvJPEG/Image-Resize-WaterMark/input_images/cat_baseline.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize-WaterMark/input_images/cat_baseline.jpg


--------------------------------------------------------------------------------
/nvJPEG/Image-Resize-WaterMark/input_images/cat_grayscale.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize-WaterMark/input_images/cat_grayscale.jpg


--------------------------------------------------------------------------------
/nvJPEG/Image-Resize-WaterMark/input_images/img1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize-WaterMark/input_images/img1.jpg


--------------------------------------------------------------------------------
/nvJPEG/Image-Resize-WaterMark/input_images/img2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize-WaterMark/input_images/img2.jpg


--------------------------------------------------------------------------------
/nvJPEG/Image-Resize-WaterMark/input_images/img3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize-WaterMark/input_images/img3.jpg


--------------------------------------------------------------------------------
/nvJPEG/Image-Resize-WaterMark/input_images/img4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize-WaterMark/input_images/img4.jpg


--------------------------------------------------------------------------------
/nvJPEG/Image-Resize-WaterMark/input_images/img5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize-WaterMark/input_images/img5.jpg


--------------------------------------------------------------------------------
/nvJPEG/Image-Resize-WaterMark/input_images/img6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize-WaterMark/input_images/img6.jpg


--------------------------------------------------------------------------------
/nvJPEG/Image-Resize-WaterMark/input_images/img7.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize-WaterMark/input_images/img7.jpg


--------------------------------------------------------------------------------
/nvJPEG/Image-Resize-WaterMark/input_images/img8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize-WaterMark/input_images/img8.jpg


--------------------------------------------------------------------------------
/nvJPEG/Image-Resize-WaterMark/input_images/img9.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize-WaterMark/input_images/img9.jpg


--------------------------------------------------------------------------------
/nvJPEG/Image-Resize/input_images/cat.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize/input_images/cat.jpg


--------------------------------------------------------------------------------
/nvJPEG/Image-Resize/input_images/cat_baseline.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize/input_images/cat_baseline.jpg


--------------------------------------------------------------------------------
/nvJPEG/Image-Resize/input_images/cat_grayscale.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize/input_images/cat_grayscale.jpg


--------------------------------------------------------------------------------
/nvJPEG/Image-Resize/input_images/img1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize/input_images/img1.jpg


--------------------------------------------------------------------------------
/nvJPEG/Image-Resize/input_images/img2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize/input_images/img2.jpg


--------------------------------------------------------------------------------
/nvJPEG/Image-Resize/input_images/img3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize/input_images/img3.jpg


--------------------------------------------------------------------------------
/nvJPEG/Image-Resize/input_images/img4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize/input_images/img4.jpg


--------------------------------------------------------------------------------
/nvJPEG/Image-Resize/input_images/img5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize/input_images/img5.jpg


--------------------------------------------------------------------------------
/nvJPEG/Image-Resize/input_images/img6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize/input_images/img6.jpg


--------------------------------------------------------------------------------
/nvJPEG/Image-Resize/input_images/img7.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize/input_images/img7.jpg


--------------------------------------------------------------------------------
/nvJPEG/Image-Resize/input_images/img8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize/input_images/img8.jpg


--------------------------------------------------------------------------------
/nvJPEG/Image-Resize/input_images/img9.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/Image-Resize/input_images/img9.jpg


--------------------------------------------------------------------------------
/nvJPEG/README.md:
--------------------------------------------------------------------------------
 1 | # nvJPEG Library API examples
 2 | 
 3 | ## Description
 4 | 
 5 | This folder demonstrates nvJPEG library API usage.
 6 | 
 7 | ## Key Concepts
 8 | 
 9 | Image Encoding and Decoding from NVJPEG Library
10 | 
11 | ## Examples
12 | 
13 | [JPEG Image Decoder](nvJPEG-Decoder/)
14 | 
15 | [JPEG Image Decoder MultipleInstances](nvJPEG-Decoder-MultipleInstances/)
16 | 
17 | [JPEG Image Decoder Backend and ROI](nvJPEG-Decoder-Backend-ROI/)
18 | 
19 | [Image Resize](Image-Resize/)
20 | 
21 | [Image Resize Watermarking](Image-Resize-WaterMark/)
22 | 
23 | ## Supported SM Architectures
24 | 
25 | [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
26 | 
27 | ## Supported OSes
28 | 
29 | Linux Windows
30 | 
31 | ## Supported CPU Architecture
32 | 
33 | x86_64
34 | 
35 | ## CUDA APIs involved
36 | [NVJPEG](https://docs.nvidia.com/cuda/nvjpeg/index.html)
37 | 
38 | 
39 | # Prerequisites
40 | - A Linux system with recent NVIDIA drivers.
41 | - Install the [CUDA 11.0 toolkit and above](https://developer.nvidia.com/cuda-downloads).
42 | 
43 | 


--------------------------------------------------------------------------------
/nvJPEG/nvJPEG-Decoder-Backend-ROI/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # 
 2 | # Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 3 | # 
 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 5 | # and proprietary rights in and to this software, related documentation
 6 | # and any modifications thereto. Any use, reproduction, disclosure or
 7 | # distribution of this software and related documentation without an express
 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 | # 
10 | 
11 | cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
12 | 
13 | project(nvJPEGROIDecode LANGUAGES CXX CUDA)
14 | 
15 | # ---[ Project specIFication.
16 | SET(PROJECT_NAME nvJPEGROIDecode)
17 | PROJECT(${PROJECT_NAME} LANGUAGES CUDA CXX)
18 | 
19 | if(NOT DEFINED CMAKE_CUDA_STANDARD)
20 |     set(CMAKE_CUDA_STANDARD 11)
21 |     set(CMAKE_CUDA_STANDARD_REQUIRED ON)    
22 | endif()
23 | 
24 | set(CMAKE_CXX_STANDARD 11)
25 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
26 | set(CMAKE_CXX_EXTENSIONS OFF)
27 | 
28 | 
29 | include_directories(
30 |   SYSTEM ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
31 | )
32 | 
33 | 
34 | SET(EXAMPLES_DESCRIPTOR_SOURCES "nvJPEGROIDecode.cpp")
35 | 
36 | add_executable(nvJPEGROIDecode ${EXAMPLES_DESCRIPTOR_SOURCES})
37 | 
38 | find_library(NVJPEG_LIB
39 |     NAMES nvjpeg
40 |     PATHS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
41 | 
42 | find_library(CUDART_LIB 
43 |     NAMES cudart
44 |     PATHS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
45 | 
46 | target_link_libraries(nvJPEGROIDecode PUBLIC ${NVJPEG_LIB} ${CUDART_LIB} pthread)
47 | 


--------------------------------------------------------------------------------
/nvJPEG/nvJPEG-Decoder-Backend-ROI/img9_roi.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder-Backend-ROI/img9_roi.png


--------------------------------------------------------------------------------
/nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/cat.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/cat.jpg


--------------------------------------------------------------------------------
/nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/cat_baseline.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/cat_baseline.jpg


--------------------------------------------------------------------------------
/nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/cat_grayscale.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/cat_grayscale.jpg


--------------------------------------------------------------------------------
/nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/img1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/img1.jpg


--------------------------------------------------------------------------------
/nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/img2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/img2.jpg


--------------------------------------------------------------------------------
/nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/img3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/img3.jpg


--------------------------------------------------------------------------------
/nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/img4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/img4.jpg


--------------------------------------------------------------------------------
/nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/img5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/img5.jpg


--------------------------------------------------------------------------------
/nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/img6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/img6.jpg


--------------------------------------------------------------------------------
/nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/img7.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/img7.jpg


--------------------------------------------------------------------------------
/nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/img8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/img8.jpg


--------------------------------------------------------------------------------
/nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/img9.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder-Backend-ROI/input_images/img9.jpg


--------------------------------------------------------------------------------
/nvJPEG/nvJPEG-Decoder-MultipleInstances/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # 
 2 | # Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 3 | # 
 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 5 | # and proprietary rights in and to this software, related documentation
 6 | # and any modifications thereto. Any use, reproduction, disclosure or
 7 | # distribution of this software and related documentation without an express
 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 | # 
10 | 
11 | cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
12 | 
13 | project(nvJPEGDecMultipleInstances LANGUAGES CXX CUDA)
14 | 
15 | # ---[ Project specIFication.
16 | SET(PROJECT_NAME nvJPEGDecMultipleInstances)
17 | PROJECT(${PROJECT_NAME} LANGUAGES CUDA CXX)
18 | 
19 | if(NOT DEFINED CMAKE_CUDA_STANDARD)
20 |     set(CMAKE_CUDA_STANDARD 11)
21 |     set(CMAKE_CUDA_STANDARD_REQUIRED ON)    
22 | endif()
23 | 
24 | set(CMAKE_CXX_STANDARD 11)
25 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
26 | set(CMAKE_CXX_EXTENSIONS OFF)
27 | 
28 | 
29 | include_directories(
30 |   SYSTEM ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
31 | )
32 | 
33 | 
34 | SET(EXAMPLES_DESCRIPTOR_SOURCES "nvJPEGDecMultipleInstances.cpp")
35 | 
36 | add_executable(nvJPEGDecMultipleInstances ${EXAMPLES_DESCRIPTOR_SOURCES})
37 | 
38 | find_library(NVJPEG_LIB
39 |     NAMES nvjpeg
40 |     PATHS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
41 | 
42 | find_library(CUDART_LIB 
43 |     NAMES cudart
44 |     PATHS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
45 | 
46 | target_link_libraries(nvJPEGDecMultipleInstances PUBLIC ${NVJPEG_LIB} ${CUDART_LIB} pthread)
47 | 
48 | 


--------------------------------------------------------------------------------
/nvJPEG/nvJPEG-Decoder/input_images/cat.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder/input_images/cat.jpg


--------------------------------------------------------------------------------
/nvJPEG/nvJPEG-Decoder/input_images/cat_baseline.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder/input_images/cat_baseline.jpg


--------------------------------------------------------------------------------
/nvJPEG/nvJPEG-Decoder/input_images/cat_grayscale.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder/input_images/cat_grayscale.jpg


--------------------------------------------------------------------------------
/nvJPEG/nvJPEG-Decoder/input_images/img1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder/input_images/img1.jpg


--------------------------------------------------------------------------------
/nvJPEG/nvJPEG-Decoder/input_images/img2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder/input_images/img2.jpg


--------------------------------------------------------------------------------
/nvJPEG/nvJPEG-Decoder/input_images/img3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder/input_images/img3.jpg


--------------------------------------------------------------------------------
/nvJPEG/nvJPEG-Decoder/input_images/img4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder/input_images/img4.jpg


--------------------------------------------------------------------------------
/nvJPEG/nvJPEG-Decoder/input_images/img5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder/input_images/img5.jpg


--------------------------------------------------------------------------------
/nvJPEG/nvJPEG-Decoder/input_images/img6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder/input_images/img6.jpg


--------------------------------------------------------------------------------
/nvJPEG/nvJPEG-Decoder/input_images/img7.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder/input_images/img7.jpg


--------------------------------------------------------------------------------
/nvJPEG/nvJPEG-Decoder/input_images/img8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder/input_images/img8.jpg


--------------------------------------------------------------------------------
/nvJPEG/nvJPEG-Decoder/input_images/img9.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG/nvJPEG-Decoder/input_images/img9.jpg


--------------------------------------------------------------------------------
/nvJPEG/nvJPEG-Encoder-MultipleInstances/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # 
 2 | # Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 3 | # 
 4 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 5 | # and proprietary rights in and to this software, related documentation
 6 | # and any modifications thereto. Any use, reproduction, disclosure or
 7 | # distribution of this software and related documentation without an express
 8 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 | # 
10 | 
11 | cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
12 | 
13 | 
14 | option(CROSS_COMPILE_AARCH64 "Cross compile for ARM64" OFF)
15 | 
16 | if(CROSS_COMPILE_AARCH64)
17 |     set(CMAKE_SYSTEM_NAME Linux)
18 |     set(CMAKE_SYSTEM_PROCESSOR aarch64)
19 |     set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)
20 |     set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)
21 |     set(CMAKE_CUDA_HOST_COMPILER aarch64-linux-gnu-g++)
22 | endif()
23 | 
24 | project(nvJPEGEncMultipleInstances LANGUAGES CXX CUDA)
25 | 
26 | find_package(Threads REQUIRED)
27 | find_package(CUDAToolkit 12.9 REQUIRED)
28 | 
29 | add_executable(nvJPEGEncMultipleInstances "nvJPEGEncMultipleInstances.cpp")
30 | 
31 | target_compile_features(nvJPEGEncMultipleInstances PRIVATE
32 |     cxx_std_17
33 |     cuda_std_17)
34 | 
35 | target_link_libraries(nvJPEGEncMultipleInstances PUBLIC
36 |     CUDA::nvjpeg
37 |     CUDA::cudart_static
38 |     Threads::Threads)


--------------------------------------------------------------------------------
/nvJPEG2000/README.md:
--------------------------------------------------------------------------------
 1 | # nvJPEG2000 Library API examples
 2 | 
 3 | ## Description
 4 | 
 5 | This folder demonstrates nvJPEG2000 library API usage.
 6 | 
 7 | ## Key Concepts
 8 | 
 9 | Image Decoding from NVJPEG2000 Library
10 | 
11 | ## Examples
12 | 
13 | [JPEG2000 Image Decoder](nvJPEG2000-Decoder/)
14 | [JPEG2000 Image Decoder Pipelined](nvJPEG2000-Decoder-Pipelined/)
15 | [JPEG2000 Image Decoder Tile Partial](nvjpeg2000-Decoder-Tile-Partial/)
16 | 
17 | ## Supported SM Architectures
18 | 
19 |   [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
20 | 
21 | ## Supported OSes
22 | 
23 | Linux, Windows
24 | 
25 | ## Supported CPU Architecture
26 | 
27 | x86_64
28 | 
29 | ## CUDA APIs involved
30 | [NVJPEG2000](https://docs.nvidia.com/cuda/nvjpeg2000/index.html)
31 | 
32 | 
33 | # Prerequisites
34 | - A Linux system with recent NVIDIA drivers.
35 | - Install the [CUDA 11.0 toolkit](https://developer.nvidia.com/cuda-downloads).
36 | 
37 | 


--------------------------------------------------------------------------------
/nvJPEG2000/nvJPEG2000-Decoder/images/2k_image_lossless/2k_lossless.jp2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG2000/nvJPEG2000-Decoder/images/2k_image_lossless/2k_lossless.jp2


--------------------------------------------------------------------------------
/nvJPEG2000/nvJPEG2000-Decoder/images/2k_image_lossy/2k_lossy.jp2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG2000/nvJPEG2000-Decoder/images/2k_image_lossy/2k_lossy.jp2


--------------------------------------------------------------------------------
/nvJPEG2000/nvJPEG2000-Decoder/images/4k_image_lossy/4k_lossy.jp2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG2000/nvJPEG2000-Decoder/images/4k_image_lossy/4k_lossy.jp2


--------------------------------------------------------------------------------
/nvJPEG2000/nvJPEG2000-Encoder/images/TestImage640x480.bmp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvJPEG2000/nvJPEG2000-Encoder/images/TestImage640x480.bmp


--------------------------------------------------------------------------------
/nvTIFF/README.md:
--------------------------------------------------------------------------------
 1 | # nvTIFF Library API examples
 2 | 
 3 | ## Description
 4 | 
 5 | This folder demonstrates nvTIFF library API usage.
 6 | 
 7 | ## Key Concepts
 8 | 
 9 | TIFF Image Decoding and Encoding from nvTIFF Library
10 | 
11 | ## Examples
12 | 
13 | [TIFF Image Decoder Encoder](nvTIFF-Decode-Encode/)
14 | 
15 | [GeoTIFF Image Decoder](nvTIFF-GeoTIFF-Decode/)
16 | 
17 | [TIFF Image Decoding with ROI](nvTIFF-Decode-Image-ROI/)
18 | 
19 | 
20 | ## Supported SM Architectures
21 | 
22 |   [SM 6.0 +](https://developer.nvidia.com/cuda-gpus)
23 | 
24 | ## Supported OSes
25 | 
26 | Linux, Windows
27 | 
28 | ## Supported CPU Architecture
29 | 
30 | x86_64, arm64-sbsa, aarch64-jetson
31 | 
32 | ## CUDA APIs involved
33 | [nvTIFF](https://docs.nvidia.com/cuda/nvtiff/index.html)
34 | 
35 | 
36 | # Prerequisites
37 | - A Linux system with recent NVIDIA drivers.
38 | - Install the [CUDA toolkit](https://developer.nvidia.com/cuda-downloads).
39 | - [nvCOMP](https://developer.nvidia.com/nvcomp-download) for Deflate decompression support 
40 | 
41 | 


--------------------------------------------------------------------------------
/nvTIFF/nvTIFF-Decode-Encode/images/bali_notiles.tif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvTIFF/nvTIFF-Decode-Encode/images/bali_notiles.tif


--------------------------------------------------------------------------------
/nvTIFF/nvTIFF-GeoTIFF-Decode/images/bali_notiles.tif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/CUDALibrarySamples/1b438224a85a9580caea2acdc0bcaf632e42e987/nvTIFF/nvTIFF-GeoTIFF-Decode/images/bali_notiles.tif


--------------------------------------------------------------------------------