├── .gitignore
├── res
    ├── app.ico
    └── app.rc
├── doc
    ├── releases
    │   └── 2_0_1
    │   │   ├── SHA256SUMS
    │   │   └── SHA256SUMS.sig
    └── gpg_keys
    │   └── xmrig.asc
├── src
    ├── CudaCryptonightR_gen.h
    ├── KawPow
    │   └── raven
    │   │   ├── CudaKawPow_gen.h
    │   │   ├── KawPow_dag.h
    │   │   └── KawPow.cu
    ├── cuda_fast_div_heavy.hpp
    ├── RandomX
    │   ├── arqma
    │   │   ├── randomx_arqma.cu
    │   │   └── configuration.h
    │   ├── defyx
    │   │   ├── randomx_defyx.cu
    │   │   └── configuration.h
    │   ├── graft
    │   │   ├── randomx_graft.cu
    │   │   └── configuration.h
    │   ├── keva
    │   │   ├── randomx_keva.cu
    │   │   └── configuration.h
    │   ├── yada
    │   │   ├── randomx_yada.cu
    │   │   └── configuration.h
    │   ├── monero
    │   │   ├── randomx_monero.cu
    │   │   └── configuration.h
    │   ├── wownero
    │   │   ├── randomx_wownero.cu
    │   │   └── configuration.h
    │   ├── common.hpp
    │   ├── randomx.cu
    │   └── hash.hpp
    ├── crypto
    │   ├── cn
    │   │   ├── c_blake256.h
    │   │   └── CnAlgo.h
    │   └── common
    │   │   ├── Algorithm.cpp
    │   │   └── Algorithm.h
    ├── version.h
    ├── 3rdparty
    │   └── cub
    │   │   ├── config.cuh
    │   │   ├── util_deprecated.cuh
    │   │   ├── util_namespace.cuh
    │   │   ├── version.cuh
    │   │   ├── util_compiler.cuh
    │   │   ├── util_macro.cuh
    │   │   ├── grid
    │   │       └── grid_mapping.cuh
    │   │   ├── util_cpp_dialect.cuh
    │   │   ├── util_debug.cuh
    │   │   ├── thread
    │   │       └── thread_reduce.cuh
    │   │   ├── block
    │   │       └── block_raking_layout.cuh
    │   │   ├── util_arch.cuh
    │   │   └── iterator
    │   │       └── cache_modified_input_iterator.cuh
    ├── common
    │   └── utils
    │   │   └── timestamp.h
    ├── cuda_fast_int_math_v2.hpp
    ├── cuda_device.hpp
    ├── cuda_extra.h
    ├── xmrig-cuda.h
    ├── cuda_blake.hpp
    ├── cryptonight.h
    ├── cuda_keccak.hpp
    └── CudaCryptonightR_gen.cpp
├── cmake
    ├── CUDA-Version.cmake
    ├── os.cmake
    ├── cpu.cmake
    └── flags.cmake
├── .github
    └── workflows
    │   └── test.yml
├── README.md
├── CMakeLists.txt
└── CHANGELOG.md


/.gitignore:
--------------------------------------------------------------------------------
1 | /CMakeLists.txt.user
2 | /build
3 | 


--------------------------------------------------------------------------------
/res/app.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MoneroOcean/xmrig-cuda/HEAD/res/app.ico


--------------------------------------------------------------------------------
/doc/releases/2_0_1/SHA256SUMS:
--------------------------------------------------------------------------------
1 | e12d814f584cecbb6cec9c394e49989f53ae61d4079c51c7682d938a03963b96  xmrig-proxy-5.0.0-gcc-win32.zip
2 | 0a5e143c979ef163247439da3049492ecfad49355e34ae371b14a8e08529230e  xmrig-proxy-5.0.0-msvc-win64.zip
3 | af7564afbb7e69aea52e6a2a945cb04caa09a993a468d5c1fdd73c3d337da05e  xmrig-proxy-5.0.0-xenial-x64.tar.gz
4 | 


--------------------------------------------------------------------------------
/src/CudaCryptonightR_gen.h:
--------------------------------------------------------------------------------
 1 | #ifndef XMRIG_CUDACRYPTONIGHTR_GEN_H
 2 | #define XMRIG_CUDACRYPTONIGHTR_GEN_H
 3 | 
 4 | #include <cstdint>
 5 | #include <vector>
 6 | #include <string>
 7 | 
 8 | void CryptonightR_get_program(std::vector<char>& ptx, std::string& lowered_name, uint64_t height, int arch_major, int arch_minor, bool background = false);
 9 | 
10 | #endif // XMRIG_CUDACRYPTONIGHTR_GEN_H
11 | 


--------------------------------------------------------------------------------
/src/KawPow/raven/CudaKawPow_gen.h:
--------------------------------------------------------------------------------
 1 | #ifndef XMRIG_CUDAKAWPOW_GEN_H
 2 | #define XMRIG_CUDAKAWPOW_GEN_H
 3 | 
 4 | #include <cstdint>
 5 | #include <vector>
 6 | #include <string>
 7 | 
 8 | void KawPow_get_program(std::vector<char>& ptx, std::string& lowered_name, uint64_t period, uint32_t threads, int arch_major, int arch_minor, const uint64_t* dag_sizes, bool background = false);
 9 | void calculate_fast_mod_data(uint32_t divisor, uint32_t& reciprocal, uint32_t& increment, uint32_t& shift);
10 | 
11 | #endif // XMRIG_CUDAKAWPOW_GEN_H
12 | 


--------------------------------------------------------------------------------
/doc/releases/2_0_1/SHA256SUMS.sig:
--------------------------------------------------------------------------------
 1 | -----BEGIN PGP SIGNATURE-----
 2 | 
 3 | iQEzBAABCgAdFiEEmsTOqOZuNaXHzdwbRGpTY4vpRAkFAl3Vl4YACgkQRGpTY4vp
 4 | RAlj+Qf/TeSwcQ7HoDeCk7kAVTu25gZDf/gqTyYVNPt8x4pSjc0ofxXNo/q0Yrla
 5 | Dy5Ovjy0ZHJVYAC3vYdaDEaTWkZ0DVCytYDHEtsOgaA4jQm5baHGyIjREq1II8sl
 6 | QU27VhiOsX39jxrV4bGJvSkgLRpljFSIlbwn8+yP+sCwPMJ4MMEoJCC60agIsZBu
 7 | PsJQGVxAJ/n3nk2zvUuz/5DGqFyeOJ2MjqnLcaP6IoJ/PHxUngVi7k9qIggi6EFg
 8 | Ou/M0VMNpSo9uengCKoOsidtTkoek3MXGw+eS/JVB0qNCGHaNHqj3bTRD8yk7Klv
 9 | qq5hC4F84jAPPO8QHago9n4UcoYSkQ==
10 | =M0Ty
11 | -----END PGP SIGNATURE-----
12 | 


--------------------------------------------------------------------------------
/cmake/CUDA-Version.cmake:
--------------------------------------------------------------------------------
 1 | set(DEVICE_COMPILER "nvcc")
 2 | set(CUDA_COMPILER "${DEVICE_COMPILER}" CACHE STRING "Select the device compiler")
 3 | 
 4 | if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
 5 |     list(APPEND DEVICE_COMPILER "clang")
 6 | endif()
 7 | 
 8 | set_property(CACHE CUDA_COMPILER PROPERTY STRINGS "${DEVICE_COMPILER}")
 9 | 
10 | list(APPEND CMAKE_PREFIX_PATH "$ENV{CUDA_ROOT}")
11 | list(APPEND CMAKE_PREFIX_PATH "$ENV{CMAKE_PREFIX_PATH}")
12 | 
13 | set(CUDA_STATIC ON)
14 | find_package(CUDA 8.0 REQUIRED)
15 | 
16 | set(LIBS ${LIBS} ${CUDA_LIBRARIES})
17 | 


--------------------------------------------------------------------------------
/src/cuda_fast_div_heavy.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <stdint.h>
 4 | 
 5 | __device__ __forceinline__ uint64_t fast_div_heavy(int64_t _a, int32_t _b)
 6 | {
 7 | 	int64_t a = abs(_a);
 8 | 	int32_t b = abs(_b);
 9 | 
10 | 	float rcp;
11 | 	asm("rcp.approx.f32 %0, %1;" : "=f"(rcp) : "f"(__int2float_rn(b)));
12 | 	float rcp2 = __uint_as_float(__float_as_uint(rcp) + (32U << 23));
13 | 
14 | 	uint64_t q1 = __float2ull_rd(__int2float_rn(((int32_t*)&a)[1]) * rcp2);
15 | 	a -= q1 * (uint32_t)(b);
16 | 
17 | 	rcp2 = __uint_as_float(__float_as_uint(rcp) + (12U << 23));
18 | 	int64_t q2 = __float2ll_rn(__int2float_rn(a >> 12) * rcp2);
19 | 	int32_t a2 = ((int32_t*)&a)[0] - ((int32_t*)&q2)[0] * b;
20 | 
21 | 	int32_t q3 = __float2int_rn(__int2float_rn(a2) * rcp);
22 | 	q3 += (a2 - q3 * b) >> 31;
23 | 
24 | 	const int64_t q = q1 + q2 + q3;
25 | 	return ((((int32_t*)&_a)[1] ^ _b) < 0) ? -q : q;
26 | }
27 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | on: push
 2 | 
 3 | name: Test builds
 4 | 
 5 | jobs:
 6 |   build_win_cuda11_4:
 7 |     name: Windows CUDA 11.4
 8 |     runs-on: windows-2019
 9 |     steps:
10 |       - name: Checkout code
11 |         uses: actions/checkout@master
12 |       - name: Install CUDA
13 |         run: |
14 |           powershell -Command "Invoke-WebRequest https://developer.download.nvidia.com/compute/cuda/11.4.0/network_installers/cuda_11.4.0_win10_network.exe -OutFile .\cuda_setup.exe"
15 |           start /wait .\cuda_setup.exe -s
16 |         shell: cmd
17 |       - name: Build project on Windows
18 |         run: |
19 |           cmake . -G "Visual Studio 16 2019" -DCUDA_TOOLKIT_ROOT_DIR="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.4"
20 |           cd "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\MSBuild\Current\Bin"
21 |           .\MSBuild.exe /p:Configuration=Release $Env:GITHUB_WORKSPACE\xmrig-cuda.sln || exit 1
22 | 


--------------------------------------------------------------------------------
/res/app.rc:
--------------------------------------------------------------------------------
 1 | #include <windows.h>
 2 | #include "../src/version.h"
 3 | 
 4 | 101 ICON "app.ico"
 5 | 
 6 | VS_VERSION_INFO VERSIONINFO
 7 |   FILEVERSION APP_VER_MAJOR,APP_VER_MINOR,APP_VER_PATCH,0
 8 |   PRODUCTVERSION APP_VER_MAJOR,APP_VER_MINOR,APP_VER_PATCH,0
 9 |   FILEFLAGSMASK 0x3fL
10 | #ifdef _DEBUG
11 |   FILEFLAGS VS_FF_DEBUG
12 | #else
13 |   FILEFLAGS 0x0L
14 | #endif
15 |   FILEOS VOS__WINDOWS32
16 |   FILETYPE VFT_APP
17 |   FILESUBTYPE 0x0L
18 |   BEGIN
19 |     BLOCK "StringFileInfo"
20 |     BEGIN
21 |       BLOCK "000004b0"
22 |       BEGIN
23 |         VALUE "CompanyName",      APP_SITE
24 |         VALUE "FileDescription",  APP_DESC
25 |         VALUE "FileVersion",      APP_VERSION
26 |         VALUE "LegalCopyright",   APP_COPYRIGHT
27 |         VALUE "OriginalFilename", "xmrig-cuda.dll"
28 |         VALUE "ProductName",      APP_NAME
29 |         VALUE "ProductVersion",   APP_VERSION
30 |       END
31 |     END
32 |     BLOCK "VarFileInfo"
33 |     BEGIN
34 |       VALUE "Translation", 0x0, 1200
35 |     END
36 |   END
37 | 
38 | 


--------------------------------------------------------------------------------
/src/RandomX/arqma/randomx_arqma.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2019 SChernykh
 3 | 
 4 | This file is part of RandomX CUDA.
 5 | 
 6 | RandomX CUDA is free software: you can redistribute it and/or modify
 7 | it under the terms of the GNU General Public License as published by
 8 | the Free Software Foundation, either version 3 of the License, or
 9 | (at your option) any later version.
10 | 
11 | RandomX CUDA is distributed in the hope that it will be useful,
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | GNU General Public License for more details.
15 | 
16 | You should have received a copy of the GNU General Public License
17 | along with RandomX CUDA.  If not, see<http://www.gnu.org/licenses/>.
18 | */
19 | 
20 | #include "cryptonight.h"
21 | #include "cuda_device.hpp"
22 | 
23 | 
24 | #include <cuda.h>
25 | #include <cuda_runtime.h>
26 | #include <cstdint>
27 | 
28 | 
29 | namespace RandomX_Arqma {
30 |     #include "configuration.h"
31 |     #define fillAes4Rx4 fillAes4Rx4_v104
32 |     #include "RandomX/common.hpp"
33 | }
34 | 


--------------------------------------------------------------------------------
/src/RandomX/defyx/randomx_defyx.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2019 SChernykh
 3 | 
 4 | This file is part of RandomX CUDA.
 5 | 
 6 | RandomX CUDA is free software: you can redistribute it and/or modify
 7 | it under the terms of the GNU General Public License as published by
 8 | the Free Software Foundation, either version 3 of the License, or
 9 | (at your option) any later version.
10 | 
11 | RandomX CUDA is distributed in the hope that it will be useful,
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | GNU General Public License for more details.
15 | 
16 | You should have received a copy of the GNU General Public License
17 | along with RandomX CUDA.  If not, see<http://www.gnu.org/licenses/>.
18 | */
19 | 
20 | #include "cryptonight.h"
21 | #include "cuda_device.hpp"
22 | 
23 | 
24 | #include <cuda.h>
25 | #include <cuda_runtime.h>
26 | #include <cstdint>
27 | 
28 | 
29 | namespace RandomX_DefyX {
30 |     #include "configuration.h"
31 |     #define fillAes4Rx4 fillAes4Rx4_v104
32 |     #include "RandomX/common.hpp"
33 | }
34 | 


--------------------------------------------------------------------------------
/src/RandomX/graft/randomx_graft.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2019 SChernykh
 3 | 
 4 | This file is part of RandomX CUDA.
 5 | 
 6 | RandomX CUDA is free software: you can redistribute it and/or modify
 7 | it under the terms of the GNU General Public License as published by
 8 | the Free Software Foundation, either version 3 of the License, or
 9 | (at your option) any later version.
10 | 
11 | RandomX CUDA is distributed in the hope that it will be useful,
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | GNU General Public License for more details.
15 | 
16 | You should have received a copy of the GNU General Public License
17 | along with RandomX CUDA.  If not, see<http://www.gnu.org/licenses/>.
18 | */
19 | 
20 | #include "cryptonight.h"
21 | #include "cuda_device.hpp"
22 | 
23 | 
24 | #include <cuda.h>
25 | #include <cuda_runtime.h>
26 | #include <cstdint>
27 | 
28 | 
29 | namespace RandomX_Graft {
30 |     #include "configuration.h"
31 |     #define fillAes4Rx4 fillAes4Rx4_v104
32 |     #include "RandomX/common.hpp"
33 | }
34 | 


--------------------------------------------------------------------------------
/src/RandomX/keva/randomx_keva.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2019-2020 SChernykh
 3 | 
 4 | This file is part of RandomX CUDA.
 5 | 
 6 | RandomX CUDA is free software: you can redistribute it and/or modify
 7 | it under the terms of the GNU General Public License as published by
 8 | the Free Software Foundation, either version 3 of the License, or
 9 | (at your option) any later version.
10 | 
11 | RandomX CUDA is distributed in the hope that it will be useful,
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | GNU General Public License for more details.
15 | 
16 | You should have received a copy of the GNU General Public License
17 | along with RandomX CUDA.  If not, see<http://www.gnu.org/licenses/>.
18 | */
19 | 
20 | #include "cryptonight.h"
21 | #include "cuda_device.hpp"
22 | 
23 | 
24 | #include <cuda.h>
25 | #include <cuda_runtime.h>
26 | #include <cstdint>
27 | 
28 | 
29 | namespace RandomX_Keva {
30 |     #include "configuration.h"
31 |     #define fillAes4Rx4 fillAes4Rx4_v104
32 |     #include "RandomX/common.hpp"
33 | }
34 | 


--------------------------------------------------------------------------------
/src/RandomX/yada/randomx_yada.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2019-2020 SChernykh
 3 | 
 4 | This file is part of RandomX CUDA.
 5 | 
 6 | RandomX CUDA is free software: you can redistribute it and/or modify
 7 | it under the terms of the GNU General Public License as published by
 8 | the Free Software Foundation, either version 3 of the License, or
 9 | (at your option) any later version.
10 | 
11 | RandomX CUDA is distributed in the hope that it will be useful,
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | GNU General Public License for more details.
15 | 
16 | You should have received a copy of the GNU General Public License
17 | along with RandomX CUDA.  If not, see<http://www.gnu.org/licenses/>.
18 | */
19 | 
20 | #include "cryptonight.h"
21 | #include "cuda_device.hpp"
22 | 
23 | 
24 | #include <cuda.h>
25 | #include <cuda_runtime.h>
26 | #include <cstdint>
27 | 
28 | 
29 | namespace RandomX_Yada {
30 |     #include "configuration.h"
31 |     #define fillAes4Rx4 fillAes4Rx4_v104
32 |     #include "RandomX/common.hpp"
33 | }
34 | 


--------------------------------------------------------------------------------
/src/RandomX/monero/randomx_monero.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2019 SChernykh
 3 | 
 4 | This file is part of RandomX CUDA.
 5 | 
 6 | RandomX CUDA is free software: you can redistribute it and/or modify
 7 | it under the terms of the GNU General Public License as published by
 8 | the Free Software Foundation, either version 3 of the License, or
 9 | (at your option) any later version.
10 | 
11 | RandomX CUDA is distributed in the hope that it will be useful,
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | GNU General Public License for more details.
15 | 
16 | You should have received a copy of the GNU General Public License
17 | along with RandomX CUDA.  If not, see<http://www.gnu.org/licenses/>.
18 | */
19 | 
20 | #include "cryptonight.h"
21 | #include "cuda_device.hpp"
22 | 
23 | 
24 | #include <cuda.h>
25 | #include <cuda_runtime.h>
26 | #include <cstdint>
27 | 
28 | 
29 | namespace RandomX_Monero {
30 |     #include "configuration.h"
31 |     #define fillAes4Rx4 fillAes4Rx4_v104
32 |     #include "RandomX/common.hpp"
33 | }
34 | 


--------------------------------------------------------------------------------
/src/RandomX/wownero/randomx_wownero.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2019 SChernykh
 3 | 
 4 | This file is part of RandomX CUDA.
 5 | 
 6 | RandomX CUDA is free software: you can redistribute it and/or modify
 7 | it under the terms of the GNU General Public License as published by
 8 | the Free Software Foundation, either version 3 of the License, or
 9 | (at your option) any later version.
10 | 
11 | RandomX CUDA is distributed in the hope that it will be useful,
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | GNU General Public License for more details.
15 | 
16 | You should have received a copy of the GNU General Public License
17 | along with RandomX CUDA.  If not, see<http://www.gnu.org/licenses/>.
18 | */
19 | 
20 | #include "cryptonight.h"
21 | #include "cuda_device.hpp"
22 | 
23 | 
24 | #include <cuda.h>
25 | #include <cuda_runtime.h>
26 | #include <cstdint>
27 | 
28 | 
29 | namespace RandomX_Wownero {
30 |     #include "configuration.h"
31 |     #define fillAes4Rx4 fillAes4Rx4_v103
32 |     #include "RandomX/common.hpp"
33 | }
34 | 


--------------------------------------------------------------------------------
/cmake/os.cmake:
--------------------------------------------------------------------------------
 1 | if (WIN32)
 2 |     set(XMRIG_OS_WIN ON)
 3 | elseif (APPLE)
 4 |     set(XMRIG_OS_APPLE ON)
 5 | 
 6 |     if (IOS OR CMAKE_SYSTEM_NAME STREQUAL iOS)
 7 |         set(XMRIG_OS_IOS ON)
 8 |     else()
 9 |         set(XMRIG_OS_MACOS ON)
10 |     endif()
11 | else()
12 |     set(XMRIG_OS_UNIX ON)
13 | 
14 |     if (ANDROID OR CMAKE_SYSTEM_NAME MATCHES "Android")
15 |         set(XMRIG_OS_ANDROID ON)
16 |     elseif(CMAKE_SYSTEM_NAME MATCHES "Linux")
17 |         set(XMRIG_OS_LINUX ON)
18 |     elseif(CMAKE_SYSTEM_NAME STREQUAL FreeBSD)
19 |         set(XMRIG_OS_FREEBSD ON)
20 |     endif()
21 | endif()
22 | 
23 | 
24 | if (XMRIG_OS_WIN)
25 |     add_definitions(/DWIN32)
26 |     add_definitions(/DXMRIG_OS_WIN)
27 | elseif(XMRIG_OS_APPLE)
28 |     add_definitions(/DXMRIG_OS_APPLE)
29 | 
30 |     if (XMRIG_OS_IOS)
31 |         add_definitions(/DXMRIG_OS_IOS)
32 |     else()
33 |         add_definitions(/DXMRIG_OS_MACOS)
34 |     endif()
35 | elseif(XMRIG_OS_UNIX)
36 |     add_definitions(/DXMRIG_OS_UNIX)
37 | 
38 |     if (XMRIG_OS_ANDROID)
39 |         add_definitions(/DXMRIG_OS_ANDROID)
40 |     elseif (XMRIG_OS_LINUX)
41 |         add_definitions(/DXMRIG_OS_LINUX)
42 |     elseif (XMRIG_OS_FREEBSD)
43 |         add_definitions(/DXMRIG_OS_FREEBSD)
44 |     endif()
45 | endif()
46 | 


--------------------------------------------------------------------------------
/cmake/cpu.cmake:
--------------------------------------------------------------------------------
 1 | if (NOT CMAKE_SYSTEM_PROCESSOR)
 2 |     message(WARNING "CMAKE_SYSTEM_PROCESSOR not defined")
 3 | endif()
 4 | 
 5 | 
 6 | if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|AMD64)$")
 7 |     add_definitions(/DRAPIDJSON_SSE2)
 8 | endif()
 9 | 
10 | if (NOT ARM_TARGET)
11 |     if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm64|armv8-a)$")
12 |         set(ARM_TARGET 8)
13 |     elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^(armv7|armv7f|armv7s|armv7k|armv7-a|armv7l)$")
14 |         set(ARM_TARGET 7)
15 |     endif()
16 | endif()
17 | 
18 | if (ARM_TARGET AND ARM_TARGET GREATER 6)
19 |     set(XMRIG_ARM     ON)
20 |     set(WITH_LIBCPUID OFF)
21 |     add_definitions(/DXMRIG_ARM)
22 | 
23 |     message(STATUS "Use ARM_TARGET=${ARM_TARGET} (${CMAKE_SYSTEM_PROCESSOR})")
24 | 
25 |     include(CheckCXXCompilerFlag)
26 | 
27 |     if (ARM_TARGET EQUAL 8)
28 |         set(XMRIG_ARMv8 ON)
29 |         add_definitions(/DXMRIG_ARMv8)
30 | 
31 |         CHECK_CXX_COMPILER_FLAG(-march=armv8-a+crypto XMRIG_ARM_CRYPTO)
32 | 
33 |         if (XMRIG_ARM_CRYPTO)
34 |             add_definitions(/DXMRIG_ARM_CRYPTO)
35 |             set(ARM8_CXX_FLAGS "-march=armv8-a+crypto")
36 |         else()
37 |             set(ARM8_CXX_FLAGS "-march=armv8-a")
38 |         endif()
39 |     elseif (ARM_TARGET EQUAL 7)
40 |         set(XMRIG_ARMv7 ON)
41 |         add_definitions(/DXMRIG_ARMv7)
42 |     endif()
43 | endif()
44 | 


--------------------------------------------------------------------------------
/src/crypto/cn/c_blake256.h:
--------------------------------------------------------------------------------
 1 | #ifndef _BLAKE256_H_
 2 | #define _BLAKE256_H_
 3 | 
 4 | #include <stdint.h>
 5 | 
 6 | typedef struct {
 7 |   uint32_t h[8], s[4], t[2];
 8 |   int buflen, nullt;
 9 |   uint8_t buf[64];
10 | } state;
11 | 
12 | typedef struct {
13 |   state inner;
14 |   state outer;
15 | } hmac_state;
16 | 
17 | void blake256_init(state *);
18 | void blake224_init(state *);
19 | 
20 | void blake256_update(state *, const uint8_t *, uint64_t);
21 | void blake224_update(state *, const uint8_t *, uint64_t);
22 | 
23 | void blake256_final(state *, uint8_t *);
24 | void blake224_final(state *, uint8_t *);
25 | 
26 | void blake256_hash(uint8_t *, const uint8_t *, uint64_t);
27 | void blake224_hash(uint8_t *, const uint8_t *, uint64_t);
28 | 
29 | /* HMAC functions: */
30 | 
31 | void hmac_blake256_init(hmac_state *, const uint8_t *, uint64_t);
32 | void hmac_blake224_init(hmac_state *, const uint8_t *, uint64_t);
33 | 
34 | void hmac_blake256_update(hmac_state *, const uint8_t *, uint64_t);
35 | void hmac_blake224_update(hmac_state *, const uint8_t *, uint64_t);
36 | 
37 | void hmac_blake256_final(hmac_state *, uint8_t *);
38 | void hmac_blake224_final(hmac_state *, uint8_t *);
39 | 
40 | void hmac_blake256_hash(uint8_t *, const uint8_t *, uint64_t, const uint8_t *, uint64_t);
41 | void hmac_blake224_hash(uint8_t *, const uint8_t *, uint64_t, const uint8_t *, uint64_t);
42 | 
43 | #endif /* _BLAKE256_H_ */
44 | 


--------------------------------------------------------------------------------
/src/version.h:
--------------------------------------------------------------------------------
 1 | /* XMRig
 2 |  * Copyright (c) 2018-2025 SChernykh   <https://github.com/SChernykh>
 3 |  * Copyright (c) 2016-2025 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 4 |  *
 5 |  *   This program is free software: you can redistribute it and/or modify
 6 |  *   it under the terms of the GNU General Public License as published by
 7 |  *   the Free Software Foundation, either version 3 of the License, or
 8 |  *   (at your option) any later version.
 9 |  *
10 |  *   This program is distributed in the hope that it will be useful,
11 |  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
12 |  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 |  *   GNU General Public License for more details.
14 |  *
15 |  *   You should have received a copy of the GNU General Public License
16 |  *   along with this program. If not, see <http://www.gnu.org/licenses/>.
17 |  */
18 | 
19 | #ifndef XMRIG_VERSION_H
20 | #define XMRIG_VERSION_H
21 | 
22 | #define APP_ID        "xmrig-cuda"
23 | #define APP_NAME      "XMRig"
24 | #define APP_DESC      "XMRig CUDA plugin"
25 | #define APP_VERSION   "6.22.1-mo1"
26 | #define APP_DOMAIN    "xmrig.com"
27 | #define APP_SITE      "www.xmrig.com"
28 | #define APP_COPYRIGHT "Copyright (C) 2016-2025 xmrig.com"
29 | 
30 | #define APP_VER_MAJOR  6
31 | #define APP_VER_MINOR  22
32 | #define APP_VER_PATCH  1
33 | 
34 | #define API_VERSION    4
35 | 
36 | #endif /* XMRIG_VERSION_H */
37 | 


--------------------------------------------------------------------------------
/doc/gpg_keys/xmrig.asc:
--------------------------------------------------------------------------------
 1 | -----BEGIN PGP PUBLIC KEY BLOCK-----
 2 | 
 3 | mQENBF3VSRIBCADfFjDUbq0WLGulFeSou0A+jTvweNllPyLNOn3SNCC0XLEYyEcu
 4 | JiEBK80DlvR06TVr8Aw1rT5S2iH0i5Tl8DqShH2mmcN1rBp1M0Y95D89KVj3BIhE
 5 | nxmgmD4N3Wgm+5FmEH4W/RpG1xdYWJx3eJhtWPdFJqpg083E2D5P30wIQem+EnTR
 6 | 5YrtTZPh5cPj2KRY+UmsDE3ahmxCgP7LYgnnpZQlWBBiMV932s7MvYBPJQc1wecS
 7 | 0wi1zxyS81xHc3839EkA7wueCeNo+5jha+KH66tMKsfrI2WvfPHTCPjK9v7WJc/O
 8 | /eRp9d+wacn09D1L6CoRO0ers5p10GO84VhTABEBAAG0GVhNUmlnIDxzdXBwb3J0
 9 | QHhtcmlnLmNvbT6JAU4EEwEIADgWIQSaxM6o5m41pcfN3BtEalNji+lECQUCXdVJ
10 | EgIbAwULCQgHAgYVCgkICwIEFgIDAQIeAQIXgAAKCRBEalNji+lECbkQB/9nRou0
11 | tOlBwYn8xVgBu7IiDWNVETRWfrjrtdTvSahgbbo6lWgjA/vBLkjN9fISdBQ/n/Mt
12 | hNDJbEtxHHt2baJhvT8du1eWcIHHXCV/rmv+iY/hTXa1gKqHiHDJrtYSVBG3BMme
13 | 1rdsUHTiKf3t5yRHOXAfY2C+XNblKAV7mhlxQBiKxdFDIkFEQKNrHNUvnzkOqoCT
14 | 2kTZZ2tPUMQdOn1eek6zG/+C7SwcBpJnakJ8jce4yA/xZbOVKetNWO3Ufu3TE34k
15 | OdA+H4PU9+fV77XfOY8DtXeS3boUI97ei+4s/mwX/NFC0i8CPXyefxl3WRUBGDOI
16 | w//kPNQVh4HobOCeuQENBF3VSRIBCADl29WorEi+vRA/3kg9VUXtxSU6caibFS3N
17 | VXANiFRjrOmICdfrIgOSGNrYCQFsXu0Xe0udDYVX8yX6WJk+CT02Pdg0gkXiKoze
18 | KrnK15mo3xXbb2tr1o9ROPgwY/o2AwQHj0o1JhdS2cybfuRiUQRoGgBX7a9X0cTY
19 | r4ZJvOjzgAajl3ciwB3yWUmDiRlzZpO7YWESXbOhGVzyCnP5MlMEJ/fPRw9h38vK
20 | HNKLhzcRfsLpXk34ghY3SxIv4NWUfuZXFWqpSdC9JgNc5zA72lJEQcF4DHJCKl7B
21 | ddmrfsr9mdiIpo+/ZZFPPngdeZ2kvkJ2YKaZNVu2XooJARPQ8B8tABEBAAGJATYE
22 | GAEIACAWIQSaxM6o5m41pcfN3BtEalNji+lECQUCXdVJEgIbDAAKCRBEalNji+lE
23 | CdPUB/4nH1IdhHGmfko2kxdaHqQgCGLqh3pcrQXD9mBv/LYVnoHZpVRHsIDgg2Z4
24 | lQYrIRRqe69FjVxo7sA2eMIlV0GRDlUrw+HeURFpEhKPEdwFy6i/cti2MY0YxOrB
25 | TvQoRutUoMnyjM4TBJWaaqccbTsavMdLmG3JHdAkiHtUis/fUwVctmEQwN+d/J2b
26 | wJAtliqw3nXchUfdIfwHF/7hg8seUuYUaifzkazBZhVWvRkTVLVanzZ51HRfuzwD
27 | ntaa7kfYGdE+4TKOylAPh+8E6WnR19RRTpsaW0dVBgOiBTE0uc7rUv2HWS/u6RUR
28 | t7ldSBzkuDTlM2V59Iq2hXoSC6dT
29 | =cIG9
30 | -----END PGP PUBLIC KEY BLOCK-----
31 | 


--------------------------------------------------------------------------------
/src/3rdparty/cub/config.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without
 5 |  * modification, are permitted provided that the following conditions are met:
 6 |  *     * Redistributions of source code must retain the above copyright
 7 |  *       notice, this list of conditions and the following disclaimer.
 8 |  *     * Redistributions in binary form must reproduce the above copyright
 9 |  *       notice, this list of conditions and the following disclaimer in the
10 |  *       documentation and/or other materials provided with the distribution.
11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
12 |  *       names of its contributors may be used to endorse or promote products
13 |  *       derived from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 |  *
26 |  ******************************************************************************/
27 | 
28 | /**
29 |  * \file
30 |  * Static configuration header for the CUB project.
31 |  */
32 | 
33 | #pragma once
34 | 
35 | #include "util_arch.cuh"
36 | #include "util_compiler.cuh"
37 | #include "util_cpp_dialect.cuh"
38 | #include "util_deprecated.cuh"
39 | #include "util_macro.cuh"
40 | #include "util_namespace.cuh"
41 | 


--------------------------------------------------------------------------------
/src/common/utils/timestamp.h:
--------------------------------------------------------------------------------
 1 | /* XMRig
 2 |  * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
 3 |  * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
 4 |  * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 5 |  * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 6 |  * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
 7 |  * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
 8 |  * Copyright 2018-2020 SChernykh   <https://github.com/SChernykh>
 9 |  * Copyright 2016-2020 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
10 |  *
11 |  *   This program is free software: you can redistribute it and/or modify
12 |  *   it under the terms of the GNU General Public License as published by
13 |  *   the Free Software Foundation, either version 3 of the License, or
14 |  *   (at your option) any later version.
15 |  *
16 |  *   This program is distributed in the hope that it will be useful,
17 |  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
18 |  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 |  *   GNU General Public License for more details.
20 |  *
21 |  *   You should have received a copy of the GNU General Public License
22 |  *   along with this program. If not, see <http://www.gnu.org/licenses/>.
23 |  */
24 | 
25 | #ifndef XMRIG_TIMESTAMP_H
26 | #define XMRIG_TIMESTAMP_H
27 | 
28 | 
29 | #include <chrono>
30 | 
31 | 
32 | namespace xmrig_cuda {
33 | 
34 | 
35 | static inline int64_t steadyTimestamp()
36 | {
37 |     using namespace std::chrono;
38 |     if (high_resolution_clock::is_steady) {
39 |         return time_point_cast<milliseconds>(high_resolution_clock::now()).time_since_epoch().count();
40 |     }
41 |     else {
42 |         return time_point_cast<milliseconds>(steady_clock::now()).time_since_epoch().count();
43 |     }
44 | }
45 | 
46 | 
47 | static inline int64_t currentMSecsSinceEpoch()
48 | {
49 |     using namespace std::chrono;
50 | 
51 |     return time_point_cast<milliseconds>(high_resolution_clock::now()).time_since_epoch().count();
52 | }
53 | 
54 | 
55 | } /* namespace xmrig_cuda */
56 | 
57 | #endif /* XMRIG_TIMESTAMP_H */
58 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # xmrig-cuda
 2 | This repository contains the CUDA plugin for the XMRig miner, which provides support for NVIDIA GPUs.
 3 | 
 4 | This plugin is a separate project because of the main reasons listed below:
 5 | 1. Not all users need CUDA support, and it is an optional feature.
 6 | 2. CUDA has strict compiler version requirements that may be difficult to meet, unlike CPU mining code, which is generally very flexible.
 7 | 
 8 | 
 9 | ## Windows
10 | 
11 | <<<<<<< HEAD
12 | * To [download](https://github.com/MoneroOcean/xmrig-cuda/releases) the plugin, you must choose the appropriate CUDA version. Generally, the latest version (12.4) is all you need, unless you have very old GPUs. Windows builds are available for every major CUDA release. Alternatively, you can [build](https://xmrig.com/docs/miner/build/windows) the plugin from the source.
13 | =======
14 | * To [download](https://github.com/xmrig/xmrig-cuda/releases) the plugin, you must choose the appropriate CUDA version. Generally, the latest version is all you need, unless you have very old GPUs. Windows builds are available for every major CUDA release. Alternatively, you can [build](https://xmrig.com/docs/miner/build/windows) the plugin from the source.
15 | >>>>>>> 861e49d24797dce63e6eb8590c86ddd2eb5e81ab
16 | * Place **`xmrig-cuda.dll`** and other dll files near to **`xmrig.exe`**.
17 | * Edit **`config.json`** enable the plugin.
18 | ```
19 | {
20 |    ...
21 |    "cuda": {
22 |       "enabled": true,
23 |       ...
24 |    }
25 |    ...
26 | }
27 | ```
28 | ### Advanced
29 | You can specify the path to the plugin using the `loader` option.
30 | ```
31 | {
32 |    ...
33 |    "cuda": {
34 |       "enabled": true,
35 |       "loader": "c:/some/path/xmrig-cuda.dll",
36 |       ...
37 |    }
38 |    ...
39 | }
40 | ```
41 | Due to JSON format restrictions, the directory separator must be written in Linux style `/` or escaped `\\`.
42 | 
43 | ## Linux
44 | Linux usage is almost the same as Windows except we don't provide binaries and you must build the plugin from the source and the name of the plugin is different **`libxmrig-cuda.so`**.
45 | 
46 | ## macOS
47 | CUDA no longer supports macOS, which means that the plugin also does not support it.
48 | 


--------------------------------------------------------------------------------
/src/RandomX/common.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | /*
 4 | Copyright (c) 2019 SChernykh
 5 | 
 6 | This file is part of RandomX CUDA.
 7 | 
 8 | RandomX CUDA is free software: you can redistribute it and/or modify
 9 | it under the terms of the GNU General Public License as published by
10 | the Free Software Foundation, either version 3 of the License, or
11 | (at your option) any later version.
12 | 
13 | RandomX CUDA is distributed in the hope that it will be useful,
14 | but WITHOUT ANY WARRANTY; without even the implied warranty of
15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 | GNU General Public License for more details.
17 | 
18 | You should have received a copy of the GNU General Public License
19 | along with RandomX CUDA.  If not, see<http://www.gnu.org/licenses/>.
20 | */
21 | 
22 | 
23 | #include <cstdint>
24 | 
25 | 
26 | #define RANDOMX_DATASET_ITEM_SIZE  64
27 | #define RANDOMX_DATASET_EXTRA_SIZE 33554368
28 | #define RANDOMX_JUMP_BITS          8
29 | #define RANDOMX_JUMP_OFFSET        8
30 | 
31 | 
32 | namespace randomx {
33 |     constexpr int mantissaSize = 52;
34 |     constexpr int exponentSize = 11;
35 |     constexpr uint64_t mantissaMask = (1ULL << mantissaSize) - 1;
36 |     constexpr uint64_t exponentMask = (1ULL << exponentSize) - 1;
37 |     constexpr int exponentBias = 1023;
38 |     constexpr int dynamicExponentBits = 4;
39 |     constexpr int staticExponentBits = 4;
40 |     constexpr uint64_t constExponentBits = 0x300;
41 |     constexpr uint64_t dynamicMantissaMask = (1ULL << (mantissaSize + dynamicExponentBits)) - 1;
42 | 
43 |     constexpr int RegistersCount = 8;
44 |     constexpr int RegisterCountFlt = RegistersCount / 2;
45 |     constexpr int RegisterNeedsDisplacement = 5; //x86 r13 register
46 | 
47 |     constexpr int CacheLineSize = RANDOMX_DATASET_ITEM_SIZE;
48 |     constexpr uint32_t DatasetExtraItems = RANDOMX_DATASET_EXTRA_SIZE / RANDOMX_DATASET_ITEM_SIZE;
49 | 
50 |     constexpr uint32_t ConditionMask = ((1 << RANDOMX_JUMP_BITS) - 1);
51 |     constexpr int ConditionOffset = RANDOMX_JUMP_OFFSET;
52 |     constexpr int StoreL3Condition = 14;
53 | }
54 | 
55 | #include "blake2b_cuda.hpp"
56 | #include "aes_cuda.hpp"
57 | #include "randomx_cuda.hpp"
58 | #include "hash.hpp"
59 | 


--------------------------------------------------------------------------------
/src/cuda_fast_int_math_v2.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <stdint.h>
 4 | 
 5 | __device__ __forceinline__ uint32_t get_reciprocal(uint32_t a)
 6 | {
 7 | 	const float a_hi = __uint_as_float((a >> 8) + ((126U + 31U) << 23));
 8 | 	const float a_lo = __uint2float_rn(a & 0xFF);
 9 | 
10 | 	float r;
11 | 	asm("rcp.approx.f32 %0, %1;" : "=f"(r) : "f"(a_hi));
12 | 	const float r_scaled = __uint_as_float(__float_as_uint(r) + (64U << 23));
13 | 
14 | 	const float h = __fmaf_rn(a_lo, r, __fmaf_rn(a_hi, r, -1.0f));
15 | 	return (__float_as_uint(r) << 9) - __float2int_rn(h * r_scaled);
16 | }
17 | 
18 | __device__ __forceinline__ uint64_t fast_div_v2(uint64_t a, uint32_t b)
19 | {
20 | 	const uint32_t r = get_reciprocal(b);
21 | 	const uint64_t k = __umulhi(((uint32_t*)&a)[0], r) + ((uint64_t)(r) * ((uint32_t*)&a)[1]) + a;
22 | 
23 | 	uint32_t q[2];
24 | 	q[0] = ((uint32_t*)&k)[1];
25 | 
26 | 	int64_t tmp = a - (uint64_t)(q[0]) * b;
27 | 	((int32_t*)(&tmp))[1] -= (k < a) ? b : 0;
28 | 
29 | 	const bool overshoot = ((int32_t*)(&tmp))[1] < 0;
30 | 	const bool undershoot = tmp >= b;
31 | 
32 | 	q[0] += (undershoot ? 1U : 0U) - (overshoot ? 1U : 0U);
33 | 	q[1] = ((uint32_t*)(&tmp))[0] + (overshoot ? b : 0U) - (undershoot ? b : 0U);
34 | 
35 | 	return *((uint64_t*)(q));
36 | }
37 | 
38 | __device__ __forceinline__ uint32_t fast_sqrt_v2(const uint64_t n1)
39 | {
40 | 	float x = __uint_as_float((((uint32_t*)&n1)[1] >> 9) + ((64U + 127U) << 23));
41 | 	float x1;
42 | 	asm("rsqrt.approx.f32 %0, %1;" : "=f"(x1) : "f"(x));
43 | 	asm("sqrt.approx.f32 %0, %1;" : "=f"(x) : "f"(x));
44 | 
45 | 	// The following line does x1 *= 4294967296.0f;
46 | 	x1 = __uint_as_float(__float_as_uint(x1) + (32U << 23));
47 | 
48 | 	const uint32_t x0 = __float_as_uint(x) - (158U << 23);
49 | 	const int64_t delta0 = n1 - (((int64_t)(x0) * x0) << 18);
50 | 	const float delta = __int2float_rn(((int32_t*)&delta0)[1]) * x1;
51 | 
52 | 	uint32_t result = (x0 << 10) + __float2int_rn(delta);
53 | 	const uint32_t s = result >> 1;
54 | 	const uint32_t b = result & 1;
55 | 
56 | 	const uint64_t x2 = (uint64_t)(s) * (s + b) + ((uint64_t)(result) << 32) - n1;
57 | 	if ((int64_t)(x2 + b) > 0) --result;
58 | 	if ((int64_t)(x2 + 0x100000000UL + s) < 0) ++result;
59 | 
60 | 	return result;
61 | }
62 | 


--------------------------------------------------------------------------------
/src/crypto/common/Algorithm.cpp:
--------------------------------------------------------------------------------
 1 | /* XMRig
 2 |  * Copyright (c) 2018      Lee Clagett <https://github.com/vtnerd>
 3 |  * Copyright (c) 2018-2021 SChernykh   <https://github.com/SChernykh>
 4 |  * Copyright (c) 2016-2021 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 5 |  *
 6 |  *   This program is free software: you can redistribute it and/or modify
 7 |  *   it under the terms of the GNU General Public License as published by
 8 |  *   the Free Software Foundation, either version 3 of the License, or
 9 |  *   (at your option) any later version.
10 |  *
11 |  *   This program is distributed in the hope that it will be useful,
12 |  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
13 |  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 |  *   GNU General Public License for more details.
15 |  *
16 |  *   You should have received a copy of the GNU General Public License
17 |  *   along with this program. If not, see <http://www.gnu.org/licenses/>.
18 |  */
19 | 
20 | #include "crypto/common/Algorithm.h"
21 | 
22 | 
23 | #include <set>
24 | 
25 | 
26 | xmrig_cuda::Algorithm::Id xmrig_cuda::Algorithm::parse(uint32_t id)
27 | {
28 |     static const std::set<uint32_t> ids = {
29 |         CN_0, CN_1, CN_2, CN_FAST, CN_HALF, CN_XAO, CN_RTO, CN_RWZ, CN_ZLS, CN_DOUBLE, CN_CCX,
30 | #       ifdef XMRIG_ALGO_CN_R
31 |         CN_R,
32 | #       endif
33 | #       ifdef XMRIG_ALGO_CN_LITE
34 |         CN_LITE_0, CN_LITE_1,
35 | #       endif
36 | #       ifdef XMRIG_ALGO_CN_HEAVY
37 |         CN_HEAVY_0, CN_HEAVY_TUBE, CN_HEAVY_XHV,
38 | #       endif
39 | #       ifdef XMRIG_ALGO_CN_PICO
40 |         CN_PICO_0, CN_PICO_TLO,
41 | #       endif
42 | #       ifdef XMRIG_ALGO_CN_FEMTO
43 |         CN_UPX2,
44 | #       endif
45 | #       ifdef XMRIG_ALGO_CN_GPU
46 |         CN_GPU,
47 | #       endif
48 | #       ifdef XMRIG_ALGO_RANDOMX
49 |         RX_XLA,
50 |         RX_0, RX_WOW, RX_ARQ, RX_GRAFT, RX_SFX, RX_KEVA, RX_YADA,
51 | #       endif
52 | #       ifdef XMRIG_ALGO_ARGON2
53 |         AR2_CHUKWA, AR2_CHUKWA_V2, AR2_WRKZ,
54 | #       endif
55 | #       ifdef XMRIG_ALGO_KAWPOW
56 |         KAWPOW_RVN,
57 | #       endif
58 |     };
59 | 
60 |     return ids.count(id) ? static_cast<Id>(id) : INVALID;
61 | }
62 | 


--------------------------------------------------------------------------------
/src/RandomX/randomx.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2019 SChernykh
 3 | 
 4 | This file is part of RandomX CUDA.
 5 | 
 6 | RandomX CUDA is free software: you can redistribute it and/or modify
 7 | it under the terms of the GNU General Public License as published by
 8 | the Free Software Foundation, either version 3 of the License, or
 9 | (at your option) any later version.
10 | 
11 | RandomX CUDA is distributed in the hope that it will be useful,
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | GNU General Public License for more details.
15 | 
16 | You should have received a copy of the GNU General Public License
17 | along with RandomX CUDA.  If not, see<http://www.gnu.org/licenses/>.
18 | */
19 | 
20 | 
21 | #include "cryptonight.h"
22 | #include "cuda_device.hpp"
23 | 
24 | 
25 | #include <cuda.h>
26 | #include <cuda_runtime.h>
27 | #include <cstdint>
28 | 
29 | 
30 | void randomx_prepare(nvid_ctx *ctx, const void *dataset, size_t dataset_size, uint32_t batch_size)
31 | {
32 |     ctx->rx_batch_size      = batch_size;
33 |     ctx->d_scratchpads_size = batch_size * (ctx->algorithm.l3() + 64);
34 | 
35 |     if (ctx->rx_dataset_host > 0) {
36 |         CUDA_CHECK(ctx->device_id, cudaHostGetDevicePointer(&ctx->d_rx_dataset, const_cast<void *>(dataset), 0));
37 |     }
38 |     else {
39 |         CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_rx_dataset, dataset_size));
40 |         CUDA_CHECK(ctx->device_id, cudaMemcpy(ctx->d_rx_dataset, dataset, dataset_size, cudaMemcpyHostToDevice));
41 |     }
42 | 
43 |     CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_long_state, ctx->d_scratchpads_size));
44 |     CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_rx_hashes, batch_size * 64));
45 |     CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_rx_entropy, batch_size * (128 + 2560)));
46 |     CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_rx_vm_states, batch_size * 2560));
47 |     CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_rx_rounding, batch_size * sizeof(uint32_t)));
48 | }
49 | 
50 | 
51 | void randomx_update_dataset(nvid_ctx* ctx, const void* dataset, size_t dataset_size)
52 | {
53 |     if (ctx->rx_dataset_host > 0) {
54 |         return;
55 |     }
56 | 
57 |     CUDA_CHECK(ctx->device_id, cudaMemcpy(ctx->d_rx_dataset, dataset, dataset_size, cudaMemcpyHostToDevice));
58 | }
59 | 


--------------------------------------------------------------------------------
/src/cuda_device.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cuda_runtime.h>
 4 | #include <stdexcept>
 5 | #include <iostream>
 6 | #include <string>
 7 | 
 8 | 
 9 | #define CUDA_THROW(error) throw std::runtime_error(std::string("<") + __FUNCTION__ + ">:" + std::to_string(__LINE__) + " \"" + (error) + "\"")
10 | 
11 | 
12 | /** execute and check a CUDA api command
13 | *
14 | * @param id gpu id (thread id)
15 | * @param ... CUDA api command
16 | */
17 | #define CUDA_CHECK(id, ...) {                                                                             \
18 |     cudaError_t error = __VA_ARGS__;                                                                      \
19 |     if (error != cudaSuccess){                                                                            \
20 |         CUDA_THROW(cudaGetErrorString(error));                                                            \
21 |     }                                                                                                     \
22 | }                                                                                                         \
23 | ( (void) 0 )
24 | 
25 | /** execute and check a CUDA kernel
26 | *
27 | * @param id gpu id (thread id)
28 | * @param ... CUDA kernel call
29 | */
30 | #define CUDA_CHECK_KERNEL(id, ...)      \
31 |     __VA_ARGS__;                        \
32 |     CUDA_CHECK(id, cudaGetLastError())
33 | 
34 | #if defined(XMRIG_ALGO_KAWPOW) || defined(XMRIG_ALGO_CN_R)
35 | #define CU_CHECK(id, ...) {                                                                             \
36 |     CUresult result = __VA_ARGS__;                                                                      \
37 |     if(result != CUDA_SUCCESS){                                                                         \
38 |         const char* s;                                                                                  \
39 |         cuGetErrorString(result, &s);                                                                   \
40 |         CUDA_THROW(s ? s : "unknown error");                                                            \
41 |     }                                                                                                   \
42 | }                                                                                                       \
43 | ( (void) 0 )
44 | #endif
45 | 


--------------------------------------------------------------------------------
/src/3rdparty/cub/util_deprecated.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without
 5 |  * modification, are permitted provided that the following conditions are met:
 6 |  *     * Redistributions of source code must retain the above copyright
 7 |  *       notice, this list of conditions and the following disclaimer.
 8 |  *     * Redistributions in binary form must reproduce the above copyright
 9 |  *       notice, this list of conditions and the following disclaimer in the
10 |  *       documentation and/or other materials provided with the distribution.
11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
12 |  *       names of its contributors may be used to endorse or promote products
13 |  *       derived from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 |  *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 |  *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 |  *
26 |  ******************************************************************************/
27 | 
28 | /**
29 |  * \file
30 |  * Define CUB_DEPRECATED macro.
31 |  */
32 | 
33 | #pragma once
34 | 
35 | #include "util_compiler.cuh"
36 | 
37 | #if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC
38 | #  define CUB_DEPRECATED __declspec(deprecated)
39 | #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG
40 | #  define CUB_DEPRECATED __attribute__((deprecated))
41 | #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC
42 | #  define CUB_DEPRECATED __attribute__((deprecated))
43 | #else
44 | #  define CUB_DEPRECATED
45 | #endif
46 | 
47 | 


--------------------------------------------------------------------------------
/src/3rdparty/cub/util_namespace.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
 4 |  *
 5 |  * Redistribution and use in source and binary forms, with or without
 6 |  * modification, are permitted provided that the following conditions are met:
 7 |  *     * Redistributions of source code must retain the above copyright
 8 |  *       notice, this list of conditions and the following disclaimer.
 9 |  *     * Redistributions in binary form must reproduce the above copyright
10 |  *       notice, this list of conditions and the following disclaimer in the
11 |  *       documentation and/or other materials provided with the distribution.
12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
13 |  *       names of its contributors may be used to endorse or promote products
14 |  *       derived from this software without specific prior written permission.
15 |  *
16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |  *
27 |  ******************************************************************************/
28 | 
29 | /**
30 |  * \file
31 |  * Place-holder for prefixing the cub namespace
32 |  */
33 | 
34 | #pragma once
35 | 
36 | #include "version.cuh"
37 | 
38 | // For example:
39 | //#define CUB_NS_PREFIX namespace thrust{ namespace detail {
40 | //#define CUB_NS_POSTFIX } }
41 | 
42 | #ifndef CUB_NS_PREFIX
43 | #define CUB_NS_PREFIX
44 | #endif
45 | 
46 | #ifndef CUB_NS_POSTFIX
47 | #define CUB_NS_POSTFIX
48 | #endif
49 | 
50 | // Declare these namespaces here for the purpose of Doxygenating them
51 | 
52 | /*! \namespace cub
53 |  *  \brief \p cub is the top-level namespace which contains all CUB
54 |  *         functions and types.
55 |  */
56 | namespace cub
57 | {
58 | 
59 | }
60 | 


--------------------------------------------------------------------------------
/src/3rdparty/cub/version.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without
 5 |  * modification, are permitted provided that the following conditions are met:
 6 |  *     * Redistributions of source code must retain the above copyright
 7 |  *       notice, this list of conditions and the following disclaimer.
 8 |  *     * Redistributions in binary form must reproduce the above copyright
 9 |  *       notice, this list of conditions and the following disclaimer in the
10 |  *       documentation and/or other materials provided with the distribution.
11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
12 |  *       names of its contributors may be used to endorse or promote products
13 |  *       derived from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 |  *
26 |  ******************************************************************************/
27 | 
28 | /*! \file version.h
29 |  *  \brief Compile-time macros encoding CUB release version
30 |  *
31 |  *         <cub/version.h> is the only CUB header that is guaranteed to
32 |  *         change with every CUB release.
33 |  *
34 |  */
35 | 
36 | #pragma once
37 | 
38 | /*! \def CUB_VERSION
39 |  *  \brief The preprocessor macro \p CUB_VERSION encodes the version
40 |  *         number of the CUB library.
41 |  *
42 |  *         <tt>CUB_VERSION % 100</tt> is the sub-minor version.
43 |  *         <tt>CUB_VERSION / 100 % 1000</tt> is the minor version.
44 |  *         <tt>CUB_VERSION / 100000</tt> is the major version.
45 |  */
46 | #define CUB_VERSION 101000
47 | 
48 | /*! \def CUB_MAJOR_VERSION
49 |  *  \brief The preprocessor macro \p CUB_MAJOR_VERSION encodes the
50 |  *         major version number of the CUB library.
51 |  */
52 | #define CUB_MAJOR_VERSION     (CUB_VERSION / 100000)
53 | 
54 | /*! \def CUB_MINOR_VERSION
55 |  *  \brief The preprocessor macro \p CUB_MINOR_VERSION encodes the
56 |  *         minor version number of the CUB library.
57 |  */
58 | #define CUB_MINOR_VERSION     (CUB_VERSION / 100 % 1000)
59 | 
60 | /*! \def CUB_SUBMINOR_VERSION
61 |  *  \brief The preprocessor macro \p CUB_SUBMINOR_VERSION encodes the
62 |  *         sub-minor version number of the CUB library.
63 |  */
64 | #define CUB_SUBMINOR_VERSION  (CUB_VERSION % 100)
65 | 
66 | /*! \def CUB_PATCH_NUMBER
67 |  *  \brief The preprocessor macro \p CUB_PATCH_NUMBER encodes the
68 |  *         patch number of the CUB library.
69 |  */
70 | #define CUB_PATCH_NUMBER 0
71 | 


--------------------------------------------------------------------------------
/src/3rdparty/cub/util_compiler.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without
 5 |  * modification, are permitted provided that the following conditions are met:
 6 |  *     * Redistributions of source code must retain the above copyright
 7 |  *       notice, this list of conditions and the following disclaimer.
 8 |  *     * Redistributions in binary form must reproduce the above copyright
 9 |  *       notice, this list of conditions and the following disclaimer in the
10 |  *       documentation and/or other materials provided with the distribution.
11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
12 |  *       names of its contributors may be used to endorse or promote products
13 |  *       derived from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 |  *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 |  *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 |  *
26 |  ******************************************************************************/
27 | 
28 | /**
29 |  * \file
30 |  * Detect compiler information.
31 |  */
32 | 
33 | #pragma once
34 | 
35 | // enumerate host compilers we know about
36 | #define CUB_HOST_COMPILER_UNKNOWN 0
37 | #define CUB_HOST_COMPILER_MSVC 1
38 | #define CUB_HOST_COMPILER_GCC 2
39 | #define CUB_HOST_COMPILER_CLANG 3
40 | 
41 | // enumerate device compilers we know about
42 | #define CUB_DEVICE_COMPILER_UNKNOWN 0
43 | #define CUB_DEVICE_COMPILER_MSVC 1
44 | #define CUB_DEVICE_COMPILER_GCC 2
45 | #define CUB_DEVICE_COMPILER_NVCC 3
46 | #define CUB_DEVICE_COMPILER_CLANG 4
47 | 
48 | // figure out which host compiler we're using
49 | #if defined(_MSC_VER)
50 | #  define CUB_HOST_COMPILER CUB_HOST_COMPILER_MSVC
51 | #  define CUB_MSVC_VERSION _MSC_VER
52 | #  define CUB_MSVC_VERSION_FULL _MSC_FULL_VER
53 | #elif defined(__clang__)
54 | #  define CUB_HOST_COMPILER CUB_HOST_COMPILER_CLANG
55 | #  define CUB_CLANG_VERSION                                                    \
56 |     (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__)
57 | #elif defined(__GNUC__)
58 | #  define CUB_HOST_COMPILER CUB_HOST_COMPILER_GCC
59 | #  define CUB_GCC_VERSION                                                      \
60 |     (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
61 | #else
62 | #  define CUB_HOST_COMPILER CUB_HOST_COMPILER_UNKNOWN
63 | #endif // CUB_HOST_COMPILER
64 | 
65 | // figure out which device compiler we're using
66 | #if defined(__CUDACC__)
67 | #  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_NVCC
68 | #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC
69 | #  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_MSVC
70 | #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC
71 | #  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_GCC
72 | #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG
73 | // CUDA-capable clang should behave similar to NVCC.
74 | #  if defined(__CUDA__)
75 | #    define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_NVCC
76 | #  else
77 | #    define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_CLANG
78 | #  endif
79 | #else
80 | #  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_UNKNOWN
81 | #endif
82 | 


--------------------------------------------------------------------------------
/src/cuda_extra.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #ifdef __INTELLISENSE__
  4 | #define __CUDA_ARCH__ 520
  5 | /* avoid red underlining */
  6 | 
  7 | struct uint3
  8 | {
  9 | 	unsigned int x, y, z;
 10 | };
 11 | 
 12 | struct uint3  threadIdx;
 13 | struct uint3  blockIdx;
 14 | struct uint3  blockDim;
 15 | #define __funnelshift_r(a,b,c) 1
 16 | #define __syncthreads()
 17 | #define asm(x)
 18 | #define __shfl(a,b,c) 1
 19 | #endif
 20 | 
 21 | #define AES_BLOCK_SIZE  16
 22 | #define AES_KEY_SIZE    32
 23 | #define INIT_SIZE_BLK   8
 24 | #define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE) // 128 B
 25 | 
 26 | #define C32(x)    ((uint32_t)(x ## U))
 27 | #define T32(x) ((x) & C32(0xFFFFFFFF))
 28 | 
 29 | #if __CUDA_ARCH__ >= 350
 30 | __forceinline__ __device__ uint64_t cuda_ROTL64(const uint64_t value, const int offset)
 31 | {
 32 | 	uint2 result;
 33 | 	if(offset >= 32)
 34 | 	{
 35 | 		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
 36 | 		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
 37 | 	} 
 38 | 	else 
 39 | 	{
 40 | 		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
 41 | 		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
 42 | 	}
 43 | 	return  __double_as_longlong(__hiloint2double(result.y, result.x));
 44 | }
 45 | #define ROTL64(x, n) (cuda_ROTL64(x, n))
 46 | #else
 47 | #define ROTL64(x, n)        (((x) << (n)) | ((x) >> (64 - (n))))
 48 | #endif
 49 | 
 50 | #if __CUDA_ARCH__ < 350
 51 | #define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n))))
 52 | #define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
 53 | #else
 54 | #define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
 55 | #define ROTR32(x, n) __funnelshift_r( (x), (x), (n) )
 56 | #endif
 57 | 
 58 | #define MEMSET8(dst,what,cnt) { \
 59 | 	int i_memset8; \
 60 | 	uint64_t *out_memset8 = (uint64_t *)(dst); \
 61 | 	for( i_memset8 = 0; i_memset8 < cnt; i_memset8++ ) \
 62 | 		out_memset8[i_memset8] = (what); }
 63 | 
 64 | #define MEMSET4(dst,what,cnt) { \
 65 | 	int i_memset4; \
 66 | 	uint32_t *out_memset4 = (uint32_t *)(dst); \
 67 | 	for( i_memset4 = 0; i_memset4 < cnt; i_memset4++ ) \
 68 | 		out_memset4[i_memset4] = (what); }
 69 | 
 70 | #define MEMCPY8(dst,src,cnt) { \
 71 | 	int i_memcpy8; \
 72 | 	uint64_t *in_memcpy8 = (uint64_t *)(src); \
 73 | 	uint64_t *out_memcpy8 = (uint64_t *)(dst); \
 74 | 	for( i_memcpy8 = 0; i_memcpy8 < cnt; i_memcpy8++ ) \
 75 | 		out_memcpy8[i_memcpy8] = in_memcpy8[i_memcpy8]; }
 76 | 
 77 | #define MEMCPY4(dst,src,cnt) { \
 78 | 	int i_memcpy4; \
 79 | 	uint32_t *in_memcpy4 = (uint32_t *)(src); \
 80 | 	uint32_t *out_memcpy4 = (uint32_t *)(dst); \
 81 | 	for( i_memcpy4 = 0; i_memcpy4 < cnt; i_memcpy4++ ) \
 82 | 		out_memcpy4[i_memcpy4] = in_memcpy4[i_memcpy4]; }
 83 | 
 84 | #define XOR_BLOCKS(a,b) { \
 85 | 	((uint64_t *)a)[0] ^= ((uint64_t *)b)[0]; \
 86 | 	((uint64_t *)a)[1] ^= ((uint64_t *)b)[1]; }
 87 | 
 88 | #define XOR_BLOCKS_DST(x,y,z) { \
 89 | 	((uint64_t *)z)[0] = ((uint64_t *)(x))[0] ^ ((uint64_t *)(y))[0]; \
 90 | 	((uint64_t *)z)[1] = ((uint64_t *)(x))[1] ^ ((uint64_t *)(y))[1]; }
 91 | 
 92 | #define MUL_SUM_XOR_DST(a,c,dst) { \
 93 | 	const uint64_t dst0 = ((uint64_t *)dst)[0]; \
 94 | 	uint64_t hi, lo = cuda_mul128(((uint64_t *)a)[0], dst0, &hi) + ((uint64_t *)c)[1]; \
 95 | 	hi += ((uint64_t *)c)[0]; \
 96 | 	((uint64_t *)c)[0] = dst0 ^ hi; \
 97 | 	((uint64_t *)dst)[0] = hi; \
 98 | 	((uint64_t *)c)[1] = atomicExch(((unsigned long long int *)dst) + 1, (unsigned long long int)lo) ^ lo; \
 99 | 	}
100 | 
101 | #define E2I(x) ((size_t)(((*((uint64_t*)(x)) >> 4) & 0x1ffff)))
102 | 
103 | 


--------------------------------------------------------------------------------
/cmake/flags.cmake:
--------------------------------------------------------------------------------
  1 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
  2 | set(CMAKE_CXX_EXTENSIONS OFF)
  3 | set(CMAKE_CXX_STANDARD 11)
  4 | 
  5 | set(CMAKE_C_STANDARD 99)
  6 | set(CMAKE_C_STANDARD_REQUIRED ON)
  7 | 
  8 | if ("${CMAKE_BUILD_TYPE}" STREQUAL "")
  9 |     set(CMAKE_BUILD_TYPE Release)
 10 | endif()
 11 | 
 12 | if (CMAKE_BUILD_TYPE STREQUAL "Release")
 13 |     add_definitions(/DNDEBUG)
 14 | endif()
 15 | 
 16 | include(CheckSymbolExists)
 17 | 
 18 | if (CMAKE_CXX_COMPILER_ID MATCHES GNU)
 19 |     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wno-strict-aliasing -fPIC")
 20 |     set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O2")
 21 | 
 22 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -fexceptions -fno-rtti -Wno-strict-aliasing -Wno-class-memaccess -fPIC")
 23 |     set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2 -s")
 24 | 
 25 |     if (XMRIG_ARMv8)
 26 |         set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ARM8_CXX_FLAGS}")
 27 |         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARM8_CXX_FLAGS} -flax-vector-conversions")
 28 |     elseif (XMRIG_ARMv7)
 29 |         set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=neon")
 30 |         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon -flax-vector-conversions")
 31 |     else()
 32 |         set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes")
 33 |         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes")
 34 | 
 35 |         add_definitions(/DHAVE_ROTR)
 36 |     endif()
 37 | 
 38 |     if (WIN32)
 39 |         if (CMAKE_SIZEOF_VOID_P EQUAL 8)
 40 |             set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static")
 41 |         else()
 42 |             set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static -Wl,--large-address-aware")
 43 |         endif()
 44 |     else()
 45 |         set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libgcc -static-libstdc++")
 46 |     endif()
 47 | 
 48 |     add_definitions(/D_GNU_SOURCE)
 49 | 
 50 |     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99")
 51 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 52 | 
 53 |     add_definitions(/DHAVE_BUILTIN_CLEAR_CACHE)
 54 | 
 55 | elseif (CMAKE_CXX_COMPILER_ID MATCHES MSVC)
 56 | 
 57 |     set(CMAKE_C_FLAGS_RELEASE "/MP /MT /O2 /Ob2 /DNDEBUG")
 58 |     set(CMAKE_CXX_FLAGS_RELEASE "/MP /MT /O2 /Ob2 /DNDEBUG")
 59 |     add_definitions(/D_CRT_SECURE_NO_WARNINGS)
 60 |     add_definitions(/D_CRT_NONSTDC_NO_WARNINGS)
 61 |     add_definitions(/DNOMINMAX)
 62 |     add_definitions(/DHAVE_ROTR)
 63 | 
 64 | elseif (CMAKE_CXX_COMPILER_ID MATCHES Clang)
 65 | 
 66 |     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall")
 67 |     set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O2 -funroll-loops -fmerge-all-constants")
 68 | 
 69 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -fexceptions -fno-rtti -Wno-missing-braces")
 70 |     set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2 -funroll-loops -fmerge-all-constants")
 71 | 
 72 |     if (XMRIG_ARMv8)
 73 |         set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ARM8_CXX_FLAGS}")
 74 |         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARM8_CXX_FLAGS}")
 75 |     elseif (XMRIG_ARMv7)
 76 |         set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=neon -march=${CMAKE_SYSTEM_PROCESSOR}")
 77 |         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon -march=${CMAKE_SYSTEM_PROCESSOR}")
 78 |     else()
 79 |         set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes")
 80 |         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes")
 81 | 
 82 |         check_symbol_exists("_rotr" "x86intrin.h" HAVE_ROTR)
 83 |         if (HAVE_ROTR)
 84 |             add_definitions(/DHAVE_ROTR)
 85 |         endif()
 86 |     endif()
 87 | 
 88 |     if (XMRIG_OS_APPLE)
 89 |         set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99")
 90 |         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 91 |     endif()
 92 | 
 93 | endif()
 94 | 
 95 | if (NOT WIN32)
 96 |     check_symbol_exists("__builtin___clear_cache" "stdlib.h" HAVE_BUILTIN_CLEAR_CACHE)
 97 |     if (HAVE_BUILTIN_CLEAR_CACHE)
 98 |         add_definitions(/DHAVE_BUILTIN_CLEAR_CACHE)
 99 |     endif()
100 | endif()
101 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | cmake_minimum_required(VERSION 3.5)
  2 | project(xmrig-cuda)
  3 | include(cmake/CUDA-Version.cmake)
  4 | 
  5 | 
  6 | option(WITH_DRIVER_API "Enable CUDA Driver API and NVRTC, required for cn/r and kawpow algorithms" ON)
  7 | 
  8 | # Algorithm selection
  9 | option(WITH_CN_R       "Enable CryptoNight-R algorithm" ON)
 10 | option(WITH_CN_LITE    "Enable CryptoNight-Lite algorithms family" ON)
 11 | option(WITH_CN_HEAVY   "Enable CryptoNight-Heavy algorithms family" ON)
 12 | option(WITH_CN_PICO    "Enable CryptoNight-Pico algorithm" ON)
 13 | option(WITH_CN_FEMTO   "Enable CryptoNight-UPX2 algorithm" ON)
 14 | option(WITH_CN_GPU     "Enable CryptoNight-GPU algorithm" ON)
 15 | option(WITH_ARGON2     "Enable Argon2 algorithms family" OFF) #unsupported
 16 | 
 17 | if (CUDA_VERSION VERSION_LESS 9.0)
 18 |     message(STATUS "CUDA ${CUDA_VERSION}: RandomX and KawPow disabled, they do not work with old CUDA")
 19 |     option(WITH_RANDOMX  "Enable RandomX algorithms family" OFF)
 20 |     option(WITH_KAWPOW   "Enable KawPow algorithms family" OFF)
 21 | else()
 22 |     option(WITH_RANDOMX  "Enable RandomX algorithms family" ON)
 23 |     option(WITH_KAWPOW   "Enable KawPow algorithms family" ON)
 24 | endif()
 25 | 
 26 | if (WITH_CN_LITE)
 27 |     add_definitions(/DXMRIG_ALGO_CN_LITE)
 28 | endif()
 29 | 
 30 | if (WITH_CN_HEAVY)
 31 |     add_definitions(/DXMRIG_ALGO_CN_HEAVY)
 32 | endif()
 33 | 
 34 | if (WITH_CN_PICO)
 35 |     add_definitions(/DXMRIG_ALGO_CN_PICO)
 36 | endif()
 37 | 
 38 | if (WITH_CN_FEMTO)
 39 |     add_definitions(/DXMRIG_ALGO_CN_FEMTO)
 40 | endif()
 41 | 
 42 | if (WITH_CN_GPU)
 43 |     add_definitions(/DXMRIG_ALGO_CN_GPU)
 44 | endif()
 45 | 
 46 | if (WITH_RANDOMX)
 47 |     add_definitions(/DXMRIG_ALGO_RANDOMX)
 48 | endif()
 49 | 
 50 | if (WITH_ARGON2)
 51 |     add_definitions(/DXMRIG_ALGO_ARGON2)
 52 | endif()
 53 | 
 54 | if (WITH_KAWPOW)
 55 |     if (WITH_DRIVER_API)
 56 |         add_definitions(/DXMRIG_ALGO_KAWPOW)
 57 |     else()
 58 |         set(WITH_KAWPOW OFF)
 59 |         message(STATUS "CUDA ${CUDA_VERSION}: KawPow disabled, requires WITH_DRIVER_API=ON for CUDA Driver API and NVRTC")
 60 |     endif()
 61 | endif()
 62 | 
 63 | if (WITH_CN_R)
 64 |     if (WITH_DRIVER_API)
 65 |         add_definitions(/DXMRIG_ALGO_CN_R)
 66 |     else()
 67 |         set(WITH_CN_R OFF)
 68 |         message(STATUS "CUDA ${CUDA_VERSION}: CryptoNight-R disabled, requires WITH_DRIVER_API=ON for CUDA Driver API and NVRTC")
 69 |     endif()
 70 | endif()
 71 | 
 72 | 
 73 | include_directories(src)
 74 | add_definitions(/DCUB_IGNORE_DEPRECATED_CPP_DIALECT)
 75 | 
 76 | 
 77 | include(cmake/cpu.cmake)
 78 | include(cmake/os.cmake)
 79 | include(cmake/flags.cmake)
 80 | include(cmake/CUDA.cmake)
 81 | 
 82 | 
 83 | set(SOURCES
 84 |     src/crypto/cn/c_blake256.c
 85 |     src/crypto/common/Algorithm.cpp
 86 |     src/crypto/common/Algorithm.h
 87 |     src/version.h
 88 |     src/xmrig-cuda.cpp
 89 |     src/xmrig-cuda.h
 90 |     )
 91 | 
 92 | 
 93 | if (WITH_DRIVER_API AND WITH_CN_R)
 94 |     list(APPEND SOURCES src/CudaCryptonightR_gen.cpp)
 95 | endif()
 96 | 
 97 | if (XMRIG_OS_WIN)
 98 |     list(APPEND SOURCES res/app.rc)
 99 | endif()
100 | 
101 | if (XMRIG_OS_APPLE)
102 |     cmake_policy(SET CMP0042 NEW)
103 | endif()
104 | 
105 | add_library(${CMAKE_PROJECT_NAME} SHARED ${SOURCES})
106 | target_link_libraries(${CMAKE_PROJECT_NAME} xmrig-cu ${LIBS})
107 | 
108 | if (WITH_DRIVER_API AND WIN32)
109 |     if (CUDA_VERSION VERSION_LESS 10.0)
110 |         file(GLOB NVRTCDLL "${CUDA_TOOLKIT_ROOT_DIR}/bin/nvrtc64*.dll")
111 |     else()
112 |         file(GLOB NVRTCDLL "${CUDA_TOOLKIT_ROOT_DIR}/bin/nvrtc64*_0.dll")
113 |     endif()
114 | 
115 |     add_custom_command(TARGET ${CMAKE_PROJECT_NAME} POST_BUILD
116 |         COMMAND ${CMAKE_COMMAND} -E copy_if_different "${NVRTCDLL}" $<TARGET_FILE_DIR:${CMAKE_PROJECT_NAME}>)
117 | 
118 |     file(GLOB NVRTCBUILTINDLL "${CUDA_TOOLKIT_ROOT_DIR}/bin/nvrtc-builtins64*.dll")
119 |     add_custom_command(TARGET ${CMAKE_PROJECT_NAME} POST_BUILD
120 |         COMMAND ${CMAKE_COMMAND} -E copy_if_different "${NVRTCBUILTINDLL}" $<TARGET_FILE_DIR:${CMAKE_PROJECT_NAME}>)
121 | endif()
122 | 


--------------------------------------------------------------------------------
/src/3rdparty/cub/util_macro.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /******************************************************************************
 30 |  * Common C/C++ macro utilities
 31 |  ******************************************************************************/
 32 | 
 33 | #pragma once
 34 | 
 35 | #include "util_namespace.cuh"
 36 | 
 37 | /// Optional outer namespace(s)
 38 | CUB_NS_PREFIX
 39 | 
 40 | /// CUB namespace
 41 | namespace cub {
 42 | 
 43 | 
 44 | /**
 45 |  * \addtogroup UtilModule
 46 |  * @{
 47 |  */
 48 | 
 49 | #ifndef CUB_ALIGN
 50 |     #if defined(_WIN32) || defined(_WIN64)
 51 |         /// Align struct
 52 |         #define CUB_ALIGN(bytes) __declspec(align(32))
 53 |     #else
 54 |         /// Align struct
 55 |         #define CUB_ALIGN(bytes) __attribute__((aligned(bytes)))
 56 |     #endif
 57 | #endif
 58 | 
 59 | #ifndef CUB_MAX
 60 |     /// Select maximum(a, b)
 61 |     #define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a))
 62 | #endif
 63 | 
 64 | #ifndef CUB_MIN
 65 |     /// Select minimum(a, b)
 66 |     #define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a))
 67 | #endif
 68 | 
 69 | #ifndef CUB_QUOTIENT_FLOOR
 70 |     /// Quotient of x/y rounded down to nearest integer
 71 |     #define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y))
 72 | #endif
 73 | 
 74 | #ifndef CUB_QUOTIENT_CEILING
 75 |     /// Quotient of x/y rounded up to nearest integer
 76 |     #define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y))
 77 | #endif
 78 | 
 79 | #ifndef CUB_ROUND_UP_NEAREST
 80 |     /// x rounded up to the nearest multiple of y
 81 |     #define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y)
 82 | #endif
 83 | 
 84 | #ifndef CUB_ROUND_DOWN_NEAREST
 85 |     /// x rounded down to the nearest multiple of y
 86 |     #define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y)
 87 | #endif
 88 | 
 89 | 
 90 | #ifndef CUB_STATIC_ASSERT
 91 |     #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
 92 |         #define CUB_CAT_(a, b) a ## b
 93 |         #define CUB_CAT(a, b) CUB_CAT_(a, b)
 94 |     #endif // DOXYGEN_SHOULD_SKIP_THIS
 95 | 
 96 |     /// Static assert
 97 |     #define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1]
 98 | #endif
 99 | 
100 | /** @} */       // end group UtilModule
101 | 
102 | }               // CUB namespace
103 | CUB_NS_POSTFIX  // Optional outer namespace(s)
104 | 


--------------------------------------------------------------------------------
/src/xmrig-cuda.h:
--------------------------------------------------------------------------------
  1 | /* XMRig
  2 |  * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
  3 |  * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
  4 |  * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
  5 |  * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
  6 |  * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
  7 |  * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
  8 |  * Copyright 2018-2020 SChernykh   <https://github.com/SChernykh>
  9 |  * Copyright 2016-2020 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 10 |  *
 11 |  *   This program is free software: you can redistribute it and/or modify
 12 |  *   it under the terms of the GNU General Public License as published by
 13 |  *   the Free Software Foundation, either version 3 of the License, or
 14 |  *   (at your option) any later version.
 15 |  *
 16 |  *   This program is distributed in the hope that it will be useful,
 17 |  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 18 |  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 19 |  *   GNU General Public License for more details.
 20 |  *
 21 |  *   You should have received a copy of the GNU General Public License
 22 |  *   along with this program. If not, see <http://www.gnu.org/licenses/>.
 23 |  */
 24 | 
 25 | #ifndef XMRIG_CUDA_H
 26 | #define XMRIG_CUDA_H
 27 | 
 28 | 
 29 | #include <cstdint>
 30 | #include <cstddef>
 31 | 
 32 | 
 33 | #if defined _WIN32 || defined __CYGWIN__
 34 | #   define XMRIG_EXPORT __declspec(dllexport)
 35 | #   define XMRIG_HIDDEN
 36 | #else
 37 | #   define XMRIG_EXPORT __attribute__ ((visibility ("default")))
 38 | #   define XMRIG_HIDDEN  __attribute__ ((visibility ("hidden")))
 39 | #endif
 40 | 
 41 | 
 42 | using nvid_ctx = struct nvid_ctx;
 43 | 
 44 | 
 45 | enum Version : uint32_t
 46 | {
 47 |     ApiVersion,
 48 |     DriverVersion,
 49 |     RuntimeVersion
 50 | };
 51 | 
 52 | 
 53 | enum DeviceProperty : uint32_t
 54 | {
 55 |     DeviceId,
 56 |     DeviceAlgorithm,
 57 |     DeviceArchMajor,
 58 |     DeviceArchMinor,
 59 |     DeviceSmx,
 60 |     DeviceBlocks,
 61 |     DeviceThreads,
 62 |     DeviceBFactor,
 63 |     DeviceBSleep,
 64 |     DeviceClockRate,
 65 |     DeviceMemoryClockRate,
 66 |     DeviceMemoryTotal,
 67 |     DeviceMemoryFree,
 68 |     DevicePciBusID,
 69 |     DevicePciDeviceID,
 70 |     DevicePciDomainID,
 71 |     DeviceDatasetHost,
 72 | };
 73 | 
 74 | 
 75 | #if defined(__cplusplus)
 76 | extern "C" {
 77 | #endif
 78 | 
 79 | 
 80 | XMRIG_EXPORT bool cnHash(nvid_ctx *ctx, uint32_t startNonce, uint64_t height, uint64_t target, uint32_t *rescount, uint32_t *resnonce);
 81 | XMRIG_EXPORT bool deviceInfo(nvid_ctx *ctx, int32_t blocks, int32_t threads, uint32_t algo, int32_t dataset_host);
 82 | XMRIG_EXPORT bool deviceInit(nvid_ctx *ctx);
 83 | XMRIG_EXPORT bool rxHash(nvid_ctx *ctx, uint32_t startNonce, uint64_t target, uint32_t *rescount, uint32_t *resnonce);
 84 | XMRIG_EXPORT bool rxPrepare(nvid_ctx *ctx, const void *dataset, size_t datasetSize, bool dataset_host, uint32_t batchSize);
 85 | XMRIG_EXPORT bool rxUpdateDataset(nvid_ctx* ctx, const void* dataset, size_t datasetSize);
 86 | XMRIG_EXPORT bool kawPowHash(nvid_ctx *ctx, uint8_t* job_blob, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t *skipped_hashes);
 87 | XMRIG_EXPORT bool kawPowPrepare_v2(nvid_ctx *ctx, const void* cache, size_t cache_size, const void* dag_precalc, size_t dag_size, uint32_t height, const uint64_t* dag_sizes);
 88 | XMRIG_EXPORT bool kawPowStopHash(nvid_ctx *ctx);
 89 | XMRIG_EXPORT bool setJob(nvid_ctx *ctx, const void *data, size_t size, uint32_t algo);
 90 | XMRIG_EXPORT const char *deviceName(nvid_ctx *ctx);
 91 | XMRIG_EXPORT const char *lastError(nvid_ctx *ctx);
 92 | XMRIG_EXPORT const char *pluginVersion();
 93 | XMRIG_EXPORT int32_t deviceInt(nvid_ctx *ctx, DeviceProperty property);
 94 | XMRIG_EXPORT nvid_ctx *alloc(uint32_t id, int32_t bfactor, int32_t bsleep);
 95 | XMRIG_EXPORT uint32_t deviceCount();
 96 | XMRIG_EXPORT uint32_t deviceUint(nvid_ctx *ctx, DeviceProperty property);
 97 | XMRIG_EXPORT uint32_t version(Version version);
 98 | XMRIG_EXPORT uint64_t deviceUlong(nvid_ctx *ctx, DeviceProperty property);
 99 | XMRIG_EXPORT void init();
100 | XMRIG_EXPORT void release(nvid_ctx *ctx);
101 | 
102 | 
103 | #if defined(__cplusplus)
104 | }
105 | #endif
106 | 
107 | 
108 | #endif /* XMRIG_CUDA_H */
109 | 


--------------------------------------------------------------------------------
/src/RandomX/hash.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | /*
 4 | Copyright (c) 2019 SChernykh
 5 | 
 6 | This file is part of RandomX CUDA.
 7 | 
 8 | RandomX CUDA is free software: you can redistribute it and/or modify
 9 | it under the terms of the GNU General Public License as published by
10 | the Free Software Foundation, either version 3 of the License, or
11 | (at your option) any later version.
12 | 
13 | RandomX CUDA is distributed in the hope that it will be useful,
14 | but WITHOUT ANY WARRANTY; without even the implied warranty of
15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 | GNU General Public License for more details.
17 | 
18 | You should have received a copy of the GNU General Public License
19 | along with RandomX CUDA.  If not, see<http://www.gnu.org/licenses/>.
20 | */
21 | 
22 | __global__ void find_shares(const void* hashes, uint64_t target, uint32_t* shares)
23 | {
24 |     const uint32_t global_index = blockIdx.x * blockDim.x + threadIdx.x;
25 |     const uint64_t* p = (const uint64_t*)hashes;
26 | 
27 |     if (p[global_index * 4 + 3] < target) {
28 |         const uint32_t idx = atomicInc(shares, 0xFFFFFFFF) + 1;
29 |         if (idx < 10) {
30 |             shares[idx] = global_index;
31 |         }
32 |     }
33 | }
34 | 
35 | void hash(nvid_ctx *ctx, uint32_t nonce, uint32_t nonce_offset, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size)
36 | {
37 |     if (ctx->algorithm.id() == xmrig_cuda::Algorithm::RX_XLA) {
38 | //        sipesh(tempHash, sizeof(tempHash), input, inputSize, input, inputSize, 0, 0);
39 | //        CUDA_CHECK_KERNEL(ctx->device_id, sipesh<<<batch_size / 32, 32>>>(ctx->d_rx_hashes, ctx->d_input, ctx->inputlen, nonce));
40 | //        k12(input, inputSize, tempHash);
41 | //        CUDA_CHECK_KERNEL(ctx->device_id, k12<<<batch_size / 32, 32>>>(ctx->d_rx_hashes, ctx->d_input, ctx->inputlen, nonce));
42 |     } else if (ctx->inputlen <= 128) {
43 |         CUDA_CHECK_KERNEL(ctx->device_id, blake2b_initial_hash << <batch_size / 32, 32 >> > (ctx->d_rx_hashes, ctx->d_input, ctx->inputlen, nonce));
44 |     }
45 |     else if (ctx->inputlen <= 256) {
46 |         CUDA_CHECK_KERNEL(ctx->device_id, blake2b_initial_hash_double << <batch_size / 32, 32 >> > (ctx->d_rx_hashes, ctx->d_input, ctx->inputlen, nonce));
47 |     }
48 |     else {
49 |         CUDA_CHECK_KERNEL(ctx->device_id, blake2b_initial_hash_big << <batch_size / 32, 32 >> > (ctx->d_rx_hashes, ctx->d_input, ctx->inputlen, nonce, nonce_offset));
50 |     }
51 | 
52 |     CUDA_CHECK_KERNEL(ctx->device_id, fillAes1Rx4<RANDOMX_SCRATCHPAD_L3, false, 64><<<batch_size / 32, 32 * 4>>>(ctx->d_rx_hashes, ctx->d_long_state, batch_size));
53 |     CUDA_CHECK(ctx->device_id, cudaMemset(ctx->d_rx_rounding, 0, batch_size * sizeof(uint32_t)));
54 | 
55 |     for (size_t i = 0; i < RANDOMX_PROGRAM_COUNT; ++i) {
56 |         CUDA_CHECK_KERNEL(ctx->device_id, fillAes4Rx4<ENTROPY_SIZE, false><<<batch_size / 32, 32 * 4>>>(ctx->d_rx_hashes, ctx->d_rx_entropy, batch_size));
57 | 
58 |         CUDA_CHECK_KERNEL(ctx->device_id, init_vm<8><<<batch_size / 4, 4 * 8>>>(ctx->d_rx_entropy, ctx->d_rx_vm_states));
59 |         for (int j = 0, n = 1 << ctx->device_bfactor; j < n; ++j) {
60 |             CUDA_CHECK_KERNEL(ctx->device_id, execute_vm<8, false><<<batch_size / 2, 2 * 8>>>(ctx->d_rx_vm_states, ctx->d_rx_rounding, ctx->d_long_state, ctx->d_rx_dataset, batch_size, RANDOMX_PROGRAM_ITERATIONS >> ctx->device_bfactor, j == 0, j == n - 1));
61 |         }
62 | 
63 |         if (i == RANDOMX_PROGRAM_COUNT - 1) {
64 |             CUDA_CHECK_KERNEL(ctx->device_id, hashAes1Rx4<RANDOMX_SCRATCHPAD_L3, 192, VM_STATE_SIZE, 64><<<batch_size / 32, 32 * 4>>>(ctx->d_long_state, ctx->d_rx_vm_states, batch_size));
65 |             CUDA_CHECK_KERNEL(ctx->device_id, blake2b_hash_registers<REGISTERS_SIZE, VM_STATE_SIZE, 32><<<batch_size / 32, 32>>>(ctx->d_rx_hashes, ctx->d_rx_vm_states));
66 |         } else {
67 |             CUDA_CHECK_KERNEL(ctx->device_id, blake2b_hash_registers<REGISTERS_SIZE, VM_STATE_SIZE, 64><<<batch_size / 32, 32>>>(ctx->d_rx_hashes, ctx->d_rx_vm_states));
68 |         }
69 |     }
70 | 
71 |     CUDA_CHECK(ctx->device_id, cudaMemset(ctx->d_result_nonce, 0, 10 * sizeof(uint32_t)));
72 |     CUDA_CHECK_KERNEL(ctx->device_id, find_shares<<<batch_size / 32, 32>>>(ctx->d_rx_hashes, target, ctx->d_result_nonce));
73 |     CUDA_CHECK(ctx->device_id, cudaDeviceSynchronize());
74 | 
75 |     CUDA_CHECK(ctx->device_id, cudaMemcpy(resnonce, ctx->d_result_nonce, 10 * sizeof(uint32_t), cudaMemcpyDeviceToHost));
76 | 
77 |     *rescount = resnonce[0];
78 |     if (*rescount > 9) {
79 |         *rescount = 9;
80 |     }
81 | 
82 |     for (uint32_t i = 0; i < *rescount; i++) {
83 |         resnonce[i] = resnonce[i + 1] + nonce;
84 |     }
85 | }
86 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # v6.22.1
 2 | - [#205](https://github.com/xmrig/xmrig-cuda/pull/205) Fixed RandomX dataset update. Fix works together with the updated XMRig.
 3 | 
 4 | # v6.22.0
 5 | - [#201](https://github.com/xmrig/xmrig-cuda/pull/201) Added support for [Yada](https://yadacoin.io/) (`rx/yada` algorithm).
 6 | 
 7 | # v6.21.1
 8 | - The binary downloads now only support the latest version of each major CUDA release.
 9 | - Improved build speed with CUDA 11.3 or higher.
10 | 
11 | # v6.21.0
12 | - [#167](https://github.com/xmrig/xmrig-cuda/pull/167) Removed deprecated AstroBWTv1 and v2.
13 | - [#176](https://github.com/xmrig/xmrig-cuda/pull/176) Added CUDA 12 support.
14 | - [#191](https://github.com/xmrig/xmrig-cuda/pull/191) Fixed Zephyr mining.
15 | 
16 | # v6.17.0
17 | - [#157](https://github.com/xmrig/xmrig-cuda/pull/157) Added Dero HE (`astrobwt/v2`) support.
18 | 
19 | # v6.15.1
20 | - [#119](https://github.com/xmrig/xmrig-cuda/issues/119) Fixed compile error on Linux.
21 | - [#124](https://github.com/xmrig/xmrig-cuda/pull/124) Fixed `"out of memory"` error on non-CryptoNight algorithms.
22 | - [#125](https://github.com/xmrig/xmrig-cuda/pull/125) Fixed `"invalid argument"` error.
23 | 
24 | # v6.15.0
25 | - **ABI changed, minimum supported XMRig version now is 6.15.0.**
26 | - [#2563](https://github.com/xmrig/xmrig/pull/2563) Added new algorithm RandomX Graft (`rx/graft`).
27 | - [#104](https://github.com/xmrig/xmrig-cuda/pull/104) Fixed build on macOS 10.13 (last supported for CUDA).
28 | 
29 | # v6.12.0
30 | - [#95](https://github.com/xmrig/xmrig-cuda/pull/95) Added support for Uplexa (`cn/upx2` algorithm).
31 | 
32 | # v6.5.0
33 | - [#74](https://github.com/xmrig/xmrig-cuda/pull/74) Fixed CUDA 8.0 support, RandomX, AstroBWT, and KawPow disabled for this CUDA version.
34 | - [#76](https://github.com/xmrig/xmrig-cuda/pull/76) Fixed high CPU usage on Cryptonight and AstroBWT.
35 | - Removed legacy API and added version information on Windows.
36 | 
37 | # v6.4.1
38 | - [#72](https://github.com/xmrig/xmrig-cuda/issues/72) Fixed broken KawPow on Linux.
39 | 
40 | # v6.4.0
41 | - [#70](https://github.com/xmrig/xmrig-cuda/pull/70) RandomX: removed `rx/loki` algorithm.
42 | - Added CMake option `-DWITH_DRIVER_API=OFF` to disable CUDA Driver API and NVRTC, required for `cn/r` and `kawpow` algorithms.
43 | 
44 | # v6.3.2
45 | - [#65](https://github.com/xmrig/xmrig-cuda/pull/65) Fixed broken AstroBWT.
46 | 
47 | # v6.3.1
48 | - [#62](https://github.com/xmrig/xmrig-cuda/pull/62) Fixed broken RandomX (regression since v6.2.1).
49 | 
50 | # v6.3.0
51 | - [#59](https://github.com/xmrig/xmrig-cuda/pull/59) Added support for upcoming Haven offshore fork.
52 | - Fixed build with recent CUDA 11.
53 | 
54 | # v6.2.1
55 | - [#54](https://github.com/xmrig/xmrig-cuda/pull/54) Optimized KawPow, about 2% hashrate improvement, 10% faster DAG initialization.
56 | - [#55](https://github.com/xmrig/xmrig-cuda/pull/55) Added fast job switching for KawPow, almost zero stale shares.
57 | 
58 | # v6.2.0
59 | - [#52](https://github.com/xmrig/xmrig-cuda/pull/52) Added new algorithm `cn/ccx` for Conceal.
60 | - [#53](https://github.com/xmrig/xmrig-cuda/pull/53) Fixed build with CUDA 11.
61 | 
62 | # v6.1.0
63 | - [#48](https://github.com/xmrig/xmrig-cuda/pull/48) Optimized AstroBWT, approximately 3 times faster.
64 | - [#51](https://github.com/xmrig/xmrig-cuda/pull/51) Reduced memory usage for KawPow.
65 | 
66 | # v6.0.0
67 | - [#1694](https://github.com/xmrig/xmrig/pull/1694) Added support for KawPow algorithm (Ravencoin) on AMD/NVIDIA.
68 | 
69 | # v3.0.0
70 | - **ABI changed, minimum supported XMRig version now is 5.11.0.**
71 | - [#41](https://github.com/xmrig/xmrig-cuda/pull/41) Added AstroBWT algorithm support.
72 | 
73 | # v2.2.0
74 | - [#1578](https://github.com/xmrig/xmrig/pull/1578) Added new `rx/keva` algorithm for upcoming Kevacoin fork.
75 | 
76 | # v2.1.0
77 | - [#1466](https://github.com/xmrig/xmrig/pull/1466) Added `cn-pico/tlo` algorithm.
78 | - Added alternative relaxed API (algorithm passed as string).
79 | 
80 | # v2.0.2
81 | - [#27](https://github.com/xmrig/xmrig-cuda/pull/27) Added RandomSFX (`rx/sfx`) algorithm for Safex Cash.
82 | - [#28](https://github.com/xmrig/xmrig-cuda/pull/28) Added RandomV (`rx/v`) algorithm for *new* MoneroV.
83 | 
84 | # v2.0.1-beta
85 | - [#10](https://github.com/xmrig/xmrig-cuda/pull/10) Fixed compatibility with CUDA 8, RandomX support not tested and potentially broken with this CUDA version.
86 | - [#1276](https://github.com/xmrig/xmrig/issues/1276) Fixed maximum threads count.
87 | 
88 | # v2.0.0-beta
89 | - **ABI changed, minimum supported XMRig version now is 4.6.0.**
90 | - [#5](https://github.com/xmrig/xmrig-cuda/pull/5) Optimized RandomX.
91 | - [#6](https://github.com/xmrig/xmrig-cuda/issues/6) Fixed compatibility with some old systems.
92 | - [#7](https://github.com/xmrig/xmrig-cuda/pull/7) Added support for option `dataset_host` for 2 GB GPUs.
93 | - [#8](https://github.com/xmrig/xmrig-cuda/pull/8) RandomX: fixed random kernel launch errors with some configurations.
94 | 
95 | # v1.0.0-beta
96 | - Initial version.
97 | 


--------------------------------------------------------------------------------
/src/RandomX/arqma/configuration.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2018-2019, tevador <tevador@gmail.com>
  3 | 
  4 | All rights reserved.
  5 | 
  6 | Redistribution and use in source and binary forms, with or without
  7 | modification, are permitted provided that the following conditions are met:
  8 | 	* Redistributions of source code must retain the above copyright
  9 | 	  notice, this list of conditions and the following disclaimer.
 10 | 	* Redistributions in binary form must reproduce the above copyright
 11 | 	  notice, this list of conditions and the following disclaimer in the
 12 | 	  documentation and/or other materials provided with the distribution.
 13 | 	* Neither the name of the copyright holder nor the
 14 | 	  names of its contributors may be used to endorse or promote products
 15 | 	  derived from this software without specific prior written permission.
 16 | 
 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 18 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 19 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 20 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 21 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 22 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 23 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 24 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 25 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 27 | */
 28 | 
 29 | #pragma once
 30 | 
 31 | //Cache size in KiB. Must be a power of 2.
 32 | #define RANDOMX_ARGON_MEMORY       262144
 33 | 
 34 | //Number of Argon2d iterations for Cache initialization.
 35 | #define RANDOMX_ARGON_ITERATIONS   1
 36 | 
 37 | //Number of parallel lanes for Cache initialization.
 38 | #define RANDOMX_ARGON_LANES        1
 39 | 
 40 | //Argon2d salt
 41 | #define RANDOMX_ARGON_SALT         "RandomARQ\x01"
 42 | 
 43 | //Number of random Cache accesses per Dataset item. Minimum is 2.
 44 | #define RANDOMX_CACHE_ACCESSES     8
 45 | 
 46 | //Target latency for SuperscalarHash (in cycles of the reference CPU).
 47 | #define RANDOMX_SUPERSCALAR_LATENCY   170
 48 | 
 49 | //Dataset base size in bytes. Must be a power of 2.
 50 | #define RANDOMX_DATASET_BASE_SIZE  2147483648
 51 | 
 52 | //Dataset extra size. Must be divisible by 64.
 53 | #define RANDOMX_DATASET_EXTRA_SIZE 33554368
 54 | 
 55 | //Number of instructions in a RandomX program. Must be divisible by 8.
 56 | #define RANDOMX_PROGRAM_SIZE       256
 57 | 
 58 | //Number of iterations during VM execution.
 59 | #define RANDOMX_PROGRAM_ITERATIONS 1024
 60 | 
 61 | //Number of chained VM executions per hash.
 62 | #define RANDOMX_PROGRAM_COUNT      4
 63 | 
 64 | //Scratchpad L3 size in bytes. Must be a power of 2.
 65 | #define RANDOMX_SCRATCHPAD_L3      262144
 66 | 
 67 | //Scratchpad L2 size in bytes. Must be a power of two and less than or equal to RANDOMX_SCRATCHPAD_L3.
 68 | #define RANDOMX_SCRATCHPAD_L2      131072
 69 | 
 70 | //Scratchpad L1 size in bytes. Must be a power of two (minimum 64) and less than or equal to RANDOMX_SCRATCHPAD_L2.
 71 | #define RANDOMX_SCRATCHPAD_L1      16384
 72 | 
 73 | //Jump condition mask size in bits.
 74 | #define RANDOMX_JUMP_BITS          8
 75 | 
 76 | //Jump condition mask offset in bits. The sum of RANDOMX_JUMP_BITS and RANDOMX_JUMP_OFFSET must not exceed 16.
 77 | #define RANDOMX_JUMP_OFFSET        8
 78 | 
 79 | /*
 80 | Instruction frequencies (per 256 opcodes)
 81 | Total sum of frequencies must be 256
 82 | */
 83 | 
 84 | //Integer instructions
 85 | #define RANDOMX_FREQ_IADD_RS       16
 86 | #define RANDOMX_FREQ_IADD_M         7
 87 | #define RANDOMX_FREQ_ISUB_R        16
 88 | #define RANDOMX_FREQ_ISUB_M         7
 89 | #define RANDOMX_FREQ_IMUL_R        16
 90 | #define RANDOMX_FREQ_IMUL_M         4
 91 | #define RANDOMX_FREQ_IMULH_R        4
 92 | #define RANDOMX_FREQ_IMULH_M        1
 93 | #define RANDOMX_FREQ_ISMULH_R       4
 94 | #define RANDOMX_FREQ_ISMULH_M       1
 95 | #define RANDOMX_FREQ_IMUL_RCP       8
 96 | #define RANDOMX_FREQ_INEG_R         2
 97 | #define RANDOMX_FREQ_IXOR_R        15
 98 | #define RANDOMX_FREQ_IXOR_M         5
 99 | #define RANDOMX_FREQ_IROR_R         8
100 | #define RANDOMX_FREQ_IROL_R         2
101 | #define RANDOMX_FREQ_ISWAP_R        4
102 | 
103 | //Floating point instructions
104 | #define RANDOMX_FREQ_FSWAP_R        4
105 | #define RANDOMX_FREQ_FADD_R        16
106 | #define RANDOMX_FREQ_FADD_M         5
107 | #define RANDOMX_FREQ_FSUB_R        16
108 | #define RANDOMX_FREQ_FSUB_M         5
109 | #define RANDOMX_FREQ_FSCAL_R        6
110 | #define RANDOMX_FREQ_FMUL_R        32
111 | #define RANDOMX_FREQ_FDIV_M         4
112 | #define RANDOMX_FREQ_FSQRT_R        6
113 | 
114 | //Control instructions
115 | #define RANDOMX_FREQ_CBRANCH       25
116 | #define RANDOMX_FREQ_CFROUND        1
117 | 
118 | //Store instruction
119 | #define RANDOMX_FREQ_ISTORE        16
120 | 
121 | //No-op instruction
122 | #define RANDOMX_FREQ_NOP            0
123 | /*                               ------
124 |                                   256
125 | */
126 | 


--------------------------------------------------------------------------------
/src/RandomX/defyx/configuration.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2018-2019, tevador <tevador@gmail.com>
  3 | 
  4 | All rights reserved.
  5 | 
  6 | Redistribution and use in source and binary forms, with or without
  7 | modification, are permitted provided that the following conditions are met:
  8 | 	* Redistributions of source code must retain the above copyright
  9 | 	  notice, this list of conditions and the following disclaimer.
 10 | 	* Redistributions in binary form must reproduce the above copyright
 11 | 	  notice, this list of conditions and the following disclaimer in the
 12 | 	  documentation and/or other materials provided with the distribution.
 13 | 	* Neither the name of the copyright holder nor the
 14 | 	  names of its contributors may be used to endorse or promote products
 15 | 	  derived from this software without specific prior written permission.
 16 | 
 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 18 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 19 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 20 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 21 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 22 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 23 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 24 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 25 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 27 | */
 28 | 
 29 | #pragma once
 30 | 
 31 | //Cache size in KiB. Must be a power of 2.
 32 | #define RANDOMX_ARGON_MEMORY       131072
 33 | 
 34 | //Number of Argon2d iterations for Cache initialization.
 35 | #define RANDOMX_ARGON_ITERATIONS   2
 36 | 
 37 | //Number of parallel lanes for Cache initialization.
 38 | #define RANDOMX_ARGON_LANES        1
 39 | 
 40 | //Argon2d salt
 41 | #define RANDOMX_ARGON_SALT         "DefyXScala\x13"
 42 | 
 43 | //Number of random Cache accesses per Dataset item. Minimum is 2.
 44 | #define RANDOMX_CACHE_ACCESSES     2
 45 | 
 46 | //Target latency for SuperscalarHash (in cycles of the reference CPU).
 47 | #define RANDOMX_SUPERSCALAR_LATENCY   170
 48 | 
 49 | //Dataset base size in bytes. Must be a power of 2.
 50 | #define RANDOMX_DATASET_BASE_SIZE  33554432
 51 | 
 52 | //Dataset extra size. Must be divisible by 64.
 53 | #define RANDOMX_DATASET_EXTRA_SIZE 33554368
 54 | 
 55 | //Number of instructions in a RandomX program. Must be divisible by 8.
 56 | #define RANDOMX_PROGRAM_SIZE       64
 57 | 
 58 | //Number of iterations during VM execution.
 59 | #define RANDOMX_PROGRAM_ITERATIONS 1024
 60 | 
 61 | //Number of chained VM executions per hash.
 62 | #define RANDOMX_PROGRAM_COUNT      4
 63 | 
 64 | //Scratchpad L3 size in bytes. Must be a power of 2.
 65 | #define RANDOMX_SCRATCHPAD_L3      262144
 66 | 
 67 | //Scratchpad L2 size in bytes. Must be a power of two and less than or equal to RANDOMX_SCRATCHPAD_L3.
 68 | #define RANDOMX_SCRATCHPAD_L2      131072
 69 | 
 70 | //Scratchpad L1 size in bytes. Must be a power of two (minimum 64) and less than or equal to RANDOMX_SCRATCHPAD_L2.
 71 | #define RANDOMX_SCRATCHPAD_L1      65536
 72 | 
 73 | //Jump condition mask size in bits.
 74 | #define RANDOMX_JUMP_BITS          8
 75 | 
 76 | //Jump condition mask offset in bits. The sum of RANDOMX_JUMP_BITS and RANDOMX_JUMP_OFFSET must not exceed 16.
 77 | #define RANDOMX_JUMP_OFFSET        8
 78 | 
 79 | /*
 80 | Instruction frequencies (per 256 opcodes)
 81 | Total sum of frequencies must be 256
 82 | */
 83 | 
 84 | //Integer instructions
 85 | #define RANDOMX_FREQ_IADD_RS       25
 86 | #define RANDOMX_FREQ_IADD_M         7
 87 | #define RANDOMX_FREQ_ISUB_R        16
 88 | #define RANDOMX_FREQ_ISUB_M         7
 89 | #define RANDOMX_FREQ_IMUL_R        16
 90 | #define RANDOMX_FREQ_IMUL_M         4
 91 | #define RANDOMX_FREQ_IMULH_R        4
 92 | #define RANDOMX_FREQ_IMULH_M        1
 93 | #define RANDOMX_FREQ_ISMULH_R       4
 94 | #define RANDOMX_FREQ_ISMULH_M       1
 95 | #define RANDOMX_FREQ_IMUL_RCP       8
 96 | #define RANDOMX_FREQ_INEG_R         2
 97 | #define RANDOMX_FREQ_IXOR_R        15
 98 | #define RANDOMX_FREQ_IXOR_M         5
 99 | #define RANDOMX_FREQ_IROR_R         8
100 | #define RANDOMX_FREQ_IROL_R         2
101 | #define RANDOMX_FREQ_ISWAP_R        4
102 | 
103 | //Floating point instructions
104 | #define RANDOMX_FREQ_FSWAP_R        4
105 | #define RANDOMX_FREQ_FADD_R        16
106 | #define RANDOMX_FREQ_FADD_M         5
107 | #define RANDOMX_FREQ_FSUB_R        16
108 | #define RANDOMX_FREQ_FSUB_M         5
109 | #define RANDOMX_FREQ_FSCAL_R        6
110 | #define RANDOMX_FREQ_FMUL_R        32
111 | #define RANDOMX_FREQ_FDIV_M         4
112 | #define RANDOMX_FREQ_FSQRT_R        6
113 | 
114 | //Control instructions
115 | #define RANDOMX_FREQ_CBRANCH       16
116 | #define RANDOMX_FREQ_CFROUND        1
117 | 
118 | //Store instruction
119 | #define RANDOMX_FREQ_ISTORE        16
120 | 
121 | //No-op instruction
122 | #define RANDOMX_FREQ_NOP            0
123 | /*                               ------
124 |                                   256
125 | */
126 | 


--------------------------------------------------------------------------------
/src/RandomX/keva/configuration.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2018-2019, tevador <tevador@gmail.com>
  3 | 
  4 | All rights reserved.
  5 | 
  6 | Redistribution and use in source and binary forms, with or without
  7 | modification, are permitted provided that the following conditions are met:
  8 | 	* Redistributions of source code must retain the above copyright
  9 | 	  notice, this list of conditions and the following disclaimer.
 10 | 	* Redistributions in binary form must reproduce the above copyright
 11 | 	  notice, this list of conditions and the following disclaimer in the
 12 | 	  documentation and/or other materials provided with the distribution.
 13 | 	* Neither the name of the copyright holder nor the
 14 | 	  names of its contributors may be used to endorse or promote products
 15 | 	  derived from this software without specific prior written permission.
 16 | 
 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 18 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 19 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 20 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 21 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 22 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 23 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 24 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 25 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 27 | */
 28 | 
 29 | #pragma once
 30 | 
 31 | //Cache size in KiB. Must be a power of 2.
 32 | #define RANDOMX_ARGON_MEMORY       262144
 33 | 
 34 | //Number of Argon2d iterations for Cache initialization.
 35 | #define RANDOMX_ARGON_ITERATIONS   3
 36 | 
 37 | //Number of parallel lanes for Cache initialization.
 38 | #define RANDOMX_ARGON_LANES        1
 39 | 
 40 | //Argon2d salt
 41 | #define RANDOMX_ARGON_SALT         "RandomKV\x01"
 42 | 
 43 | //Number of random Cache accesses per Dataset item. Minimum is 2.
 44 | #define RANDOMX_CACHE_ACCESSES     8
 45 | 
 46 | //Target latency for SuperscalarHash (in cycles of the reference CPU).
 47 | #define RANDOMX_SUPERSCALAR_LATENCY   170
 48 | 
 49 | //Dataset base size in bytes. Must be a power of 2.
 50 | #define RANDOMX_DATASET_BASE_SIZE  2147483648
 51 | 
 52 | //Dataset extra size. Must be divisible by 64.
 53 | #define RANDOMX_DATASET_EXTRA_SIZE 33554368
 54 | 
 55 | //Number of instructions in a RandomX program. Must be divisible by 8.
 56 | #define RANDOMX_PROGRAM_SIZE       256
 57 | 
 58 | //Number of iterations during VM execution.
 59 | #define RANDOMX_PROGRAM_ITERATIONS 2048
 60 | 
 61 | //Number of chained VM executions per hash.
 62 | #define RANDOMX_PROGRAM_COUNT      8
 63 | 
 64 | //Scratchpad L3 size in bytes. Must be a power of 2.
 65 | #define RANDOMX_SCRATCHPAD_L3      1048576
 66 | 
 67 | //Scratchpad L2 size in bytes. Must be a power of two and less than or equal to RANDOMX_SCRATCHPAD_L3.
 68 | #define RANDOMX_SCRATCHPAD_L2      131072
 69 | 
 70 | //Scratchpad L1 size in bytes. Must be a power of two (minimum 64) and less than or equal to RANDOMX_SCRATCHPAD_L2.
 71 | #define RANDOMX_SCRATCHPAD_L1      16384
 72 | 
 73 | //Jump condition mask size in bits.
 74 | #define RANDOMX_JUMP_BITS          8
 75 | 
 76 | //Jump condition mask offset in bits. The sum of RANDOMX_JUMP_BITS and RANDOMX_JUMP_OFFSET must not exceed 16.
 77 | #define RANDOMX_JUMP_OFFSET        8
 78 | 
 79 | /*
 80 | Instruction frequencies (per 256 opcodes)
 81 | Total sum of frequencies must be 256
 82 | */
 83 | 
 84 | //Integer instructions
 85 | #define RANDOMX_FREQ_IADD_RS       16
 86 | #define RANDOMX_FREQ_IADD_M         7
 87 | #define RANDOMX_FREQ_ISUB_R        16
 88 | #define RANDOMX_FREQ_ISUB_M         7
 89 | #define RANDOMX_FREQ_IMUL_R        16
 90 | #define RANDOMX_FREQ_IMUL_M         4
 91 | #define RANDOMX_FREQ_IMULH_R        4
 92 | #define RANDOMX_FREQ_IMULH_M        1
 93 | #define RANDOMX_FREQ_ISMULH_R       4
 94 | #define RANDOMX_FREQ_ISMULH_M       1
 95 | #define RANDOMX_FREQ_IMUL_RCP       8
 96 | #define RANDOMX_FREQ_INEG_R         2
 97 | #define RANDOMX_FREQ_IXOR_R        15
 98 | #define RANDOMX_FREQ_IXOR_M         5
 99 | #define RANDOMX_FREQ_IROR_R         8
100 | #define RANDOMX_FREQ_IROL_R         2
101 | #define RANDOMX_FREQ_ISWAP_R        4
102 | 
103 | //Floating point instructions
104 | #define RANDOMX_FREQ_FSWAP_R        4
105 | #define RANDOMX_FREQ_FADD_R        16
106 | #define RANDOMX_FREQ_FADD_M         5
107 | #define RANDOMX_FREQ_FSUB_R        16
108 | #define RANDOMX_FREQ_FSUB_M         5
109 | #define RANDOMX_FREQ_FSCAL_R        6
110 | #define RANDOMX_FREQ_FMUL_R        32
111 | #define RANDOMX_FREQ_FDIV_M         4
112 | #define RANDOMX_FREQ_FSQRT_R        6
113 | 
114 | //Control instructions
115 | #define RANDOMX_FREQ_CBRANCH       25
116 | #define RANDOMX_FREQ_CFROUND        1
117 | 
118 | //Store instruction
119 | #define RANDOMX_FREQ_ISTORE        16
120 | 
121 | //No-op instruction
122 | #define RANDOMX_FREQ_NOP            0
123 | /*                               ------
124 |                                   256
125 | */
126 | 


--------------------------------------------------------------------------------
/src/RandomX/monero/configuration.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2018-2019, tevador <tevador@gmail.com>
  3 | 
  4 | All rights reserved.
  5 | 
  6 | Redistribution and use in source and binary forms, with or without
  7 | modification, are permitted provided that the following conditions are met:
  8 | 	* Redistributions of source code must retain the above copyright
  9 | 	  notice, this list of conditions and the following disclaimer.
 10 | 	* Redistributions in binary form must reproduce the above copyright
 11 | 	  notice, this list of conditions and the following disclaimer in the
 12 | 	  documentation and/or other materials provided with the distribution.
 13 | 	* Neither the name of the copyright holder nor the
 14 | 	  names of its contributors may be used to endorse or promote products
 15 | 	  derived from this software without specific prior written permission.
 16 | 
 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 18 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 19 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 20 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 21 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 22 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 23 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 24 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 25 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 27 | */
 28 | 
 29 | #pragma once
 30 | 
 31 | //Cache size in KiB. Must be a power of 2.
 32 | #define RANDOMX_ARGON_MEMORY       262144
 33 | 
 34 | //Number of Argon2d iterations for Cache initialization.
 35 | #define RANDOMX_ARGON_ITERATIONS   3
 36 | 
 37 | //Number of parallel lanes for Cache initialization.
 38 | #define RANDOMX_ARGON_LANES        1
 39 | 
 40 | //Argon2d salt
 41 | #define RANDOMX_ARGON_SALT         "RandomX\x03"
 42 | 
 43 | //Number of random Cache accesses per Dataset item. Minimum is 2.
 44 | #define RANDOMX_CACHE_ACCESSES     8
 45 | 
 46 | //Target latency for SuperscalarHash (in cycles of the reference CPU).
 47 | #define RANDOMX_SUPERSCALAR_LATENCY   170
 48 | 
 49 | //Dataset base size in bytes. Must be a power of 2.
 50 | #define RANDOMX_DATASET_BASE_SIZE  2147483648
 51 | 
 52 | //Dataset extra size. Must be divisible by 64.
 53 | #define RANDOMX_DATASET_EXTRA_SIZE 33554368
 54 | 
 55 | //Number of instructions in a RandomX program. Must be divisible by 8.
 56 | #define RANDOMX_PROGRAM_SIZE       256
 57 | 
 58 | //Number of iterations during VM execution.
 59 | #define RANDOMX_PROGRAM_ITERATIONS 2048
 60 | 
 61 | //Number of chained VM executions per hash.
 62 | #define RANDOMX_PROGRAM_COUNT      8
 63 | 
 64 | //Scratchpad L3 size in bytes. Must be a power of 2.
 65 | #define RANDOMX_SCRATCHPAD_L3      2097152
 66 | 
 67 | //Scratchpad L2 size in bytes. Must be a power of two and less than or equal to RANDOMX_SCRATCHPAD_L3.
 68 | #define RANDOMX_SCRATCHPAD_L2      262144
 69 | 
 70 | //Scratchpad L1 size in bytes. Must be a power of two (minimum 64) and less than or equal to RANDOMX_SCRATCHPAD_L2.
 71 | #define RANDOMX_SCRATCHPAD_L1      16384
 72 | 
 73 | //Jump condition mask size in bits.
 74 | #define RANDOMX_JUMP_BITS          8
 75 | 
 76 | //Jump condition mask offset in bits. The sum of RANDOMX_JUMP_BITS and RANDOMX_JUMP_OFFSET must not exceed 16.
 77 | #define RANDOMX_JUMP_OFFSET        8
 78 | 
 79 | /*
 80 | Instruction frequencies (per 256 opcodes)
 81 | Total sum of frequencies must be 256
 82 | */
 83 | 
 84 | //Integer instructions
 85 | #define RANDOMX_FREQ_IADD_RS       16
 86 | #define RANDOMX_FREQ_IADD_M         7
 87 | #define RANDOMX_FREQ_ISUB_R        16
 88 | #define RANDOMX_FREQ_ISUB_M         7
 89 | #define RANDOMX_FREQ_IMUL_R        16
 90 | #define RANDOMX_FREQ_IMUL_M         4
 91 | #define RANDOMX_FREQ_IMULH_R        4
 92 | #define RANDOMX_FREQ_IMULH_M        1
 93 | #define RANDOMX_FREQ_ISMULH_R       4
 94 | #define RANDOMX_FREQ_ISMULH_M       1
 95 | #define RANDOMX_FREQ_IMUL_RCP       8
 96 | #define RANDOMX_FREQ_INEG_R         2
 97 | #define RANDOMX_FREQ_IXOR_R        15
 98 | #define RANDOMX_FREQ_IXOR_M         5
 99 | #define RANDOMX_FREQ_IROR_R         8
100 | #define RANDOMX_FREQ_IROL_R         2
101 | #define RANDOMX_FREQ_ISWAP_R        4
102 | 
103 | //Floating point instructions
104 | #define RANDOMX_FREQ_FSWAP_R        4
105 | #define RANDOMX_FREQ_FADD_R        16
106 | #define RANDOMX_FREQ_FADD_M         5
107 | #define RANDOMX_FREQ_FSUB_R        16
108 | #define RANDOMX_FREQ_FSUB_M         5
109 | #define RANDOMX_FREQ_FSCAL_R        6
110 | #define RANDOMX_FREQ_FMUL_R        32
111 | #define RANDOMX_FREQ_FDIV_M         4
112 | #define RANDOMX_FREQ_FSQRT_R        6
113 | 
114 | //Control instructions
115 | #define RANDOMX_FREQ_CBRANCH       25
116 | #define RANDOMX_FREQ_CFROUND        1
117 | 
118 | //Store instruction
119 | #define RANDOMX_FREQ_ISTORE        16
120 | 
121 | //No-op instruction
122 | #define RANDOMX_FREQ_NOP            0
123 | /*                               ------
124 |                                   256
125 | */
126 | 


--------------------------------------------------------------------------------
/src/RandomX/graft/configuration.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2018-2019, tevador <tevador@gmail.com>
  3 | 
  4 | All rights reserved.
  5 | 
  6 | Redistribution and use in source and binary forms, with or without
  7 | modification, are permitted provided that the following conditions are met:
  8 | 	* Redistributions of source code must retain the above copyright
  9 | 	  notice, this list of conditions and the following disclaimer.
 10 | 	* Redistributions in binary form must reproduce the above copyright
 11 | 	  notice, this list of conditions and the following disclaimer in the
 12 | 	  documentation and/or other materials provided with the distribution.
 13 | 	* Neither the name of the copyright holder nor the
 14 | 	  names of its contributors may be used to endorse or promote products
 15 | 	  derived from this software without specific prior written permission.
 16 | 
 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 18 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 19 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 20 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 21 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 22 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 23 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 24 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 25 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 27 | */
 28 | 
 29 | #pragma once
 30 | 
 31 | //Cache size in KiB. Must be a power of 2.
 32 | #define RANDOMX_ARGON_MEMORY       262144
 33 | 
 34 | //Number of Argon2d iterations for Cache initialization.
 35 | #define RANDOMX_ARGON_ITERATIONS   3
 36 | 
 37 | //Number of parallel lanes for Cache initialization.
 38 | #define RANDOMX_ARGON_LANES        2
 39 | 
 40 | //Argon2d salt
 41 | #define RANDOMX_ARGON_SALT         "RandomX-Graft\x01"
 42 | 
 43 | //Number of random Cache accesses per Dataset item. Minimum is 2.
 44 | #define RANDOMX_CACHE_ACCESSES     8
 45 | 
 46 | //Target latency for SuperscalarHash (in cycles of the reference CPU).
 47 | #define RANDOMX_SUPERSCALAR_LATENCY   170
 48 | 
 49 | //Dataset base size in bytes. Must be a power of 2.
 50 | #define RANDOMX_DATASET_BASE_SIZE  2147483648
 51 | 
 52 | //Dataset extra size. Must be divisible by 64.
 53 | #define RANDOMX_DATASET_EXTRA_SIZE 33554368
 54 | 
 55 | //Number of instructions in a RandomX program. Must be divisible by 8.
 56 | #define RANDOMX_PROGRAM_SIZE       280
 57 | 
 58 | //Number of iterations during VM execution.
 59 | #define RANDOMX_PROGRAM_ITERATIONS 2048
 60 | 
 61 | //Number of chained VM executions per hash.
 62 | #define RANDOMX_PROGRAM_COUNT      8
 63 | 
 64 | //Scratchpad L3 size in bytes. Must be a power of 2.
 65 | #define RANDOMX_SCRATCHPAD_L3      2097152
 66 | 
 67 | //Scratchpad L2 size in bytes. Must be a power of two and less than or equal to RANDOMX_SCRATCHPAD_L3.
 68 | #define RANDOMX_SCRATCHPAD_L2      262144
 69 | 
 70 | //Scratchpad L1 size in bytes. Must be a power of two (minimum 64) and less than or equal to RANDOMX_SCRATCHPAD_L2.
 71 | #define RANDOMX_SCRATCHPAD_L1      16384
 72 | 
 73 | //Jump condition mask size in bits.
 74 | #define RANDOMX_JUMP_BITS          8
 75 | 
 76 | //Jump condition mask offset in bits. The sum of RANDOMX_JUMP_BITS and RANDOMX_JUMP_OFFSET must not exceed 16.
 77 | #define RANDOMX_JUMP_OFFSET        8
 78 | 
 79 | /*
 80 | Instruction frequencies (per 256 opcodes)
 81 | Total sum of frequencies must be 256
 82 | */
 83 | 
 84 | //Integer instructions
 85 | #define RANDOMX_FREQ_IADD_RS       16
 86 | #define RANDOMX_FREQ_IADD_M         7
 87 | #define RANDOMX_FREQ_ISUB_R        16
 88 | #define RANDOMX_FREQ_ISUB_M         7
 89 | #define RANDOMX_FREQ_IMUL_R        16
 90 | #define RANDOMX_FREQ_IMUL_M         4
 91 | #define RANDOMX_FREQ_IMULH_R        4
 92 | #define RANDOMX_FREQ_IMULH_M        1
 93 | #define RANDOMX_FREQ_ISMULH_R       4
 94 | #define RANDOMX_FREQ_ISMULH_M       1
 95 | #define RANDOMX_FREQ_IMUL_RCP       8
 96 | #define RANDOMX_FREQ_INEG_R         2
 97 | #define RANDOMX_FREQ_IXOR_R        15
 98 | #define RANDOMX_FREQ_IXOR_M         5
 99 | #define RANDOMX_FREQ_IROR_R         7
100 | #define RANDOMX_FREQ_IROL_R         3
101 | #define RANDOMX_FREQ_ISWAP_R        4
102 | 
103 | //Floating point instructions
104 | #define RANDOMX_FREQ_FSWAP_R        4
105 | #define RANDOMX_FREQ_FADD_R        16
106 | #define RANDOMX_FREQ_FADD_M         5
107 | #define RANDOMX_FREQ_FSUB_R        16
108 | #define RANDOMX_FREQ_FSUB_M         5
109 | #define RANDOMX_FREQ_FSCAL_R        6
110 | #define RANDOMX_FREQ_FMUL_R        32
111 | #define RANDOMX_FREQ_FDIV_M         4
112 | #define RANDOMX_FREQ_FSQRT_R        6
113 | 
114 | //Control instructions
115 | #define RANDOMX_FREQ_CBRANCH       25
116 | #define RANDOMX_FREQ_CFROUND        1
117 | 
118 | //Store instruction
119 | #define RANDOMX_FREQ_ISTORE        16
120 | 
121 | //No-op instruction
122 | #define RANDOMX_FREQ_NOP            0
123 | /*                               ------
124 |                                   256
125 | */
126 | 


--------------------------------------------------------------------------------
/src/RandomX/yada/configuration.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2018-2019, tevador <tevador@gmail.com>
  3 | 
  4 | All rights reserved.
  5 | 
  6 | Redistribution and use in source and binary forms, with or without
  7 | modification, are permitted provided that the following conditions are met:
  8 | 	* Redistributions of source code must retain the above copyright
  9 | 	  notice, this list of conditions and the following disclaimer.
 10 | 	* Redistributions in binary form must reproduce the above copyright
 11 | 	  notice, this list of conditions and the following disclaimer in the
 12 | 	  documentation and/or other materials provided with the distribution.
 13 | 	* Neither the name of the copyright holder nor the
 14 | 	  names of its contributors may be used to endorse or promote products
 15 | 	  derived from this software without specific prior written permission.
 16 | 
 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 18 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 19 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 20 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 21 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 22 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 23 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 24 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 25 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 27 | */
 28 | 
 29 | #pragma once
 30 | 
 31 | //Cache size in KiB. Must be a power of 2.
 32 | #define RANDOMX_ARGON_MEMORY       262144
 33 | 
 34 | //Number of Argon2d iterations for Cache initialization.
 35 | #define RANDOMX_ARGON_ITERATIONS   4
 36 | 
 37 | //Number of parallel lanes for Cache initialization.
 38 | #define RANDOMX_ARGON_LANES        1
 39 | 
 40 | //Argon2d salt
 41 | #define RANDOMX_ARGON_SALT         "RandomXYadaCoin\x03"
 42 | 
 43 | //Number of random Cache accesses per Dataset item. Minimum is 2.
 44 | #define RANDOMX_CACHE_ACCESSES     8
 45 | 
 46 | //Target latency for SuperscalarHash (in cycles of the reference CPU).
 47 | #define RANDOMX_SUPERSCALAR_LATENCY   150
 48 | 
 49 | //Dataset base size in bytes. Must be a power of 2.
 50 | #define RANDOMX_DATASET_BASE_SIZE  2147483648
 51 | 
 52 | //Dataset extra size. Must be divisible by 64.
 53 | #define RANDOMX_DATASET_EXTRA_SIZE 33554368
 54 | 
 55 | //Number of instructions in a RandomX program. Must be divisible by 8.
 56 | #define RANDOMX_PROGRAM_SIZE       256
 57 | 
 58 | //Number of iterations during VM execution.
 59 | #define RANDOMX_PROGRAM_ITERATIONS 2048
 60 | 
 61 | //Number of chained VM executions per hash.
 62 | #define RANDOMX_PROGRAM_COUNT      8
 63 | 
 64 | //Scratchpad L3 size in bytes. Must be a power of 2.
 65 | #define RANDOMX_SCRATCHPAD_L3      2097152
 66 | 
 67 | //Scratchpad L2 size in bytes. Must be a power of two and less than or equal to RANDOMX_SCRATCHPAD_L3.
 68 | #define RANDOMX_SCRATCHPAD_L2      262144
 69 | 
 70 | //Scratchpad L1 size in bytes. Must be a power of two (minimum 64) and less than or equal to RANDOMX_SCRATCHPAD_L2.
 71 | #define RANDOMX_SCRATCHPAD_L1      16384
 72 | 
 73 | //Jump condition mask size in bits.
 74 | #define RANDOMX_JUMP_BITS          8
 75 | 
 76 | //Jump condition mask offset in bits. The sum of RANDOMX_JUMP_BITS and RANDOMX_JUMP_OFFSET must not exceed 16.
 77 | #define RANDOMX_JUMP_OFFSET        8
 78 | 
 79 | /*
 80 | Instruction frequencies (per 256 opcodes)
 81 | Total sum of frequencies must be 256
 82 | */
 83 | 
 84 | //Integer instructions
 85 | #define RANDOMX_FREQ_IADD_RS       16
 86 | #define RANDOMX_FREQ_IADD_M         7
 87 | #define RANDOMX_FREQ_ISUB_R        16
 88 | #define RANDOMX_FREQ_ISUB_M         7
 89 | #define RANDOMX_FREQ_IMUL_R        16
 90 | #define RANDOMX_FREQ_IMUL_M         4
 91 | #define RANDOMX_FREQ_IMULH_R        4
 92 | #define RANDOMX_FREQ_IMULH_M        1
 93 | #define RANDOMX_FREQ_ISMULH_R       4
 94 | #define RANDOMX_FREQ_ISMULH_M       1
 95 | #define RANDOMX_FREQ_IMUL_RCP       8
 96 | #define RANDOMX_FREQ_INEG_R         2
 97 | #define RANDOMX_FREQ_IXOR_R        15
 98 | #define RANDOMX_FREQ_IXOR_M         5
 99 | #define RANDOMX_FREQ_IROR_R         8
100 | #define RANDOMX_FREQ_IROL_R         2
101 | #define RANDOMX_FREQ_ISWAP_R        4
102 | 
103 | //Floating point instructions
104 | #define RANDOMX_FREQ_FSWAP_R        4
105 | #define RANDOMX_FREQ_FADD_R        16
106 | #define RANDOMX_FREQ_FADD_M         5
107 | #define RANDOMX_FREQ_FSUB_R        16
108 | #define RANDOMX_FREQ_FSUB_M         5
109 | #define RANDOMX_FREQ_FSCAL_R        6
110 | #define RANDOMX_FREQ_FMUL_R        32
111 | #define RANDOMX_FREQ_FDIV_M         4
112 | #define RANDOMX_FREQ_FSQRT_R        6
113 | 
114 | //Control instructions
115 | #define RANDOMX_FREQ_CBRANCH       25
116 | #define RANDOMX_FREQ_CFROUND        1
117 | 
118 | //Store instruction
119 | #define RANDOMX_FREQ_ISTORE        16
120 | 
121 | //No-op instruction
122 | #define RANDOMX_FREQ_NOP            0
123 | /*                               ------
124 |                                   256
125 | */
126 | 


--------------------------------------------------------------------------------
/src/RandomX/wownero/configuration.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2018-2019, tevador <tevador@gmail.com>
  3 | Copyright (c) 2019, Wownero Inc., a Monero Enterprise Alliance partner company
  4 | 
  5 | All rights reserved.
  6 | 
  7 | Redistribution and use in source and binary forms, with or without
  8 | modification, are permitted provided that the following conditions are met:
  9 | 	* Redistributions of source code must retain the above copyright
 10 | 	  notice, this list of conditions and the following disclaimer.
 11 | 	* Redistributions in binary form must reproduce the above copyright
 12 | 	  notice, this list of conditions and the following disclaimer in the
 13 | 	  documentation and/or other materials provided with the distribution.
 14 | 	* Neither the name of the copyright holder nor the
 15 | 	  names of its contributors may be used to endorse or promote products
 16 | 	  derived from this software without specific prior written permission.
 17 | 
 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 28 | */
 29 | 
 30 | #pragma once
 31 | 
 32 | //Cache size in KiB. Must be a power of 2.
 33 | #define RANDOMX_ARGON_MEMORY       262144
 34 | 
 35 | //Number of Argon2d iterations for Cache initialization.
 36 | #define RANDOMX_ARGON_ITERATIONS   3
 37 | 
 38 | //Number of parallel lanes for Cache initialization.
 39 | #define RANDOMX_ARGON_LANES        1
 40 | 
 41 | //Argon2d salt
 42 | #define RANDOMX_ARGON_SALT         "RandomWOW\x01"
 43 | 
 44 | //Number of random Cache accesses per Dataset item. Minimum is 2.
 45 | #define RANDOMX_CACHE_ACCESSES     8
 46 | 
 47 | //Target latency for SuperscalarHash (in cycles of the reference CPU).
 48 | #define RANDOMX_SUPERSCALAR_LATENCY   170
 49 | 
 50 | //Dataset base size in bytes. Must be a power of 2.
 51 | #define RANDOMX_DATASET_BASE_SIZE  2147483648
 52 | 
 53 | //Dataset extra size. Must be divisible by 64.
 54 | #define RANDOMX_DATASET_EXTRA_SIZE 33554368
 55 | 
 56 | //Number of instructions in a RandomX program. Must be divisible by 8.
 57 | #define RANDOMX_PROGRAM_SIZE       256
 58 | 
 59 | //Number of iterations during VM execution.
 60 | #define RANDOMX_PROGRAM_ITERATIONS 1024
 61 | 
 62 | //Number of chained VM executions per hash.
 63 | #define RANDOMX_PROGRAM_COUNT      16
 64 | 
 65 | //Scratchpad L3 size in bytes. Must be a power of 2.
 66 | #define RANDOMX_SCRATCHPAD_L3      1048576
 67 | 
 68 | //Scratchpad L2 size in bytes. Must be a power of two and less than or equal to RANDOMX_SCRATCHPAD_L3.
 69 | #define RANDOMX_SCRATCHPAD_L2      131072
 70 | 
 71 | //Scratchpad L1 size in bytes. Must be a power of two (minimum 64) and less than or equal to RANDOMX_SCRATCHPAD_L2.
 72 | #define RANDOMX_SCRATCHPAD_L1      16384
 73 | 
 74 | //Jump condition mask size in bits.
 75 | #define RANDOMX_JUMP_BITS          8
 76 | 
 77 | //Jump condition mask offset in bits. The sum of RANDOMX_JUMP_BITS and RANDOMX_JUMP_OFFSET must not exceed 16.
 78 | #define RANDOMX_JUMP_OFFSET        8
 79 | 
 80 | /*
 81 | Instruction frequencies (per 256 opcodes)
 82 | Total sum of frequencies must be 256
 83 | */
 84 | 
 85 | //Integer instructions
 86 | #define RANDOMX_FREQ_IADD_RS       25
 87 | #define RANDOMX_FREQ_IADD_M         7
 88 | #define RANDOMX_FREQ_ISUB_R        16
 89 | #define RANDOMX_FREQ_ISUB_M         7
 90 | #define RANDOMX_FREQ_IMUL_R        16
 91 | #define RANDOMX_FREQ_IMUL_M         4
 92 | #define RANDOMX_FREQ_IMULH_R        4
 93 | #define RANDOMX_FREQ_IMULH_M        1
 94 | #define RANDOMX_FREQ_ISMULH_R       4
 95 | #define RANDOMX_FREQ_ISMULH_M       1
 96 | #define RANDOMX_FREQ_IMUL_RCP       8
 97 | #define RANDOMX_FREQ_INEG_R         2
 98 | #define RANDOMX_FREQ_IXOR_R        15
 99 | #define RANDOMX_FREQ_IXOR_M         5
100 | #define RANDOMX_FREQ_IROR_R        10
101 | #define RANDOMX_FREQ_IROL_R         0
102 | #define RANDOMX_FREQ_ISWAP_R        4
103 | 
104 | //Floating point instructions
105 | #define RANDOMX_FREQ_FSWAP_R        8
106 | #define RANDOMX_FREQ_FADD_R        20
107 | #define RANDOMX_FREQ_FADD_M         5
108 | #define RANDOMX_FREQ_FSUB_R        20
109 | #define RANDOMX_FREQ_FSUB_M         5
110 | #define RANDOMX_FREQ_FSCAL_R        6
111 | #define RANDOMX_FREQ_FMUL_R        20
112 | #define RANDOMX_FREQ_FDIV_M         4
113 | #define RANDOMX_FREQ_FSQRT_R        6
114 | 
115 | //Control instructions
116 | #define RANDOMX_FREQ_CBRANCH       16
117 | #define RANDOMX_FREQ_CFROUND        1
118 | 
119 | //Store instruction
120 | #define RANDOMX_FREQ_ISTORE        16
121 | 
122 | //No-op instruction
123 | #define RANDOMX_FREQ_NOP            0
124 | /*                               ------
125 |                                   256
126 | */
127 | 


--------------------------------------------------------------------------------
/src/3rdparty/cub/grid/grid_mapping.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  *
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include "../config.cuh"
 37 | 
 38 | /// Optional outer namespace(s)
 39 | CUB_NS_PREFIX
 40 | 
 41 | /// CUB namespace
 42 | namespace cub {
 43 | 
 44 | 
 45 | /**
 46 |  * \addtogroup GridModule
 47 |  * @{
 48 |  */
 49 | 
 50 | 
 51 | /******************************************************************************
 52 |  * Mapping policies
 53 |  *****************************************************************************/
 54 | 
 55 | 
 56 | /**
 57 |  * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
 58 |  */
 59 | enum GridMappingStrategy
 60 | {
 61 |     /**
 62 |      * \brief An a "raking" access pattern in which each thread block is
 63 |      * assigned a consecutive sequence of input tiles
 64 |      *
 65 |      * \par Overview
 66 |      * The input is evenly partitioned into \p p segments, where \p p is
 67 |      * constant and corresponds loosely to the number of thread blocks that may
 68 |      * actively reside on the target device. Each segment is comprised of
 69 |      * consecutive tiles, where a tile is a small, constant-sized unit of input
 70 |      * to be processed to completion before the thread block terminates or
 71 |      * obtains more work.  The kernel invokes \p p thread blocks, each
 72 |      * of which iteratively consumes a segment of <em>n</em>/<em>p</em> elements
 73 |      * in tile-size increments.
 74 |      */
 75 |     GRID_MAPPING_RAKE,
 76 | 
 77 |     /**
 78 |      * \brief An a "strip mining" access pattern in which the input tiles assigned
 79 |      * to each thread block are separated by a stride equal to the the extent of
 80 |      * the grid.
 81 |      *
 82 |      * \par Overview
 83 |      * The input is evenly partitioned into \p p sets, where \p p is
 84 |      * constant and corresponds loosely to the number of thread blocks that may
 85 |      * actively reside on the target device. Each set is comprised of
 86 |      * data tiles separated by stride \p tiles, where a tile is a small,
 87 |      * constant-sized unit of input to be processed to completion before the
 88 |      * thread block terminates or obtains more work.  The kernel invokes \p p
 89 |      * thread blocks, each of which iteratively consumes a segment of
 90 |      * <em>n</em>/<em>p</em> elements in tile-size increments.
 91 |      */
 92 |     GRID_MAPPING_STRIP_MINE,
 93 | 
 94 |     /**
 95 |      * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks.
 96 |      *
 97 |      * \par Overview
 98 |      * The input is treated as a queue to be dynamically consumed by a grid of
 99 |      * thread blocks.  Work is atomically dequeued in tiles, where a tile is a
100 |      * unit of input to be processed to completion before the thread block
101 |      * terminates or obtains more work.  The grid size \p p is constant,
102 |      * loosely corresponding to the number of thread blocks that may actively
103 |      * reside on the target device.
104 |      */
105 |     GRID_MAPPING_DYNAMIC,
106 | };
107 | 
108 | 
109 | /** @} */       // end group GridModule
110 | 
111 | }               // CUB namespace
112 | CUB_NS_POSTFIX  // Optional outer namespace(s)
113 | 
114 | 


--------------------------------------------------------------------------------
/src/KawPow/raven/KawPow_dag.h:
--------------------------------------------------------------------------------
  1 | #define ETHASH_HASH_BYTES 64
  2 | #define ETHASH_DATASET_PARENTS 512
  3 | 
  4 | #if (__CUDACC_VER_MAJOR__ > 8)
  5 | #define SHFL(x, y, z) __shfl_sync(0xFFFFFFFF, (x), (y), (z))
  6 | #else
  7 | #define SHFL(x, y, z) __shfl((x), (y), (z))
  8 | #endif
  9 | 
 10 | typedef union {
 11 | 	uint32_t words[64 / sizeof(uint32_t)];
 12 | 	uint2	 uint2s[64 / sizeof(uint2)];
 13 | 	uint4	 uint4s[64 / sizeof(uint4)];
 14 | } hash64_t;
 15 | 
 16 | typedef union {
 17 | 	uint32_t words[200 / sizeof(uint32_t)];
 18 | 	uint64_t uint64s[200 / sizeof(uint64_t)];
 19 | 	uint2	 uint2s[200 / sizeof(uint2)];
 20 | 	uint4	 uint4s[200 / sizeof(uint4)];
 21 | } hash200_t;
 22 | 
 23 | // Implementation based on:
 24 | // https://github.com/mjosaarinen/tiny_sha3/blob/master/sha3.c
 25 | // converted from 64->32 bit words
 26 | 
 27 | __device__ __constant__ const uint64_t keccakf_rndc[24] = {
 28 | 	0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808AULL,
 29 | 	0x8000000080008000ULL, 0x000000000000808BULL, 0x0000000080000001ULL,
 30 | 	0x8000000080008081ULL, 0x8000000000008009ULL, 0x000000000000008AULL,
 31 | 	0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000AULL,
 32 | 	0x000000008000808BULL, 0x800000000000008BULL, 0x8000000000008089ULL,
 33 | 	0x8000000000008003ULL, 0x8000000000008002ULL, 0x8000000000000080ULL,
 34 | 	0x000000000000800AULL, 0x800000008000000AULL, 0x8000000080008081ULL,
 35 | 	0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL
 36 | };
 37 | 
 38 | __device__ __forceinline__ uint64_t ROTL64(const uint64_t x, const int offset)
 39 | {
 40 |     uint64_t result;
 41 |     asm("{\n\t"
 42 |         ".reg .b64 lhs;\n\t"
 43 |         ".reg .u32 roff;\n\t"
 44 |         "shl.b64 lhs, %1, %2;\n\t"
 45 |         "sub.u32 roff, 64, %2;\n\t"
 46 |         "shr.b64 %0, %1, roff;\n\t"
 47 |         "add.u64 %0, lhs, %0;\n\t"
 48 |         "}\n"
 49 |         : "=l"(result)
 50 |         : "l"(x), "r"(offset));
 51 |     return result;
 52 | }
 53 | 
 54 | __device__ __forceinline__ void keccak_f1600_round(uint64_t st[25], const int r)
 55 | {
 56 | 	const uint32_t keccakf_rotc[24] = {
 57 | 		1,  3,  6,  10, 15, 21, 28, 36, 45, 55, 2,  14,
 58 | 		27, 41, 56, 8,  25, 43, 62, 18, 39, 61, 20, 44
 59 | 	};
 60 | 	const uint32_t keccakf_piln[24] = {
 61 | 		10, 7,  11, 17, 18, 3, 5,  16, 8,  21, 24, 4,
 62 | 		15, 23, 19, 13, 12, 2, 20, 14, 22, 9,  6,  1
 63 | 	};
 64 | 
 65 | 	uint64_t t, bc[5];
 66 | 	// Theta
 67 | 	for (int i = 0; i < 5; i++)
 68 | 		bc[i] = st[i] ^ st[i + 5] ^ st[i + 10] ^ st[i + 15] ^ st[i + 20];
 69 | 
 70 | 	for (int i = 0; i < 5; i++) {
 71 | 		t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1);
 72 | 		for (uint32_t j = 0; j < 25; j += 5)
 73 | 			st[j + i] ^= t;
 74 | 	}
 75 | 
 76 | 	// Rho Pi
 77 | 	t = st[1];
 78 | 	for (int i = 0; i < 24; i++) {
 79 | 		uint32_t j = keccakf_piln[i];
 80 | 		bc[0] = st[j];
 81 | 		st[j] = ROTL64(t, keccakf_rotc[i]);
 82 | 		t = bc[0];
 83 | 	}
 84 | 
 85 | 	//  Chi
 86 | 	for (uint32_t j = 0; j < 25; j += 5) {
 87 | 		for (int i = 0; i < 5; i++)
 88 | 			bc[i] = st[j + i];
 89 | 		for (int i = 0; i < 5; i++)
 90 | 			st[j + i] ^= (~bc[(i + 1) % 5]) & bc[(i + 2) % 5];
 91 | 	}
 92 | 
 93 | 	//  Iota
 94 | 	st[0] ^= keccakf_rndc[r];
 95 | }
 96 | 
 97 | __device__ __forceinline__ void keccak_f1600(uint64_t st[25])
 98 | {
 99 | 	for (int i = 8; i < 25; i++) {
100 | 		st[i] = 0;
101 | 	}
102 | 	st[8] = 0x8000000000000001;
103 | 
104 | 	for (int r = 0; r < 24; r++) {
105 | 		keccak_f1600_round(st, r);
106 | 	}
107 | }
108 | 
109 | #define FNV_PRIME	0x01000193U
110 | #define fnv(x,y) ((uint32_t(x) * (FNV_PRIME)) ^uint32_t(y))
111 | __device__ uint4 fnv4(uint4 a, uint4 b)
112 | {
113 | 	uint4 c;
114 | 	c.x = a.x * FNV_PRIME ^ b.x;
115 | 	c.y = a.y * FNV_PRIME ^ b.y;
116 | 	c.z = a.z * FNV_PRIME ^ b.z;
117 | 	c.w = a.w * FNV_PRIME ^ b.w;
118 | 	return c;
119 | }
120 | 
121 | #define NODE_WORDS (ETHASH_HASH_BYTES/sizeof(uint32_t))
122 | 
123 | __device__ __forceinline__ uint32_t fast_mod(uint32_t a, uint4 d)
124 | {
125 | 	const uint64_t t = a;
126 | 	const uint32_t q = ((t + d.y) * d.x) >> d.z;
127 | 	return a - q * d.w;
128 | }
129 | 
130 | __global__ void ethash_calculate_dag_item(uint32_t start, hash64_t *g_dag, uint64_t dag_bytes, hash64_t* g_light, uint4 light_words)
131 | {
132 | 	uint64_t const node_index = start + uint64_t(blockIdx.x) * blockDim.x + threadIdx.x;
133 | 
134 | 	uint64_t num_nodes = dag_bytes / sizeof(hash64_t);
135 | 	uint64_t num_nodes_rounded = ((num_nodes + 3) / 4) * 4;
136 | 
137 | 	if (node_index >= num_nodes_rounded) return; // None of the threads from this quad have valid node_index
138 | 
139 | 	hash200_t dag_node;
140 | 	for(int i = 0; i < 4; ++i) {
141 | 		dag_node.uint4s[i] = g_light[fast_mod(node_index, light_words)].uint4s[i];
142 | 	}
143 | 
144 | 	dag_node.words[0] ^= node_index;
145 | 	keccak_f1600(dag_node.uint64s);
146 | 
147 | 	const int thread_id = threadIdx.x & 3;
148 | 
149 | 	#pragma unroll(4)
150 | 	for (uint32_t i = 0; i < ETHASH_DATASET_PARENTS; ++i) {
151 | 		uint32_t parent_index = fast_mod(fnv(node_index ^ i, dag_node.words[i % NODE_WORDS]), light_words);
152 | 
153 | 		#pragma unroll
154 | 		for (uint32_t t = 0; t < 4; ++t) {
155 | 
156 | 			const uint32_t shuffle_index = SHFL(parent_index, t, 4);
157 | 
158 | 			const uint4 p4 = g_light[shuffle_index].uint4s[thread_id];
159 | 
160 | 			#pragma unroll
161 | 			for (int w = 0; w < 4; ++w) {
162 | 
163 | 				const uint4 s4 = make_uint4(SHFL(p4.x, w, 4), SHFL(p4.y, w, 4), SHFL(p4.z, w, 4), SHFL(p4.w, w, 4));
164 | 				if (t == thread_id) {
165 | 					dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], s4);
166 | 				}
167 | 			}
168 | 		}
169 | 	}
170 | 
171 | 	keccak_f1600(dag_node.uint64s);
172 | 
173 | 	for (uint32_t t = 0; t < 4; ++t) {
174 | 		uint32_t shuffle_index = SHFL(node_index, t, 4);
175 | 		uint4 s[4];
176 | 		for (uint32_t w = 0; w < 4; w++) {
177 | 			s[w] = make_uint4(SHFL(dag_node.uint4s[w].x, t, 4),
178 | 							  SHFL(dag_node.uint4s[w].y, t, 4),
179 | 							  SHFL(dag_node.uint4s[w].z, t, 4),
180 | 							  SHFL(dag_node.uint4s[w].w, t, 4));
181 | 		}
182 | 		if (shuffle_index * sizeof(hash64_t) < dag_bytes) {
183 | 			g_dag[shuffle_index].uint4s[thread_id] = s[thread_id];
184 | 		}
185 | 	}
186 | }
187 | 


--------------------------------------------------------------------------------
/src/cuda_blake.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | typedef struct {
  4 | 	uint32_t h[8], s[4], t[2];
  5 | 	uint32_t buflen;
  6 | 	int nullt;
  7 | 	uint8_t buf[64];
  8 | } blake_state;
  9 | 
 10 | #define U8TO32(p) \
 11 | 	(((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \
 12 | 	((uint32_t)((p)[2]) <<  8) | ((uint32_t)((p)[3])      ))
 13 | 
 14 | #define U32TO8(p, v) \
 15 | 	(p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \
 16 | 	(p)[2] = (uint8_t)((v) >>  8); (p)[3] = (uint8_t)((v)      );
 17 | 
 18 | #define BLAKE_ROT(x,n) ROTR32(x, n)
 19 | #define BLAKE_G(a,b,c,d,e) \
 20 | 	v[a] += (m[d_blake_sigma[i][e]] ^ d_blake_cst[d_blake_sigma[i][e+1]]) + v[b]; \
 21 | 	v[d] = BLAKE_ROT(v[d] ^ v[a],16); \
 22 | 	v[c] += v[d];                     \
 23 | 	v[b] = BLAKE_ROT(v[b] ^ v[c],12); \
 24 | 	v[a] += (m[d_blake_sigma[i][e+1]] ^ d_blake_cst[d_blake_sigma[i][e]])+v[b]; \
 25 | 	v[d] = BLAKE_ROT(v[d] ^ v[a], 8); \
 26 | 	v[c] += v[d];                     \
 27 | 	v[b] = BLAKE_ROT(v[b] ^ v[c], 7);
 28 | 
 29 | __constant__ uint8_t d_blake_sigma[14][16] =
 30 | {
 31 | 	{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
 32 | 	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
 33 | 	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
 34 | 	{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
 35 | 	{9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
 36 | 	{2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
 37 | 	{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
 38 | 	{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
 39 | 	{6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5},
 40 | 	{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0},
 41 | 	{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
 42 | 	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
 43 | 	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
 44 | 	{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}
 45 | };
 46 | __constant__ uint32_t d_blake_cst[16]
 47 | = {
 48 | 	0x243F6A88, 0x85A308D3, 0x13198A2E, 0x03707344,
 49 | 	0xA4093822, 0x299F31D0, 0x082EFA98, 0xEC4E6C89,
 50 | 	0x452821E6, 0x38D01377, 0xBE5466CF, 0x34E90C6C,
 51 | 	0xC0AC29B7, 0xC97C50DD, 0x3F84D5B5, 0xB5470917
 52 | };
 53 | 
 54 | __device__ void cn_blake_compress(blake_state *S, const uint8_t *block)
 55 | {
 56 | 	uint32_t v[16], m[16], i;
 57 | 
 58 | 	for (i = 0; i < 16; ++i) m[i] = U8TO32(block + i * 4);
 59 | 	for (i = 0; i < 8;  ++i) v[i] = S->h[i];
 60 | 	v[ 8] = S->s[0] ^ 0x243F6A88;
 61 | 	v[ 9] = S->s[1] ^ 0x85A308D3;
 62 | 	v[10] = S->s[2] ^ 0x13198A2E;
 63 | 	v[11] = S->s[3] ^ 0x03707344;
 64 | 	v[12] = 0xA4093822;
 65 | 	v[13] = 0x299F31D0;
 66 | 	v[14] = 0x082EFA98;
 67 | 	v[15] = 0xEC4E6C89;
 68 | 
 69 | 	if (S->nullt == 0)
 70 | 	{
 71 | 		v[12] ^= S->t[0];
 72 | 		v[13] ^= S->t[0];
 73 | 		v[14] ^= S->t[1];
 74 | 		v[15] ^= S->t[1];
 75 | 	}
 76 | 
 77 | 	for (i = 0; i < 14; ++i)
 78 | 	{
 79 | 		BLAKE_G(0, 4,  8, 12,  0);
 80 | 		BLAKE_G(1, 5,  9, 13,  2);
 81 | 		BLAKE_G(2, 6, 10, 14,  4);
 82 | 		BLAKE_G(3, 7, 11, 15,  6);
 83 | 		BLAKE_G(3, 4,  9, 14, 14);
 84 | 		BLAKE_G(2, 7,  8, 13, 12);
 85 | 		BLAKE_G(0, 5, 10, 15,  8);
 86 | 		BLAKE_G(1, 6, 11, 12, 10);
 87 | 	}
 88 | 
 89 | 	for (i = 0; i < 16; ++i) S->h[i % 8] ^= v[i];
 90 | 	for (i = 0; i < 8;  ++i) S->h[i] ^= S->s[i % 4];
 91 | }
 92 | 
 93 | __device__ void cn_blake_update(blake_state *S, const uint8_t *data, uint64_t datalen)
 94 | {
 95 | 	uint32_t left = S->buflen >> 3;
 96 | 	uint32_t fill = 64 - left;
 97 | 
 98 | 	if (left && (((datalen >> 3) & 0x3F) >= fill))
 99 | 	{
100 | 		memcpy((void *) (S->buf + left), (void *) data, fill);
101 | 		S->t[0] += 512;
102 | 		if (S->t[0] == 0) S->t[1]++;
103 | 		cn_blake_compress(S, S->buf);
104 | 		data += fill;
105 | 		datalen -= (fill << 3);
106 | 		left = 0;
107 | 	}
108 | 
109 | 	while (datalen >= 512) 
110 | 	{
111 | 		S->t[0] += 512;
112 | 		if (S->t[0] == 0) S->t[1]++;
113 | 		cn_blake_compress(S, data);
114 | 		data += 64;
115 | 		datalen -= 512;
116 | 	}
117 | 
118 | 	if (datalen > 0) 
119 | 	{
120 | 		memcpy((void *) (S->buf + left), (void *) data, datalen >> 3);
121 | 		S->buflen = (left << 3) + datalen;
122 | 	}
123 | 	else 
124 | 	{
125 | 		S->buflen = 0;
126 | 	}
127 | }
128 | 
129 | __device__ void cn_blake_final(blake_state *S, uint8_t *digest)
130 | {
131 | 	const uint8_t padding[] = 
132 | 	{
133 | 		0x80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
134 | 		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
135 | 	};
136 | 
137 | 	uint8_t pa = 0x81, pb = 0x01;
138 | 	uint8_t msglen[8];
139 | 	uint32_t lo = S->t[0] + S->buflen, hi = S->t[1];
140 | 	if (lo < (unsigned) S->buflen) hi++;
141 | 	U32TO8(msglen + 0, hi);
142 | 	U32TO8(msglen + 4, lo);
143 | 
144 | 	if (S->buflen == 440) 
145 | 	{
146 | 		S->t[0] -= 8;
147 | 		cn_blake_update(S, &pa, 8);
148 | 	} 
149 | 	else 
150 | 	{
151 | 		if (S->buflen < 440) 
152 | 		{
153 | 			if (S->buflen == 0) S->nullt = 1;
154 | 			S->t[0] -= 440 - S->buflen;
155 | 			cn_blake_update(S, padding, 440 - S->buflen);
156 | 		}
157 | 		else 
158 | 		{
159 | 			S->t[0] -= 512 - S->buflen;
160 | 			cn_blake_update(S, padding, 512 - S->buflen);
161 | 			S->t[0] -= 440;
162 | 			cn_blake_update(S, padding + 1, 440);
163 | 			S->nullt = 1;
164 | 		}
165 | 		cn_blake_update(S, &pb, 8);
166 | 		S->t[0] -= 8;
167 | 	}
168 | 	S->t[0] -= 64;
169 | 	cn_blake_update(S, msglen, 64);
170 | 
171 | 	U32TO8(digest +  0, S->h[0]);
172 | 	U32TO8(digest +  4, S->h[1]);
173 | 	U32TO8(digest +  8, S->h[2]);
174 | 	U32TO8(digest + 12, S->h[3]);
175 | 	U32TO8(digest + 16, S->h[4]);
176 | 	U32TO8(digest + 20, S->h[5]);
177 | 	U32TO8(digest + 24, S->h[6]);
178 | 	U32TO8(digest + 28, S->h[7]);
179 | }
180 | 
181 | __device__ void cn_blake(const uint8_t *in, uint64_t inlen, uint8_t *out)
182 | {
183 | 	blake_state bs;
184 | 	blake_state *S = (blake_state *)&bs;
185 | 
186 | 	S->h[0] = 0x6A09E667; S->h[1] = 0xBB67AE85; S->h[2] = 0x3C6EF372;
187 | 	S->h[3] = 0xA54FF53A; S->h[4] = 0x510E527F; S->h[5] = 0x9B05688C;
188 | 	S->h[6] = 0x1F83D9AB; S->h[7] = 0x5BE0CD19;
189 | 	S->t[0] = S->t[1] = S->buflen = S->nullt = 0;
190 | 	S->s[0] = S->s[1] = S->s[2] = S->s[3] = 0;
191 | 
192 | 	cn_blake_update(S, in, inlen * 8);
193 | 	cn_blake_final(S, out);
194 | }
195 | 


--------------------------------------------------------------------------------
/src/3rdparty/cub/util_cpp_dialect.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 |  *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | /*! \file
 29 |  *  \brief Detect the version of the C++ standard used by the compiler.
 30 |  */
 31 | 
 32 | #pragma once
 33 | 
 34 | #include "util_compiler.cuh"
 35 | 
 36 | // Deprecation warnings may be silenced by defining the following macros. These
 37 | // may be combined.
 38 | // - CUB_IGNORE_DEPRECATED_CPP_DIALECT:
 39 | //   Ignore all deprecated C++ dialects and outdated compilers.
 40 | // - CUB_IGNORE_DEPRECATED_CPP_11:
 41 | //   Ignore deprecation warnings when compiling with C++11. C++03 and outdated
 42 | //   compilers will still issue warnings.
 43 | // - CUB_IGNORE_DEPRECATED_COMPILER
 44 | //   Ignore deprecation warnings when using deprecated compilers. Compiling
 45 | //   with C++03 and C++11 will still issue warnings.
 46 | 
 47 | // Check for the thrust opt-outs as well:
 48 | #if !defined(CUB_IGNORE_DEPRECATED_CPP_DIALECT) && \
 49 |      defined(THRUST_IGNORE_DEPRECATED_CPP_DIALECT)
 50 | #  define    CUB_IGNORE_DEPRECATED_CPP_DIALECT
 51 | #endif
 52 | #if !defined(CUB_IGNORE_DEPRECATED_CPP_11) && \
 53 |      defined(THRUST_IGNORE_DEPRECATED_CPP_11)
 54 | #  define    CUB_IGNORE_DEPRECATED_CPP_11
 55 | #endif
 56 | #if !defined(CUB_IGNORE_DEPRECATED_COMPILER) && \
 57 |      defined(THRUST_IGNORE_DEPRECATED_COMPILER)
 58 | #  define    CUB_IGNORE_DEPRECATED_COMPILER
 59 | #endif
 60 | 
 61 | #ifdef CUB_IGNORE_DEPRECATED_CPP_DIALECT
 62 | #  define CUB_IGNORE_DEPRECATED_CPP_11
 63 | #  define CUB_IGNORE_DEPRECATED_COMPILER
 64 | #endif
 65 | 
 66 | // Define this to override the built-in detection.
 67 | #ifndef CUB_CPP_DIALECT
 68 | 
 69 | // MSVC does not define __cplusplus correctly. _MSVC_LANG is used instead.
 70 | // This macro is only defined in MSVC 2015U3+.
 71 | #  ifdef _MSVC_LANG // Do not replace with CUB_HOST_COMPILER test (see above)
 72 | // MSVC2015 reports C++14 but lacks extended constexpr support. Treat as C++11.
 73 | #    if CUB_MSVC_VERSION < 1910 && _MSVC_LANG > 201103L /* MSVC < 2017 && CPP > 2011 */
 74 | #      define CUB_CPLUSPLUS 201103L /* Fix to 2011 */
 75 | #    else
 76 | #      define CUB_CPLUSPLUS _MSVC_LANG /* We'll trust this for now. */
 77 | #    endif // MSVC 2015 C++14 fix
 78 | #  else
 79 | #    define CUB_CPLUSPLUS __cplusplus
 80 | #  endif
 81 | 
 82 | // Detect current dialect:
 83 | #  if CUB_CPLUSPLUS < 201103L
 84 | #    define CUB_CPP_DIALECT 2003
 85 | #  elif CUB_CPLUSPLUS < 201402L
 86 | #    define CUB_CPP_DIALECT 2011
 87 | #  elif CUB_CPLUSPLUS < 201703L
 88 | #    define CUB_CPP_DIALECT 2014
 89 | #  elif CUB_CPLUSPLUS == 201703L
 90 | #    define CUB_CPP_DIALECT 2017
 91 | #  elif CUB_CPLUSPLUS > 201703L // unknown, but is higher than 2017.
 92 | #    define CUB_CPP_DIALECT 2020
 93 | #  endif
 94 | 
 95 | #  undef CUB_CPLUSPLUS // cleanup
 96 | 
 97 | #endif // !CUB_CPP_DIALECT
 98 | 
 99 | // Define CUB_COMPILER_DEPRECATION macro:
100 | #if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC
101 | #  define CUB_COMP_DEPR_IMPL(msg) \
102 |     __pragma(message(__FILE__ ":" CUB_COMP_DEPR_IMPL0(__LINE__) ": warning: " #msg))
103 | #  define CUB_COMP_DEPR_IMPL0(x) CUB_COMP_DEPR_IMPL1(x)
104 | #  define CUB_COMP_DEPR_IMPL1(x) #x
105 | #else // clang / gcc:
106 | #  define CUB_COMP_DEPR_IMPL(msg) CUB_COMP_DEPR_IMPL0(GCC warning #msg)
107 | #  define CUB_COMP_DEPR_IMPL0(expr) _Pragma(#expr)
108 | #  define CUB_COMP_DEPR_IMPL1 /* intentionally blank */
109 | #endif
110 | 
111 | #define CUB_COMPILER_DEPRECATION(REQ, FIX) \
112 |   CUB_COMP_DEPR_IMPL(CUB requires REQ. Please FIX. Define CUB_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message.)
113 | 
114 | // Minimum required compiler checks:
115 | #ifndef CUB_IGNORE_DEPRECATED_COMPILER
116 | #  if CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC && CUB_GCC_VERSION < 50000
117 |      CUB_COMPILER_DEPRECATION(GCC 5.0, upgrade your compiler);
118 | #  endif
119 | #  if CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG && CUB_CLANG_VERSION < 60000
120 |      CUB_COMPILER_DEPRECATION(Clang 6.0, upgrade your compiler);
121 | #  endif
122 | #  if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC && CUB_MSVC_VERSION < 1910
123 |      CUB_COMPILER_DEPRECATION(MSVC 2017, upgrade your compiler);
124 | #  endif
125 | #endif
126 | 
127 | #if !defined(CUB_IGNORE_DEPRECATED_CPP_DIALECT) && CUB_CPP_DIALECT < 2014 && \
128 |     (CUB_CPP_DIALECT != 2011 || !defined(CUB_IGNORE_DEPRECATED_CPP_11))
129 |   CUB_COMPILER_DEPRECATION(C++14, pass -std=c++14 to your compiler);
130 | #endif
131 | 
132 | #undef CUB_COMPILER_DEPRECATION
133 | #undef CUB_COMP_DEPR_IMPL
134 | #undef CUB_COMP_DEPR_IMPL0
135 | #undef CUB_COMP_DEPR_IMPL1
136 | 


--------------------------------------------------------------------------------
/src/3rdparty/cub/util_debug.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  *
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Error and event logging routines.
 32 |  *
 33 |  * The following macros definitions are supported:
 34 |  * - \p CUB_LOG.  Simple event messages are printed to \p stdout.
 35 |  */
 36 | 
 37 | #pragma once
 38 | 
 39 | #include <stdio.h>
 40 | #include "util_namespace.cuh"
 41 | #include "util_arch.cuh"
 42 | 
 43 | /// Optional outer namespace(s)
 44 | CUB_NS_PREFIX
 45 | 
 46 | /// CUB namespace
 47 | namespace cub {
 48 | 
 49 | 
 50 | /**
 51 |  * \addtogroup UtilMgmt
 52 |  * @{
 53 |  */
 54 | 
 55 | 
 56 | /// CUB error reporting macro (prints error messages to stderr)
 57 | #if (defined(DEBUG) || defined(_DEBUG)) && !defined(CUB_STDERR)
 58 |     #define CUB_STDERR
 59 | #endif
 60 | 
 61 | 
 62 | 
 63 | /**
 64 |  * \brief %If \p CUB_STDERR is defined and \p error is not \p cudaSuccess, the corresponding error message is printed to \p stderr (or \p stdout in device code) along with the supplied source context.
 65 |  *
 66 |  * \return The CUDA error.
 67 |  */
 68 | __host__ __device__ __forceinline__ cudaError_t Debug(
 69 |     cudaError_t     error,
 70 |     const char*     filename,
 71 |     int             line)
 72 | {
 73 |     (void)filename;
 74 |     (void)line;
 75 | 
 76 | #ifdef CUB_RUNTIME_ENABLED
 77 |     // Clear the global CUDA error state which may have been set by the last
 78 |     // call. Otherwise, errors may "leak" to unrelated kernel launches.
 79 |     cudaGetLastError();
 80 | #endif
 81 | 
 82 | #ifdef CUB_STDERR
 83 |     if (error)
 84 |     {
 85 |         if (CUB_IS_HOST_CODE) {
 86 |             #if CUB_INCLUDE_HOST_CODE
 87 |                 fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error));
 88 |                 fflush(stderr);
 89 |             #endif
 90 |         } else {
 91 |             #if CUB_INCLUDE_DEVICE_CODE
 92 |                 printf("CUDA error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n", error, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, filename, line);
 93 |             #endif
 94 |         }
 95 |     }
 96 | #endif
 97 |     return error;
 98 | }
 99 | 
100 | 
101 | /**
102 |  * \brief Debug macro
103 |  */
104 | #ifndef CubDebug
105 |     #define CubDebug(e) cub::Debug((cudaError_t) (e), __FILE__, __LINE__)
106 | #endif
107 | 
108 | 
109 | /**
110 |  * \brief Debug macro with exit
111 |  */
112 | #ifndef CubDebugExit
113 |     #define CubDebugExit(e) if (cub::Debug((cudaError_t) (e), __FILE__, __LINE__)) { exit(1); }
114 | #endif
115 | 
116 | 
117 | /**
118 |  * \brief Log macro for printf statements.
119 |  */
120 | #if !defined(_CubLog)
121 |     #if defined(__NVCOMPILER_CUDA__)
122 |         #define _CubLog(format, ...) (__builtin_is_device_code() \
123 |             ? printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, \
124 |                      blockIdx.z, blockIdx.y, blockIdx.x, \
125 |                      threadIdx.z, threadIdx.y, threadIdx.x, __VA_ARGS__) \
126 |             : printf(format, __VA_ARGS__));
127 |     #elif !(defined(__clang__) && defined(__CUDA__))
128 |         #if (CUB_PTX_ARCH == 0)
129 |             #define _CubLog(format, ...) printf(format,__VA_ARGS__);
130 |         #elif (CUB_PTX_ARCH >= 200)
131 |             #define _CubLog(format, ...) printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, __VA_ARGS__);
132 |         #endif
133 |     #else
134 |         // XXX shameless hack for clang around variadic printf...
135 |         //     Compilies w/o supplying -std=c++11 but shows warning,
136 |         //     so we sielence them :)
137 |         #pragma clang diagnostic ignored "-Wc++11-extensions"
138 |         #pragma clang diagnostic ignored "-Wunnamed-type-template-args"
139 |             template <class... Args>
140 |             inline __host__ __device__ void va_printf(char const* format, Args const&... args)
141 |             {
142 |         #ifdef __CUDA_ARCH__
143 |               printf(format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, args...);
144 |         #else
145 |               printf(format, args...);
146 |         #endif
147 |             }
148 |         #ifndef __CUDA_ARCH__
149 |             #define _CubLog(format, ...) va_printf(format,__VA_ARGS__);
150 |         #else
151 |             #define _CubLog(format, ...) va_printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, __VA_ARGS__);
152 |         #endif
153 |     #endif
154 | #endif
155 | 
156 | 
157 | 
158 | 
159 | /** @} */       // end group UtilMgmt
160 | 
161 | }               // CUB namespace
162 | CUB_NS_POSTFIX  // Optional outer namespace(s)
163 | 


--------------------------------------------------------------------------------
/src/crypto/cn/CnAlgo.h:
--------------------------------------------------------------------------------
  1 | /* XMRig
  2 |  * Copyright (c) 2018-2021 SChernykh   <https://github.com/SChernykh>
  3 |  * Copyright (c) 2016-2021 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
  4 |  *
  5 |  *   This program is free software: you can redistribute it and/or modify
  6 |  *   it under the terms of the GNU General Public License as published by
  7 |  *   the Free Software Foundation, either version 3 of the License, or
  8 |  *   (at your option) any later version.
  9 |  *
 10 |  *   This program is distributed in the hope that it will be useful,
 11 |  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 |  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 13 |  *   GNU General Public License for more details.
 14 |  *
 15 |  *   You should have received a copy of the GNU General Public License
 16 |  *   along with this program. If not, see <http://www.gnu.org/licenses/>.
 17 |  */
 18 | 
 19 | #ifndef XMRIG_CN_ALGO_H
 20 | #define XMRIG_CN_ALGO_H
 21 | 
 22 | 
 23 | #include <cstddef>
 24 | #include <cstdint>
 25 | 
 26 | 
 27 | #include "crypto/common/Algorithm.h"
 28 | 
 29 | 
 30 | namespace xmrig_cuda
 31 | {
 32 | 
 33 | 
 34 | template<Algorithm::Id ALGO = Algorithm::INVALID>
 35 | class CnAlgo
 36 | {
 37 | public:
 38 |     constexpr CnAlgo() {};
 39 | 
 40 |     constexpr inline Algorithm::Id base() const  { static_assert(Algorithm::isCN(ALGO), "invalid CRYPTONIGHT algorithm"); return Algorithm::base(ALGO); }
 41 |     constexpr inline bool isHeavy() const        { return Algorithm::family(ALGO) == Algorithm::CN_HEAVY; }
 42 |     constexpr inline bool isR() const            { return ALGO == Algorithm::CN_R; }
 43 |     constexpr inline size_t memory() const       { static_assert(Algorithm::isCN(ALGO), "invalid CRYPTONIGHT algorithm"); return Algorithm::l3(ALGO); }
 44 |     constexpr inline uint32_t iterations() const { static_assert(Algorithm::isCN(ALGO), "invalid CRYPTONIGHT algorithm"); return CN_ITER; }
 45 |     constexpr inline uint32_t mask() const       { return static_cast<uint32_t>(((memory() - 1) / 16) * 16); }
 46 | 
 47 |     inline static uint32_t iterations(Algorithm::Id algo)
 48 |     {
 49 |         switch (algo) {
 50 |         case Algorithm::CN_0:
 51 |         case Algorithm::CN_1:
 52 |         case Algorithm::CN_2:
 53 |         case Algorithm::CN_R:
 54 |         case Algorithm::CN_RTO:
 55 |             return CN_ITER;
 56 | 
 57 |         case Algorithm::CN_FAST:
 58 |         case Algorithm::CN_HALF:
 59 | #       ifdef XMRIG_ALGO_CN_LITE
 60 |         case Algorithm::CN_LITE_0:
 61 |         case Algorithm::CN_LITE_1:
 62 | #       endif
 63 | #       ifdef XMRIG_ALGO_CN_HEAVY
 64 |         case Algorithm::CN_HEAVY_0:
 65 |         case Algorithm::CN_HEAVY_TUBE:
 66 |         case Algorithm::CN_HEAVY_XHV:
 67 | #       endif
 68 |         case Algorithm::CN_CCX:
 69 |             return CN_ITER / 2;
 70 | 
 71 |         case Algorithm::CN_RWZ:
 72 |         case Algorithm::CN_ZLS:
 73 |             return 0x60000;
 74 | 
 75 |         case Algorithm::CN_XAO:
 76 |         case Algorithm::CN_DOUBLE:
 77 |             return CN_ITER * 2;
 78 | 
 79 | #       ifdef XMRIG_ALGO_CN_PICO
 80 |         case Algorithm::CN_PICO_0:
 81 |         case Algorithm::CN_PICO_TLO:
 82 |             return CN_ITER / 8;
 83 | #       endif
 84 | 
 85 | #       ifdef XMRIG_ALGO_CN_FEMTO
 86 |         case Algorithm::CN_UPX2:
 87 |             return CN_ITER / 32;
 88 | #       endif
 89 | 
 90 | #       ifdef XMRIG_ALGO_CN_GPU
 91 |         case Algorithm::CN_GPU:
 92 |             return 0xC000;
 93 | #       endif
 94 | 
 95 |         default:
 96 |             break;
 97 |         }
 98 | 
 99 |         return 0;
100 |     }
101 | 
102 |     inline static uint32_t mask(Algorithm::Id algo)
103 |     {
104 | #       ifdef XMRIG_ALGO_CN_PICO
105 |         if (algo == Algorithm::CN_PICO_0) {
106 |             return 0x1FFF0;
107 |         }
108 | #       endif
109 | 
110 | #       ifdef XMRIG_ALGO_CN_FEMTO
111 |         if (algo == Algorithm::CN_UPX2) {
112 |             return 0x1FFF0;
113 |         }
114 | #       endif
115 | 
116 | #       ifdef XMRIG_ALGO_CN_GPU
117 |         if (algo == Algorithm::CN_GPU) {
118 |             return 0x1FFFC0;
119 |         }
120 | #       endif
121 | 
122 |         return ((Algorithm::l3(algo) - 1) / 16) * 16;
123 |     }
124 | 
125 | private:
126 |     constexpr const static uint32_t CN_ITER = 0x80000;
127 | };
128 | 
129 | 
130 | template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_FAST>::iterations() const         { return CN_ITER / 2; }
131 | template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_HALF>::iterations() const         { return CN_ITER / 2; }
132 | template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_LITE_0>::iterations() const       { return CN_ITER / 2; }
133 | template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_LITE_1>::iterations() const       { return CN_ITER / 2; }
134 | template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_HEAVY_0>::iterations() const      { return CN_ITER / 2; }
135 | template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_HEAVY_TUBE>::iterations() const   { return CN_ITER / 2; }
136 | template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_HEAVY_XHV>::iterations() const    { return CN_ITER / 2; }
137 | template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_XAO>::iterations() const          { return CN_ITER * 2; }
138 | template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_DOUBLE>::iterations() const       { return CN_ITER * 2; }
139 | template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_RWZ>::iterations() const          { return 0x60000; }
140 | template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_ZLS>::iterations() const          { return 0x60000; }
141 | template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_PICO_0>::iterations() const       { return CN_ITER / 8; }
142 | template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_PICO_TLO>::iterations() const     { return CN_ITER / 8; }
143 | template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_CCX>::iterations() const          { return CN_ITER / 2; }
144 | template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_UPX2>::iterations() const         { return CN_ITER / 32; }
145 | template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_GPU>::iterations() const          { return 0xC000; }
146 | 
147 | 
148 | template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_PICO_0>::mask() const             { return 0x1FFF0; }
149 | template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_UPX2>::mask() const               { return 0x1FFF0; }
150 | template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_GPU>::mask() const                { return 0x1FFFC0; }
151 | 
152 | 
153 | } /* namespace xmrig_cuda */
154 | 
155 | 
156 | #endif /* XMRIG_CN_ALGO_H */
157 | 


--------------------------------------------------------------------------------
/src/3rdparty/cub/thread/thread_reduce.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Thread utilities for sequential reduction over statically-sized array types
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include "../thread/thread_operators.cuh"
 37 | #include "../config.cuh"
 38 | 
 39 | /// Optional outer namespace(s)
 40 | CUB_NS_PREFIX
 41 | 
 42 | /// CUB namespace
 43 | namespace cub {
 44 | 
 45 | /// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations)
 46 | namespace internal {
 47 | 
 48 | /**
 49 |  * Sequential reduction over statically-sized array types
 50 |  */
 51 | template <
 52 |     int         LENGTH,
 53 |     typename    T,
 54 |     typename    ReductionOp>
 55 | __device__ __forceinline__ T ThreadReduce(
 56 |     T*                  input,                  ///< [in] Input array
 57 |     ReductionOp         reduction_op,           ///< [in] Binary reduction operator
 58 |     T                   prefix,                 ///< [in] Prefix to seed reduction with
 59 |     Int2Type<LENGTH>    /*length*/)
 60 | {
 61 |     T retval = prefix;
 62 | 
 63 |     #pragma unroll
 64 |     for (int i = 0; i < LENGTH; ++i)
 65 |         retval = reduction_op(retval, input[i]);
 66 | 
 67 |     return retval;
 68 | }
 69 | 
 70 | 
 71 | /**
 72 |  * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
 73 |  *
 74 |  * \tparam LENGTH     LengthT of input array
 75 |  * \tparam T          <b>[inferred]</b> The data type to be reduced.
 76 |  * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
 77 |  */
 78 | template <
 79 |     int         LENGTH,
 80 |     typename    T,
 81 |     typename    ReductionOp>
 82 | __device__ __forceinline__ T ThreadReduce(
 83 |     T*          input,                  ///< [in] Input array
 84 |     ReductionOp reduction_op,           ///< [in] Binary reduction operator
 85 |     T           prefix)                 ///< [in] Prefix to seed reduction with
 86 | {
 87 |     return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
 88 | }
 89 | 
 90 | 
 91 | /**
 92 |  * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array.  The aggregate is returned.
 93 |  *
 94 |  * \tparam LENGTH     LengthT of input array
 95 |  * \tparam T          <b>[inferred]</b> The data type to be reduced.
 96 |  * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
 97 |  */
 98 | template <
 99 |     int         LENGTH,
100 |     typename    T,
101 |     typename    ReductionOp>
102 | __device__ __forceinline__ T ThreadReduce(
103 |     T*          input,                  ///< [in] Input array
104 |     ReductionOp reduction_op)           ///< [in] Binary reduction operator
105 | {
106 |     T prefix = input[0];
107 |     return ThreadReduce<LENGTH - 1>(input + 1, reduction_op, prefix);
108 | }
109 | 
110 | 
111 | /**
112 |  * \brief Perform a sequential reduction over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
113 |  *
114 |  * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input array
115 |  * \tparam T          <b>[inferred]</b> The data type to be reduced.
116 |  * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
117 |  */
118 | template <
119 |     int         LENGTH,
120 |     typename    T,
121 |     typename    ReductionOp>
122 | __device__ __forceinline__ T ThreadReduce(
123 |     T           (&input)[LENGTH],       ///< [in] Input array
124 |     ReductionOp reduction_op,           ///< [in] Binary reduction operator
125 |     T           prefix)                 ///< [in] Prefix to seed reduction with
126 | {
127 |     return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
128 | }
129 | 
130 | 
131 | /**
132 |  * \brief Serial reduction with the specified operator
133 |  *
134 |  * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input array
135 |  * \tparam T          <b>[inferred]</b> The data type to be reduced.
136 |  * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
137 |  */
138 | template <
139 |     int         LENGTH,
140 |     typename    T,
141 |     typename    ReductionOp>
142 | __device__ __forceinline__ T ThreadReduce(
143 |     T           (&input)[LENGTH],       ///< [in] Input array
144 |     ReductionOp reduction_op)           ///< [in] Binary reduction operator
145 | {
146 |     return ThreadReduce<LENGTH>((T*) input, reduction_op);
147 | }
148 | 
149 | 
150 | }               // internal namespace
151 | }               // CUB namespace
152 | CUB_NS_POSTFIX  // Optional outer namespace(s)
153 | 


--------------------------------------------------------------------------------
/src/3rdparty/cub/block/block_raking_layout.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data.
 32 |  */
 33 | 
 34 | 
 35 | #pragma once
 36 | 
 37 | #include "../config.cuh"
 38 | #include "../util_type.cuh"
 39 | 
 40 | /// Optional outer namespace(s)
 41 | CUB_NS_PREFIX
 42 | 
 43 | /// CUB namespace
 44 | namespace cub {
 45 | 
 46 | /**
 47 |  * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data.    ![](raking.png)
 48 |  * \ingroup BlockModule
 49 |  *
 50 |  * \par Overview
 51 |  * This type facilitates a shared memory usage pattern where a block of CUDA
 52 |  * threads places elements into shared memory and then reduces the active
 53 |  * parallelism to one "raking" warp of threads for serially aggregating consecutive
 54 |  * sequences of shared items.  Padding is inserted to eliminate bank conflicts
 55 |  * (for most data types).
 56 |  *
 57 |  * \tparam T                        The data type to be exchanged.
 58 |  * \tparam BLOCK_THREADS            The thread block size in threads.
 59 |  * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
 60 |  */
 61 | template <
 62 |     typename    T,
 63 |     int         BLOCK_THREADS,
 64 |     int         PTX_ARCH = CUB_PTX_ARCH>
 65 | struct BlockRakingLayout
 66 | {
 67 |     //---------------------------------------------------------------------
 68 |     // Constants and type definitions
 69 |     //---------------------------------------------------------------------
 70 | 
 71 |     enum
 72 |     {
 73 |         /// The total number of elements that need to be cooperatively reduced
 74 |         SHARED_ELEMENTS = BLOCK_THREADS,
 75 | 
 76 |         /// Maximum number of warp-synchronous raking threads
 77 |         MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(PTX_ARCH)),
 78 | 
 79 |         /// Number of raking elements per warp-synchronous raking thread (rounded up)
 80 |         SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS,
 81 | 
 82 |         /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads)
 83 |         RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH,
 84 | 
 85 |         /// Whether we will have bank conflicts (technically we should find out if the GCD is > 1)
 86 |         HAS_CONFLICTS = (CUB_SMEM_BANKS(PTX_ARCH) % SEGMENT_LENGTH == 0),
 87 | 
 88 |         /// Degree of bank conflicts (e.g., 4-way)
 89 |         CONFLICT_DEGREE = (HAS_CONFLICTS) ?
 90 |             (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(PTX_ARCH) :
 91 |             1,
 92 | 
 93 |         /// Pad each segment length with one element if segment length is not relatively prime to warp size and can't be optimized as a vector load
 94 |         USE_SEGMENT_PADDING = ((SEGMENT_LENGTH & 1) == 0) && (SEGMENT_LENGTH > 2),
 95 | 
 96 |         /// Total number of elements in the raking grid
 97 |         GRID_ELEMENTS = RAKING_THREADS * (SEGMENT_LENGTH + USE_SEGMENT_PADDING),
 98 | 
 99 |         /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the number of raking threads)
100 |         UNGUARDED = (SHARED_ELEMENTS % RAKING_THREADS == 0),
101 |     };
102 | 
103 | 
104 |     /**
105 |      * \brief Shared memory storage type
106 |      */
107 |     struct __align__(16) _TempStorage
108 |     {
109 |         T buff[BlockRakingLayout::GRID_ELEMENTS];
110 |     };
111 | 
112 |     /// Alias wrapper allowing storage to be unioned
113 |     struct TempStorage : Uninitialized<_TempStorage> {};
114 | 
115 | 
116 |     /**
117 |      * \brief Returns the location for the calling thread to place data into the grid
118 |      */
119 |     static __device__ __forceinline__ T* PlacementPtr(
120 |         TempStorage &temp_storage,
121 |         unsigned int linear_tid)
122 |     {
123 |         // Offset for partial
124 |         unsigned int offset = linear_tid;
125 | 
126 |         // Add in one padding element for every segment
127 |         if (USE_SEGMENT_PADDING > 0)
128 |         {
129 |             offset += offset / SEGMENT_LENGTH;
130 |         }
131 | 
132 |         // Incorporating a block of padding partials every shared memory segment
133 |         return temp_storage.Alias().buff + offset;
134 |     }
135 | 
136 | 
137 |     /**
138 |      * \brief Returns the location for the calling thread to begin sequential raking
139 |      */
140 |     static __device__ __forceinline__ T* RakingPtr(
141 |         TempStorage &temp_storage,
142 |         unsigned int linear_tid)
143 |     {
144 |         return temp_storage.Alias().buff + (linear_tid * (SEGMENT_LENGTH + USE_SEGMENT_PADDING));
145 |     }
146 | };
147 | 
148 | }               // CUB namespace
149 | CUB_NS_POSTFIX  // Optional outer namespace(s)
150 | 
151 | 


--------------------------------------------------------------------------------
/src/KawPow/raven/KawPow.cu:
--------------------------------------------------------------------------------
  1 | /* XMRig
  2 |  * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
  3 |  * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
  4 |  * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
  5 |  * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
  6 |  * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
  7 |  * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
  8 |  * Copyright 2018-2020 SChernykh   <https://github.com/SChernykh>
  9 |  * Copyright 2016-2020 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 10 |  *
 11 |  *   This program is free software: you can redistribute it and/or modify
 12 |  *   it under the terms of the GNU General Public License as published by
 13 |  *   the Free Software Foundation, either version 3 of the License, or
 14 |  *   (at your option) any later version.
 15 |  *
 16 |  *   This program is distributed in the hope that it will be useful,
 17 |  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 18 |  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 19 |  *   GNU General Public License for more details.
 20 |  *
 21 |  *   You should have received a copy of the GNU General Public License
 22 |  *   along with this program. If not, see <http://www.gnu.org/licenses/>.
 23 |  */
 24 | 
 25 | 
 26 | #include <cuda.h>
 27 | #include <cuda_runtime.h>
 28 | #include <cstdint>
 29 | 
 30 | #include "cryptonight.h"
 31 | #include "cuda_device.hpp"
 32 | #include "KawPow_dag.h"
 33 | #include "CudaKawPow_gen.h"
 34 | 
 35 | 
 36 | void kawpow_prepare(nvid_ctx *ctx, const void* cache, size_t cache_size, const void* dag_precalc, size_t dag_size, uint32_t height, const uint64_t* dag_sizes)
 37 | {
 38 |     constexpr size_t MEM_ALIGN = 1024 * 1024;
 39 | 
 40 |     if (cache_size != ctx->kawpow_cache_size) {
 41 |         ctx->kawpow_cache_size = cache_size;
 42 | 
 43 |         if (!dag_precalc) {
 44 |             if (cache_size > ctx->kawpow_cache_capacity) {
 45 |                 CUDA_CHECK(ctx->device_id, cudaFree(ctx->kawpow_cache));
 46 | 
 47 |                 ctx->kawpow_cache_capacity = ((cache_size + MEM_ALIGN - 1) / MEM_ALIGN) * MEM_ALIGN;
 48 |                 CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->kawpow_cache, ctx->kawpow_cache_capacity));
 49 |             }
 50 | 
 51 |             CUDA_CHECK(ctx->device_id, cudaMemcpy((uint8_t*)(ctx->kawpow_cache), cache, cache_size, cudaMemcpyHostToDevice));
 52 |         }
 53 |     }
 54 | 
 55 |     if (dag_size != ctx->kawpow_dag_size) {
 56 |         ctx->kawpow_dag_size = dag_size;
 57 | 
 58 |         if (dag_size > ctx->kawpow_dag_capacity) {
 59 |             CUDA_CHECK(ctx->device_id, cudaFree(ctx->kawpow_dag));
 60 | 
 61 |             ctx->kawpow_dag_capacity = ((dag_size + MEM_ALIGN - 1) / MEM_ALIGN) * MEM_ALIGN;
 62 |             CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->kawpow_dag, ctx->kawpow_dag_capacity));
 63 |         }
 64 | 
 65 |         if (dag_precalc) {
 66 |             CUDA_CHECK(ctx->device_id, cudaMemcpy((uint8_t*)(ctx->kawpow_dag), cache, cache_size, cudaMemcpyHostToDevice));
 67 |         }
 68 | 
 69 |         constexpr int blocks = 8192;
 70 |         constexpr int threads = 32;
 71 | 
 72 |         const size_t cache_items = ((cache_size + 255) / 256) * 256 / sizeof(hash64_t);
 73 |         const size_t dag_items = dag_size / sizeof(hash64_t);
 74 | 
 75 |         uint4 light_words;
 76 |         light_words.w = ctx->kawpow_cache_size / sizeof(hash64_t);
 77 |         calculate_fast_mod_data(light_words.w, light_words.x, light_words.y, light_words.z);
 78 | 
 79 |         for (size_t i = dag_precalc ? cache_items : 0; i < dag_items; i += blocks * threads) {
 80 |             CUDA_CHECK_KERNEL(ctx->device_id, ethash_calculate_dag_item<<<blocks, threads>>>(
 81 |                 i,
 82 |                 (hash64_t*) ctx->kawpow_dag,
 83 |                 ctx->kawpow_dag_size,
 84 |                 (hash64_t*)(dag_precalc ? ctx->kawpow_dag : ctx->kawpow_cache),
 85 |                 light_words
 86 |             ));
 87 |             CUDA_CHECK(ctx->device_id, cudaDeviceSynchronize());
 88 |         }
 89 | 
 90 |         if (dag_precalc) {
 91 |             CUDA_CHECK(ctx->device_id, cudaMemcpy((uint8_t*)(ctx->kawpow_dag), dag_precalc, cache_items * sizeof(hash64_t), cudaMemcpyHostToDevice));
 92 |         }
 93 |     }
 94 | 
 95 |     constexpr uint32_t PERIOD_LENGTH = 3;
 96 |     const uint32_t period = height / PERIOD_LENGTH;
 97 | 
 98 |     if (ctx->kawpow_period != period) {
 99 |         if (ctx->kawpow_module) {
100 |             cuModuleUnload(ctx->kawpow_module);
101 |         }
102 | 
103 |         std::vector<char> ptx;
104 |         std::string lowered_name;
105 |         KawPow_get_program(ptx, lowered_name, period, ctx->device_threads, ctx->device_arch[0], ctx->device_arch[1], dag_sizes);
106 | 
107 |         CU_CHECK(ctx->device_id, cuModuleLoadDataEx(&ctx->kawpow_module, ptx.data(), 0, 0, 0));
108 |         CU_CHECK(ctx->device_id, cuModuleGetFunction(&ctx->kawpow_kernel, ctx->kawpow_module, lowered_name.c_str()));
109 | 
110 |         ctx->kawpow_period = period;
111 | 
112 |         KawPow_get_program(ptx, lowered_name, period + 1, ctx->device_threads, ctx->device_arch[0], ctx->device_arch[1], dag_sizes, true);
113 |     }
114 | 
115 |     if (!ctx->kawpow_stop_host) {
116 |         CUDA_CHECK(ctx->device_id, cudaMallocHost(&ctx->kawpow_stop_host, sizeof(uint32_t) * 2));
117 |         CUDA_CHECK(ctx->device_id, cudaHostGetDevicePointer(&ctx->kawpow_stop_device, ctx->kawpow_stop_host, 0));
118 |     }
119 | }
120 | 
121 | 
122 | void kawpow_stop_hash(nvid_ctx *ctx)
123 | {
124 |     if (ctx->kawpow_stop_host) {
125 |         *ctx->kawpow_stop_host = 1;
126 |     }
127 | }
128 | 
129 | 
130 | namespace KawPow_Raven {
131 | 
132 | void hash(nvid_ctx *ctx, uint8_t* job_blob, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t *skipped_hashes)
133 | {
134 |     dim3 grid(ctx->device_blocks);
135 |     dim3 block(ctx->device_threads);
136 | 
137 |     uint32_t hack_false = 0;
138 |     void* args[] = { &ctx->kawpow_dag, &ctx->d_input, &target, &hack_false, &ctx->d_result_nonce, &ctx->kawpow_stop_device };
139 | 
140 |     CUDA_CHECK(ctx->device_id, cudaMemcpy(ctx->d_input, job_blob, 40, cudaMemcpyHostToDevice));
141 |     CUDA_CHECK(ctx->device_id, cudaMemset(ctx->d_result_nonce, 0, sizeof(uint32_t)));
142 |     memset(ctx->kawpow_stop_host, 0, sizeof(uint32_t) * 2);
143 | 
144 |     CU_CHECK(ctx->device_id, cuLaunchKernel(
145 |         ctx->kawpow_kernel,
146 |         grid.x, grid.y, grid.z,
147 |         block.x, block.y, block.z,
148 |         0, nullptr, args, 0
149 |     ));
150 |     CU_CHECK(ctx->device_id, cuCtxSynchronize());
151 | 
152 |     *skipped_hashes = ctx->kawpow_stop_host[1];
153 | 
154 |     uint32_t results[16];
155 |     CUDA_CHECK(ctx->device_id, cudaMemcpy(results, ctx->d_result_nonce, sizeof(results), cudaMemcpyDeviceToHost));
156 | 
157 |     if (results[0] > 15) {
158 |         results[0] = 15;
159 |     }
160 | 
161 |     *rescount = results[0];
162 |     memcpy(resnonce, results + 1, results[0] * sizeof(uint32_t));
163 | }
164 | 
165 | }
166 | 


--------------------------------------------------------------------------------
/src/cryptonight.h:
--------------------------------------------------------------------------------
  1 | /* XMRig
  2 |  * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
  3 |  * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
  4 |  * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
  5 |  * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
  6 |  * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
  7 |  * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
  8 |  * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
  9 |  * Copyright 2019      Spudz76     <https://github.com/Spudz76>
 10 |  * Copyright 2018-2020 SChernykh   <https://github.com/SChernykh>
 11 |  * Copyright 2016-2020 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 12 |  *
 13 |  *   This program is free software: you can redistribute it and/or modify
 14 |  *   it under the terms of the GNU General Public License as published by
 15 |  *   the Free Software Foundation, either version 3 of the License, or
 16 |  *   (at your option) any later version.
 17 |  *
 18 |  *   This program is distributed in the hope that it will be useful,
 19 |  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 20 |  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 21 |  *   GNU General Public License for more details.
 22 |  *
 23 |  *   You should have received a copy of the GNU General Public License
 24 |  *   along with this program. If not, see <http://www.gnu.org/licenses/>.
 25 |  */
 26 | 
 27 | 
 28 | #pragma once
 29 | 
 30 | #include "crypto/common/Algorithm.h"
 31 | 
 32 | 
 33 | #include <cstdint>
 34 | 
 35 | 
 36 | #if defined(XMRIG_ALGO_KAWPOW) || defined(XMRIG_ALGO_CN_R)
 37 | #   include <cuda.h>
 38 | #endif
 39 | 
 40 | 
 41 | struct nvid_ctx {
 42 | #   ifdef XMRIG_ALGO_CN_R
 43 |     CUdevice cuDevice                   = -1;
 44 |     CUmodule module                     = nullptr;
 45 |     CUfunction kernel                   = nullptr;
 46 | #   endif
 47 | 
 48 |     xmrig_cuda::Algorithm algorithm     = xmrig_cuda::Algorithm::INVALID;
 49 |     uint64_t kernel_height              = 0;
 50 | 
 51 |     int device_id                       = 0;
 52 |     const char *device_name             = nullptr;
 53 |     int device_arch[2]                  { 0,};
 54 |     int device_mpcount                  = 0;
 55 |     int device_blocks                   = 0;
 56 |     int device_threads                  = 0;
 57 |     int device_bfactor                  = 0;
 58 |     int device_bsleep                   = 0;
 59 |     int device_clockRate                = 0;
 60 |     int device_memoryClockRate          = 0;
 61 |     size_t device_memoryTotal           = 0;
 62 |     size_t device_memoryFree            = 0;
 63 |     int device_pciBusID                 = 0;
 64 |     int device_pciDeviceID              = 0;
 65 |     int device_pciDomainID              = 0;
 66 |     uint32_t syncMode                   = 3;
 67 |     bool ready                          = false;
 68 | 
 69 |     uint32_t *d_input                   = nullptr;
 70 |     int inputlen                        = 0;
 71 |     uint32_t *d_result_count            = nullptr;
 72 |     uint32_t *d_result_nonce            = nullptr;
 73 |     uint32_t *d_long_state              = nullptr;
 74 |     uint64_t d_scratchpads_size         = 0;
 75 |     uint32_t *d_ctx_state               = nullptr;
 76 |     uint32_t *d_ctx_state2              = nullptr;
 77 |     uint32_t *d_ctx_a                   = nullptr;
 78 |     uint32_t *d_ctx_b                   = nullptr;
 79 |     uint32_t *d_ctx_key1                = nullptr;
 80 |     uint32_t *d_ctx_key2                = nullptr;
 81 |     uint32_t *d_ctx_text                = nullptr;
 82 | 
 83 |     uint32_t rx_batch_size              = 0;
 84 |     int32_t rx_dataset_host             = -1;
 85 |     uint32_t *d_rx_dataset              = nullptr;
 86 |     uint32_t *d_rx_hashes               = nullptr;
 87 |     uint32_t *d_rx_entropy              = nullptr;
 88 |     uint32_t *d_rx_vm_states            = nullptr;
 89 |     uint32_t *d_rx_rounding             = nullptr;
 90 | 
 91 | #   ifdef XMRIG_ALGO_KAWPOW
 92 |     void* kawpow_cache                  = nullptr;
 93 |     size_t kawpow_cache_size            = 0;
 94 |     size_t kawpow_cache_capacity        = 0;
 95 | 
 96 |     void* kawpow_dag                    = nullptr;
 97 |     size_t kawpow_dag_size              = 0;
 98 |     size_t kawpow_dag_capacity          = 0;
 99 | 
100 |     uint32_t* kawpow_stop_host          = nullptr;
101 |     uint32_t* kawpow_stop_device        = nullptr;
102 | 
103 |     uint32_t kawpow_period              = 0;
104 | 
105 |     CUmodule kawpow_module              = nullptr;
106 |     CUfunction kawpow_kernel            = nullptr;
107 | #   endif
108 | };
109 | 
110 | 
111 | int cuda_get_devicecount();
112 | int cuda_get_runtime_version();
113 | int cuda_get_driver_version();
114 | int cuda_get_deviceinfo(nvid_ctx *ctx);
115 | int cryptonight_gpu_init(nvid_ctx *ctx);
116 | void cryptonight_extra_cpu_set_data(nvid_ctx *ctx, const void *data, size_t len);
117 | void cryptonight_extra_cpu_prepare(nvid_ctx *ctx, uint32_t startNonce, const xmrig_cuda::Algorithm &algorithm);
118 | void cryptonight_gpu_hash(nvid_ctx *ctx, const xmrig_cuda::Algorithm &algorithm, uint64_t height, uint32_t startNonce);
119 | void cryptonight_extra_cpu_final(nvid_ctx *ctx, uint32_t startNonce, uint64_t target, uint32_t *rescount, uint32_t *resnonce, const xmrig_cuda::Algorithm &algorithm);
120 | 
121 | void cuda_extra_cpu_set_data(nvid_ctx *ctx, const void *data, size_t len);
122 | void randomx_prepare(nvid_ctx *ctx, const void *dataset, size_t dataset_size, uint32_t batch_size);
123 | void randomx_update_dataset(nvid_ctx* ctx, const void* dataset, size_t dataset_size);
124 | 
125 | namespace RandomX_Arqma   { void hash(nvid_ctx *ctx, uint32_t nonce, uint32_t nonce_offset, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
126 | namespace RandomX_Monero  { void hash(nvid_ctx *ctx, uint32_t nonce, uint32_t nonce_offset, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
127 | namespace RandomX_Wownero { void hash(nvid_ctx *ctx, uint32_t nonce, uint32_t nonce_offset, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
128 | namespace RandomX_Keva    { void hash(nvid_ctx *ctx, uint32_t nonce, uint32_t nonce_offset, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
129 | namespace RandomX_Graft   { void hash(nvid_ctx *ctx, uint32_t nonce, uint32_t nonce_offset, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
130 | namespace RandomX_Yada    { void hash(nvid_ctx* ctx, uint32_t nonce, uint32_t nonce_offset, uint64_t target, uint32_t* rescount, uint32_t* resnonce, uint32_t batch_size); }
131 | 
132 | #ifdef XMRIG_ALGO_KAWPOW
133 | void kawpow_prepare(nvid_ctx *ctx, const void* cache, size_t cache_size, const void* dag_precalc, size_t dag_size, uint32_t height, const uint64_t* dag_sizes);
134 | void kawpow_stop_hash(nvid_ctx *ctx);
135 | 
136 | namespace KawPow_Raven    { void hash(nvid_ctx *ctx, uint8_t* job_blob, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t *skipped_hashes); }
137 | #endif
138 | 


--------------------------------------------------------------------------------
/src/crypto/common/Algorithm.h:
--------------------------------------------------------------------------------
  1 | /* XMRig
  2 |  * Copyright (c) 2018      Lee Clagett <https://github.com/vtnerd>
  3 |  * Copyright (c) 2018-2021 SChernykh   <https://github.com/SChernykh>
  4 |  * Copyright (c) 2016-2021 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
  5 |  *
  6 |  *   This program is free software: you can redistribute it and/or modify
  7 |  *   it under the terms of the GNU General Public License as published by
  8 |  *   the Free Software Foundation, either version 3 of the License, or
  9 |  *   (at your option) any later version.
 10 |  *
 11 |  *   This program is distributed in the hope that it will be useful,
 12 |  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 |  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 14 |  *   GNU General Public License for more details.
 15 |  *
 16 |  *   You should have received a copy of the GNU General Public License
 17 |  *   along with this program. If not, see <http://www.gnu.org/licenses/>.
 18 |  */
 19 | 
 20 | #ifndef XMRIG_ALGORITHM_H
 21 | #define XMRIG_ALGORITHM_H
 22 | 
 23 | 
 24 | #include <cstddef>
 25 | #include <cstdint>
 26 | #include <vector>
 27 | 
 28 | 
 29 | namespace xmrig_cuda {
 30 | 
 31 | 
 32 | class Algorithm
 33 | {
 34 | public:
 35 |     enum Id : uint32_t {
 36 |         INVALID         = 0,
 37 |         CN_0            = 0x63150000,   // "cn/0"             CryptoNight (original).
 38 |         CN_1            = 0x63150100,   // "cn/1"             CryptoNight variant 1 also known as Monero7 and CryptoNightV7.
 39 |         CN_2            = 0x63150200,   // "cn/2"             CryptoNight variant 2.
 40 |         CN_R            = 0x63150272,   // "cn/r"             CryptoNightR (Monero's variant 4).
 41 |         CN_FAST         = 0x63150166,   // "cn/fast"          CryptoNight variant 1 with half iterations.
 42 |         CN_HALF         = 0x63150268,   // "cn/half"          CryptoNight variant 2 with half iterations (Masari/Torque).
 43 |         CN_XAO          = 0x63150078,   // "cn/xao"           CryptoNight variant 0 (modified, Alloy only).
 44 |         CN_RTO          = 0x63150172,   // "cn/rto"           CryptoNight variant 1 (modified, Arto only).
 45 |         CN_RWZ          = 0x63150277,   // "cn/rwz"           CryptoNight variant 2 with 3/4 iterations and reversed shuffle operation (Graft).
 46 |         CN_ZLS          = 0x6315027a,   // "cn/zls"           CryptoNight variant 2 with 3/4 iterations (Zelerius).
 47 |         CN_DOUBLE       = 0x63150264,   // "cn/double"        CryptoNight variant 2 with double iterations (X-CASH).
 48 |         CN_CCX          = 0x63150063,   // "cn/ccx"           Conceal (CCX)
 49 |         CN_LITE_0       = 0x63140000,   // "cn-lite/0"        CryptoNight-Lite variant 0.
 50 |         CN_LITE_1       = 0x63140100,   // "cn-lite/1"        CryptoNight-Lite variant 1.
 51 |         CN_HEAVY_0      = 0x63160000,   // "cn-heavy/0"       CryptoNight-Heavy (4 MB).
 52 |         CN_HEAVY_TUBE   = 0x63160172,   // "cn-heavy/tube"    CryptoNight-Heavy (modified, TUBE only).
 53 |         CN_HEAVY_XHV    = 0x63160068,   // "cn-heavy/xhv"     CryptoNight-Heavy (modified, Haven Protocol only).
 54 |         CN_PICO_0       = 0x63120200,   // "cn-pico"          CryptoNight-Pico
 55 |         CN_PICO_TLO     = 0x63120274,   // "cn-pico/tlo"      CryptoNight-Pico (TLO)
 56 |         CN_UPX2         = 0x63110200,   // "cn/upx2"          Uplexa (UPX2)
 57 |         CN_GPU          = 0x63150300,   // "cn/gpu"           CryptoNight-GPU (Ryo).
 58 |         RX_0            = 0x72151200,   // "rx/0"             RandomX (reference configuration).
 59 |         RX_WOW          = 0x72141177,   // "rx/wow"           RandomWOW (Wownero).
 60 |         RX_ARQ          = 0x72121061,   // "rx/arq"           RandomARQ (Arqma).
 61 |         RX_GRAFT        = 0x72151267,   // "rx/graft"         RandomGRAFT (Graft).
 62 |         RX_SFX          = 0x72151273,   // "rx/sfx"           RandomSFX (Safex Cash).
 63 |         RX_KEVA         = 0x7214116b,   // "rx/keva"          RandomKEVA (Keva).
 64 |         RX_YADA         = 0x72151279,   // "rx/yada"          RandomYada (YadaCoin).
 65 |         AR2_CHUKWA      = 0x61130000,   // "argon2/chukwa"    Argon2id (Chukwa).
 66 |         AR2_CHUKWA_V2   = 0x61140000,   // "argon2/chukwav2"  Argon2id (Chukwa v2).
 67 |         AR2_WRKZ        = 0x61120000,   // "argon2/wrkz"      Argon2id (WRKZ)
 68 |         KAWPOW_RVN      = 0x6b0f0000,   // "kawpow/rvn"       KawPow (RVN)
 69 | 
 70 |         RX_XLA          = 0x721211ff,   // "panthera"         Panthera (Scala2).
 71 |     };
 72 | 
 73 |     enum Family : uint32_t {
 74 |         UNKNOWN         = 0,
 75 |         CN_ANY          = 0x63000000,
 76 |         CN              = 0x63150000,
 77 |         CN_LITE         = 0x63140000,
 78 |         CN_HEAVY        = 0x63160000,
 79 |         CN_PICO         = 0x63120000,
 80 |         CN_FEMTO        = 0x63110000,
 81 |         RANDOM_X        = 0x72000000,
 82 |         ARGON2          = 0x61000000,
 83 |         KAWPOW          = 0x6b000000
 84 |     };
 85 | 
 86 |     inline Algorithm() = default;
 87 |     inline Algorithm(Id id) : m_id(id)                      {}
 88 |     Algorithm(uint32_t id) : m_id(parse(id))                {}
 89 | 
 90 |     static inline constexpr bool isCN(Id id)                { return (id & 0xff000000) == CN_ANY; }
 91 |     static inline constexpr Id base(Id id)                  { return isCN(id) ? static_cast<Id>(CN_0 | (id & 0xff00)) : INVALID; }
 92 |     static inline constexpr size_t l2(Id id)                { return family(id) == RANDOM_X ? (1U << ((id >> 8) & 0xff)) : 0U; }
 93 |     static inline constexpr size_t l3(Id id)                { return 1ULL << ((id >> 16) & 0xff); }
 94 |     static inline constexpr uint32_t family(Id id)          { return id & (isCN(id) ? 0xffff0000 : 0xff000000); }
 95 | 
 96 |     inline bool isCN() const                                { return isCN(m_id); }
 97 |     inline bool isEqual(const Algorithm &other) const       { return m_id == other.m_id; }
 98 |     inline bool isValid() const                             { return m_id != INVALID && family() > UNKNOWN; }
 99 |     inline Id base() const                                  { return base(m_id); }
100 |     inline Id id() const                                    { return m_id; }
101 |     inline size_t l2() const                                { return l2(m_id); }
102 |     inline size_t l3() const                                { return l3(m_id); }
103 |     inline uint32_t family() const                          { return family(m_id); }
104 |     inline uint32_t maxIntensity() const                    { return isCN() ? 5 : 1; };
105 | 
106 |     inline bool operator!=(Algorithm::Id id) const          { return m_id != id; }
107 |     inline bool operator!=(const Algorithm &other) const    { return !isEqual(other); }
108 |     inline bool operator==(Algorithm::Id id) const          { return m_id == id; }
109 |     inline bool operator==(const Algorithm &other) const    { return isEqual(other); }
110 |     inline operator Algorithm::Id() const                   { return m_id; }
111 | 
112 |     static Id parse(uint32_t id);
113 | 
114 | private:
115 |     Id m_id = INVALID;
116 | };
117 | 
118 | 
119 | } // namespace xmrig_cuda
120 | 
121 | 
122 | #endif /* XMRIG_ALGORITHM_H */
123 | 


--------------------------------------------------------------------------------
/src/3rdparty/cub/util_arch.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  *
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Static architectural properties by SM version.
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include "util_cpp_dialect.cuh"
 37 | #include "util_namespace.cuh"
 38 | #include "util_macro.cuh"
 39 | 
 40 | /// Optional outer namespace(s)
 41 | CUB_NS_PREFIX
 42 | 
 43 | /// CUB namespace
 44 | namespace cub {
 45 | 
 46 | #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
 47 | 
 48 | #if ((__CUDACC_VER_MAJOR__ >= 9) || defined(__NVCOMPILER_CUDA__)) && \
 49 |         !defined(CUB_USE_COOPERATIVE_GROUPS)
 50 |     #define CUB_USE_COOPERATIVE_GROUPS
 51 | #endif
 52 | 
 53 | /// In device code, CUB_PTX_ARCH expands to the PTX version for which we are
 54 | /// compiling. In host code, CUB_PTX_ARCH's value is implementation defined.
 55 | #ifndef CUB_PTX_ARCH
 56 |     #if defined(__NVCOMPILER_CUDA__)
 57 |         // __NVCOMPILER_CUDA_ARCH__ is the target PTX version, and is defined
 58 |         // when compiling both host code and device code. Currently, only one
 59 |         // PTX version can be targeted.
 60 |         #define CUB_PTX_ARCH __NVCOMPILER_CUDA_ARCH__
 61 |     #elif !defined(__CUDA_ARCH__)
 62 |         #define CUB_PTX_ARCH 0
 63 |     #else
 64 |         #define CUB_PTX_ARCH __CUDA_ARCH__
 65 |     #endif
 66 | #endif
 67 | 
 68 | #ifndef CUB_IS_DEVICE_CODE
 69 |     #if defined(__NVCOMPILER_CUDA__)
 70 |         #define CUB_IS_DEVICE_CODE __builtin_is_device_code()
 71 |         #define CUB_IS_HOST_CODE (!__builtin_is_device_code())
 72 |         #define CUB_INCLUDE_DEVICE_CODE 1
 73 |         #define CUB_INCLUDE_HOST_CODE 1
 74 |     #elif CUB_PTX_ARCH > 0
 75 |         #define CUB_IS_DEVICE_CODE 1
 76 |         #define CUB_IS_HOST_CODE 0
 77 |         #define CUB_INCLUDE_DEVICE_CODE 1
 78 |         #define CUB_INCLUDE_HOST_CODE 0
 79 |     #else
 80 |         #define CUB_IS_DEVICE_CODE 0
 81 |         #define CUB_IS_HOST_CODE 1
 82 |         #define CUB_INCLUDE_DEVICE_CODE 0
 83 |         #define CUB_INCLUDE_HOST_CODE 1
 84 |     #endif
 85 | #endif
 86 | 
 87 | /// Maximum number of devices supported.
 88 | #ifndef CUB_MAX_DEVICES
 89 |     #define CUB_MAX_DEVICES 128
 90 | #endif
 91 | 
 92 | #if CUB_CPP_DIALECT >= 2011
 93 |     static_assert(CUB_MAX_DEVICES > 0, "CUB_MAX_DEVICES must be greater than 0.");
 94 | #endif
 95 | 
 96 | /// Whether or not the source targeted by the active compiler pass is allowed to  invoke device kernels or methods from the CUDA runtime API.
 97 | #ifndef CUB_RUNTIME_FUNCTION
 98 |     #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__>= 350 && defined(__CUDACC_RDC__))
 99 |         #define CUB_RUNTIME_ENABLED
100 |         #define CUB_RUNTIME_FUNCTION __host__ __device__
101 |     #else
102 |         #define CUB_RUNTIME_FUNCTION __host__
103 |     #endif
104 | #endif
105 | 
106 | 
107 | /// Number of threads per warp
108 | #ifndef CUB_LOG_WARP_THREADS
109 |     #define CUB_LOG_WARP_THREADS(arch)                      \
110 |         (5)
111 |     #define CUB_WARP_THREADS(arch)                          \
112 |         (1 << CUB_LOG_WARP_THREADS(arch))
113 | 
114 |     #define CUB_PTX_WARP_THREADS        CUB_WARP_THREADS(CUB_PTX_ARCH)
115 |     #define CUB_PTX_LOG_WARP_THREADS    CUB_LOG_WARP_THREADS(CUB_PTX_ARCH)
116 | #endif
117 | 
118 | 
119 | /// Number of smem banks
120 | #ifndef CUB_LOG_SMEM_BANKS
121 |     #define CUB_LOG_SMEM_BANKS(arch)                        \
122 |         ((arch >= 200) ?                                    \
123 |             (5) :                                           \
124 |             (4))
125 |     #define CUB_SMEM_BANKS(arch)                            \
126 |         (1 << CUB_LOG_SMEM_BANKS(arch))
127 | 
128 |     #define CUB_PTX_LOG_SMEM_BANKS      CUB_LOG_SMEM_BANKS(CUB_PTX_ARCH)
129 |     #define CUB_PTX_SMEM_BANKS          CUB_SMEM_BANKS(CUB_PTX_ARCH)
130 | #endif
131 | 
132 | 
133 | /// Oversubscription factor
134 | #ifndef CUB_SUBSCRIPTION_FACTOR
135 |     #define CUB_SUBSCRIPTION_FACTOR(arch)                   \
136 |         ((arch >= 300) ?                                    \
137 |             (5) :                                           \
138 |             ((arch >= 200) ?                                \
139 |                 (3) :                                       \
140 |                 (10)))
141 |     #define CUB_PTX_SUBSCRIPTION_FACTOR             CUB_SUBSCRIPTION_FACTOR(CUB_PTX_ARCH)
142 | #endif
143 | 
144 | 
145 | /// Prefer padding overhead vs X-way conflicts greater than this threshold
146 | #ifndef CUB_PREFER_CONFLICT_OVER_PADDING
147 |     #define CUB_PREFER_CONFLICT_OVER_PADDING(arch)          \
148 |         ((arch >= 300) ?                                    \
149 |             (1) :                                           \
150 |             (4))
151 |     #define CUB_PTX_PREFER_CONFLICT_OVER_PADDING    CUB_PREFER_CONFLICT_OVER_PADDING(CUB_PTX_ARCH)
152 | #endif
153 | 
154 | 
155 | template <
156 |     int NOMINAL_4B_BLOCK_THREADS,
157 |     int NOMINAL_4B_ITEMS_PER_THREAD,
158 |     typename T>
159 | struct RegBoundScaling
160 | {
161 |     enum {
162 |         ITEMS_PER_THREAD    = CUB_MAX(1, NOMINAL_4B_ITEMS_PER_THREAD * 4 / CUB_MAX(4, sizeof(T))),
163 |         BLOCK_THREADS       = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, (((1024 * 48) / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32),
164 |     };
165 | };
166 | 
167 | 
168 | template <
169 |     int NOMINAL_4B_BLOCK_THREADS,
170 |     int NOMINAL_4B_ITEMS_PER_THREAD,
171 |     typename T>
172 | struct MemBoundScaling
173 | {
174 |     enum {
175 |         ITEMS_PER_THREAD    = CUB_MAX(1, CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T), NOMINAL_4B_ITEMS_PER_THREAD * 2)),
176 |         BLOCK_THREADS       = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, (((1024 * 48) / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32),
177 |     };
178 | };
179 | 
180 | 
181 | 
182 | 
183 | #endif  // Do not document
184 | 
185 | }               // CUB namespace
186 | CUB_NS_POSTFIX  // Optional outer namespace(s)
187 | 


--------------------------------------------------------------------------------
/src/cuda_keccak.hpp:
--------------------------------------------------------------------------------
  1 | #ifdef __CUDACC__
  2 | __constant__
  3 | #else
  4 | const
  5 | #endif
  6 | uint64_t keccakf_rndc[24] ={
  7 |     0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
  8 |     0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
  9 |     0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
 10 |     0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
 11 |     0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
 12 |     0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
 13 |     0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
 14 |     0x8000000000008080, 0x0000000080000001, 0x8000000080008008
 15 | };
 16 | 
 17 | #if __CUDA_ARCH__ >= 350
 18 | 	__forceinline__ __device__ uint64_t cuda_rotl64(const uint64_t value, const int offset)
 19 | 	{
 20 | 		uint2 result;
 21 | 		if(offset >= 32)
 22 | 		{
 23 | 			asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
 24 | 			asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
 25 | 		}
 26 | 		else
 27 | 		{
 28 | 			asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
 29 | 			asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
 30 | 		}
 31 | 		return  __double_as_longlong(__hiloint2double(result.y, result.x));
 32 | 	}
 33 | 	#define rotl64_1(x, y) (cuda_rotl64((x), (y)))
 34 | #else
 35 | 	#define rotl64_1(x, y) ((x) << (y) | ((x) >> (64 - (y))))
 36 | #endif
 37 | 
 38 | #define rotl64_2(x, y) rotl64_1(((x) >> 32) | ((x) << 32), (y))
 39 | #define bitselect(a, b, c) ((a) ^ ((c) & ((b) ^ (a))))
 40 | 
 41 | __device__ __forceinline__ void cn_keccakf2(uint64_t *s)
 42 | {
 43 | 	uint8_t i;
 44 | 
 45 | 	for(i = 0; i < 24; ++i)
 46 | 	{
 47 | 		uint64_t bc[5], tmpxor[5], tmp1, tmp2;
 48 | 
 49 | 		tmpxor[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
 50 | 		tmpxor[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
 51 | 		tmpxor[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
 52 | 		tmpxor[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
 53 | 		tmpxor[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
 54 | 
 55 | 		bc[0] = tmpxor[0] ^ rotl64_1(tmpxor[2], 1);
 56 | 		bc[1] = tmpxor[1] ^ rotl64_1(tmpxor[3], 1);
 57 | 		bc[2] = tmpxor[2] ^ rotl64_1(tmpxor[4], 1);
 58 | 		bc[3] = tmpxor[3] ^ rotl64_1(tmpxor[0], 1);
 59 | 		bc[4] = tmpxor[4] ^ rotl64_1(tmpxor[1], 1);
 60 | 
 61 | 		tmp1 = s[1] ^ bc[0];
 62 | 
 63 | 		s[0] ^= bc[4];
 64 | 		s[1] = rotl64_2(s[6] ^ bc[0], 12);
 65 | 		s[6] = rotl64_1(s[9] ^ bc[3], 20);
 66 | 		s[9] = rotl64_2(s[22] ^ bc[1], 29);
 67 | 		s[22] = rotl64_2(s[14] ^ bc[3], 7);
 68 | 		s[14] = rotl64_1(s[20] ^ bc[4], 18);
 69 | 		s[20] = rotl64_2(s[2] ^ bc[1], 30);
 70 | 		s[2] = rotl64_2(s[12] ^ bc[1], 11);
 71 | 		s[12] = rotl64_1(s[13] ^ bc[2], 25);
 72 | 		s[13] = rotl64_1(s[19] ^ bc[3], 8);
 73 | 		s[19] = rotl64_2(s[23] ^ bc[2], 24);
 74 | 		s[23] = rotl64_2(s[15] ^ bc[4], 9);
 75 | 		s[15] = rotl64_1(s[4] ^ bc[3], 27);
 76 | 		s[4] = rotl64_1(s[24] ^ bc[3], 14);
 77 | 		s[24] = rotl64_1(s[21] ^ bc[0], 2);
 78 | 		s[21] = rotl64_2(s[8] ^ bc[2], 23);
 79 | 		s[8] = rotl64_2(s[16] ^ bc[0], 13);
 80 | 		s[16] = rotl64_2(s[5] ^ bc[4], 4);
 81 | 		s[5] = rotl64_1(s[3] ^ bc[2], 28);
 82 | 		s[3] = rotl64_1(s[18] ^ bc[2], 21);
 83 | 		s[18] = rotl64_1(s[17] ^ bc[1], 15);
 84 | 		s[17] = rotl64_1(s[11] ^ bc[0], 10);
 85 | 		s[11] = rotl64_1(s[7] ^ bc[1], 6);
 86 | 		s[7] = rotl64_1(s[10] ^ bc[4], 3);
 87 | 		s[10] = rotl64_1(tmp1, 1);
 88 | 
 89 | 		tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
 90 | 		tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
 91 | 		tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
 92 | 		tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
 93 | 		tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
 94 | 		s[0] ^= keccakf_rndc[i];
 95 | 	}
 96 | }
 97 | 
 98 | __device__ __forceinline__ void cn_keccakf(uint64_t *s)
 99 | {
100 | 	uint64_t bc[5], tmpxor[5], tmp1, tmp2;
101 | 
102 | 	for(int i = 0; i < 24; ++i)
103 | 	{
104 | 		tmpxor[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
105 | 		tmpxor[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
106 | 		tmpxor[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
107 | 		tmpxor[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
108 | 		tmpxor[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
109 | 
110 | 		bc[0] = tmpxor[0] ^ rotl64_1(tmpxor[2], 1);
111 | 		bc[1] = tmpxor[1] ^ rotl64_1(tmpxor[3], 1);
112 | 		bc[2] = tmpxor[2] ^ rotl64_1(tmpxor[4], 1);
113 | 		bc[3] = tmpxor[3] ^ rotl64_1(tmpxor[0], 1);
114 | 		bc[4] = tmpxor[4] ^ rotl64_1(tmpxor[1], 1);
115 | 
116 | 		tmp1 = s[1] ^ bc[0];
117 | 
118 | 		s[0] ^= bc[4];
119 | 		s[1] = rotl64_2(s[6] ^ bc[0], 12);
120 | 		s[6] = rotl64_1(s[9] ^ bc[3], 20);
121 | 		s[9] = rotl64_2(s[22] ^ bc[1], 29);
122 | 		s[22] = rotl64_2(s[14] ^ bc[3], 7);
123 | 		s[14] = rotl64_1(s[20] ^ bc[4], 18);
124 | 		s[20] = rotl64_2(s[2] ^ bc[1], 30);
125 | 		s[2] = rotl64_2(s[12] ^ bc[1], 11);
126 | 		s[12] = rotl64_1(s[13] ^ bc[2], 25);
127 | 		s[13] = rotl64_1(s[19] ^ bc[3], 8);
128 | 		s[19] = rotl64_2(s[23] ^ bc[2], 24);
129 | 		s[23] = rotl64_2(s[15] ^ bc[4], 9);
130 | 		s[15] = rotl64_1(s[4] ^ bc[3], 27);
131 | 		s[4] = rotl64_1(s[24] ^ bc[3], 14);
132 | 		s[24] = rotl64_1(s[21] ^ bc[0], 2);
133 | 		s[21] = rotl64_2(s[8] ^ bc[2], 23);
134 | 		s[8] = rotl64_2(s[16] ^ bc[0], 13);
135 | 		s[16] = rotl64_2(s[5] ^ bc[4], 4);
136 | 		s[5] = rotl64_1(s[3] ^ bc[2], 28);
137 | 		s[3] = rotl64_1(s[18] ^ bc[2], 21);
138 | 		s[18] = rotl64_1(s[17] ^ bc[1], 15);
139 | 		s[17] = rotl64_1(s[11] ^ bc[0], 10);
140 | 		s[11] = rotl64_1(s[7] ^ bc[1], 6);
141 | 		s[7] = rotl64_1(s[10] ^ bc[4], 3);
142 | 		s[10] = rotl64_1(tmp1, 1);
143 | 
144 | 		tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
145 | 		tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
146 | 		tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
147 | 		tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
148 | 		tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
149 | 		s[0] ^= keccakf_rndc[i];
150 | 	}
151 | }
152 | 
153 | __device__ __forceinline__ void cn_keccak(const uint64_t * __restrict__ input, int inlen, uint8_t * __restrict__ md)
154 | {
155 | 	uint64_t st[25];
156 | 
157 | 	#pragma unroll
158 | 	for (int i = 0; i < 25; ++i) {
159 | 		st[i] = 0;
160 | 	}
161 | 
162 | 	// Input length must be a multiple of 136 and padded on the host side
163 | 	for (int i = 0; inlen > 0; i += 17, inlen -= 136) {
164 | 		#pragma unroll
165 | 		for (int j = 0; j < 17; ++j) {
166 | 			st[j] ^= input[i + j];
167 | 		}
168 | 		cn_keccakf(st);
169 | 	}
170 | 
171 | 	MEMCPY8(md, st, 25);
172 | 	return;
173 | }
174 | 


--------------------------------------------------------------------------------
/src/CudaCryptonightR_gen.cpp:
--------------------------------------------------------------------------------
  1 | #include <string>
  2 | #include <sstream>
  3 | #include <mutex>
  4 | #include <cstring>
  5 | #include <nvrtc.h>
  6 | #include <thread>
  7 | 
  8 | 
  9 | #include "crypto/cn/CryptoNight_monero.h"
 10 | #include "CudaCryptonightR_gen.h"
 11 | #include "cuda_device.hpp"
 12 | 
 13 | 
 14 | static std::string get_code(const V4_Instruction* code, int code_size)
 15 | {
 16 |     std::stringstream s;
 17 | 
 18 |     for (int i = 0; i < code_size; ++i)
 19 |     {
 20 |         const V4_Instruction inst = code[i];
 21 | 
 22 |         const uint32_t a = inst.dst_index;
 23 |         const uint32_t b = inst.src_index;
 24 | 
 25 |         switch (inst.opcode)
 26 |         {
 27 |         case MUL:
 28 |             s << 'r' << a << "*=r" << b << ';';
 29 |             break;
 30 | 
 31 |         case ADD:
 32 |             s << 'r' << a << "+=r" << b << '+' << inst.C << "U;";
 33 |             break;
 34 | 
 35 |         case SUB:
 36 |             s << 'r' << a << "-=r" << b << ';';
 37 |             break;
 38 | 
 39 |         case ROR:
 40 |             s << 'r' << a << "=rotate_right(r" << a << ",r" << b << ");";
 41 |             break;
 42 | 
 43 |         case ROL:
 44 |             s << 'r' << a << "=rotate_left(r" << a << ",r" << b << ");";
 45 |             break;
 46 | 
 47 |         case XOR:
 48 |             s << 'r' << a << "^=r" << b << ';';
 49 |             break;
 50 |         }
 51 | 
 52 |         s << '\n';
 53 |     }
 54 | 
 55 |     return s.str();
 56 | }
 57 | 
 58 | struct CacheEntry
 59 | {
 60 |     CacheEntry(uint64_t height, int arch_major, int arch_minor, const std::vector<char>& ptx, const std::string& lowered_name) :
 61 |         height(height),
 62 |         arch_major(arch_major),
 63 |         arch_minor(arch_minor),
 64 |         ptx(ptx),
 65 |         lowered_name(lowered_name)
 66 |     {}
 67 | 
 68 |     uint64_t height;
 69 |     int arch_major;
 70 |     int arch_minor;
 71 |     std::vector<char> ptx;
 72 |     std::string lowered_name;
 73 | };
 74 | 
 75 | struct BackgroundTaskBase
 76 | {
 77 |     virtual ~BackgroundTaskBase() = default;
 78 |     virtual void exec() = 0;
 79 | };
 80 | 
 81 | template<typename T>
 82 | struct BackgroundTask : public BackgroundTaskBase
 83 | {
 84 |     BackgroundTask(T&& func) : m_func(std::move(func)) {}
 85 |     void exec() override { m_func(); }
 86 | 
 87 |     T m_func;
 88 | };
 89 | 
 90 | static std::mutex CryptonightR_cache_mutex;
 91 | static std::mutex CryptonightR_build_mutex;
 92 | static std::vector<CacheEntry> CryptonightR_cache;
 93 | 
 94 | static std::mutex background_tasks_mutex;
 95 | static std::vector<BackgroundTaskBase*> background_tasks;
 96 | static std::thread* background_thread = nullptr;
 97 | 
 98 | static void background_thread_proc()
 99 | {
100 |     std::vector<BackgroundTaskBase*> tasks;
101 |     for (;;) {
102 |         tasks.clear();
103 |         {
104 |             std::lock_guard<std::mutex> g(background_tasks_mutex);
105 |             background_tasks.swap(tasks);
106 |         }
107 | 
108 |         for (BackgroundTaskBase* task : tasks) {
109 |             task->exec();
110 |             delete task;
111 |         }
112 | 
113 |         std::this_thread::sleep_for(std::chrono::milliseconds(500));
114 |     }
115 | }
116 | 
117 | template<typename T>
118 | static void background_exec(T&& func)
119 | {
120 |     BackgroundTaskBase* task = new BackgroundTask<T>(std::move(func));
121 | 
122 |     std::lock_guard<std::mutex> g(background_tasks_mutex);
123 |     background_tasks.push_back(task);
124 |     if (!background_thread) {
125 |         background_thread = new std::thread(background_thread_proc);
126 |     }
127 | }
128 | 
129 | 
130 | static void CryptonightR_build_program(
131 |     std::vector<char>& ptx,
132 |     std::string& lowered_name,
133 |     uint64_t height,
134 |     int arch_major,
135 |     int arch_minor,
136 |     std::string source)
137 | {
138 |     {
139 |         std::lock_guard<std::mutex> g(CryptonightR_cache_mutex);
140 | 
141 |         // Remove old programs from cache
142 |         for (size_t i = 0; i < CryptonightR_cache.size();) {
143 |             const CacheEntry& entry = CryptonightR_cache[i];
144 |             if (entry.height + 2 < height) {
145 |                 CryptonightR_cache[i] = std::move(CryptonightR_cache.back());
146 |                 CryptonightR_cache.pop_back();
147 |             }
148 |             else {
149 |                 ++i;
150 |             }
151 |         }
152 |     }
153 | 
154 |     ptx.clear();
155 |     ptx.reserve(65536);
156 | 
157 |     std::lock_guard<std::mutex> g1(CryptonightR_build_mutex);
158 |     {
159 |         std::lock_guard<std::mutex> g(CryptonightR_cache_mutex);
160 | 
161 |         // Check if the cache already has this program (some other thread might have added it first)
162 |         for (const CacheEntry& entry : CryptonightR_cache)
163 |         {
164 |             if ((entry.height == height) && (entry.arch_major == arch_major) && (entry.arch_minor == arch_minor))
165 |             {
166 |                 ptx = entry.ptx;
167 |                 lowered_name = entry.lowered_name;
168 |                 return;
169 |             }
170 |         }
171 |     }
172 | 
173 |     nvrtcProgram prog;
174 |     nvrtcResult result = nvrtcCreateProgram(&prog, source.c_str(), "CryptonightR.cu", 0, nullptr, nullptr);
175 |     if (result != NVRTC_SUCCESS) {
176 |         CUDA_THROW(nvrtcGetErrorString(result));
177 |     }
178 | 
179 |     result = nvrtcAddNameExpression(prog, "CryptonightR_phase2");
180 |     if (result != NVRTC_SUCCESS) {
181 |         nvrtcDestroyProgram(&prog);
182 | 
183 |         CUDA_THROW(nvrtcGetErrorString(result));
184 |     }
185 | 
186 |     char opt0[64];
187 |     sprintf(opt0, "--gpu-architecture=compute_%d%d", arch_major, arch_minor);
188 | 
189 |     const char* opts[2] = { opt0, "-DVARIANT=13" };
190 |     result = nvrtcCompileProgram(prog, 2, opts);
191 |     if (result != NVRTC_SUCCESS) {
192 |         size_t logSize;
193 |         if (nvrtcGetProgramLogSize(prog, &logSize) == NVRTC_SUCCESS) {
194 |             char *log = new char[logSize]();
195 |             if (nvrtcGetProgramLog(prog, log) == NVRTC_SUCCESS) {
196 |                 printf("Program compile log: %s\n", log);
197 |             }
198 | 
199 |             delete[] log;
200 |         }
201 | 
202 |         nvrtcDestroyProgram(&prog);
203 | 
204 |         CUDA_THROW(nvrtcGetErrorString(result));
205 |     }
206 | 
207 | 
208 |     const char* name;
209 |     result = nvrtcGetLoweredName(prog, "CryptonightR_phase2", &name);
210 |     if (result != NVRTC_SUCCESS) {
211 |         nvrtcDestroyProgram(&prog);
212 | 
213 |         CUDA_THROW(nvrtcGetErrorString(result));
214 |     }
215 | 
216 |     size_t ptxSize;
217 |     result = nvrtcGetPTXSize(prog, &ptxSize);
218 |     if (result != NVRTC_SUCCESS) {
219 |         nvrtcDestroyProgram(&prog);
220 | 
221 |         CUDA_THROW(nvrtcGetErrorString(result));
222 |     }
223 | 
224 |     ptx.resize(ptxSize);
225 |     result = nvrtcGetPTX(prog, ptx.data());
226 |     if (result != NVRTC_SUCCESS) {
227 |         nvrtcDestroyProgram(&prog);
228 | 
229 |         CUDA_THROW(nvrtcGetErrorString(result));
230 |     }
231 | 
232 |     lowered_name = name;
233 | 
234 |     nvrtcDestroyProgram(&prog);
235 | 
236 |     {
237 |         std::lock_guard<std::mutex> g(CryptonightR_cache_mutex);
238 |         CryptonightR_cache.emplace_back(height, arch_major, arch_minor, ptx, lowered_name);
239 |     }
240 | }
241 | 
242 | void CryptonightR_get_program(std::vector<char>& ptx, std::string& lowered_name, uint64_t height, int arch_major, int arch_minor, bool background)
243 | {
244 |     if (background) {
245 |         background_exec([=]() { std::vector<char> tmp; std::string s; CryptonightR_get_program(tmp, s, height, arch_major, arch_minor, false); });
246 |         return;
247 |     }
248 | 
249 |     ptx.clear();
250 | 
251 |     const char* source_code_template =
252 |         #include "CryptonightR.cu"
253 |     ;
254 | 
255 |     const char include_name[] = "XMRIG_INCLUDE_RANDOM_MATH";
256 |     const char *offset = strstr(source_code_template, include_name);
257 |     if (!offset){
258 |         return;
259 |     }
260 | 
261 |     V4_Instruction code[256];
262 |     const int code_size = v4_random_math_init<xmrig_cuda::Algorithm::CN_R>(code, height);
263 | 
264 |     std::string source_code(source_code_template, offset);
265 |     source_code.append(get_code(code, code_size));
266 |     source_code.append(offset + sizeof(include_name) - 1);
267 | 
268 |     {
269 |         std::lock_guard<std::mutex> g(CryptonightR_cache_mutex);
270 | 
271 |         // Check if the cache has this program
272 |         for (const CacheEntry& entry : CryptonightR_cache) {
273 |             if ((entry.height == height) && (entry.arch_major == arch_major) && (entry.arch_minor == arch_minor)) {
274 |                 ptx = entry.ptx;
275 |                 lowered_name = entry.lowered_name;
276 | 
277 |                 return;
278 |             }
279 |         }
280 |     }
281 | 
282 |     CryptonightR_build_program(ptx, lowered_name, height, arch_major, arch_minor, source_code);
283 | }
284 | 


--------------------------------------------------------------------------------
/src/3rdparty/cub/iterator/cache_modified_input_iterator.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Random-access iterator types
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include <iterator>
 37 | #include <iostream>
 38 | 
 39 | #include "../config.cuh"
 40 | #include "../thread/thread_load.cuh"
 41 | #include "../thread/thread_store.cuh"
 42 | #include "../util_device.cuh"
 43 | 
 44 | #if (THRUST_VERSION >= 100700)
 45 |     // This iterator is compatible with Thrust API 1.7 and newer
 46 |     #include <thrust/iterator/iterator_facade.h>
 47 |     #include <thrust/iterator/iterator_traits.h>
 48 | #endif // THRUST_VERSION
 49 | 
 50 | 
 51 | /// Optional outer namespace(s)
 52 | CUB_NS_PREFIX
 53 | 
 54 | /// CUB namespace
 55 | namespace cub {
 56 | 
 57 | 
 58 | 
 59 | /**
 60 |  * \addtogroup UtilIterator
 61 |  * @{
 62 |  */
 63 | 
 64 | 
 65 | /**
 66 |  * \brief A random-access input wrapper for dereferencing array values using a PTX cache load modifier.
 67 |  *
 68 |  * \par Overview
 69 |  * - CacheModifiedInputIteratorTis a random-access input iterator that wraps a native
 70 |  *   device pointer of type <tt>ValueType*</tt>. \p ValueType references are
 71 |  *   made by reading \p ValueType values through loads modified by \p MODIFIER.
 72 |  * - Can be used to load any data type from memory using PTX cache load modifiers (e.g., "LOAD_LDG",
 73 |  *   "LOAD_CG", "LOAD_CA", "LOAD_CS", "LOAD_CV", etc.).
 74 |  * - Can be constructed, manipulated, and exchanged within and between host and device
 75 |  *   functions, but can only be dereferenced within device functions.
 76 |  * - Compatible with Thrust API v1.7 or newer.
 77 |  *
 78 |  * \par Snippet
 79 |  * The code snippet below illustrates the use of \p CacheModifiedInputIteratorTto
 80 |  * dereference a device array of double using the "ldg" PTX load modifier
 81 |  * (i.e., load values through texture cache).
 82 |  * \par
 83 |  * \code
 84 |  * #include <cub/cub.cuh>   // or equivalently <cub/iterator/cache_modified_input_iterator.cuh>
 85 |  *
 86 |  * // Declare, allocate, and initialize a device array
 87 |  * double *d_in;            // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
 88 |  *
 89 |  * // Create an iterator wrapper
 90 |  * cub::CacheModifiedInputIterator<cub::LOAD_LDG, double> itr(d_in);
 91 |  *
 92 |  * // Within device code:
 93 |  * printf("%f\n", itr[0]);  // 8.0
 94 |  * printf("%f\n", itr[1]);  // 6.0
 95 |  * printf("%f\n", itr[6]);  // 9.0
 96 |  *
 97 |  * \endcode
 98 |  *
 99 |  * \tparam CacheLoadModifier    The cub::CacheLoadModifier to use when accessing data
100 |  * \tparam ValueType            The value type of this iterator
101 |  * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
102 |  */
103 | template <
104 |     CacheLoadModifier   MODIFIER,
105 |     typename            ValueType,
106 |     typename            OffsetT = ptrdiff_t>
107 | class CacheModifiedInputIterator
108 | {
109 | public:
110 | 
111 |     // Required iterator traits
112 |     typedef CacheModifiedInputIterator          self_type;              ///< My own type
113 |     typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
114 |     typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
115 |     typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
116 |     typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
117 | 
118 | #if (THRUST_VERSION >= 100700)
119 |     // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
120 |     typedef typename thrust::detail::iterator_facade_category<
121 |         thrust::device_system_tag,
122 |         thrust::random_access_traversal_tag,
123 |         value_type,
124 |         reference
125 |       >::type iterator_category;                                        ///< The iterator category
126 | #else
127 |     typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
128 | #endif  // THRUST_VERSION
129 | 
130 | 
131 | public:
132 | 
133 |     /// Wrapped native pointer
134 |     ValueType* ptr;
135 | 
136 |     /// Constructor
137 |     template <typename QualifiedValueType>
138 |     __host__ __device__ __forceinline__ CacheModifiedInputIterator(
139 |         QualifiedValueType* ptr)     ///< Native pointer to wrap
140 |     :
141 |         ptr(const_cast<typename RemoveQualifiers<QualifiedValueType>::Type *>(ptr))
142 |     {}
143 | 
144 |     /// Postfix increment
145 |     __host__ __device__ __forceinline__ self_type operator++(int)
146 |     {
147 |         self_type retval = *this;
148 |         ptr++;
149 |         return retval;
150 |     }
151 | 
152 |     /// Prefix increment
153 |     __host__ __device__ __forceinline__ self_type operator++()
154 |     {
155 |         ptr++;
156 |         return *this;
157 |     }
158 | 
159 |     /// Indirection
160 |     __device__ __forceinline__ reference operator*() const
161 |     {
162 |         return ThreadLoad<MODIFIER>(ptr);
163 |     }
164 | 
165 |     /// Addition
166 |     template <typename Distance>
167 |     __host__ __device__ __forceinline__ self_type operator+(Distance n) const
168 |     {
169 |         self_type retval(ptr + n);
170 |         return retval;
171 |     }
172 | 
173 |     /// Addition assignment
174 |     template <typename Distance>
175 |     __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
176 |     {
177 |         ptr += n;
178 |         return *this;
179 |     }
180 | 
181 |     /// Subtraction
182 |     template <typename Distance>
183 |     __host__ __device__ __forceinline__ self_type operator-(Distance n) const
184 |     {
185 |         self_type retval(ptr - n);
186 |         return retval;
187 |     }
188 | 
189 |     /// Subtraction assignment
190 |     template <typename Distance>
191 |     __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
192 |     {
193 |         ptr -= n;
194 |         return *this;
195 |     }
196 | 
197 |     /// Distance
198 |     __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
199 |     {
200 |         return ptr - other.ptr;
201 |     }
202 | 
203 |     /// Array subscript
204 |     template <typename Distance>
205 |     __device__ __forceinline__ reference operator[](Distance n) const
206 |     {
207 |         return ThreadLoad<MODIFIER>(ptr + n);
208 |     }
209 | 
210 |     /// Structure dereference
211 |     __device__ __forceinline__ pointer operator->()
212 |     {
213 |         return &ThreadLoad<MODIFIER>(ptr);
214 |     }
215 | 
216 |     /// Equal to
217 |     __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
218 |     {
219 |         return (ptr == rhs.ptr);
220 |     }
221 | 
222 |     /// Not equal to
223 |     __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
224 |     {
225 |         return (ptr != rhs.ptr);
226 |     }
227 | 
228 |     /// ostream operator
229 |     friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/)
230 |     {
231 |         return os;
232 |     }
233 | };
234 | 
235 | 
236 | 
237 | /** @} */       // end group UtilIterator
238 | 
239 | }               // CUB namespace
240 | CUB_NS_POSTFIX  // Optional outer namespace(s)
241 | 


--------------------------------------------------------------------------------