├── .gitignore ├── res ├── app.ico └── app.rc ├── doc ├── releases │ └── 2_0_1 │ │ ├── SHA256SUMS │ │ └── SHA256SUMS.sig └── gpg_keys │ └── xmrig.asc ├── src ├── CudaCryptonightR_gen.h ├── KawPow │ └── raven │ │ ├── CudaKawPow_gen.h │ │ ├── KawPow_dag.h │ │ └── KawPow.cu ├── cuda_fast_div_heavy.hpp ├── RandomX │ ├── arqma │ │ ├── randomx_arqma.cu │ │ └── configuration.h │ ├── defyx │ │ ├── randomx_defyx.cu │ │ └── configuration.h │ ├── graft │ │ ├── randomx_graft.cu │ │ └── configuration.h │ ├── keva │ │ ├── randomx_keva.cu │ │ └── configuration.h │ ├── yada │ │ ├── randomx_yada.cu │ │ └── configuration.h │ ├── monero │ │ ├── randomx_monero.cu │ │ └── configuration.h │ ├── wownero │ │ ├── randomx_wownero.cu │ │ └── configuration.h │ ├── common.hpp │ ├── randomx.cu │ └── hash.hpp ├── crypto │ ├── cn │ │ ├── c_blake256.h │ │ └── CnAlgo.h │ └── common │ │ ├── Algorithm.cpp │ │ └── Algorithm.h ├── version.h ├── 3rdparty │ └── cub │ │ ├── config.cuh │ │ ├── util_deprecated.cuh │ │ ├── util_namespace.cuh │ │ ├── version.cuh │ │ ├── util_compiler.cuh │ │ ├── util_macro.cuh │ │ ├── grid │ │ └── grid_mapping.cuh │ │ ├── util_cpp_dialect.cuh │ │ ├── util_debug.cuh │ │ ├── thread │ │ └── thread_reduce.cuh │ │ ├── block │ │ └── block_raking_layout.cuh │ │ ├── util_arch.cuh │ │ └── iterator │ │ └── cache_modified_input_iterator.cuh ├── common │ └── utils │ │ └── timestamp.h ├── cuda_fast_int_math_v2.hpp ├── cuda_device.hpp ├── cuda_extra.h ├── xmrig-cuda.h ├── cuda_blake.hpp ├── cryptonight.h ├── cuda_keccak.hpp └── CudaCryptonightR_gen.cpp ├── cmake ├── CUDA-Version.cmake ├── os.cmake ├── cpu.cmake └── flags.cmake ├── .github └── workflows │ └── test.yml ├── README.md ├── CMakeLists.txt └── CHANGELOG.md /.gitignore: -------------------------------------------------------------------------------- 1 | /CMakeLists.txt.user 2 | /build 3 | -------------------------------------------------------------------------------- /res/app.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MoneroOcean/xmrig-cuda/HEAD/res/app.ico -------------------------------------------------------------------------------- /doc/releases/2_0_1/SHA256SUMS: -------------------------------------------------------------------------------- 1 | e12d814f584cecbb6cec9c394e49989f53ae61d4079c51c7682d938a03963b96 xmrig-proxy-5.0.0-gcc-win32.zip 2 | 0a5e143c979ef163247439da3049492ecfad49355e34ae371b14a8e08529230e xmrig-proxy-5.0.0-msvc-win64.zip 3 | af7564afbb7e69aea52e6a2a945cb04caa09a993a468d5c1fdd73c3d337da05e xmrig-proxy-5.0.0-xenial-x64.tar.gz 4 | -------------------------------------------------------------------------------- /src/CudaCryptonightR_gen.h: -------------------------------------------------------------------------------- 1 | #ifndef XMRIG_CUDACRYPTONIGHTR_GEN_H 2 | #define XMRIG_CUDACRYPTONIGHTR_GEN_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | void CryptonightR_get_program(std::vector& ptx, std::string& lowered_name, uint64_t height, int arch_major, int arch_minor, bool background = false); 9 | 10 | #endif // XMRIG_CUDACRYPTONIGHTR_GEN_H 11 | -------------------------------------------------------------------------------- /src/KawPow/raven/CudaKawPow_gen.h: -------------------------------------------------------------------------------- 1 | #ifndef XMRIG_CUDAKAWPOW_GEN_H 2 | #define XMRIG_CUDAKAWPOW_GEN_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | void KawPow_get_program(std::vector& ptx, std::string& lowered_name, uint64_t period, uint32_t threads, int arch_major, int arch_minor, const uint64_t* dag_sizes, bool background = false); 9 | void calculate_fast_mod_data(uint32_t divisor, uint32_t& reciprocal, uint32_t& increment, uint32_t& shift); 10 | 11 | #endif // XMRIG_CUDAKAWPOW_GEN_H 12 | -------------------------------------------------------------------------------- /doc/releases/2_0_1/SHA256SUMS.sig: -------------------------------------------------------------------------------- 1 | -----BEGIN PGP SIGNATURE----- 2 | 3 | iQEzBAABCgAdFiEEmsTOqOZuNaXHzdwbRGpTY4vpRAkFAl3Vl4YACgkQRGpTY4vp 4 | RAlj+Qf/TeSwcQ7HoDeCk7kAVTu25gZDf/gqTyYVNPt8x4pSjc0ofxXNo/q0Yrla 5 | Dy5Ovjy0ZHJVYAC3vYdaDEaTWkZ0DVCytYDHEtsOgaA4jQm5baHGyIjREq1II8sl 6 | QU27VhiOsX39jxrV4bGJvSkgLRpljFSIlbwn8+yP+sCwPMJ4MMEoJCC60agIsZBu 7 | PsJQGVxAJ/n3nk2zvUuz/5DGqFyeOJ2MjqnLcaP6IoJ/PHxUngVi7k9qIggi6EFg 8 | Ou/M0VMNpSo9uengCKoOsidtTkoek3MXGw+eS/JVB0qNCGHaNHqj3bTRD8yk7Klv 9 | qq5hC4F84jAPPO8QHago9n4UcoYSkQ== 10 | =M0Ty 11 | -----END PGP SIGNATURE----- 12 | -------------------------------------------------------------------------------- /cmake/CUDA-Version.cmake: -------------------------------------------------------------------------------- 1 | set(DEVICE_COMPILER "nvcc") 2 | set(CUDA_COMPILER "${DEVICE_COMPILER}" CACHE STRING "Select the device compiler") 3 | 4 | if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") 5 | list(APPEND DEVICE_COMPILER "clang") 6 | endif() 7 | 8 | set_property(CACHE CUDA_COMPILER PROPERTY STRINGS "${DEVICE_COMPILER}") 9 | 10 | list(APPEND CMAKE_PREFIX_PATH "$ENV{CUDA_ROOT}") 11 | list(APPEND CMAKE_PREFIX_PATH "$ENV{CMAKE_PREFIX_PATH}") 12 | 13 | set(CUDA_STATIC ON) 14 | find_package(CUDA 8.0 REQUIRED) 15 | 16 | set(LIBS ${LIBS} ${CUDA_LIBRARIES}) 17 | -------------------------------------------------------------------------------- /src/cuda_fast_div_heavy.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | __device__ __forceinline__ uint64_t fast_div_heavy(int64_t _a, int32_t _b) 6 | { 7 | int64_t a = abs(_a); 8 | int32_t b = abs(_b); 9 | 10 | float rcp; 11 | asm("rcp.approx.f32 %0, %1;" : "=f"(rcp) : "f"(__int2float_rn(b))); 12 | float rcp2 = __uint_as_float(__float_as_uint(rcp) + (32U << 23)); 13 | 14 | uint64_t q1 = __float2ull_rd(__int2float_rn(((int32_t*)&a)[1]) * rcp2); 15 | a -= q1 * (uint32_t)(b); 16 | 17 | rcp2 = __uint_as_float(__float_as_uint(rcp) + (12U << 23)); 18 | int64_t q2 = __float2ll_rn(__int2float_rn(a >> 12) * rcp2); 19 | int32_t a2 = ((int32_t*)&a)[0] - ((int32_t*)&q2)[0] * b; 20 | 21 | int32_t q3 = __float2int_rn(__int2float_rn(a2) * rcp); 22 | q3 += (a2 - q3 * b) >> 31; 23 | 24 | const int64_t q = q1 + q2 + q3; 25 | return ((((int32_t*)&_a)[1] ^ _b) < 0) ? -q : q; 26 | } 27 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | on: push 2 | 3 | name: Test builds 4 | 5 | jobs: 6 | build_win_cuda11_4: 7 | name: Windows CUDA 11.4 8 | runs-on: windows-2019 9 | steps: 10 | - name: Checkout code 11 | uses: actions/checkout@master 12 | - name: Install CUDA 13 | run: | 14 | powershell -Command "Invoke-WebRequest https://developer.download.nvidia.com/compute/cuda/11.4.0/network_installers/cuda_11.4.0_win10_network.exe -OutFile .\cuda_setup.exe" 15 | start /wait .\cuda_setup.exe -s 16 | shell: cmd 17 | - name: Build project on Windows 18 | run: | 19 | cmake . -G "Visual Studio 16 2019" -DCUDA_TOOLKIT_ROOT_DIR="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.4" 20 | cd "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\MSBuild\Current\Bin" 21 | .\MSBuild.exe /p:Configuration=Release $Env:GITHUB_WORKSPACE\xmrig-cuda.sln || exit 1 22 | -------------------------------------------------------------------------------- /res/app.rc: -------------------------------------------------------------------------------- 1 | #include 2 | #include "../src/version.h" 3 | 4 | 101 ICON "app.ico" 5 | 6 | VS_VERSION_INFO VERSIONINFO 7 | FILEVERSION APP_VER_MAJOR,APP_VER_MINOR,APP_VER_PATCH,0 8 | PRODUCTVERSION APP_VER_MAJOR,APP_VER_MINOR,APP_VER_PATCH,0 9 | FILEFLAGSMASK 0x3fL 10 | #ifdef _DEBUG 11 | FILEFLAGS VS_FF_DEBUG 12 | #else 13 | FILEFLAGS 0x0L 14 | #endif 15 | FILEOS VOS__WINDOWS32 16 | FILETYPE VFT_APP 17 | FILESUBTYPE 0x0L 18 | BEGIN 19 | BLOCK "StringFileInfo" 20 | BEGIN 21 | BLOCK "000004b0" 22 | BEGIN 23 | VALUE "CompanyName", APP_SITE 24 | VALUE "FileDescription", APP_DESC 25 | VALUE "FileVersion", APP_VERSION 26 | VALUE "LegalCopyright", APP_COPYRIGHT 27 | VALUE "OriginalFilename", "xmrig-cuda.dll" 28 | VALUE "ProductName", APP_NAME 29 | VALUE "ProductVersion", APP_VERSION 30 | END 31 | END 32 | BLOCK "VarFileInfo" 33 | BEGIN 34 | VALUE "Translation", 0x0, 1200 35 | END 36 | END 37 | 38 | -------------------------------------------------------------------------------- /src/RandomX/arqma/randomx_arqma.cu: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2019 SChernykh 3 | 4 | This file is part of RandomX CUDA. 5 | 6 | RandomX CUDA is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | RandomX CUDA is distributed in the hope that it will be useful, 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | GNU General Public License for more details. 15 | 16 | You should have received a copy of the GNU General Public License 17 | along with RandomX CUDA. If not, see. 18 | */ 19 | 20 | #include "cryptonight.h" 21 | #include "cuda_device.hpp" 22 | 23 | 24 | #include 25 | #include 26 | #include 27 | 28 | 29 | namespace RandomX_Arqma { 30 | #include "configuration.h" 31 | #define fillAes4Rx4 fillAes4Rx4_v104 32 | #include "RandomX/common.hpp" 33 | } 34 | -------------------------------------------------------------------------------- /src/RandomX/defyx/randomx_defyx.cu: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2019 SChernykh 3 | 4 | This file is part of RandomX CUDA. 5 | 6 | RandomX CUDA is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | RandomX CUDA is distributed in the hope that it will be useful, 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | GNU General Public License for more details. 15 | 16 | You should have received a copy of the GNU General Public License 17 | along with RandomX CUDA. If not, see. 18 | */ 19 | 20 | #include "cryptonight.h" 21 | #include "cuda_device.hpp" 22 | 23 | 24 | #include 25 | #include 26 | #include 27 | 28 | 29 | namespace RandomX_DefyX { 30 | #include "configuration.h" 31 | #define fillAes4Rx4 fillAes4Rx4_v104 32 | #include "RandomX/common.hpp" 33 | } 34 | -------------------------------------------------------------------------------- /src/RandomX/graft/randomx_graft.cu: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2019 SChernykh 3 | 4 | This file is part of RandomX CUDA. 5 | 6 | RandomX CUDA is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | RandomX CUDA is distributed in the hope that it will be useful, 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | GNU General Public License for more details. 15 | 16 | You should have received a copy of the GNU General Public License 17 | along with RandomX CUDA. If not, see. 18 | */ 19 | 20 | #include "cryptonight.h" 21 | #include "cuda_device.hpp" 22 | 23 | 24 | #include 25 | #include 26 | #include 27 | 28 | 29 | namespace RandomX_Graft { 30 | #include "configuration.h" 31 | #define fillAes4Rx4 fillAes4Rx4_v104 32 | #include "RandomX/common.hpp" 33 | } 34 | -------------------------------------------------------------------------------- /src/RandomX/keva/randomx_keva.cu: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2019-2020 SChernykh 3 | 4 | This file is part of RandomX CUDA. 5 | 6 | RandomX CUDA is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | RandomX CUDA is distributed in the hope that it will be useful, 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | GNU General Public License for more details. 15 | 16 | You should have received a copy of the GNU General Public License 17 | along with RandomX CUDA. If not, see. 18 | */ 19 | 20 | #include "cryptonight.h" 21 | #include "cuda_device.hpp" 22 | 23 | 24 | #include 25 | #include 26 | #include 27 | 28 | 29 | namespace RandomX_Keva { 30 | #include "configuration.h" 31 | #define fillAes4Rx4 fillAes4Rx4_v104 32 | #include "RandomX/common.hpp" 33 | } 34 | -------------------------------------------------------------------------------- /src/RandomX/yada/randomx_yada.cu: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2019-2020 SChernykh 3 | 4 | This file is part of RandomX CUDA. 5 | 6 | RandomX CUDA is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | RandomX CUDA is distributed in the hope that it will be useful, 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | GNU General Public License for more details. 15 | 16 | You should have received a copy of the GNU General Public License 17 | along with RandomX CUDA. If not, see. 18 | */ 19 | 20 | #include "cryptonight.h" 21 | #include "cuda_device.hpp" 22 | 23 | 24 | #include 25 | #include 26 | #include 27 | 28 | 29 | namespace RandomX_Yada { 30 | #include "configuration.h" 31 | #define fillAes4Rx4 fillAes4Rx4_v104 32 | #include "RandomX/common.hpp" 33 | } 34 | -------------------------------------------------------------------------------- /src/RandomX/monero/randomx_monero.cu: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2019 SChernykh 3 | 4 | This file is part of RandomX CUDA. 5 | 6 | RandomX CUDA is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | RandomX CUDA is distributed in the hope that it will be useful, 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | GNU General Public License for more details. 15 | 16 | You should have received a copy of the GNU General Public License 17 | along with RandomX CUDA. If not, see. 18 | */ 19 | 20 | #include "cryptonight.h" 21 | #include "cuda_device.hpp" 22 | 23 | 24 | #include 25 | #include 26 | #include 27 | 28 | 29 | namespace RandomX_Monero { 30 | #include "configuration.h" 31 | #define fillAes4Rx4 fillAes4Rx4_v104 32 | #include "RandomX/common.hpp" 33 | } 34 | -------------------------------------------------------------------------------- /src/RandomX/wownero/randomx_wownero.cu: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2019 SChernykh 3 | 4 | This file is part of RandomX CUDA. 5 | 6 | RandomX CUDA is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | RandomX CUDA is distributed in the hope that it will be useful, 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | GNU General Public License for more details. 15 | 16 | You should have received a copy of the GNU General Public License 17 | along with RandomX CUDA. If not, see. 18 | */ 19 | 20 | #include "cryptonight.h" 21 | #include "cuda_device.hpp" 22 | 23 | 24 | #include 25 | #include 26 | #include 27 | 28 | 29 | namespace RandomX_Wownero { 30 | #include "configuration.h" 31 | #define fillAes4Rx4 fillAes4Rx4_v103 32 | #include "RandomX/common.hpp" 33 | } 34 | -------------------------------------------------------------------------------- /cmake/os.cmake: -------------------------------------------------------------------------------- 1 | if (WIN32) 2 | set(XMRIG_OS_WIN ON) 3 | elseif (APPLE) 4 | set(XMRIG_OS_APPLE ON) 5 | 6 | if (IOS OR CMAKE_SYSTEM_NAME STREQUAL iOS) 7 | set(XMRIG_OS_IOS ON) 8 | else() 9 | set(XMRIG_OS_MACOS ON) 10 | endif() 11 | else() 12 | set(XMRIG_OS_UNIX ON) 13 | 14 | if (ANDROID OR CMAKE_SYSTEM_NAME MATCHES "Android") 15 | set(XMRIG_OS_ANDROID ON) 16 | elseif(CMAKE_SYSTEM_NAME MATCHES "Linux") 17 | set(XMRIG_OS_LINUX ON) 18 | elseif(CMAKE_SYSTEM_NAME STREQUAL FreeBSD) 19 | set(XMRIG_OS_FREEBSD ON) 20 | endif() 21 | endif() 22 | 23 | 24 | if (XMRIG_OS_WIN) 25 | add_definitions(/DWIN32) 26 | add_definitions(/DXMRIG_OS_WIN) 27 | elseif(XMRIG_OS_APPLE) 28 | add_definitions(/DXMRIG_OS_APPLE) 29 | 30 | if (XMRIG_OS_IOS) 31 | add_definitions(/DXMRIG_OS_IOS) 32 | else() 33 | add_definitions(/DXMRIG_OS_MACOS) 34 | endif() 35 | elseif(XMRIG_OS_UNIX) 36 | add_definitions(/DXMRIG_OS_UNIX) 37 | 38 | if (XMRIG_OS_ANDROID) 39 | add_definitions(/DXMRIG_OS_ANDROID) 40 | elseif (XMRIG_OS_LINUX) 41 | add_definitions(/DXMRIG_OS_LINUX) 42 | elseif (XMRIG_OS_FREEBSD) 43 | add_definitions(/DXMRIG_OS_FREEBSD) 44 | endif() 45 | endif() 46 | -------------------------------------------------------------------------------- /cmake/cpu.cmake: -------------------------------------------------------------------------------- 1 | if (NOT CMAKE_SYSTEM_PROCESSOR) 2 | message(WARNING "CMAKE_SYSTEM_PROCESSOR not defined") 3 | endif() 4 | 5 | 6 | if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|AMD64)$") 7 | add_definitions(/DRAPIDJSON_SSE2) 8 | endif() 9 | 10 | if (NOT ARM_TARGET) 11 | if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm64|armv8-a)$") 12 | set(ARM_TARGET 8) 13 | elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^(armv7|armv7f|armv7s|armv7k|armv7-a|armv7l)$") 14 | set(ARM_TARGET 7) 15 | endif() 16 | endif() 17 | 18 | if (ARM_TARGET AND ARM_TARGET GREATER 6) 19 | set(XMRIG_ARM ON) 20 | set(WITH_LIBCPUID OFF) 21 | add_definitions(/DXMRIG_ARM) 22 | 23 | message(STATUS "Use ARM_TARGET=${ARM_TARGET} (${CMAKE_SYSTEM_PROCESSOR})") 24 | 25 | include(CheckCXXCompilerFlag) 26 | 27 | if (ARM_TARGET EQUAL 8) 28 | set(XMRIG_ARMv8 ON) 29 | add_definitions(/DXMRIG_ARMv8) 30 | 31 | CHECK_CXX_COMPILER_FLAG(-march=armv8-a+crypto XMRIG_ARM_CRYPTO) 32 | 33 | if (XMRIG_ARM_CRYPTO) 34 | add_definitions(/DXMRIG_ARM_CRYPTO) 35 | set(ARM8_CXX_FLAGS "-march=armv8-a+crypto") 36 | else() 37 | set(ARM8_CXX_FLAGS "-march=armv8-a") 38 | endif() 39 | elseif (ARM_TARGET EQUAL 7) 40 | set(XMRIG_ARMv7 ON) 41 | add_definitions(/DXMRIG_ARMv7) 42 | endif() 43 | endif() 44 | -------------------------------------------------------------------------------- /src/crypto/cn/c_blake256.h: -------------------------------------------------------------------------------- 1 | #ifndef _BLAKE256_H_ 2 | #define _BLAKE256_H_ 3 | 4 | #include 5 | 6 | typedef struct { 7 | uint32_t h[8], s[4], t[2]; 8 | int buflen, nullt; 9 | uint8_t buf[64]; 10 | } state; 11 | 12 | typedef struct { 13 | state inner; 14 | state outer; 15 | } hmac_state; 16 | 17 | void blake256_init(state *); 18 | void blake224_init(state *); 19 | 20 | void blake256_update(state *, const uint8_t *, uint64_t); 21 | void blake224_update(state *, const uint8_t *, uint64_t); 22 | 23 | void blake256_final(state *, uint8_t *); 24 | void blake224_final(state *, uint8_t *); 25 | 26 | void blake256_hash(uint8_t *, const uint8_t *, uint64_t); 27 | void blake224_hash(uint8_t *, const uint8_t *, uint64_t); 28 | 29 | /* HMAC functions: */ 30 | 31 | void hmac_blake256_init(hmac_state *, const uint8_t *, uint64_t); 32 | void hmac_blake224_init(hmac_state *, const uint8_t *, uint64_t); 33 | 34 | void hmac_blake256_update(hmac_state *, const uint8_t *, uint64_t); 35 | void hmac_blake224_update(hmac_state *, const uint8_t *, uint64_t); 36 | 37 | void hmac_blake256_final(hmac_state *, uint8_t *); 38 | void hmac_blake224_final(hmac_state *, uint8_t *); 39 | 40 | void hmac_blake256_hash(uint8_t *, const uint8_t *, uint64_t, const uint8_t *, uint64_t); 41 | void hmac_blake224_hash(uint8_t *, const uint8_t *, uint64_t, const uint8_t *, uint64_t); 42 | 43 | #endif /* _BLAKE256_H_ */ 44 | -------------------------------------------------------------------------------- /src/version.h: -------------------------------------------------------------------------------- 1 | /* XMRig 2 | * Copyright (c) 2018-2025 SChernykh 3 | * Copyright (c) 2016-2025 XMRig , 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, either version 3 of the License, or 8 | * (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU General Public License 16 | * along with this program. If not, see . 17 | */ 18 | 19 | #ifndef XMRIG_VERSION_H 20 | #define XMRIG_VERSION_H 21 | 22 | #define APP_ID "xmrig-cuda" 23 | #define APP_NAME "XMRig" 24 | #define APP_DESC "XMRig CUDA plugin" 25 | #define APP_VERSION "6.22.1-mo1" 26 | #define APP_DOMAIN "xmrig.com" 27 | #define APP_SITE "www.xmrig.com" 28 | #define APP_COPYRIGHT "Copyright (C) 2016-2025 xmrig.com" 29 | 30 | #define APP_VER_MAJOR 6 31 | #define APP_VER_MINOR 22 32 | #define APP_VER_PATCH 1 33 | 34 | #define API_VERSION 4 35 | 36 | #endif /* XMRIG_VERSION_H */ 37 | -------------------------------------------------------------------------------- /doc/gpg_keys/xmrig.asc: -------------------------------------------------------------------------------- 1 | -----BEGIN PGP PUBLIC KEY BLOCK----- 2 | 3 | mQENBF3VSRIBCADfFjDUbq0WLGulFeSou0A+jTvweNllPyLNOn3SNCC0XLEYyEcu 4 | JiEBK80DlvR06TVr8Aw1rT5S2iH0i5Tl8DqShH2mmcN1rBp1M0Y95D89KVj3BIhE 5 | nxmgmD4N3Wgm+5FmEH4W/RpG1xdYWJx3eJhtWPdFJqpg083E2D5P30wIQem+EnTR 6 | 5YrtTZPh5cPj2KRY+UmsDE3ahmxCgP7LYgnnpZQlWBBiMV932s7MvYBPJQc1wecS 7 | 0wi1zxyS81xHc3839EkA7wueCeNo+5jha+KH66tMKsfrI2WvfPHTCPjK9v7WJc/O 8 | /eRp9d+wacn09D1L6CoRO0ers5p10GO84VhTABEBAAG0GVhNUmlnIDxzdXBwb3J0 9 | QHhtcmlnLmNvbT6JAU4EEwEIADgWIQSaxM6o5m41pcfN3BtEalNji+lECQUCXdVJ 10 | EgIbAwULCQgHAgYVCgkICwIEFgIDAQIeAQIXgAAKCRBEalNji+lECbkQB/9nRou0 11 | tOlBwYn8xVgBu7IiDWNVETRWfrjrtdTvSahgbbo6lWgjA/vBLkjN9fISdBQ/n/Mt 12 | hNDJbEtxHHt2baJhvT8du1eWcIHHXCV/rmv+iY/hTXa1gKqHiHDJrtYSVBG3BMme 13 | 1rdsUHTiKf3t5yRHOXAfY2C+XNblKAV7mhlxQBiKxdFDIkFEQKNrHNUvnzkOqoCT 14 | 2kTZZ2tPUMQdOn1eek6zG/+C7SwcBpJnakJ8jce4yA/xZbOVKetNWO3Ufu3TE34k 15 | OdA+H4PU9+fV77XfOY8DtXeS3boUI97ei+4s/mwX/NFC0i8CPXyefxl3WRUBGDOI 16 | w//kPNQVh4HobOCeuQENBF3VSRIBCADl29WorEi+vRA/3kg9VUXtxSU6caibFS3N 17 | VXANiFRjrOmICdfrIgOSGNrYCQFsXu0Xe0udDYVX8yX6WJk+CT02Pdg0gkXiKoze 18 | KrnK15mo3xXbb2tr1o9ROPgwY/o2AwQHj0o1JhdS2cybfuRiUQRoGgBX7a9X0cTY 19 | r4ZJvOjzgAajl3ciwB3yWUmDiRlzZpO7YWESXbOhGVzyCnP5MlMEJ/fPRw9h38vK 20 | HNKLhzcRfsLpXk34ghY3SxIv4NWUfuZXFWqpSdC9JgNc5zA72lJEQcF4DHJCKl7B 21 | ddmrfsr9mdiIpo+/ZZFPPngdeZ2kvkJ2YKaZNVu2XooJARPQ8B8tABEBAAGJATYE 22 | GAEIACAWIQSaxM6o5m41pcfN3BtEalNji+lECQUCXdVJEgIbDAAKCRBEalNji+lE 23 | CdPUB/4nH1IdhHGmfko2kxdaHqQgCGLqh3pcrQXD9mBv/LYVnoHZpVRHsIDgg2Z4 24 | lQYrIRRqe69FjVxo7sA2eMIlV0GRDlUrw+HeURFpEhKPEdwFy6i/cti2MY0YxOrB 25 | TvQoRutUoMnyjM4TBJWaaqccbTsavMdLmG3JHdAkiHtUis/fUwVctmEQwN+d/J2b 26 | wJAtliqw3nXchUfdIfwHF/7hg8seUuYUaifzkazBZhVWvRkTVLVanzZ51HRfuzwD 27 | ntaa7kfYGdE+4TKOylAPh+8E6WnR19RRTpsaW0dVBgOiBTE0uc7rUv2HWS/u6RUR 28 | t7ldSBzkuDTlM2V59Iq2hXoSC6dT 29 | =cIG9 30 | -----END PGP PUBLIC KEY BLOCK----- 31 | -------------------------------------------------------------------------------- /src/3rdparty/cub/config.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /** 29 | * \file 30 | * Static configuration header for the CUB project. 31 | */ 32 | 33 | #pragma once 34 | 35 | #include "util_arch.cuh" 36 | #include "util_compiler.cuh" 37 | #include "util_cpp_dialect.cuh" 38 | #include "util_deprecated.cuh" 39 | #include "util_macro.cuh" 40 | #include "util_namespace.cuh" 41 | -------------------------------------------------------------------------------- /src/common/utils/timestamp.h: -------------------------------------------------------------------------------- 1 | /* XMRig 2 | * Copyright 2010 Jeff Garzik 3 | * Copyright 2012-2014 pooler 4 | * Copyright 2014 Lucas Jones 5 | * Copyright 2014-2016 Wolf9466 6 | * Copyright 2016 Jay D Dee 7 | * Copyright 2017-2018 XMR-Stak , 8 | * Copyright 2018-2020 SChernykh 9 | * Copyright 2016-2020 XMRig , 10 | * 11 | * This program is free software: you can redistribute it and/or modify 12 | * it under the terms of the GNU General Public License as published by 13 | * the Free Software Foundation, either version 3 of the License, or 14 | * (at your option) any later version. 15 | * 16 | * This program is distributed in the hope that it will be useful, 17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | * GNU General Public License for more details. 20 | * 21 | * You should have received a copy of the GNU General Public License 22 | * along with this program. If not, see . 23 | */ 24 | 25 | #ifndef XMRIG_TIMESTAMP_H 26 | #define XMRIG_TIMESTAMP_H 27 | 28 | 29 | #include 30 | 31 | 32 | namespace xmrig_cuda { 33 | 34 | 35 | static inline int64_t steadyTimestamp() 36 | { 37 | using namespace std::chrono; 38 | if (high_resolution_clock::is_steady) { 39 | return time_point_cast(high_resolution_clock::now()).time_since_epoch().count(); 40 | } 41 | else { 42 | return time_point_cast(steady_clock::now()).time_since_epoch().count(); 43 | } 44 | } 45 | 46 | 47 | static inline int64_t currentMSecsSinceEpoch() 48 | { 49 | using namespace std::chrono; 50 | 51 | return time_point_cast(high_resolution_clock::now()).time_since_epoch().count(); 52 | } 53 | 54 | 55 | } /* namespace xmrig_cuda */ 56 | 57 | #endif /* XMRIG_TIMESTAMP_H */ 58 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # xmrig-cuda 2 | This repository contains the CUDA plugin for the XMRig miner, which provides support for NVIDIA GPUs. 3 | 4 | This plugin is a separate project because of the main reasons listed below: 5 | 1. Not all users need CUDA support, and it is an optional feature. 6 | 2. CUDA has strict compiler version requirements that may be difficult to meet, unlike CPU mining code, which is generally very flexible. 7 | 8 | 9 | ## Windows 10 | 11 | <<<<<<< HEAD 12 | * To [download](https://github.com/MoneroOcean/xmrig-cuda/releases) the plugin, you must choose the appropriate CUDA version. Generally, the latest version (12.4) is all you need, unless you have very old GPUs. Windows builds are available for every major CUDA release. Alternatively, you can [build](https://xmrig.com/docs/miner/build/windows) the plugin from the source. 13 | ======= 14 | * To [download](https://github.com/xmrig/xmrig-cuda/releases) the plugin, you must choose the appropriate CUDA version. Generally, the latest version is all you need, unless you have very old GPUs. Windows builds are available for every major CUDA release. Alternatively, you can [build](https://xmrig.com/docs/miner/build/windows) the plugin from the source. 15 | >>>>>>> 861e49d24797dce63e6eb8590c86ddd2eb5e81ab 16 | * Place **`xmrig-cuda.dll`** and other dll files near to **`xmrig.exe`**. 17 | * Edit **`config.json`** enable the plugin. 18 | ``` 19 | { 20 | ... 21 | "cuda": { 22 | "enabled": true, 23 | ... 24 | } 25 | ... 26 | } 27 | ``` 28 | ### Advanced 29 | You can specify the path to the plugin using the `loader` option. 30 | ``` 31 | { 32 | ... 33 | "cuda": { 34 | "enabled": true, 35 | "loader": "c:/some/path/xmrig-cuda.dll", 36 | ... 37 | } 38 | ... 39 | } 40 | ``` 41 | Due to JSON format restrictions, the directory separator must be written in Linux style `/` or escaped `\\`. 42 | 43 | ## Linux 44 | Linux usage is almost the same as Windows except we don't provide binaries and you must build the plugin from the source and the name of the plugin is different **`libxmrig-cuda.so`**. 45 | 46 | ## macOS 47 | CUDA no longer supports macOS, which means that the plugin also does not support it. 48 | -------------------------------------------------------------------------------- /src/RandomX/common.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | /* 4 | Copyright (c) 2019 SChernykh 5 | 6 | This file is part of RandomX CUDA. 7 | 8 | RandomX CUDA is free software: you can redistribute it and/or modify 9 | it under the terms of the GNU General Public License as published by 10 | the Free Software Foundation, either version 3 of the License, or 11 | (at your option) any later version. 12 | 13 | RandomX CUDA is distributed in the hope that it will be useful, 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | GNU General Public License for more details. 17 | 18 | You should have received a copy of the GNU General Public License 19 | along with RandomX CUDA. If not, see. 20 | */ 21 | 22 | 23 | #include 24 | 25 | 26 | #define RANDOMX_DATASET_ITEM_SIZE 64 27 | #define RANDOMX_DATASET_EXTRA_SIZE 33554368 28 | #define RANDOMX_JUMP_BITS 8 29 | #define RANDOMX_JUMP_OFFSET 8 30 | 31 | 32 | namespace randomx { 33 | constexpr int mantissaSize = 52; 34 | constexpr int exponentSize = 11; 35 | constexpr uint64_t mantissaMask = (1ULL << mantissaSize) - 1; 36 | constexpr uint64_t exponentMask = (1ULL << exponentSize) - 1; 37 | constexpr int exponentBias = 1023; 38 | constexpr int dynamicExponentBits = 4; 39 | constexpr int staticExponentBits = 4; 40 | constexpr uint64_t constExponentBits = 0x300; 41 | constexpr uint64_t dynamicMantissaMask = (1ULL << (mantissaSize + dynamicExponentBits)) - 1; 42 | 43 | constexpr int RegistersCount = 8; 44 | constexpr int RegisterCountFlt = RegistersCount / 2; 45 | constexpr int RegisterNeedsDisplacement = 5; //x86 r13 register 46 | 47 | constexpr int CacheLineSize = RANDOMX_DATASET_ITEM_SIZE; 48 | constexpr uint32_t DatasetExtraItems = RANDOMX_DATASET_EXTRA_SIZE / RANDOMX_DATASET_ITEM_SIZE; 49 | 50 | constexpr uint32_t ConditionMask = ((1 << RANDOMX_JUMP_BITS) - 1); 51 | constexpr int ConditionOffset = RANDOMX_JUMP_OFFSET; 52 | constexpr int StoreL3Condition = 14; 53 | } 54 | 55 | #include "blake2b_cuda.hpp" 56 | #include "aes_cuda.hpp" 57 | #include "randomx_cuda.hpp" 58 | #include "hash.hpp" 59 | -------------------------------------------------------------------------------- /src/cuda_fast_int_math_v2.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | __device__ __forceinline__ uint32_t get_reciprocal(uint32_t a) 6 | { 7 | const float a_hi = __uint_as_float((a >> 8) + ((126U + 31U) << 23)); 8 | const float a_lo = __uint2float_rn(a & 0xFF); 9 | 10 | float r; 11 | asm("rcp.approx.f32 %0, %1;" : "=f"(r) : "f"(a_hi)); 12 | const float r_scaled = __uint_as_float(__float_as_uint(r) + (64U << 23)); 13 | 14 | const float h = __fmaf_rn(a_lo, r, __fmaf_rn(a_hi, r, -1.0f)); 15 | return (__float_as_uint(r) << 9) - __float2int_rn(h * r_scaled); 16 | } 17 | 18 | __device__ __forceinline__ uint64_t fast_div_v2(uint64_t a, uint32_t b) 19 | { 20 | const uint32_t r = get_reciprocal(b); 21 | const uint64_t k = __umulhi(((uint32_t*)&a)[0], r) + ((uint64_t)(r) * ((uint32_t*)&a)[1]) + a; 22 | 23 | uint32_t q[2]; 24 | q[0] = ((uint32_t*)&k)[1]; 25 | 26 | int64_t tmp = a - (uint64_t)(q[0]) * b; 27 | ((int32_t*)(&tmp))[1] -= (k < a) ? b : 0; 28 | 29 | const bool overshoot = ((int32_t*)(&tmp))[1] < 0; 30 | const bool undershoot = tmp >= b; 31 | 32 | q[0] += (undershoot ? 1U : 0U) - (overshoot ? 1U : 0U); 33 | q[1] = ((uint32_t*)(&tmp))[0] + (overshoot ? b : 0U) - (undershoot ? b : 0U); 34 | 35 | return *((uint64_t*)(q)); 36 | } 37 | 38 | __device__ __forceinline__ uint32_t fast_sqrt_v2(const uint64_t n1) 39 | { 40 | float x = __uint_as_float((((uint32_t*)&n1)[1] >> 9) + ((64U + 127U) << 23)); 41 | float x1; 42 | asm("rsqrt.approx.f32 %0, %1;" : "=f"(x1) : "f"(x)); 43 | asm("sqrt.approx.f32 %0, %1;" : "=f"(x) : "f"(x)); 44 | 45 | // The following line does x1 *= 4294967296.0f; 46 | x1 = __uint_as_float(__float_as_uint(x1) + (32U << 23)); 47 | 48 | const uint32_t x0 = __float_as_uint(x) - (158U << 23); 49 | const int64_t delta0 = n1 - (((int64_t)(x0) * x0) << 18); 50 | const float delta = __int2float_rn(((int32_t*)&delta0)[1]) * x1; 51 | 52 | uint32_t result = (x0 << 10) + __float2int_rn(delta); 53 | const uint32_t s = result >> 1; 54 | const uint32_t b = result & 1; 55 | 56 | const uint64_t x2 = (uint64_t)(s) * (s + b) + ((uint64_t)(result) << 32) - n1; 57 | if ((int64_t)(x2 + b) > 0) --result; 58 | if ((int64_t)(x2 + 0x100000000UL + s) < 0) ++result; 59 | 60 | return result; 61 | } 62 | -------------------------------------------------------------------------------- /src/crypto/common/Algorithm.cpp: -------------------------------------------------------------------------------- 1 | /* XMRig 2 | * Copyright (c) 2018 Lee Clagett 3 | * Copyright (c) 2018-2021 SChernykh 4 | * Copyright (c) 2016-2021 XMRig , 5 | * 6 | * This program is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation, either version 3 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see . 18 | */ 19 | 20 | #include "crypto/common/Algorithm.h" 21 | 22 | 23 | #include 24 | 25 | 26 | xmrig_cuda::Algorithm::Id xmrig_cuda::Algorithm::parse(uint32_t id) 27 | { 28 | static const std::set ids = { 29 | CN_0, CN_1, CN_2, CN_FAST, CN_HALF, CN_XAO, CN_RTO, CN_RWZ, CN_ZLS, CN_DOUBLE, CN_CCX, 30 | # ifdef XMRIG_ALGO_CN_R 31 | CN_R, 32 | # endif 33 | # ifdef XMRIG_ALGO_CN_LITE 34 | CN_LITE_0, CN_LITE_1, 35 | # endif 36 | # ifdef XMRIG_ALGO_CN_HEAVY 37 | CN_HEAVY_0, CN_HEAVY_TUBE, CN_HEAVY_XHV, 38 | # endif 39 | # ifdef XMRIG_ALGO_CN_PICO 40 | CN_PICO_0, CN_PICO_TLO, 41 | # endif 42 | # ifdef XMRIG_ALGO_CN_FEMTO 43 | CN_UPX2, 44 | # endif 45 | # ifdef XMRIG_ALGO_CN_GPU 46 | CN_GPU, 47 | # endif 48 | # ifdef XMRIG_ALGO_RANDOMX 49 | RX_XLA, 50 | RX_0, RX_WOW, RX_ARQ, RX_GRAFT, RX_SFX, RX_KEVA, RX_YADA, 51 | # endif 52 | # ifdef XMRIG_ALGO_ARGON2 53 | AR2_CHUKWA, AR2_CHUKWA_V2, AR2_WRKZ, 54 | # endif 55 | # ifdef XMRIG_ALGO_KAWPOW 56 | KAWPOW_RVN, 57 | # endif 58 | }; 59 | 60 | return ids.count(id) ? static_cast(id) : INVALID; 61 | } 62 | -------------------------------------------------------------------------------- /src/RandomX/randomx.cu: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2019 SChernykh 3 | 4 | This file is part of RandomX CUDA. 5 | 6 | RandomX CUDA is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | RandomX CUDA is distributed in the hope that it will be useful, 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | GNU General Public License for more details. 15 | 16 | You should have received a copy of the GNU General Public License 17 | along with RandomX CUDA. If not, see. 18 | */ 19 | 20 | 21 | #include "cryptonight.h" 22 | #include "cuda_device.hpp" 23 | 24 | 25 | #include 26 | #include 27 | #include 28 | 29 | 30 | void randomx_prepare(nvid_ctx *ctx, const void *dataset, size_t dataset_size, uint32_t batch_size) 31 | { 32 | ctx->rx_batch_size = batch_size; 33 | ctx->d_scratchpads_size = batch_size * (ctx->algorithm.l3() + 64); 34 | 35 | if (ctx->rx_dataset_host > 0) { 36 | CUDA_CHECK(ctx->device_id, cudaHostGetDevicePointer(&ctx->d_rx_dataset, const_cast(dataset), 0)); 37 | } 38 | else { 39 | CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_rx_dataset, dataset_size)); 40 | CUDA_CHECK(ctx->device_id, cudaMemcpy(ctx->d_rx_dataset, dataset, dataset_size, cudaMemcpyHostToDevice)); 41 | } 42 | 43 | CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_long_state, ctx->d_scratchpads_size)); 44 | CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_rx_hashes, batch_size * 64)); 45 | CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_rx_entropy, batch_size * (128 + 2560))); 46 | CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_rx_vm_states, batch_size * 2560)); 47 | CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_rx_rounding, batch_size * sizeof(uint32_t))); 48 | } 49 | 50 | 51 | void randomx_update_dataset(nvid_ctx* ctx, const void* dataset, size_t dataset_size) 52 | { 53 | if (ctx->rx_dataset_host > 0) { 54 | return; 55 | } 56 | 57 | CUDA_CHECK(ctx->device_id, cudaMemcpy(ctx->d_rx_dataset, dataset, dataset_size, cudaMemcpyHostToDevice)); 58 | } 59 | -------------------------------------------------------------------------------- /src/cuda_device.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | 9 | #define CUDA_THROW(error) throw std::runtime_error(std::string("<") + __FUNCTION__ + ">:" + std::to_string(__LINE__) + " \"" + (error) + "\"") 10 | 11 | 12 | /** execute and check a CUDA api command 13 | * 14 | * @param id gpu id (thread id) 15 | * @param ... CUDA api command 16 | */ 17 | #define CUDA_CHECK(id, ...) { \ 18 | cudaError_t error = __VA_ARGS__; \ 19 | if (error != cudaSuccess){ \ 20 | CUDA_THROW(cudaGetErrorString(error)); \ 21 | } \ 22 | } \ 23 | ( (void) 0 ) 24 | 25 | /** execute and check a CUDA kernel 26 | * 27 | * @param id gpu id (thread id) 28 | * @param ... CUDA kernel call 29 | */ 30 | #define CUDA_CHECK_KERNEL(id, ...) \ 31 | __VA_ARGS__; \ 32 | CUDA_CHECK(id, cudaGetLastError()) 33 | 34 | #if defined(XMRIG_ALGO_KAWPOW) || defined(XMRIG_ALGO_CN_R) 35 | #define CU_CHECK(id, ...) { \ 36 | CUresult result = __VA_ARGS__; \ 37 | if(result != CUDA_SUCCESS){ \ 38 | const char* s; \ 39 | cuGetErrorString(result, &s); \ 40 | CUDA_THROW(s ? s : "unknown error"); \ 41 | } \ 42 | } \ 43 | ( (void) 0 ) 44 | #endif 45 | -------------------------------------------------------------------------------- /src/3rdparty/cub/util_deprecated.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /** 29 | * \file 30 | * Define CUB_DEPRECATED macro. 31 | */ 32 | 33 | #pragma once 34 | 35 | #include "util_compiler.cuh" 36 | 37 | #if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC 38 | # define CUB_DEPRECATED __declspec(deprecated) 39 | #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG 40 | # define CUB_DEPRECATED __attribute__((deprecated)) 41 | #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC 42 | # define CUB_DEPRECATED __attribute__((deprecated)) 43 | #else 44 | # define CUB_DEPRECATED 45 | #endif 46 | 47 | -------------------------------------------------------------------------------- /src/3rdparty/cub/util_namespace.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Place-holder for prefixing the cub namespace 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "version.cuh" 37 | 38 | // For example: 39 | //#define CUB_NS_PREFIX namespace thrust{ namespace detail { 40 | //#define CUB_NS_POSTFIX } } 41 | 42 | #ifndef CUB_NS_PREFIX 43 | #define CUB_NS_PREFIX 44 | #endif 45 | 46 | #ifndef CUB_NS_POSTFIX 47 | #define CUB_NS_POSTFIX 48 | #endif 49 | 50 | // Declare these namespaces here for the purpose of Doxygenating them 51 | 52 | /*! \namespace cub 53 | * \brief \p cub is the top-level namespace which contains all CUB 54 | * functions and types. 55 | */ 56 | namespace cub 57 | { 58 | 59 | } 60 | -------------------------------------------------------------------------------- /src/3rdparty/cub/version.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /*! \file version.h 29 | * \brief Compile-time macros encoding CUB release version 30 | * 31 | * is the only CUB header that is guaranteed to 32 | * change with every CUB release. 33 | * 34 | */ 35 | 36 | #pragma once 37 | 38 | /*! \def CUB_VERSION 39 | * \brief The preprocessor macro \p CUB_VERSION encodes the version 40 | * number of the CUB library. 41 | * 42 | * CUB_VERSION % 100 is the sub-minor version. 43 | * CUB_VERSION / 100 % 1000 is the minor version. 44 | * CUB_VERSION / 100000 is the major version. 45 | */ 46 | #define CUB_VERSION 101000 47 | 48 | /*! \def CUB_MAJOR_VERSION 49 | * \brief The preprocessor macro \p CUB_MAJOR_VERSION encodes the 50 | * major version number of the CUB library. 51 | */ 52 | #define CUB_MAJOR_VERSION (CUB_VERSION / 100000) 53 | 54 | /*! \def CUB_MINOR_VERSION 55 | * \brief The preprocessor macro \p CUB_MINOR_VERSION encodes the 56 | * minor version number of the CUB library. 57 | */ 58 | #define CUB_MINOR_VERSION (CUB_VERSION / 100 % 1000) 59 | 60 | /*! \def CUB_SUBMINOR_VERSION 61 | * \brief The preprocessor macro \p CUB_SUBMINOR_VERSION encodes the 62 | * sub-minor version number of the CUB library. 63 | */ 64 | #define CUB_SUBMINOR_VERSION (CUB_VERSION % 100) 65 | 66 | /*! \def CUB_PATCH_NUMBER 67 | * \brief The preprocessor macro \p CUB_PATCH_NUMBER encodes the 68 | * patch number of the CUB library. 69 | */ 70 | #define CUB_PATCH_NUMBER 0 71 | -------------------------------------------------------------------------------- /src/3rdparty/cub/util_compiler.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /** 29 | * \file 30 | * Detect compiler information. 31 | */ 32 | 33 | #pragma once 34 | 35 | // enumerate host compilers we know about 36 | #define CUB_HOST_COMPILER_UNKNOWN 0 37 | #define CUB_HOST_COMPILER_MSVC 1 38 | #define CUB_HOST_COMPILER_GCC 2 39 | #define CUB_HOST_COMPILER_CLANG 3 40 | 41 | // enumerate device compilers we know about 42 | #define CUB_DEVICE_COMPILER_UNKNOWN 0 43 | #define CUB_DEVICE_COMPILER_MSVC 1 44 | #define CUB_DEVICE_COMPILER_GCC 2 45 | #define CUB_DEVICE_COMPILER_NVCC 3 46 | #define CUB_DEVICE_COMPILER_CLANG 4 47 | 48 | // figure out which host compiler we're using 49 | #if defined(_MSC_VER) 50 | # define CUB_HOST_COMPILER CUB_HOST_COMPILER_MSVC 51 | # define CUB_MSVC_VERSION _MSC_VER 52 | # define CUB_MSVC_VERSION_FULL _MSC_FULL_VER 53 | #elif defined(__clang__) 54 | # define CUB_HOST_COMPILER CUB_HOST_COMPILER_CLANG 55 | # define CUB_CLANG_VERSION \ 56 | (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__) 57 | #elif defined(__GNUC__) 58 | # define CUB_HOST_COMPILER CUB_HOST_COMPILER_GCC 59 | # define CUB_GCC_VERSION \ 60 | (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) 61 | #else 62 | # define CUB_HOST_COMPILER CUB_HOST_COMPILER_UNKNOWN 63 | #endif // CUB_HOST_COMPILER 64 | 65 | // figure out which device compiler we're using 66 | #if defined(__CUDACC__) 67 | # define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_NVCC 68 | #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC 69 | # define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_MSVC 70 | #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC 71 | # define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_GCC 72 | #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG 73 | // CUDA-capable clang should behave similar to NVCC. 74 | # if defined(__CUDA__) 75 | # define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_NVCC 76 | # else 77 | # define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_CLANG 78 | # endif 79 | #else 80 | # define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_UNKNOWN 81 | #endif 82 | -------------------------------------------------------------------------------- /src/cuda_extra.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef __INTELLISENSE__ 4 | #define __CUDA_ARCH__ 520 5 | /* avoid red underlining */ 6 | 7 | struct uint3 8 | { 9 | unsigned int x, y, z; 10 | }; 11 | 12 | struct uint3 threadIdx; 13 | struct uint3 blockIdx; 14 | struct uint3 blockDim; 15 | #define __funnelshift_r(a,b,c) 1 16 | #define __syncthreads() 17 | #define asm(x) 18 | #define __shfl(a,b,c) 1 19 | #endif 20 | 21 | #define AES_BLOCK_SIZE 16 22 | #define AES_KEY_SIZE 32 23 | #define INIT_SIZE_BLK 8 24 | #define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE) // 128 B 25 | 26 | #define C32(x) ((uint32_t)(x ## U)) 27 | #define T32(x) ((x) & C32(0xFFFFFFFF)) 28 | 29 | #if __CUDA_ARCH__ >= 350 30 | __forceinline__ __device__ uint64_t cuda_ROTL64(const uint64_t value, const int offset) 31 | { 32 | uint2 result; 33 | if(offset >= 32) 34 | { 35 | asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); 36 | asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); 37 | } 38 | else 39 | { 40 | asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); 41 | asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); 42 | } 43 | return __double_as_longlong(__hiloint2double(result.y, result.x)); 44 | } 45 | #define ROTL64(x, n) (cuda_ROTL64(x, n)) 46 | #else 47 | #define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) 48 | #endif 49 | 50 | #if __CUDA_ARCH__ < 350 51 | #define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n)))) 52 | #define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) 53 | #else 54 | #define ROTL32(x, n) __funnelshift_l( (x), (x), (n) ) 55 | #define ROTR32(x, n) __funnelshift_r( (x), (x), (n) ) 56 | #endif 57 | 58 | #define MEMSET8(dst,what,cnt) { \ 59 | int i_memset8; \ 60 | uint64_t *out_memset8 = (uint64_t *)(dst); \ 61 | for( i_memset8 = 0; i_memset8 < cnt; i_memset8++ ) \ 62 | out_memset8[i_memset8] = (what); } 63 | 64 | #define MEMSET4(dst,what,cnt) { \ 65 | int i_memset4; \ 66 | uint32_t *out_memset4 = (uint32_t *)(dst); \ 67 | for( i_memset4 = 0; i_memset4 < cnt; i_memset4++ ) \ 68 | out_memset4[i_memset4] = (what); } 69 | 70 | #define MEMCPY8(dst,src,cnt) { \ 71 | int i_memcpy8; \ 72 | uint64_t *in_memcpy8 = (uint64_t *)(src); \ 73 | uint64_t *out_memcpy8 = (uint64_t *)(dst); \ 74 | for( i_memcpy8 = 0; i_memcpy8 < cnt; i_memcpy8++ ) \ 75 | out_memcpy8[i_memcpy8] = in_memcpy8[i_memcpy8]; } 76 | 77 | #define MEMCPY4(dst,src,cnt) { \ 78 | int i_memcpy4; \ 79 | uint32_t *in_memcpy4 = (uint32_t *)(src); \ 80 | uint32_t *out_memcpy4 = (uint32_t *)(dst); \ 81 | for( i_memcpy4 = 0; i_memcpy4 < cnt; i_memcpy4++ ) \ 82 | out_memcpy4[i_memcpy4] = in_memcpy4[i_memcpy4]; } 83 | 84 | #define XOR_BLOCKS(a,b) { \ 85 | ((uint64_t *)a)[0] ^= ((uint64_t *)b)[0]; \ 86 | ((uint64_t *)a)[1] ^= ((uint64_t *)b)[1]; } 87 | 88 | #define XOR_BLOCKS_DST(x,y,z) { \ 89 | ((uint64_t *)z)[0] = ((uint64_t *)(x))[0] ^ ((uint64_t *)(y))[0]; \ 90 | ((uint64_t *)z)[1] = ((uint64_t *)(x))[1] ^ ((uint64_t *)(y))[1]; } 91 | 92 | #define MUL_SUM_XOR_DST(a,c,dst) { \ 93 | const uint64_t dst0 = ((uint64_t *)dst)[0]; \ 94 | uint64_t hi, lo = cuda_mul128(((uint64_t *)a)[0], dst0, &hi) + ((uint64_t *)c)[1]; \ 95 | hi += ((uint64_t *)c)[0]; \ 96 | ((uint64_t *)c)[0] = dst0 ^ hi; \ 97 | ((uint64_t *)dst)[0] = hi; \ 98 | ((uint64_t *)c)[1] = atomicExch(((unsigned long long int *)dst) + 1, (unsigned long long int)lo) ^ lo; \ 99 | } 100 | 101 | #define E2I(x) ((size_t)(((*((uint64_t*)(x)) >> 4) & 0x1ffff))) 102 | 103 | -------------------------------------------------------------------------------- /cmake/flags.cmake: -------------------------------------------------------------------------------- 1 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 2 | set(CMAKE_CXX_EXTENSIONS OFF) 3 | set(CMAKE_CXX_STANDARD 11) 4 | 5 | set(CMAKE_C_STANDARD 99) 6 | set(CMAKE_C_STANDARD_REQUIRED ON) 7 | 8 | if ("${CMAKE_BUILD_TYPE}" STREQUAL "") 9 | set(CMAKE_BUILD_TYPE Release) 10 | endif() 11 | 12 | if (CMAKE_BUILD_TYPE STREQUAL "Release") 13 | add_definitions(/DNDEBUG) 14 | endif() 15 | 16 | include(CheckSymbolExists) 17 | 18 | if (CMAKE_CXX_COMPILER_ID MATCHES GNU) 19 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wno-strict-aliasing -fPIC") 20 | set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O2") 21 | 22 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -fexceptions -fno-rtti -Wno-strict-aliasing -Wno-class-memaccess -fPIC") 23 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2 -s") 24 | 25 | if (XMRIG_ARMv8) 26 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ARM8_CXX_FLAGS}") 27 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARM8_CXX_FLAGS} -flax-vector-conversions") 28 | elseif (XMRIG_ARMv7) 29 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=neon") 30 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon -flax-vector-conversions") 31 | else() 32 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes") 33 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes") 34 | 35 | add_definitions(/DHAVE_ROTR) 36 | endif() 37 | 38 | if (WIN32) 39 | if (CMAKE_SIZEOF_VOID_P EQUAL 8) 40 | set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static") 41 | else() 42 | set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static -Wl,--large-address-aware") 43 | endif() 44 | else() 45 | set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libgcc -static-libstdc++") 46 | endif() 47 | 48 | add_definitions(/D_GNU_SOURCE) 49 | 50 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99") 51 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") 52 | 53 | add_definitions(/DHAVE_BUILTIN_CLEAR_CACHE) 54 | 55 | elseif (CMAKE_CXX_COMPILER_ID MATCHES MSVC) 56 | 57 | set(CMAKE_C_FLAGS_RELEASE "/MP /MT /O2 /Ob2 /DNDEBUG") 58 | set(CMAKE_CXX_FLAGS_RELEASE "/MP /MT /O2 /Ob2 /DNDEBUG") 59 | add_definitions(/D_CRT_SECURE_NO_WARNINGS) 60 | add_definitions(/D_CRT_NONSTDC_NO_WARNINGS) 61 | add_definitions(/DNOMINMAX) 62 | add_definitions(/DHAVE_ROTR) 63 | 64 | elseif (CMAKE_CXX_COMPILER_ID MATCHES Clang) 65 | 66 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall") 67 | set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O2 -funroll-loops -fmerge-all-constants") 68 | 69 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -fexceptions -fno-rtti -Wno-missing-braces") 70 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2 -funroll-loops -fmerge-all-constants") 71 | 72 | if (XMRIG_ARMv8) 73 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ARM8_CXX_FLAGS}") 74 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARM8_CXX_FLAGS}") 75 | elseif (XMRIG_ARMv7) 76 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=neon -march=${CMAKE_SYSTEM_PROCESSOR}") 77 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon -march=${CMAKE_SYSTEM_PROCESSOR}") 78 | else() 79 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes") 80 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes") 81 | 82 | check_symbol_exists("_rotr" "x86intrin.h" HAVE_ROTR) 83 | if (HAVE_ROTR) 84 | add_definitions(/DHAVE_ROTR) 85 | endif() 86 | endif() 87 | 88 | if (XMRIG_OS_APPLE) 89 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99") 90 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") 91 | endif() 92 | 93 | endif() 94 | 95 | if (NOT WIN32) 96 | check_symbol_exists("__builtin___clear_cache" "stdlib.h" HAVE_BUILTIN_CLEAR_CACHE) 97 | if (HAVE_BUILTIN_CLEAR_CACHE) 98 | add_definitions(/DHAVE_BUILTIN_CLEAR_CACHE) 99 | endif() 100 | endif() 101 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.5) 2 | project(xmrig-cuda) 3 | include(cmake/CUDA-Version.cmake) 4 | 5 | 6 | option(WITH_DRIVER_API "Enable CUDA Driver API and NVRTC, required for cn/r and kawpow algorithms" ON) 7 | 8 | # Algorithm selection 9 | option(WITH_CN_R "Enable CryptoNight-R algorithm" ON) 10 | option(WITH_CN_LITE "Enable CryptoNight-Lite algorithms family" ON) 11 | option(WITH_CN_HEAVY "Enable CryptoNight-Heavy algorithms family" ON) 12 | option(WITH_CN_PICO "Enable CryptoNight-Pico algorithm" ON) 13 | option(WITH_CN_FEMTO "Enable CryptoNight-UPX2 algorithm" ON) 14 | option(WITH_CN_GPU "Enable CryptoNight-GPU algorithm" ON) 15 | option(WITH_ARGON2 "Enable Argon2 algorithms family" OFF) #unsupported 16 | 17 | if (CUDA_VERSION VERSION_LESS 9.0) 18 | message(STATUS "CUDA ${CUDA_VERSION}: RandomX and KawPow disabled, they do not work with old CUDA") 19 | option(WITH_RANDOMX "Enable RandomX algorithms family" OFF) 20 | option(WITH_KAWPOW "Enable KawPow algorithms family" OFF) 21 | else() 22 | option(WITH_RANDOMX "Enable RandomX algorithms family" ON) 23 | option(WITH_KAWPOW "Enable KawPow algorithms family" ON) 24 | endif() 25 | 26 | if (WITH_CN_LITE) 27 | add_definitions(/DXMRIG_ALGO_CN_LITE) 28 | endif() 29 | 30 | if (WITH_CN_HEAVY) 31 | add_definitions(/DXMRIG_ALGO_CN_HEAVY) 32 | endif() 33 | 34 | if (WITH_CN_PICO) 35 | add_definitions(/DXMRIG_ALGO_CN_PICO) 36 | endif() 37 | 38 | if (WITH_CN_FEMTO) 39 | add_definitions(/DXMRIG_ALGO_CN_FEMTO) 40 | endif() 41 | 42 | if (WITH_CN_GPU) 43 | add_definitions(/DXMRIG_ALGO_CN_GPU) 44 | endif() 45 | 46 | if (WITH_RANDOMX) 47 | add_definitions(/DXMRIG_ALGO_RANDOMX) 48 | endif() 49 | 50 | if (WITH_ARGON2) 51 | add_definitions(/DXMRIG_ALGO_ARGON2) 52 | endif() 53 | 54 | if (WITH_KAWPOW) 55 | if (WITH_DRIVER_API) 56 | add_definitions(/DXMRIG_ALGO_KAWPOW) 57 | else() 58 | set(WITH_KAWPOW OFF) 59 | message(STATUS "CUDA ${CUDA_VERSION}: KawPow disabled, requires WITH_DRIVER_API=ON for CUDA Driver API and NVRTC") 60 | endif() 61 | endif() 62 | 63 | if (WITH_CN_R) 64 | if (WITH_DRIVER_API) 65 | add_definitions(/DXMRIG_ALGO_CN_R) 66 | else() 67 | set(WITH_CN_R OFF) 68 | message(STATUS "CUDA ${CUDA_VERSION}: CryptoNight-R disabled, requires WITH_DRIVER_API=ON for CUDA Driver API and NVRTC") 69 | endif() 70 | endif() 71 | 72 | 73 | include_directories(src) 74 | add_definitions(/DCUB_IGNORE_DEPRECATED_CPP_DIALECT) 75 | 76 | 77 | include(cmake/cpu.cmake) 78 | include(cmake/os.cmake) 79 | include(cmake/flags.cmake) 80 | include(cmake/CUDA.cmake) 81 | 82 | 83 | set(SOURCES 84 | src/crypto/cn/c_blake256.c 85 | src/crypto/common/Algorithm.cpp 86 | src/crypto/common/Algorithm.h 87 | src/version.h 88 | src/xmrig-cuda.cpp 89 | src/xmrig-cuda.h 90 | ) 91 | 92 | 93 | if (WITH_DRIVER_API AND WITH_CN_R) 94 | list(APPEND SOURCES src/CudaCryptonightR_gen.cpp) 95 | endif() 96 | 97 | if (XMRIG_OS_WIN) 98 | list(APPEND SOURCES res/app.rc) 99 | endif() 100 | 101 | if (XMRIG_OS_APPLE) 102 | cmake_policy(SET CMP0042 NEW) 103 | endif() 104 | 105 | add_library(${CMAKE_PROJECT_NAME} SHARED ${SOURCES}) 106 | target_link_libraries(${CMAKE_PROJECT_NAME} xmrig-cu ${LIBS}) 107 | 108 | if (WITH_DRIVER_API AND WIN32) 109 | if (CUDA_VERSION VERSION_LESS 10.0) 110 | file(GLOB NVRTCDLL "${CUDA_TOOLKIT_ROOT_DIR}/bin/nvrtc64*.dll") 111 | else() 112 | file(GLOB NVRTCDLL "${CUDA_TOOLKIT_ROOT_DIR}/bin/nvrtc64*_0.dll") 113 | endif() 114 | 115 | add_custom_command(TARGET ${CMAKE_PROJECT_NAME} POST_BUILD 116 | COMMAND ${CMAKE_COMMAND} -E copy_if_different "${NVRTCDLL}" $) 117 | 118 | file(GLOB NVRTCBUILTINDLL "${CUDA_TOOLKIT_ROOT_DIR}/bin/nvrtc-builtins64*.dll") 119 | add_custom_command(TARGET ${CMAKE_PROJECT_NAME} POST_BUILD 120 | COMMAND ${CMAKE_COMMAND} -E copy_if_different "${NVRTCBUILTINDLL}" $) 121 | endif() 122 | -------------------------------------------------------------------------------- /src/3rdparty/cub/util_macro.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /****************************************************************************** 30 | * Common C/C++ macro utilities 31 | ******************************************************************************/ 32 | 33 | #pragma once 34 | 35 | #include "util_namespace.cuh" 36 | 37 | /// Optional outer namespace(s) 38 | CUB_NS_PREFIX 39 | 40 | /// CUB namespace 41 | namespace cub { 42 | 43 | 44 | /** 45 | * \addtogroup UtilModule 46 | * @{ 47 | */ 48 | 49 | #ifndef CUB_ALIGN 50 | #if defined(_WIN32) || defined(_WIN64) 51 | /// Align struct 52 | #define CUB_ALIGN(bytes) __declspec(align(32)) 53 | #else 54 | /// Align struct 55 | #define CUB_ALIGN(bytes) __attribute__((aligned(bytes))) 56 | #endif 57 | #endif 58 | 59 | #ifndef CUB_MAX 60 | /// Select maximum(a, b) 61 | #define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a)) 62 | #endif 63 | 64 | #ifndef CUB_MIN 65 | /// Select minimum(a, b) 66 | #define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a)) 67 | #endif 68 | 69 | #ifndef CUB_QUOTIENT_FLOOR 70 | /// Quotient of x/y rounded down to nearest integer 71 | #define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y)) 72 | #endif 73 | 74 | #ifndef CUB_QUOTIENT_CEILING 75 | /// Quotient of x/y rounded up to nearest integer 76 | #define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y)) 77 | #endif 78 | 79 | #ifndef CUB_ROUND_UP_NEAREST 80 | /// x rounded up to the nearest multiple of y 81 | #define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y) 82 | #endif 83 | 84 | #ifndef CUB_ROUND_DOWN_NEAREST 85 | /// x rounded down to the nearest multiple of y 86 | #define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y) 87 | #endif 88 | 89 | 90 | #ifndef CUB_STATIC_ASSERT 91 | #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document 92 | #define CUB_CAT_(a, b) a ## b 93 | #define CUB_CAT(a, b) CUB_CAT_(a, b) 94 | #endif // DOXYGEN_SHOULD_SKIP_THIS 95 | 96 | /// Static assert 97 | #define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1] 98 | #endif 99 | 100 | /** @} */ // end group UtilModule 101 | 102 | } // CUB namespace 103 | CUB_NS_POSTFIX // Optional outer namespace(s) 104 | -------------------------------------------------------------------------------- /src/xmrig-cuda.h: -------------------------------------------------------------------------------- 1 | /* XMRig 2 | * Copyright 2010 Jeff Garzik 3 | * Copyright 2012-2014 pooler 4 | * Copyright 2014 Lucas Jones 5 | * Copyright 2014-2016 Wolf9466 6 | * Copyright 2016 Jay D Dee 7 | * Copyright 2017-2018 XMR-Stak , 8 | * Copyright 2018-2020 SChernykh 9 | * Copyright 2016-2020 XMRig , 10 | * 11 | * This program is free software: you can redistribute it and/or modify 12 | * it under the terms of the GNU General Public License as published by 13 | * the Free Software Foundation, either version 3 of the License, or 14 | * (at your option) any later version. 15 | * 16 | * This program is distributed in the hope that it will be useful, 17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | * GNU General Public License for more details. 20 | * 21 | * You should have received a copy of the GNU General Public License 22 | * along with this program. If not, see . 23 | */ 24 | 25 | #ifndef XMRIG_CUDA_H 26 | #define XMRIG_CUDA_H 27 | 28 | 29 | #include 30 | #include 31 | 32 | 33 | #if defined _WIN32 || defined __CYGWIN__ 34 | # define XMRIG_EXPORT __declspec(dllexport) 35 | # define XMRIG_HIDDEN 36 | #else 37 | # define XMRIG_EXPORT __attribute__ ((visibility ("default"))) 38 | # define XMRIG_HIDDEN __attribute__ ((visibility ("hidden"))) 39 | #endif 40 | 41 | 42 | using nvid_ctx = struct nvid_ctx; 43 | 44 | 45 | enum Version : uint32_t 46 | { 47 | ApiVersion, 48 | DriverVersion, 49 | RuntimeVersion 50 | }; 51 | 52 | 53 | enum DeviceProperty : uint32_t 54 | { 55 | DeviceId, 56 | DeviceAlgorithm, 57 | DeviceArchMajor, 58 | DeviceArchMinor, 59 | DeviceSmx, 60 | DeviceBlocks, 61 | DeviceThreads, 62 | DeviceBFactor, 63 | DeviceBSleep, 64 | DeviceClockRate, 65 | DeviceMemoryClockRate, 66 | DeviceMemoryTotal, 67 | DeviceMemoryFree, 68 | DevicePciBusID, 69 | DevicePciDeviceID, 70 | DevicePciDomainID, 71 | DeviceDatasetHost, 72 | }; 73 | 74 | 75 | #if defined(__cplusplus) 76 | extern "C" { 77 | #endif 78 | 79 | 80 | XMRIG_EXPORT bool cnHash(nvid_ctx *ctx, uint32_t startNonce, uint64_t height, uint64_t target, uint32_t *rescount, uint32_t *resnonce); 81 | XMRIG_EXPORT bool deviceInfo(nvid_ctx *ctx, int32_t blocks, int32_t threads, uint32_t algo, int32_t dataset_host); 82 | XMRIG_EXPORT bool deviceInit(nvid_ctx *ctx); 83 | XMRIG_EXPORT bool rxHash(nvid_ctx *ctx, uint32_t startNonce, uint64_t target, uint32_t *rescount, uint32_t *resnonce); 84 | XMRIG_EXPORT bool rxPrepare(nvid_ctx *ctx, const void *dataset, size_t datasetSize, bool dataset_host, uint32_t batchSize); 85 | XMRIG_EXPORT bool rxUpdateDataset(nvid_ctx* ctx, const void* dataset, size_t datasetSize); 86 | XMRIG_EXPORT bool kawPowHash(nvid_ctx *ctx, uint8_t* job_blob, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t *skipped_hashes); 87 | XMRIG_EXPORT bool kawPowPrepare_v2(nvid_ctx *ctx, const void* cache, size_t cache_size, const void* dag_precalc, size_t dag_size, uint32_t height, const uint64_t* dag_sizes); 88 | XMRIG_EXPORT bool kawPowStopHash(nvid_ctx *ctx); 89 | XMRIG_EXPORT bool setJob(nvid_ctx *ctx, const void *data, size_t size, uint32_t algo); 90 | XMRIG_EXPORT const char *deviceName(nvid_ctx *ctx); 91 | XMRIG_EXPORT const char *lastError(nvid_ctx *ctx); 92 | XMRIG_EXPORT const char *pluginVersion(); 93 | XMRIG_EXPORT int32_t deviceInt(nvid_ctx *ctx, DeviceProperty property); 94 | XMRIG_EXPORT nvid_ctx *alloc(uint32_t id, int32_t bfactor, int32_t bsleep); 95 | XMRIG_EXPORT uint32_t deviceCount(); 96 | XMRIG_EXPORT uint32_t deviceUint(nvid_ctx *ctx, DeviceProperty property); 97 | XMRIG_EXPORT uint32_t version(Version version); 98 | XMRIG_EXPORT uint64_t deviceUlong(nvid_ctx *ctx, DeviceProperty property); 99 | XMRIG_EXPORT void init(); 100 | XMRIG_EXPORT void release(nvid_ctx *ctx); 101 | 102 | 103 | #if defined(__cplusplus) 104 | } 105 | #endif 106 | 107 | 108 | #endif /* XMRIG_CUDA_H */ 109 | -------------------------------------------------------------------------------- /src/RandomX/hash.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | /* 4 | Copyright (c) 2019 SChernykh 5 | 6 | This file is part of RandomX CUDA. 7 | 8 | RandomX CUDA is free software: you can redistribute it and/or modify 9 | it under the terms of the GNU General Public License as published by 10 | the Free Software Foundation, either version 3 of the License, or 11 | (at your option) any later version. 12 | 13 | RandomX CUDA is distributed in the hope that it will be useful, 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | GNU General Public License for more details. 17 | 18 | You should have received a copy of the GNU General Public License 19 | along with RandomX CUDA. If not, see. 20 | */ 21 | 22 | __global__ void find_shares(const void* hashes, uint64_t target, uint32_t* shares) 23 | { 24 | const uint32_t global_index = blockIdx.x * blockDim.x + threadIdx.x; 25 | const uint64_t* p = (const uint64_t*)hashes; 26 | 27 | if (p[global_index * 4 + 3] < target) { 28 | const uint32_t idx = atomicInc(shares, 0xFFFFFFFF) + 1; 29 | if (idx < 10) { 30 | shares[idx] = global_index; 31 | } 32 | } 33 | } 34 | 35 | void hash(nvid_ctx *ctx, uint32_t nonce, uint32_t nonce_offset, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size) 36 | { 37 | if (ctx->algorithm.id() == xmrig_cuda::Algorithm::RX_XLA) { 38 | // sipesh(tempHash, sizeof(tempHash), input, inputSize, input, inputSize, 0, 0); 39 | // CUDA_CHECK_KERNEL(ctx->device_id, sipesh<<>>(ctx->d_rx_hashes, ctx->d_input, ctx->inputlen, nonce)); 40 | // k12(input, inputSize, tempHash); 41 | // CUDA_CHECK_KERNEL(ctx->device_id, k12<<>>(ctx->d_rx_hashes, ctx->d_input, ctx->inputlen, nonce)); 42 | } else if (ctx->inputlen <= 128) { 43 | CUDA_CHECK_KERNEL(ctx->device_id, blake2b_initial_hash << > > (ctx->d_rx_hashes, ctx->d_input, ctx->inputlen, nonce)); 44 | } 45 | else if (ctx->inputlen <= 256) { 46 | CUDA_CHECK_KERNEL(ctx->device_id, blake2b_initial_hash_double << > > (ctx->d_rx_hashes, ctx->d_input, ctx->inputlen, nonce)); 47 | } 48 | else { 49 | CUDA_CHECK_KERNEL(ctx->device_id, blake2b_initial_hash_big << > > (ctx->d_rx_hashes, ctx->d_input, ctx->inputlen, nonce, nonce_offset)); 50 | } 51 | 52 | CUDA_CHECK_KERNEL(ctx->device_id, fillAes1Rx4<<>>(ctx->d_rx_hashes, ctx->d_long_state, batch_size)); 53 | CUDA_CHECK(ctx->device_id, cudaMemset(ctx->d_rx_rounding, 0, batch_size * sizeof(uint32_t))); 54 | 55 | for (size_t i = 0; i < RANDOMX_PROGRAM_COUNT; ++i) { 56 | CUDA_CHECK_KERNEL(ctx->device_id, fillAes4Rx4<<>>(ctx->d_rx_hashes, ctx->d_rx_entropy, batch_size)); 57 | 58 | CUDA_CHECK_KERNEL(ctx->device_id, init_vm<8><<>>(ctx->d_rx_entropy, ctx->d_rx_vm_states)); 59 | for (int j = 0, n = 1 << ctx->device_bfactor; j < n; ++j) { 60 | CUDA_CHECK_KERNEL(ctx->device_id, execute_vm<8, false><<>>(ctx->d_rx_vm_states, ctx->d_rx_rounding, ctx->d_long_state, ctx->d_rx_dataset, batch_size, RANDOMX_PROGRAM_ITERATIONS >> ctx->device_bfactor, j == 0, j == n - 1)); 61 | } 62 | 63 | if (i == RANDOMX_PROGRAM_COUNT - 1) { 64 | CUDA_CHECK_KERNEL(ctx->device_id, hashAes1Rx4<<>>(ctx->d_long_state, ctx->d_rx_vm_states, batch_size)); 65 | CUDA_CHECK_KERNEL(ctx->device_id, blake2b_hash_registers<<>>(ctx->d_rx_hashes, ctx->d_rx_vm_states)); 66 | } else { 67 | CUDA_CHECK_KERNEL(ctx->device_id, blake2b_hash_registers<<>>(ctx->d_rx_hashes, ctx->d_rx_vm_states)); 68 | } 69 | } 70 | 71 | CUDA_CHECK(ctx->device_id, cudaMemset(ctx->d_result_nonce, 0, 10 * sizeof(uint32_t))); 72 | CUDA_CHECK_KERNEL(ctx->device_id, find_shares<<>>(ctx->d_rx_hashes, target, ctx->d_result_nonce)); 73 | CUDA_CHECK(ctx->device_id, cudaDeviceSynchronize()); 74 | 75 | CUDA_CHECK(ctx->device_id, cudaMemcpy(resnonce, ctx->d_result_nonce, 10 * sizeof(uint32_t), cudaMemcpyDeviceToHost)); 76 | 77 | *rescount = resnonce[0]; 78 | if (*rescount > 9) { 79 | *rescount = 9; 80 | } 81 | 82 | for (uint32_t i = 0; i < *rescount; i++) { 83 | resnonce[i] = resnonce[i + 1] + nonce; 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # v6.22.1 2 | - [#205](https://github.com/xmrig/xmrig-cuda/pull/205) Fixed RandomX dataset update. Fix works together with the updated XMRig. 3 | 4 | # v6.22.0 5 | - [#201](https://github.com/xmrig/xmrig-cuda/pull/201) Added support for [Yada](https://yadacoin.io/) (`rx/yada` algorithm). 6 | 7 | # v6.21.1 8 | - The binary downloads now only support the latest version of each major CUDA release. 9 | - Improved build speed with CUDA 11.3 or higher. 10 | 11 | # v6.21.0 12 | - [#167](https://github.com/xmrig/xmrig-cuda/pull/167) Removed deprecated AstroBWTv1 and v2. 13 | - [#176](https://github.com/xmrig/xmrig-cuda/pull/176) Added CUDA 12 support. 14 | - [#191](https://github.com/xmrig/xmrig-cuda/pull/191) Fixed Zephyr mining. 15 | 16 | # v6.17.0 17 | - [#157](https://github.com/xmrig/xmrig-cuda/pull/157) Added Dero HE (`astrobwt/v2`) support. 18 | 19 | # v6.15.1 20 | - [#119](https://github.com/xmrig/xmrig-cuda/issues/119) Fixed compile error on Linux. 21 | - [#124](https://github.com/xmrig/xmrig-cuda/pull/124) Fixed `"out of memory"` error on non-CryptoNight algorithms. 22 | - [#125](https://github.com/xmrig/xmrig-cuda/pull/125) Fixed `"invalid argument"` error. 23 | 24 | # v6.15.0 25 | - **ABI changed, minimum supported XMRig version now is 6.15.0.** 26 | - [#2563](https://github.com/xmrig/xmrig/pull/2563) Added new algorithm RandomX Graft (`rx/graft`). 27 | - [#104](https://github.com/xmrig/xmrig-cuda/pull/104) Fixed build on macOS 10.13 (last supported for CUDA). 28 | 29 | # v6.12.0 30 | - [#95](https://github.com/xmrig/xmrig-cuda/pull/95) Added support for Uplexa (`cn/upx2` algorithm). 31 | 32 | # v6.5.0 33 | - [#74](https://github.com/xmrig/xmrig-cuda/pull/74) Fixed CUDA 8.0 support, RandomX, AstroBWT, and KawPow disabled for this CUDA version. 34 | - [#76](https://github.com/xmrig/xmrig-cuda/pull/76) Fixed high CPU usage on Cryptonight and AstroBWT. 35 | - Removed legacy API and added version information on Windows. 36 | 37 | # v6.4.1 38 | - [#72](https://github.com/xmrig/xmrig-cuda/issues/72) Fixed broken KawPow on Linux. 39 | 40 | # v6.4.0 41 | - [#70](https://github.com/xmrig/xmrig-cuda/pull/70) RandomX: removed `rx/loki` algorithm. 42 | - Added CMake option `-DWITH_DRIVER_API=OFF` to disable CUDA Driver API and NVRTC, required for `cn/r` and `kawpow` algorithms. 43 | 44 | # v6.3.2 45 | - [#65](https://github.com/xmrig/xmrig-cuda/pull/65) Fixed broken AstroBWT. 46 | 47 | # v6.3.1 48 | - [#62](https://github.com/xmrig/xmrig-cuda/pull/62) Fixed broken RandomX (regression since v6.2.1). 49 | 50 | # v6.3.0 51 | - [#59](https://github.com/xmrig/xmrig-cuda/pull/59) Added support for upcoming Haven offshore fork. 52 | - Fixed build with recent CUDA 11. 53 | 54 | # v6.2.1 55 | - [#54](https://github.com/xmrig/xmrig-cuda/pull/54) Optimized KawPow, about 2% hashrate improvement, 10% faster DAG initialization. 56 | - [#55](https://github.com/xmrig/xmrig-cuda/pull/55) Added fast job switching for KawPow, almost zero stale shares. 57 | 58 | # v6.2.0 59 | - [#52](https://github.com/xmrig/xmrig-cuda/pull/52) Added new algorithm `cn/ccx` for Conceal. 60 | - [#53](https://github.com/xmrig/xmrig-cuda/pull/53) Fixed build with CUDA 11. 61 | 62 | # v6.1.0 63 | - [#48](https://github.com/xmrig/xmrig-cuda/pull/48) Optimized AstroBWT, approximately 3 times faster. 64 | - [#51](https://github.com/xmrig/xmrig-cuda/pull/51) Reduced memory usage for KawPow. 65 | 66 | # v6.0.0 67 | - [#1694](https://github.com/xmrig/xmrig/pull/1694) Added support for KawPow algorithm (Ravencoin) on AMD/NVIDIA. 68 | 69 | # v3.0.0 70 | - **ABI changed, minimum supported XMRig version now is 5.11.0.** 71 | - [#41](https://github.com/xmrig/xmrig-cuda/pull/41) Added AstroBWT algorithm support. 72 | 73 | # v2.2.0 74 | - [#1578](https://github.com/xmrig/xmrig/pull/1578) Added new `rx/keva` algorithm for upcoming Kevacoin fork. 75 | 76 | # v2.1.0 77 | - [#1466](https://github.com/xmrig/xmrig/pull/1466) Added `cn-pico/tlo` algorithm. 78 | - Added alternative relaxed API (algorithm passed as string). 79 | 80 | # v2.0.2 81 | - [#27](https://github.com/xmrig/xmrig-cuda/pull/27) Added RandomSFX (`rx/sfx`) algorithm for Safex Cash. 82 | - [#28](https://github.com/xmrig/xmrig-cuda/pull/28) Added RandomV (`rx/v`) algorithm for *new* MoneroV. 83 | 84 | # v2.0.1-beta 85 | - [#10](https://github.com/xmrig/xmrig-cuda/pull/10) Fixed compatibility with CUDA 8, RandomX support not tested and potentially broken with this CUDA version. 86 | - [#1276](https://github.com/xmrig/xmrig/issues/1276) Fixed maximum threads count. 87 | 88 | # v2.0.0-beta 89 | - **ABI changed, minimum supported XMRig version now is 4.6.0.** 90 | - [#5](https://github.com/xmrig/xmrig-cuda/pull/5) Optimized RandomX. 91 | - [#6](https://github.com/xmrig/xmrig-cuda/issues/6) Fixed compatibility with some old systems. 92 | - [#7](https://github.com/xmrig/xmrig-cuda/pull/7) Added support for option `dataset_host` for 2 GB GPUs. 93 | - [#8](https://github.com/xmrig/xmrig-cuda/pull/8) RandomX: fixed random kernel launch errors with some configurations. 94 | 95 | # v1.0.0-beta 96 | - Initial version. 97 | -------------------------------------------------------------------------------- /src/RandomX/arqma/configuration.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2018-2019, tevador 3 | 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | * Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | * Neither the name of the copyright holder nor the 14 | names of its contributors may be used to endorse or promote products 15 | derived from this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 21 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 23 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 24 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 25 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | */ 28 | 29 | #pragma once 30 | 31 | //Cache size in KiB. Must be a power of 2. 32 | #define RANDOMX_ARGON_MEMORY 262144 33 | 34 | //Number of Argon2d iterations for Cache initialization. 35 | #define RANDOMX_ARGON_ITERATIONS 1 36 | 37 | //Number of parallel lanes for Cache initialization. 38 | #define RANDOMX_ARGON_LANES 1 39 | 40 | //Argon2d salt 41 | #define RANDOMX_ARGON_SALT "RandomARQ\x01" 42 | 43 | //Number of random Cache accesses per Dataset item. Minimum is 2. 44 | #define RANDOMX_CACHE_ACCESSES 8 45 | 46 | //Target latency for SuperscalarHash (in cycles of the reference CPU). 47 | #define RANDOMX_SUPERSCALAR_LATENCY 170 48 | 49 | //Dataset base size in bytes. Must be a power of 2. 50 | #define RANDOMX_DATASET_BASE_SIZE 2147483648 51 | 52 | //Dataset extra size. Must be divisible by 64. 53 | #define RANDOMX_DATASET_EXTRA_SIZE 33554368 54 | 55 | //Number of instructions in a RandomX program. Must be divisible by 8. 56 | #define RANDOMX_PROGRAM_SIZE 256 57 | 58 | //Number of iterations during VM execution. 59 | #define RANDOMX_PROGRAM_ITERATIONS 1024 60 | 61 | //Number of chained VM executions per hash. 62 | #define RANDOMX_PROGRAM_COUNT 4 63 | 64 | //Scratchpad L3 size in bytes. Must be a power of 2. 65 | #define RANDOMX_SCRATCHPAD_L3 262144 66 | 67 | //Scratchpad L2 size in bytes. Must be a power of two and less than or equal to RANDOMX_SCRATCHPAD_L3. 68 | #define RANDOMX_SCRATCHPAD_L2 131072 69 | 70 | //Scratchpad L1 size in bytes. Must be a power of two (minimum 64) and less than or equal to RANDOMX_SCRATCHPAD_L2. 71 | #define RANDOMX_SCRATCHPAD_L1 16384 72 | 73 | //Jump condition mask size in bits. 74 | #define RANDOMX_JUMP_BITS 8 75 | 76 | //Jump condition mask offset in bits. The sum of RANDOMX_JUMP_BITS and RANDOMX_JUMP_OFFSET must not exceed 16. 77 | #define RANDOMX_JUMP_OFFSET 8 78 | 79 | /* 80 | Instruction frequencies (per 256 opcodes) 81 | Total sum of frequencies must be 256 82 | */ 83 | 84 | //Integer instructions 85 | #define RANDOMX_FREQ_IADD_RS 16 86 | #define RANDOMX_FREQ_IADD_M 7 87 | #define RANDOMX_FREQ_ISUB_R 16 88 | #define RANDOMX_FREQ_ISUB_M 7 89 | #define RANDOMX_FREQ_IMUL_R 16 90 | #define RANDOMX_FREQ_IMUL_M 4 91 | #define RANDOMX_FREQ_IMULH_R 4 92 | #define RANDOMX_FREQ_IMULH_M 1 93 | #define RANDOMX_FREQ_ISMULH_R 4 94 | #define RANDOMX_FREQ_ISMULH_M 1 95 | #define RANDOMX_FREQ_IMUL_RCP 8 96 | #define RANDOMX_FREQ_INEG_R 2 97 | #define RANDOMX_FREQ_IXOR_R 15 98 | #define RANDOMX_FREQ_IXOR_M 5 99 | #define RANDOMX_FREQ_IROR_R 8 100 | #define RANDOMX_FREQ_IROL_R 2 101 | #define RANDOMX_FREQ_ISWAP_R 4 102 | 103 | //Floating point instructions 104 | #define RANDOMX_FREQ_FSWAP_R 4 105 | #define RANDOMX_FREQ_FADD_R 16 106 | #define RANDOMX_FREQ_FADD_M 5 107 | #define RANDOMX_FREQ_FSUB_R 16 108 | #define RANDOMX_FREQ_FSUB_M 5 109 | #define RANDOMX_FREQ_FSCAL_R 6 110 | #define RANDOMX_FREQ_FMUL_R 32 111 | #define RANDOMX_FREQ_FDIV_M 4 112 | #define RANDOMX_FREQ_FSQRT_R 6 113 | 114 | //Control instructions 115 | #define RANDOMX_FREQ_CBRANCH 25 116 | #define RANDOMX_FREQ_CFROUND 1 117 | 118 | //Store instruction 119 | #define RANDOMX_FREQ_ISTORE 16 120 | 121 | //No-op instruction 122 | #define RANDOMX_FREQ_NOP 0 123 | /* ------ 124 | 256 125 | */ 126 | -------------------------------------------------------------------------------- /src/RandomX/defyx/configuration.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2018-2019, tevador 3 | 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | * Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | * Neither the name of the copyright holder nor the 14 | names of its contributors may be used to endorse or promote products 15 | derived from this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 21 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 23 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 24 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 25 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | */ 28 | 29 | #pragma once 30 | 31 | //Cache size in KiB. Must be a power of 2. 32 | #define RANDOMX_ARGON_MEMORY 131072 33 | 34 | //Number of Argon2d iterations for Cache initialization. 35 | #define RANDOMX_ARGON_ITERATIONS 2 36 | 37 | //Number of parallel lanes for Cache initialization. 38 | #define RANDOMX_ARGON_LANES 1 39 | 40 | //Argon2d salt 41 | #define RANDOMX_ARGON_SALT "DefyXScala\x13" 42 | 43 | //Number of random Cache accesses per Dataset item. Minimum is 2. 44 | #define RANDOMX_CACHE_ACCESSES 2 45 | 46 | //Target latency for SuperscalarHash (in cycles of the reference CPU). 47 | #define RANDOMX_SUPERSCALAR_LATENCY 170 48 | 49 | //Dataset base size in bytes. Must be a power of 2. 50 | #define RANDOMX_DATASET_BASE_SIZE 33554432 51 | 52 | //Dataset extra size. Must be divisible by 64. 53 | #define RANDOMX_DATASET_EXTRA_SIZE 33554368 54 | 55 | //Number of instructions in a RandomX program. Must be divisible by 8. 56 | #define RANDOMX_PROGRAM_SIZE 64 57 | 58 | //Number of iterations during VM execution. 59 | #define RANDOMX_PROGRAM_ITERATIONS 1024 60 | 61 | //Number of chained VM executions per hash. 62 | #define RANDOMX_PROGRAM_COUNT 4 63 | 64 | //Scratchpad L3 size in bytes. Must be a power of 2. 65 | #define RANDOMX_SCRATCHPAD_L3 262144 66 | 67 | //Scratchpad L2 size in bytes. Must be a power of two and less than or equal to RANDOMX_SCRATCHPAD_L3. 68 | #define RANDOMX_SCRATCHPAD_L2 131072 69 | 70 | //Scratchpad L1 size in bytes. Must be a power of two (minimum 64) and less than or equal to RANDOMX_SCRATCHPAD_L2. 71 | #define RANDOMX_SCRATCHPAD_L1 65536 72 | 73 | //Jump condition mask size in bits. 74 | #define RANDOMX_JUMP_BITS 8 75 | 76 | //Jump condition mask offset in bits. The sum of RANDOMX_JUMP_BITS and RANDOMX_JUMP_OFFSET must not exceed 16. 77 | #define RANDOMX_JUMP_OFFSET 8 78 | 79 | /* 80 | Instruction frequencies (per 256 opcodes) 81 | Total sum of frequencies must be 256 82 | */ 83 | 84 | //Integer instructions 85 | #define RANDOMX_FREQ_IADD_RS 25 86 | #define RANDOMX_FREQ_IADD_M 7 87 | #define RANDOMX_FREQ_ISUB_R 16 88 | #define RANDOMX_FREQ_ISUB_M 7 89 | #define RANDOMX_FREQ_IMUL_R 16 90 | #define RANDOMX_FREQ_IMUL_M 4 91 | #define RANDOMX_FREQ_IMULH_R 4 92 | #define RANDOMX_FREQ_IMULH_M 1 93 | #define RANDOMX_FREQ_ISMULH_R 4 94 | #define RANDOMX_FREQ_ISMULH_M 1 95 | #define RANDOMX_FREQ_IMUL_RCP 8 96 | #define RANDOMX_FREQ_INEG_R 2 97 | #define RANDOMX_FREQ_IXOR_R 15 98 | #define RANDOMX_FREQ_IXOR_M 5 99 | #define RANDOMX_FREQ_IROR_R 8 100 | #define RANDOMX_FREQ_IROL_R 2 101 | #define RANDOMX_FREQ_ISWAP_R 4 102 | 103 | //Floating point instructions 104 | #define RANDOMX_FREQ_FSWAP_R 4 105 | #define RANDOMX_FREQ_FADD_R 16 106 | #define RANDOMX_FREQ_FADD_M 5 107 | #define RANDOMX_FREQ_FSUB_R 16 108 | #define RANDOMX_FREQ_FSUB_M 5 109 | #define RANDOMX_FREQ_FSCAL_R 6 110 | #define RANDOMX_FREQ_FMUL_R 32 111 | #define RANDOMX_FREQ_FDIV_M 4 112 | #define RANDOMX_FREQ_FSQRT_R 6 113 | 114 | //Control instructions 115 | #define RANDOMX_FREQ_CBRANCH 16 116 | #define RANDOMX_FREQ_CFROUND 1 117 | 118 | //Store instruction 119 | #define RANDOMX_FREQ_ISTORE 16 120 | 121 | //No-op instruction 122 | #define RANDOMX_FREQ_NOP 0 123 | /* ------ 124 | 256 125 | */ 126 | -------------------------------------------------------------------------------- /src/RandomX/keva/configuration.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2018-2019, tevador 3 | 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | * Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | * Neither the name of the copyright holder nor the 14 | names of its contributors may be used to endorse or promote products 15 | derived from this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 21 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 23 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 24 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 25 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | */ 28 | 29 | #pragma once 30 | 31 | //Cache size in KiB. Must be a power of 2. 32 | #define RANDOMX_ARGON_MEMORY 262144 33 | 34 | //Number of Argon2d iterations for Cache initialization. 35 | #define RANDOMX_ARGON_ITERATIONS 3 36 | 37 | //Number of parallel lanes for Cache initialization. 38 | #define RANDOMX_ARGON_LANES 1 39 | 40 | //Argon2d salt 41 | #define RANDOMX_ARGON_SALT "RandomKV\x01" 42 | 43 | //Number of random Cache accesses per Dataset item. Minimum is 2. 44 | #define RANDOMX_CACHE_ACCESSES 8 45 | 46 | //Target latency for SuperscalarHash (in cycles of the reference CPU). 47 | #define RANDOMX_SUPERSCALAR_LATENCY 170 48 | 49 | //Dataset base size in bytes. Must be a power of 2. 50 | #define RANDOMX_DATASET_BASE_SIZE 2147483648 51 | 52 | //Dataset extra size. Must be divisible by 64. 53 | #define RANDOMX_DATASET_EXTRA_SIZE 33554368 54 | 55 | //Number of instructions in a RandomX program. Must be divisible by 8. 56 | #define RANDOMX_PROGRAM_SIZE 256 57 | 58 | //Number of iterations during VM execution. 59 | #define RANDOMX_PROGRAM_ITERATIONS 2048 60 | 61 | //Number of chained VM executions per hash. 62 | #define RANDOMX_PROGRAM_COUNT 8 63 | 64 | //Scratchpad L3 size in bytes. Must be a power of 2. 65 | #define RANDOMX_SCRATCHPAD_L3 1048576 66 | 67 | //Scratchpad L2 size in bytes. Must be a power of two and less than or equal to RANDOMX_SCRATCHPAD_L3. 68 | #define RANDOMX_SCRATCHPAD_L2 131072 69 | 70 | //Scratchpad L1 size in bytes. Must be a power of two (minimum 64) and less than or equal to RANDOMX_SCRATCHPAD_L2. 71 | #define RANDOMX_SCRATCHPAD_L1 16384 72 | 73 | //Jump condition mask size in bits. 74 | #define RANDOMX_JUMP_BITS 8 75 | 76 | //Jump condition mask offset in bits. The sum of RANDOMX_JUMP_BITS and RANDOMX_JUMP_OFFSET must not exceed 16. 77 | #define RANDOMX_JUMP_OFFSET 8 78 | 79 | /* 80 | Instruction frequencies (per 256 opcodes) 81 | Total sum of frequencies must be 256 82 | */ 83 | 84 | //Integer instructions 85 | #define RANDOMX_FREQ_IADD_RS 16 86 | #define RANDOMX_FREQ_IADD_M 7 87 | #define RANDOMX_FREQ_ISUB_R 16 88 | #define RANDOMX_FREQ_ISUB_M 7 89 | #define RANDOMX_FREQ_IMUL_R 16 90 | #define RANDOMX_FREQ_IMUL_M 4 91 | #define RANDOMX_FREQ_IMULH_R 4 92 | #define RANDOMX_FREQ_IMULH_M 1 93 | #define RANDOMX_FREQ_ISMULH_R 4 94 | #define RANDOMX_FREQ_ISMULH_M 1 95 | #define RANDOMX_FREQ_IMUL_RCP 8 96 | #define RANDOMX_FREQ_INEG_R 2 97 | #define RANDOMX_FREQ_IXOR_R 15 98 | #define RANDOMX_FREQ_IXOR_M 5 99 | #define RANDOMX_FREQ_IROR_R 8 100 | #define RANDOMX_FREQ_IROL_R 2 101 | #define RANDOMX_FREQ_ISWAP_R 4 102 | 103 | //Floating point instructions 104 | #define RANDOMX_FREQ_FSWAP_R 4 105 | #define RANDOMX_FREQ_FADD_R 16 106 | #define RANDOMX_FREQ_FADD_M 5 107 | #define RANDOMX_FREQ_FSUB_R 16 108 | #define RANDOMX_FREQ_FSUB_M 5 109 | #define RANDOMX_FREQ_FSCAL_R 6 110 | #define RANDOMX_FREQ_FMUL_R 32 111 | #define RANDOMX_FREQ_FDIV_M 4 112 | #define RANDOMX_FREQ_FSQRT_R 6 113 | 114 | //Control instructions 115 | #define RANDOMX_FREQ_CBRANCH 25 116 | #define RANDOMX_FREQ_CFROUND 1 117 | 118 | //Store instruction 119 | #define RANDOMX_FREQ_ISTORE 16 120 | 121 | //No-op instruction 122 | #define RANDOMX_FREQ_NOP 0 123 | /* ------ 124 | 256 125 | */ 126 | -------------------------------------------------------------------------------- /src/RandomX/monero/configuration.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2018-2019, tevador 3 | 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | * Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | * Neither the name of the copyright holder nor the 14 | names of its contributors may be used to endorse or promote products 15 | derived from this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 21 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 23 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 24 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 25 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | */ 28 | 29 | #pragma once 30 | 31 | //Cache size in KiB. Must be a power of 2. 32 | #define RANDOMX_ARGON_MEMORY 262144 33 | 34 | //Number of Argon2d iterations for Cache initialization. 35 | #define RANDOMX_ARGON_ITERATIONS 3 36 | 37 | //Number of parallel lanes for Cache initialization. 38 | #define RANDOMX_ARGON_LANES 1 39 | 40 | //Argon2d salt 41 | #define RANDOMX_ARGON_SALT "RandomX\x03" 42 | 43 | //Number of random Cache accesses per Dataset item. Minimum is 2. 44 | #define RANDOMX_CACHE_ACCESSES 8 45 | 46 | //Target latency for SuperscalarHash (in cycles of the reference CPU). 47 | #define RANDOMX_SUPERSCALAR_LATENCY 170 48 | 49 | //Dataset base size in bytes. Must be a power of 2. 50 | #define RANDOMX_DATASET_BASE_SIZE 2147483648 51 | 52 | //Dataset extra size. Must be divisible by 64. 53 | #define RANDOMX_DATASET_EXTRA_SIZE 33554368 54 | 55 | //Number of instructions in a RandomX program. Must be divisible by 8. 56 | #define RANDOMX_PROGRAM_SIZE 256 57 | 58 | //Number of iterations during VM execution. 59 | #define RANDOMX_PROGRAM_ITERATIONS 2048 60 | 61 | //Number of chained VM executions per hash. 62 | #define RANDOMX_PROGRAM_COUNT 8 63 | 64 | //Scratchpad L3 size in bytes. Must be a power of 2. 65 | #define RANDOMX_SCRATCHPAD_L3 2097152 66 | 67 | //Scratchpad L2 size in bytes. Must be a power of two and less than or equal to RANDOMX_SCRATCHPAD_L3. 68 | #define RANDOMX_SCRATCHPAD_L2 262144 69 | 70 | //Scratchpad L1 size in bytes. Must be a power of two (minimum 64) and less than or equal to RANDOMX_SCRATCHPAD_L2. 71 | #define RANDOMX_SCRATCHPAD_L1 16384 72 | 73 | //Jump condition mask size in bits. 74 | #define RANDOMX_JUMP_BITS 8 75 | 76 | //Jump condition mask offset in bits. The sum of RANDOMX_JUMP_BITS and RANDOMX_JUMP_OFFSET must not exceed 16. 77 | #define RANDOMX_JUMP_OFFSET 8 78 | 79 | /* 80 | Instruction frequencies (per 256 opcodes) 81 | Total sum of frequencies must be 256 82 | */ 83 | 84 | //Integer instructions 85 | #define RANDOMX_FREQ_IADD_RS 16 86 | #define RANDOMX_FREQ_IADD_M 7 87 | #define RANDOMX_FREQ_ISUB_R 16 88 | #define RANDOMX_FREQ_ISUB_M 7 89 | #define RANDOMX_FREQ_IMUL_R 16 90 | #define RANDOMX_FREQ_IMUL_M 4 91 | #define RANDOMX_FREQ_IMULH_R 4 92 | #define RANDOMX_FREQ_IMULH_M 1 93 | #define RANDOMX_FREQ_ISMULH_R 4 94 | #define RANDOMX_FREQ_ISMULH_M 1 95 | #define RANDOMX_FREQ_IMUL_RCP 8 96 | #define RANDOMX_FREQ_INEG_R 2 97 | #define RANDOMX_FREQ_IXOR_R 15 98 | #define RANDOMX_FREQ_IXOR_M 5 99 | #define RANDOMX_FREQ_IROR_R 8 100 | #define RANDOMX_FREQ_IROL_R 2 101 | #define RANDOMX_FREQ_ISWAP_R 4 102 | 103 | //Floating point instructions 104 | #define RANDOMX_FREQ_FSWAP_R 4 105 | #define RANDOMX_FREQ_FADD_R 16 106 | #define RANDOMX_FREQ_FADD_M 5 107 | #define RANDOMX_FREQ_FSUB_R 16 108 | #define RANDOMX_FREQ_FSUB_M 5 109 | #define RANDOMX_FREQ_FSCAL_R 6 110 | #define RANDOMX_FREQ_FMUL_R 32 111 | #define RANDOMX_FREQ_FDIV_M 4 112 | #define RANDOMX_FREQ_FSQRT_R 6 113 | 114 | //Control instructions 115 | #define RANDOMX_FREQ_CBRANCH 25 116 | #define RANDOMX_FREQ_CFROUND 1 117 | 118 | //Store instruction 119 | #define RANDOMX_FREQ_ISTORE 16 120 | 121 | //No-op instruction 122 | #define RANDOMX_FREQ_NOP 0 123 | /* ------ 124 | 256 125 | */ 126 | -------------------------------------------------------------------------------- /src/RandomX/graft/configuration.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2018-2019, tevador 3 | 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | * Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | * Neither the name of the copyright holder nor the 14 | names of its contributors may be used to endorse or promote products 15 | derived from this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 21 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 23 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 24 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 25 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | */ 28 | 29 | #pragma once 30 | 31 | //Cache size in KiB. Must be a power of 2. 32 | #define RANDOMX_ARGON_MEMORY 262144 33 | 34 | //Number of Argon2d iterations for Cache initialization. 35 | #define RANDOMX_ARGON_ITERATIONS 3 36 | 37 | //Number of parallel lanes for Cache initialization. 38 | #define RANDOMX_ARGON_LANES 2 39 | 40 | //Argon2d salt 41 | #define RANDOMX_ARGON_SALT "RandomX-Graft\x01" 42 | 43 | //Number of random Cache accesses per Dataset item. Minimum is 2. 44 | #define RANDOMX_CACHE_ACCESSES 8 45 | 46 | //Target latency for SuperscalarHash (in cycles of the reference CPU). 47 | #define RANDOMX_SUPERSCALAR_LATENCY 170 48 | 49 | //Dataset base size in bytes. Must be a power of 2. 50 | #define RANDOMX_DATASET_BASE_SIZE 2147483648 51 | 52 | //Dataset extra size. Must be divisible by 64. 53 | #define RANDOMX_DATASET_EXTRA_SIZE 33554368 54 | 55 | //Number of instructions in a RandomX program. Must be divisible by 8. 56 | #define RANDOMX_PROGRAM_SIZE 280 57 | 58 | //Number of iterations during VM execution. 59 | #define RANDOMX_PROGRAM_ITERATIONS 2048 60 | 61 | //Number of chained VM executions per hash. 62 | #define RANDOMX_PROGRAM_COUNT 8 63 | 64 | //Scratchpad L3 size in bytes. Must be a power of 2. 65 | #define RANDOMX_SCRATCHPAD_L3 2097152 66 | 67 | //Scratchpad L2 size in bytes. Must be a power of two and less than or equal to RANDOMX_SCRATCHPAD_L3. 68 | #define RANDOMX_SCRATCHPAD_L2 262144 69 | 70 | //Scratchpad L1 size in bytes. Must be a power of two (minimum 64) and less than or equal to RANDOMX_SCRATCHPAD_L2. 71 | #define RANDOMX_SCRATCHPAD_L1 16384 72 | 73 | //Jump condition mask size in bits. 74 | #define RANDOMX_JUMP_BITS 8 75 | 76 | //Jump condition mask offset in bits. The sum of RANDOMX_JUMP_BITS and RANDOMX_JUMP_OFFSET must not exceed 16. 77 | #define RANDOMX_JUMP_OFFSET 8 78 | 79 | /* 80 | Instruction frequencies (per 256 opcodes) 81 | Total sum of frequencies must be 256 82 | */ 83 | 84 | //Integer instructions 85 | #define RANDOMX_FREQ_IADD_RS 16 86 | #define RANDOMX_FREQ_IADD_M 7 87 | #define RANDOMX_FREQ_ISUB_R 16 88 | #define RANDOMX_FREQ_ISUB_M 7 89 | #define RANDOMX_FREQ_IMUL_R 16 90 | #define RANDOMX_FREQ_IMUL_M 4 91 | #define RANDOMX_FREQ_IMULH_R 4 92 | #define RANDOMX_FREQ_IMULH_M 1 93 | #define RANDOMX_FREQ_ISMULH_R 4 94 | #define RANDOMX_FREQ_ISMULH_M 1 95 | #define RANDOMX_FREQ_IMUL_RCP 8 96 | #define RANDOMX_FREQ_INEG_R 2 97 | #define RANDOMX_FREQ_IXOR_R 15 98 | #define RANDOMX_FREQ_IXOR_M 5 99 | #define RANDOMX_FREQ_IROR_R 7 100 | #define RANDOMX_FREQ_IROL_R 3 101 | #define RANDOMX_FREQ_ISWAP_R 4 102 | 103 | //Floating point instructions 104 | #define RANDOMX_FREQ_FSWAP_R 4 105 | #define RANDOMX_FREQ_FADD_R 16 106 | #define RANDOMX_FREQ_FADD_M 5 107 | #define RANDOMX_FREQ_FSUB_R 16 108 | #define RANDOMX_FREQ_FSUB_M 5 109 | #define RANDOMX_FREQ_FSCAL_R 6 110 | #define RANDOMX_FREQ_FMUL_R 32 111 | #define RANDOMX_FREQ_FDIV_M 4 112 | #define RANDOMX_FREQ_FSQRT_R 6 113 | 114 | //Control instructions 115 | #define RANDOMX_FREQ_CBRANCH 25 116 | #define RANDOMX_FREQ_CFROUND 1 117 | 118 | //Store instruction 119 | #define RANDOMX_FREQ_ISTORE 16 120 | 121 | //No-op instruction 122 | #define RANDOMX_FREQ_NOP 0 123 | /* ------ 124 | 256 125 | */ 126 | -------------------------------------------------------------------------------- /src/RandomX/yada/configuration.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2018-2019, tevador 3 | 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | * Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | * Neither the name of the copyright holder nor the 14 | names of its contributors may be used to endorse or promote products 15 | derived from this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 21 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 23 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 24 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 25 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | */ 28 | 29 | #pragma once 30 | 31 | //Cache size in KiB. Must be a power of 2. 32 | #define RANDOMX_ARGON_MEMORY 262144 33 | 34 | //Number of Argon2d iterations for Cache initialization. 35 | #define RANDOMX_ARGON_ITERATIONS 4 36 | 37 | //Number of parallel lanes for Cache initialization. 38 | #define RANDOMX_ARGON_LANES 1 39 | 40 | //Argon2d salt 41 | #define RANDOMX_ARGON_SALT "RandomXYadaCoin\x03" 42 | 43 | //Number of random Cache accesses per Dataset item. Minimum is 2. 44 | #define RANDOMX_CACHE_ACCESSES 8 45 | 46 | //Target latency for SuperscalarHash (in cycles of the reference CPU). 47 | #define RANDOMX_SUPERSCALAR_LATENCY 150 48 | 49 | //Dataset base size in bytes. Must be a power of 2. 50 | #define RANDOMX_DATASET_BASE_SIZE 2147483648 51 | 52 | //Dataset extra size. Must be divisible by 64. 53 | #define RANDOMX_DATASET_EXTRA_SIZE 33554368 54 | 55 | //Number of instructions in a RandomX program. Must be divisible by 8. 56 | #define RANDOMX_PROGRAM_SIZE 256 57 | 58 | //Number of iterations during VM execution. 59 | #define RANDOMX_PROGRAM_ITERATIONS 2048 60 | 61 | //Number of chained VM executions per hash. 62 | #define RANDOMX_PROGRAM_COUNT 8 63 | 64 | //Scratchpad L3 size in bytes. Must be a power of 2. 65 | #define RANDOMX_SCRATCHPAD_L3 2097152 66 | 67 | //Scratchpad L2 size in bytes. Must be a power of two and less than or equal to RANDOMX_SCRATCHPAD_L3. 68 | #define RANDOMX_SCRATCHPAD_L2 262144 69 | 70 | //Scratchpad L1 size in bytes. Must be a power of two (minimum 64) and less than or equal to RANDOMX_SCRATCHPAD_L2. 71 | #define RANDOMX_SCRATCHPAD_L1 16384 72 | 73 | //Jump condition mask size in bits. 74 | #define RANDOMX_JUMP_BITS 8 75 | 76 | //Jump condition mask offset in bits. The sum of RANDOMX_JUMP_BITS and RANDOMX_JUMP_OFFSET must not exceed 16. 77 | #define RANDOMX_JUMP_OFFSET 8 78 | 79 | /* 80 | Instruction frequencies (per 256 opcodes) 81 | Total sum of frequencies must be 256 82 | */ 83 | 84 | //Integer instructions 85 | #define RANDOMX_FREQ_IADD_RS 16 86 | #define RANDOMX_FREQ_IADD_M 7 87 | #define RANDOMX_FREQ_ISUB_R 16 88 | #define RANDOMX_FREQ_ISUB_M 7 89 | #define RANDOMX_FREQ_IMUL_R 16 90 | #define RANDOMX_FREQ_IMUL_M 4 91 | #define RANDOMX_FREQ_IMULH_R 4 92 | #define RANDOMX_FREQ_IMULH_M 1 93 | #define RANDOMX_FREQ_ISMULH_R 4 94 | #define RANDOMX_FREQ_ISMULH_M 1 95 | #define RANDOMX_FREQ_IMUL_RCP 8 96 | #define RANDOMX_FREQ_INEG_R 2 97 | #define RANDOMX_FREQ_IXOR_R 15 98 | #define RANDOMX_FREQ_IXOR_M 5 99 | #define RANDOMX_FREQ_IROR_R 8 100 | #define RANDOMX_FREQ_IROL_R 2 101 | #define RANDOMX_FREQ_ISWAP_R 4 102 | 103 | //Floating point instructions 104 | #define RANDOMX_FREQ_FSWAP_R 4 105 | #define RANDOMX_FREQ_FADD_R 16 106 | #define RANDOMX_FREQ_FADD_M 5 107 | #define RANDOMX_FREQ_FSUB_R 16 108 | #define RANDOMX_FREQ_FSUB_M 5 109 | #define RANDOMX_FREQ_FSCAL_R 6 110 | #define RANDOMX_FREQ_FMUL_R 32 111 | #define RANDOMX_FREQ_FDIV_M 4 112 | #define RANDOMX_FREQ_FSQRT_R 6 113 | 114 | //Control instructions 115 | #define RANDOMX_FREQ_CBRANCH 25 116 | #define RANDOMX_FREQ_CFROUND 1 117 | 118 | //Store instruction 119 | #define RANDOMX_FREQ_ISTORE 16 120 | 121 | //No-op instruction 122 | #define RANDOMX_FREQ_NOP 0 123 | /* ------ 124 | 256 125 | */ 126 | -------------------------------------------------------------------------------- /src/RandomX/wownero/configuration.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2018-2019, tevador 3 | Copyright (c) 2019, Wownero Inc., a Monero Enterprise Alliance partner company 4 | 5 | All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without 8 | modification, are permitted provided that the following conditions are met: 9 | * Redistributions of source code must retain the above copyright 10 | notice, this list of conditions and the following disclaimer. 11 | * Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the distribution. 14 | * Neither the name of the copyright holder nor the 15 | names of its contributors may be used to endorse or promote products 16 | derived from this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | */ 29 | 30 | #pragma once 31 | 32 | //Cache size in KiB. Must be a power of 2. 33 | #define RANDOMX_ARGON_MEMORY 262144 34 | 35 | //Number of Argon2d iterations for Cache initialization. 36 | #define RANDOMX_ARGON_ITERATIONS 3 37 | 38 | //Number of parallel lanes for Cache initialization. 39 | #define RANDOMX_ARGON_LANES 1 40 | 41 | //Argon2d salt 42 | #define RANDOMX_ARGON_SALT "RandomWOW\x01" 43 | 44 | //Number of random Cache accesses per Dataset item. Minimum is 2. 45 | #define RANDOMX_CACHE_ACCESSES 8 46 | 47 | //Target latency for SuperscalarHash (in cycles of the reference CPU). 48 | #define RANDOMX_SUPERSCALAR_LATENCY 170 49 | 50 | //Dataset base size in bytes. Must be a power of 2. 51 | #define RANDOMX_DATASET_BASE_SIZE 2147483648 52 | 53 | //Dataset extra size. Must be divisible by 64. 54 | #define RANDOMX_DATASET_EXTRA_SIZE 33554368 55 | 56 | //Number of instructions in a RandomX program. Must be divisible by 8. 57 | #define RANDOMX_PROGRAM_SIZE 256 58 | 59 | //Number of iterations during VM execution. 60 | #define RANDOMX_PROGRAM_ITERATIONS 1024 61 | 62 | //Number of chained VM executions per hash. 63 | #define RANDOMX_PROGRAM_COUNT 16 64 | 65 | //Scratchpad L3 size in bytes. Must be a power of 2. 66 | #define RANDOMX_SCRATCHPAD_L3 1048576 67 | 68 | //Scratchpad L2 size in bytes. Must be a power of two and less than or equal to RANDOMX_SCRATCHPAD_L3. 69 | #define RANDOMX_SCRATCHPAD_L2 131072 70 | 71 | //Scratchpad L1 size in bytes. Must be a power of two (minimum 64) and less than or equal to RANDOMX_SCRATCHPAD_L2. 72 | #define RANDOMX_SCRATCHPAD_L1 16384 73 | 74 | //Jump condition mask size in bits. 75 | #define RANDOMX_JUMP_BITS 8 76 | 77 | //Jump condition mask offset in bits. The sum of RANDOMX_JUMP_BITS and RANDOMX_JUMP_OFFSET must not exceed 16. 78 | #define RANDOMX_JUMP_OFFSET 8 79 | 80 | /* 81 | Instruction frequencies (per 256 opcodes) 82 | Total sum of frequencies must be 256 83 | */ 84 | 85 | //Integer instructions 86 | #define RANDOMX_FREQ_IADD_RS 25 87 | #define RANDOMX_FREQ_IADD_M 7 88 | #define RANDOMX_FREQ_ISUB_R 16 89 | #define RANDOMX_FREQ_ISUB_M 7 90 | #define RANDOMX_FREQ_IMUL_R 16 91 | #define RANDOMX_FREQ_IMUL_M 4 92 | #define RANDOMX_FREQ_IMULH_R 4 93 | #define RANDOMX_FREQ_IMULH_M 1 94 | #define RANDOMX_FREQ_ISMULH_R 4 95 | #define RANDOMX_FREQ_ISMULH_M 1 96 | #define RANDOMX_FREQ_IMUL_RCP 8 97 | #define RANDOMX_FREQ_INEG_R 2 98 | #define RANDOMX_FREQ_IXOR_R 15 99 | #define RANDOMX_FREQ_IXOR_M 5 100 | #define RANDOMX_FREQ_IROR_R 10 101 | #define RANDOMX_FREQ_IROL_R 0 102 | #define RANDOMX_FREQ_ISWAP_R 4 103 | 104 | //Floating point instructions 105 | #define RANDOMX_FREQ_FSWAP_R 8 106 | #define RANDOMX_FREQ_FADD_R 20 107 | #define RANDOMX_FREQ_FADD_M 5 108 | #define RANDOMX_FREQ_FSUB_R 20 109 | #define RANDOMX_FREQ_FSUB_M 5 110 | #define RANDOMX_FREQ_FSCAL_R 6 111 | #define RANDOMX_FREQ_FMUL_R 20 112 | #define RANDOMX_FREQ_FDIV_M 4 113 | #define RANDOMX_FREQ_FSQRT_R 6 114 | 115 | //Control instructions 116 | #define RANDOMX_FREQ_CBRANCH 16 117 | #define RANDOMX_FREQ_CFROUND 1 118 | 119 | //Store instruction 120 | #define RANDOMX_FREQ_ISTORE 16 121 | 122 | //No-op instruction 123 | #define RANDOMX_FREQ_NOP 0 124 | /* ------ 125 | 256 126 | */ 127 | -------------------------------------------------------------------------------- /src/3rdparty/cub/grid/grid_mapping.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks. 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "../config.cuh" 37 | 38 | /// Optional outer namespace(s) 39 | CUB_NS_PREFIX 40 | 41 | /// CUB namespace 42 | namespace cub { 43 | 44 | 45 | /** 46 | * \addtogroup GridModule 47 | * @{ 48 | */ 49 | 50 | 51 | /****************************************************************************** 52 | * Mapping policies 53 | *****************************************************************************/ 54 | 55 | 56 | /** 57 | * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks. 58 | */ 59 | enum GridMappingStrategy 60 | { 61 | /** 62 | * \brief An a "raking" access pattern in which each thread block is 63 | * assigned a consecutive sequence of input tiles 64 | * 65 | * \par Overview 66 | * The input is evenly partitioned into \p p segments, where \p p is 67 | * constant and corresponds loosely to the number of thread blocks that may 68 | * actively reside on the target device. Each segment is comprised of 69 | * consecutive tiles, where a tile is a small, constant-sized unit of input 70 | * to be processed to completion before the thread block terminates or 71 | * obtains more work. The kernel invokes \p p thread blocks, each 72 | * of which iteratively consumes a segment of n/p elements 73 | * in tile-size increments. 74 | */ 75 | GRID_MAPPING_RAKE, 76 | 77 | /** 78 | * \brief An a "strip mining" access pattern in which the input tiles assigned 79 | * to each thread block are separated by a stride equal to the the extent of 80 | * the grid. 81 | * 82 | * \par Overview 83 | * The input is evenly partitioned into \p p sets, where \p p is 84 | * constant and corresponds loosely to the number of thread blocks that may 85 | * actively reside on the target device. Each set is comprised of 86 | * data tiles separated by stride \p tiles, where a tile is a small, 87 | * constant-sized unit of input to be processed to completion before the 88 | * thread block terminates or obtains more work. The kernel invokes \p p 89 | * thread blocks, each of which iteratively consumes a segment of 90 | * n/p elements in tile-size increments. 91 | */ 92 | GRID_MAPPING_STRIP_MINE, 93 | 94 | /** 95 | * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks. 96 | * 97 | * \par Overview 98 | * The input is treated as a queue to be dynamically consumed by a grid of 99 | * thread blocks. Work is atomically dequeued in tiles, where a tile is a 100 | * unit of input to be processed to completion before the thread block 101 | * terminates or obtains more work. The grid size \p p is constant, 102 | * loosely corresponding to the number of thread blocks that may actively 103 | * reside on the target device. 104 | */ 105 | GRID_MAPPING_DYNAMIC, 106 | }; 107 | 108 | 109 | /** @} */ // end group GridModule 110 | 111 | } // CUB namespace 112 | CUB_NS_POSTFIX // Optional outer namespace(s) 113 | 114 | -------------------------------------------------------------------------------- /src/KawPow/raven/KawPow_dag.h: -------------------------------------------------------------------------------- 1 | #define ETHASH_HASH_BYTES 64 2 | #define ETHASH_DATASET_PARENTS 512 3 | 4 | #if (__CUDACC_VER_MAJOR__ > 8) 5 | #define SHFL(x, y, z) __shfl_sync(0xFFFFFFFF, (x), (y), (z)) 6 | #else 7 | #define SHFL(x, y, z) __shfl((x), (y), (z)) 8 | #endif 9 | 10 | typedef union { 11 | uint32_t words[64 / sizeof(uint32_t)]; 12 | uint2 uint2s[64 / sizeof(uint2)]; 13 | uint4 uint4s[64 / sizeof(uint4)]; 14 | } hash64_t; 15 | 16 | typedef union { 17 | uint32_t words[200 / sizeof(uint32_t)]; 18 | uint64_t uint64s[200 / sizeof(uint64_t)]; 19 | uint2 uint2s[200 / sizeof(uint2)]; 20 | uint4 uint4s[200 / sizeof(uint4)]; 21 | } hash200_t; 22 | 23 | // Implementation based on: 24 | // https://github.com/mjosaarinen/tiny_sha3/blob/master/sha3.c 25 | // converted from 64->32 bit words 26 | 27 | __device__ __constant__ const uint64_t keccakf_rndc[24] = { 28 | 0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808AULL, 29 | 0x8000000080008000ULL, 0x000000000000808BULL, 0x0000000080000001ULL, 30 | 0x8000000080008081ULL, 0x8000000000008009ULL, 0x000000000000008AULL, 31 | 0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000AULL, 32 | 0x000000008000808BULL, 0x800000000000008BULL, 0x8000000000008089ULL, 33 | 0x8000000000008003ULL, 0x8000000000008002ULL, 0x8000000000000080ULL, 34 | 0x000000000000800AULL, 0x800000008000000AULL, 0x8000000080008081ULL, 35 | 0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL 36 | }; 37 | 38 | __device__ __forceinline__ uint64_t ROTL64(const uint64_t x, const int offset) 39 | { 40 | uint64_t result; 41 | asm("{\n\t" 42 | ".reg .b64 lhs;\n\t" 43 | ".reg .u32 roff;\n\t" 44 | "shl.b64 lhs, %1, %2;\n\t" 45 | "sub.u32 roff, 64, %2;\n\t" 46 | "shr.b64 %0, %1, roff;\n\t" 47 | "add.u64 %0, lhs, %0;\n\t" 48 | "}\n" 49 | : "=l"(result) 50 | : "l"(x), "r"(offset)); 51 | return result; 52 | } 53 | 54 | __device__ __forceinline__ void keccak_f1600_round(uint64_t st[25], const int r) 55 | { 56 | const uint32_t keccakf_rotc[24] = { 57 | 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14, 58 | 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44 59 | }; 60 | const uint32_t keccakf_piln[24] = { 61 | 10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4, 62 | 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1 63 | }; 64 | 65 | uint64_t t, bc[5]; 66 | // Theta 67 | for (int i = 0; i < 5; i++) 68 | bc[i] = st[i] ^ st[i + 5] ^ st[i + 10] ^ st[i + 15] ^ st[i + 20]; 69 | 70 | for (int i = 0; i < 5; i++) { 71 | t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1); 72 | for (uint32_t j = 0; j < 25; j += 5) 73 | st[j + i] ^= t; 74 | } 75 | 76 | // Rho Pi 77 | t = st[1]; 78 | for (int i = 0; i < 24; i++) { 79 | uint32_t j = keccakf_piln[i]; 80 | bc[0] = st[j]; 81 | st[j] = ROTL64(t, keccakf_rotc[i]); 82 | t = bc[0]; 83 | } 84 | 85 | // Chi 86 | for (uint32_t j = 0; j < 25; j += 5) { 87 | for (int i = 0; i < 5; i++) 88 | bc[i] = st[j + i]; 89 | for (int i = 0; i < 5; i++) 90 | st[j + i] ^= (~bc[(i + 1) % 5]) & bc[(i + 2) % 5]; 91 | } 92 | 93 | // Iota 94 | st[0] ^= keccakf_rndc[r]; 95 | } 96 | 97 | __device__ __forceinline__ void keccak_f1600(uint64_t st[25]) 98 | { 99 | for (int i = 8; i < 25; i++) { 100 | st[i] = 0; 101 | } 102 | st[8] = 0x8000000000000001; 103 | 104 | for (int r = 0; r < 24; r++) { 105 | keccak_f1600_round(st, r); 106 | } 107 | } 108 | 109 | #define FNV_PRIME 0x01000193U 110 | #define fnv(x,y) ((uint32_t(x) * (FNV_PRIME)) ^uint32_t(y)) 111 | __device__ uint4 fnv4(uint4 a, uint4 b) 112 | { 113 | uint4 c; 114 | c.x = a.x * FNV_PRIME ^ b.x; 115 | c.y = a.y * FNV_PRIME ^ b.y; 116 | c.z = a.z * FNV_PRIME ^ b.z; 117 | c.w = a.w * FNV_PRIME ^ b.w; 118 | return c; 119 | } 120 | 121 | #define NODE_WORDS (ETHASH_HASH_BYTES/sizeof(uint32_t)) 122 | 123 | __device__ __forceinline__ uint32_t fast_mod(uint32_t a, uint4 d) 124 | { 125 | const uint64_t t = a; 126 | const uint32_t q = ((t + d.y) * d.x) >> d.z; 127 | return a - q * d.w; 128 | } 129 | 130 | __global__ void ethash_calculate_dag_item(uint32_t start, hash64_t *g_dag, uint64_t dag_bytes, hash64_t* g_light, uint4 light_words) 131 | { 132 | uint64_t const node_index = start + uint64_t(blockIdx.x) * blockDim.x + threadIdx.x; 133 | 134 | uint64_t num_nodes = dag_bytes / sizeof(hash64_t); 135 | uint64_t num_nodes_rounded = ((num_nodes + 3) / 4) * 4; 136 | 137 | if (node_index >= num_nodes_rounded) return; // None of the threads from this quad have valid node_index 138 | 139 | hash200_t dag_node; 140 | for(int i = 0; i < 4; ++i) { 141 | dag_node.uint4s[i] = g_light[fast_mod(node_index, light_words)].uint4s[i]; 142 | } 143 | 144 | dag_node.words[0] ^= node_index; 145 | keccak_f1600(dag_node.uint64s); 146 | 147 | const int thread_id = threadIdx.x & 3; 148 | 149 | #pragma unroll(4) 150 | for (uint32_t i = 0; i < ETHASH_DATASET_PARENTS; ++i) { 151 | uint32_t parent_index = fast_mod(fnv(node_index ^ i, dag_node.words[i % NODE_WORDS]), light_words); 152 | 153 | #pragma unroll 154 | for (uint32_t t = 0; t < 4; ++t) { 155 | 156 | const uint32_t shuffle_index = SHFL(parent_index, t, 4); 157 | 158 | const uint4 p4 = g_light[shuffle_index].uint4s[thread_id]; 159 | 160 | #pragma unroll 161 | for (int w = 0; w < 4; ++w) { 162 | 163 | const uint4 s4 = make_uint4(SHFL(p4.x, w, 4), SHFL(p4.y, w, 4), SHFL(p4.z, w, 4), SHFL(p4.w, w, 4)); 164 | if (t == thread_id) { 165 | dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], s4); 166 | } 167 | } 168 | } 169 | } 170 | 171 | keccak_f1600(dag_node.uint64s); 172 | 173 | for (uint32_t t = 0; t < 4; ++t) { 174 | uint32_t shuffle_index = SHFL(node_index, t, 4); 175 | uint4 s[4]; 176 | for (uint32_t w = 0; w < 4; w++) { 177 | s[w] = make_uint4(SHFL(dag_node.uint4s[w].x, t, 4), 178 | SHFL(dag_node.uint4s[w].y, t, 4), 179 | SHFL(dag_node.uint4s[w].z, t, 4), 180 | SHFL(dag_node.uint4s[w].w, t, 4)); 181 | } 182 | if (shuffle_index * sizeof(hash64_t) < dag_bytes) { 183 | g_dag[shuffle_index].uint4s[thread_id] = s[thread_id]; 184 | } 185 | } 186 | } 187 | -------------------------------------------------------------------------------- /src/cuda_blake.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | typedef struct { 4 | uint32_t h[8], s[4], t[2]; 5 | uint32_t buflen; 6 | int nullt; 7 | uint8_t buf[64]; 8 | } blake_state; 9 | 10 | #define U8TO32(p) \ 11 | (((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \ 12 | ((uint32_t)((p)[2]) << 8) | ((uint32_t)((p)[3]) )) 13 | 14 | #define U32TO8(p, v) \ 15 | (p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \ 16 | (p)[2] = (uint8_t)((v) >> 8); (p)[3] = (uint8_t)((v) ); 17 | 18 | #define BLAKE_ROT(x,n) ROTR32(x, n) 19 | #define BLAKE_G(a,b,c,d,e) \ 20 | v[a] += (m[d_blake_sigma[i][e]] ^ d_blake_cst[d_blake_sigma[i][e+1]]) + v[b]; \ 21 | v[d] = BLAKE_ROT(v[d] ^ v[a],16); \ 22 | v[c] += v[d]; \ 23 | v[b] = BLAKE_ROT(v[b] ^ v[c],12); \ 24 | v[a] += (m[d_blake_sigma[i][e+1]] ^ d_blake_cst[d_blake_sigma[i][e]])+v[b]; \ 25 | v[d] = BLAKE_ROT(v[d] ^ v[a], 8); \ 26 | v[c] += v[d]; \ 27 | v[b] = BLAKE_ROT(v[b] ^ v[c], 7); 28 | 29 | __constant__ uint8_t d_blake_sigma[14][16] = 30 | { 31 | {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, 32 | {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, 33 | {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4}, 34 | {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}, 35 | {9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13}, 36 | {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9}, 37 | {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11}, 38 | {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10}, 39 | {6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5}, 40 | {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0}, 41 | {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, 42 | {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, 43 | {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4}, 44 | {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8} 45 | }; 46 | __constant__ uint32_t d_blake_cst[16] 47 | = { 48 | 0x243F6A88, 0x85A308D3, 0x13198A2E, 0x03707344, 49 | 0xA4093822, 0x299F31D0, 0x082EFA98, 0xEC4E6C89, 50 | 0x452821E6, 0x38D01377, 0xBE5466CF, 0x34E90C6C, 51 | 0xC0AC29B7, 0xC97C50DD, 0x3F84D5B5, 0xB5470917 52 | }; 53 | 54 | __device__ void cn_blake_compress(blake_state *S, const uint8_t *block) 55 | { 56 | uint32_t v[16], m[16], i; 57 | 58 | for (i = 0; i < 16; ++i) m[i] = U8TO32(block + i * 4); 59 | for (i = 0; i < 8; ++i) v[i] = S->h[i]; 60 | v[ 8] = S->s[0] ^ 0x243F6A88; 61 | v[ 9] = S->s[1] ^ 0x85A308D3; 62 | v[10] = S->s[2] ^ 0x13198A2E; 63 | v[11] = S->s[3] ^ 0x03707344; 64 | v[12] = 0xA4093822; 65 | v[13] = 0x299F31D0; 66 | v[14] = 0x082EFA98; 67 | v[15] = 0xEC4E6C89; 68 | 69 | if (S->nullt == 0) 70 | { 71 | v[12] ^= S->t[0]; 72 | v[13] ^= S->t[0]; 73 | v[14] ^= S->t[1]; 74 | v[15] ^= S->t[1]; 75 | } 76 | 77 | for (i = 0; i < 14; ++i) 78 | { 79 | BLAKE_G(0, 4, 8, 12, 0); 80 | BLAKE_G(1, 5, 9, 13, 2); 81 | BLAKE_G(2, 6, 10, 14, 4); 82 | BLAKE_G(3, 7, 11, 15, 6); 83 | BLAKE_G(3, 4, 9, 14, 14); 84 | BLAKE_G(2, 7, 8, 13, 12); 85 | BLAKE_G(0, 5, 10, 15, 8); 86 | BLAKE_G(1, 6, 11, 12, 10); 87 | } 88 | 89 | for (i = 0; i < 16; ++i) S->h[i % 8] ^= v[i]; 90 | for (i = 0; i < 8; ++i) S->h[i] ^= S->s[i % 4]; 91 | } 92 | 93 | __device__ void cn_blake_update(blake_state *S, const uint8_t *data, uint64_t datalen) 94 | { 95 | uint32_t left = S->buflen >> 3; 96 | uint32_t fill = 64 - left; 97 | 98 | if (left && (((datalen >> 3) & 0x3F) >= fill)) 99 | { 100 | memcpy((void *) (S->buf + left), (void *) data, fill); 101 | S->t[0] += 512; 102 | if (S->t[0] == 0) S->t[1]++; 103 | cn_blake_compress(S, S->buf); 104 | data += fill; 105 | datalen -= (fill << 3); 106 | left = 0; 107 | } 108 | 109 | while (datalen >= 512) 110 | { 111 | S->t[0] += 512; 112 | if (S->t[0] == 0) S->t[1]++; 113 | cn_blake_compress(S, data); 114 | data += 64; 115 | datalen -= 512; 116 | } 117 | 118 | if (datalen > 0) 119 | { 120 | memcpy((void *) (S->buf + left), (void *) data, datalen >> 3); 121 | S->buflen = (left << 3) + datalen; 122 | } 123 | else 124 | { 125 | S->buflen = 0; 126 | } 127 | } 128 | 129 | __device__ void cn_blake_final(blake_state *S, uint8_t *digest) 130 | { 131 | const uint8_t padding[] = 132 | { 133 | 0x80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 134 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 135 | }; 136 | 137 | uint8_t pa = 0x81, pb = 0x01; 138 | uint8_t msglen[8]; 139 | uint32_t lo = S->t[0] + S->buflen, hi = S->t[1]; 140 | if (lo < (unsigned) S->buflen) hi++; 141 | U32TO8(msglen + 0, hi); 142 | U32TO8(msglen + 4, lo); 143 | 144 | if (S->buflen == 440) 145 | { 146 | S->t[0] -= 8; 147 | cn_blake_update(S, &pa, 8); 148 | } 149 | else 150 | { 151 | if (S->buflen < 440) 152 | { 153 | if (S->buflen == 0) S->nullt = 1; 154 | S->t[0] -= 440 - S->buflen; 155 | cn_blake_update(S, padding, 440 - S->buflen); 156 | } 157 | else 158 | { 159 | S->t[0] -= 512 - S->buflen; 160 | cn_blake_update(S, padding, 512 - S->buflen); 161 | S->t[0] -= 440; 162 | cn_blake_update(S, padding + 1, 440); 163 | S->nullt = 1; 164 | } 165 | cn_blake_update(S, &pb, 8); 166 | S->t[0] -= 8; 167 | } 168 | S->t[0] -= 64; 169 | cn_blake_update(S, msglen, 64); 170 | 171 | U32TO8(digest + 0, S->h[0]); 172 | U32TO8(digest + 4, S->h[1]); 173 | U32TO8(digest + 8, S->h[2]); 174 | U32TO8(digest + 12, S->h[3]); 175 | U32TO8(digest + 16, S->h[4]); 176 | U32TO8(digest + 20, S->h[5]); 177 | U32TO8(digest + 24, S->h[6]); 178 | U32TO8(digest + 28, S->h[7]); 179 | } 180 | 181 | __device__ void cn_blake(const uint8_t *in, uint64_t inlen, uint8_t *out) 182 | { 183 | blake_state bs; 184 | blake_state *S = (blake_state *)&bs; 185 | 186 | S->h[0] = 0x6A09E667; S->h[1] = 0xBB67AE85; S->h[2] = 0x3C6EF372; 187 | S->h[3] = 0xA54FF53A; S->h[4] = 0x510E527F; S->h[5] = 0x9B05688C; 188 | S->h[6] = 0x1F83D9AB; S->h[7] = 0x5BE0CD19; 189 | S->t[0] = S->t[1] = S->buflen = S->nullt = 0; 190 | S->s[0] = S->s[1] = S->s[2] = S->s[3] = 0; 191 | 192 | cn_blake_update(S, in, inlen * 8); 193 | cn_blake_final(S, out); 194 | } 195 | -------------------------------------------------------------------------------- /src/3rdparty/cub/util_cpp_dialect.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /*! \file 29 | * \brief Detect the version of the C++ standard used by the compiler. 30 | */ 31 | 32 | #pragma once 33 | 34 | #include "util_compiler.cuh" 35 | 36 | // Deprecation warnings may be silenced by defining the following macros. These 37 | // may be combined. 38 | // - CUB_IGNORE_DEPRECATED_CPP_DIALECT: 39 | // Ignore all deprecated C++ dialects and outdated compilers. 40 | // - CUB_IGNORE_DEPRECATED_CPP_11: 41 | // Ignore deprecation warnings when compiling with C++11. C++03 and outdated 42 | // compilers will still issue warnings. 43 | // - CUB_IGNORE_DEPRECATED_COMPILER 44 | // Ignore deprecation warnings when using deprecated compilers. Compiling 45 | // with C++03 and C++11 will still issue warnings. 46 | 47 | // Check for the thrust opt-outs as well: 48 | #if !defined(CUB_IGNORE_DEPRECATED_CPP_DIALECT) && \ 49 | defined(THRUST_IGNORE_DEPRECATED_CPP_DIALECT) 50 | # define CUB_IGNORE_DEPRECATED_CPP_DIALECT 51 | #endif 52 | #if !defined(CUB_IGNORE_DEPRECATED_CPP_11) && \ 53 | defined(THRUST_IGNORE_DEPRECATED_CPP_11) 54 | # define CUB_IGNORE_DEPRECATED_CPP_11 55 | #endif 56 | #if !defined(CUB_IGNORE_DEPRECATED_COMPILER) && \ 57 | defined(THRUST_IGNORE_DEPRECATED_COMPILER) 58 | # define CUB_IGNORE_DEPRECATED_COMPILER 59 | #endif 60 | 61 | #ifdef CUB_IGNORE_DEPRECATED_CPP_DIALECT 62 | # define CUB_IGNORE_DEPRECATED_CPP_11 63 | # define CUB_IGNORE_DEPRECATED_COMPILER 64 | #endif 65 | 66 | // Define this to override the built-in detection. 67 | #ifndef CUB_CPP_DIALECT 68 | 69 | // MSVC does not define __cplusplus correctly. _MSVC_LANG is used instead. 70 | // This macro is only defined in MSVC 2015U3+. 71 | # ifdef _MSVC_LANG // Do not replace with CUB_HOST_COMPILER test (see above) 72 | // MSVC2015 reports C++14 but lacks extended constexpr support. Treat as C++11. 73 | # if CUB_MSVC_VERSION < 1910 && _MSVC_LANG > 201103L /* MSVC < 2017 && CPP > 2011 */ 74 | # define CUB_CPLUSPLUS 201103L /* Fix to 2011 */ 75 | # else 76 | # define CUB_CPLUSPLUS _MSVC_LANG /* We'll trust this for now. */ 77 | # endif // MSVC 2015 C++14 fix 78 | # else 79 | # define CUB_CPLUSPLUS __cplusplus 80 | # endif 81 | 82 | // Detect current dialect: 83 | # if CUB_CPLUSPLUS < 201103L 84 | # define CUB_CPP_DIALECT 2003 85 | # elif CUB_CPLUSPLUS < 201402L 86 | # define CUB_CPP_DIALECT 2011 87 | # elif CUB_CPLUSPLUS < 201703L 88 | # define CUB_CPP_DIALECT 2014 89 | # elif CUB_CPLUSPLUS == 201703L 90 | # define CUB_CPP_DIALECT 2017 91 | # elif CUB_CPLUSPLUS > 201703L // unknown, but is higher than 2017. 92 | # define CUB_CPP_DIALECT 2020 93 | # endif 94 | 95 | # undef CUB_CPLUSPLUS // cleanup 96 | 97 | #endif // !CUB_CPP_DIALECT 98 | 99 | // Define CUB_COMPILER_DEPRECATION macro: 100 | #if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC 101 | # define CUB_COMP_DEPR_IMPL(msg) \ 102 | __pragma(message(__FILE__ ":" CUB_COMP_DEPR_IMPL0(__LINE__) ": warning: " #msg)) 103 | # define CUB_COMP_DEPR_IMPL0(x) CUB_COMP_DEPR_IMPL1(x) 104 | # define CUB_COMP_DEPR_IMPL1(x) #x 105 | #else // clang / gcc: 106 | # define CUB_COMP_DEPR_IMPL(msg) CUB_COMP_DEPR_IMPL0(GCC warning #msg) 107 | # define CUB_COMP_DEPR_IMPL0(expr) _Pragma(#expr) 108 | # define CUB_COMP_DEPR_IMPL1 /* intentionally blank */ 109 | #endif 110 | 111 | #define CUB_COMPILER_DEPRECATION(REQ, FIX) \ 112 | CUB_COMP_DEPR_IMPL(CUB requires REQ. Please FIX. Define CUB_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message.) 113 | 114 | // Minimum required compiler checks: 115 | #ifndef CUB_IGNORE_DEPRECATED_COMPILER 116 | # if CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC && CUB_GCC_VERSION < 50000 117 | CUB_COMPILER_DEPRECATION(GCC 5.0, upgrade your compiler); 118 | # endif 119 | # if CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG && CUB_CLANG_VERSION < 60000 120 | CUB_COMPILER_DEPRECATION(Clang 6.0, upgrade your compiler); 121 | # endif 122 | # if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC && CUB_MSVC_VERSION < 1910 123 | CUB_COMPILER_DEPRECATION(MSVC 2017, upgrade your compiler); 124 | # endif 125 | #endif 126 | 127 | #if !defined(CUB_IGNORE_DEPRECATED_CPP_DIALECT) && CUB_CPP_DIALECT < 2014 && \ 128 | (CUB_CPP_DIALECT != 2011 || !defined(CUB_IGNORE_DEPRECATED_CPP_11)) 129 | CUB_COMPILER_DEPRECATION(C++14, pass -std=c++14 to your compiler); 130 | #endif 131 | 132 | #undef CUB_COMPILER_DEPRECATION 133 | #undef CUB_COMP_DEPR_IMPL 134 | #undef CUB_COMP_DEPR_IMPL0 135 | #undef CUB_COMP_DEPR_IMPL1 136 | -------------------------------------------------------------------------------- /src/3rdparty/cub/util_debug.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Error and event logging routines. 32 | * 33 | * The following macros definitions are supported: 34 | * - \p CUB_LOG. Simple event messages are printed to \p stdout. 35 | */ 36 | 37 | #pragma once 38 | 39 | #include 40 | #include "util_namespace.cuh" 41 | #include "util_arch.cuh" 42 | 43 | /// Optional outer namespace(s) 44 | CUB_NS_PREFIX 45 | 46 | /// CUB namespace 47 | namespace cub { 48 | 49 | 50 | /** 51 | * \addtogroup UtilMgmt 52 | * @{ 53 | */ 54 | 55 | 56 | /// CUB error reporting macro (prints error messages to stderr) 57 | #if (defined(DEBUG) || defined(_DEBUG)) && !defined(CUB_STDERR) 58 | #define CUB_STDERR 59 | #endif 60 | 61 | 62 | 63 | /** 64 | * \brief %If \p CUB_STDERR is defined and \p error is not \p cudaSuccess, the corresponding error message is printed to \p stderr (or \p stdout in device code) along with the supplied source context. 65 | * 66 | * \return The CUDA error. 67 | */ 68 | __host__ __device__ __forceinline__ cudaError_t Debug( 69 | cudaError_t error, 70 | const char* filename, 71 | int line) 72 | { 73 | (void)filename; 74 | (void)line; 75 | 76 | #ifdef CUB_RUNTIME_ENABLED 77 | // Clear the global CUDA error state which may have been set by the last 78 | // call. Otherwise, errors may "leak" to unrelated kernel launches. 79 | cudaGetLastError(); 80 | #endif 81 | 82 | #ifdef CUB_STDERR 83 | if (error) 84 | { 85 | if (CUB_IS_HOST_CODE) { 86 | #if CUB_INCLUDE_HOST_CODE 87 | fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error)); 88 | fflush(stderr); 89 | #endif 90 | } else { 91 | #if CUB_INCLUDE_DEVICE_CODE 92 | printf("CUDA error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n", error, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, filename, line); 93 | #endif 94 | } 95 | } 96 | #endif 97 | return error; 98 | } 99 | 100 | 101 | /** 102 | * \brief Debug macro 103 | */ 104 | #ifndef CubDebug 105 | #define CubDebug(e) cub::Debug((cudaError_t) (e), __FILE__, __LINE__) 106 | #endif 107 | 108 | 109 | /** 110 | * \brief Debug macro with exit 111 | */ 112 | #ifndef CubDebugExit 113 | #define CubDebugExit(e) if (cub::Debug((cudaError_t) (e), __FILE__, __LINE__)) { exit(1); } 114 | #endif 115 | 116 | 117 | /** 118 | * \brief Log macro for printf statements. 119 | */ 120 | #if !defined(_CubLog) 121 | #if defined(__NVCOMPILER_CUDA__) 122 | #define _CubLog(format, ...) (__builtin_is_device_code() \ 123 | ? printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, \ 124 | blockIdx.z, blockIdx.y, blockIdx.x, \ 125 | threadIdx.z, threadIdx.y, threadIdx.x, __VA_ARGS__) \ 126 | : printf(format, __VA_ARGS__)); 127 | #elif !(defined(__clang__) && defined(__CUDA__)) 128 | #if (CUB_PTX_ARCH == 0) 129 | #define _CubLog(format, ...) printf(format,__VA_ARGS__); 130 | #elif (CUB_PTX_ARCH >= 200) 131 | #define _CubLog(format, ...) printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, __VA_ARGS__); 132 | #endif 133 | #else 134 | // XXX shameless hack for clang around variadic printf... 135 | // Compilies w/o supplying -std=c++11 but shows warning, 136 | // so we sielence them :) 137 | #pragma clang diagnostic ignored "-Wc++11-extensions" 138 | #pragma clang diagnostic ignored "-Wunnamed-type-template-args" 139 | template 140 | inline __host__ __device__ void va_printf(char const* format, Args const&... args) 141 | { 142 | #ifdef __CUDA_ARCH__ 143 | printf(format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, args...); 144 | #else 145 | printf(format, args...); 146 | #endif 147 | } 148 | #ifndef __CUDA_ARCH__ 149 | #define _CubLog(format, ...) va_printf(format,__VA_ARGS__); 150 | #else 151 | #define _CubLog(format, ...) va_printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, __VA_ARGS__); 152 | #endif 153 | #endif 154 | #endif 155 | 156 | 157 | 158 | 159 | /** @} */ // end group UtilMgmt 160 | 161 | } // CUB namespace 162 | CUB_NS_POSTFIX // Optional outer namespace(s) 163 | -------------------------------------------------------------------------------- /src/crypto/cn/CnAlgo.h: -------------------------------------------------------------------------------- 1 | /* XMRig 2 | * Copyright (c) 2018-2021 SChernykh 3 | * Copyright (c) 2016-2021 XMRig , 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, either version 3 of the License, or 8 | * (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU General Public License 16 | * along with this program. If not, see . 17 | */ 18 | 19 | #ifndef XMRIG_CN_ALGO_H 20 | #define XMRIG_CN_ALGO_H 21 | 22 | 23 | #include 24 | #include 25 | 26 | 27 | #include "crypto/common/Algorithm.h" 28 | 29 | 30 | namespace xmrig_cuda 31 | { 32 | 33 | 34 | template 35 | class CnAlgo 36 | { 37 | public: 38 | constexpr CnAlgo() {}; 39 | 40 | constexpr inline Algorithm::Id base() const { static_assert(Algorithm::isCN(ALGO), "invalid CRYPTONIGHT algorithm"); return Algorithm::base(ALGO); } 41 | constexpr inline bool isHeavy() const { return Algorithm::family(ALGO) == Algorithm::CN_HEAVY; } 42 | constexpr inline bool isR() const { return ALGO == Algorithm::CN_R; } 43 | constexpr inline size_t memory() const { static_assert(Algorithm::isCN(ALGO), "invalid CRYPTONIGHT algorithm"); return Algorithm::l3(ALGO); } 44 | constexpr inline uint32_t iterations() const { static_assert(Algorithm::isCN(ALGO), "invalid CRYPTONIGHT algorithm"); return CN_ITER; } 45 | constexpr inline uint32_t mask() const { return static_cast(((memory() - 1) / 16) * 16); } 46 | 47 | inline static uint32_t iterations(Algorithm::Id algo) 48 | { 49 | switch (algo) { 50 | case Algorithm::CN_0: 51 | case Algorithm::CN_1: 52 | case Algorithm::CN_2: 53 | case Algorithm::CN_R: 54 | case Algorithm::CN_RTO: 55 | return CN_ITER; 56 | 57 | case Algorithm::CN_FAST: 58 | case Algorithm::CN_HALF: 59 | # ifdef XMRIG_ALGO_CN_LITE 60 | case Algorithm::CN_LITE_0: 61 | case Algorithm::CN_LITE_1: 62 | # endif 63 | # ifdef XMRIG_ALGO_CN_HEAVY 64 | case Algorithm::CN_HEAVY_0: 65 | case Algorithm::CN_HEAVY_TUBE: 66 | case Algorithm::CN_HEAVY_XHV: 67 | # endif 68 | case Algorithm::CN_CCX: 69 | return CN_ITER / 2; 70 | 71 | case Algorithm::CN_RWZ: 72 | case Algorithm::CN_ZLS: 73 | return 0x60000; 74 | 75 | case Algorithm::CN_XAO: 76 | case Algorithm::CN_DOUBLE: 77 | return CN_ITER * 2; 78 | 79 | # ifdef XMRIG_ALGO_CN_PICO 80 | case Algorithm::CN_PICO_0: 81 | case Algorithm::CN_PICO_TLO: 82 | return CN_ITER / 8; 83 | # endif 84 | 85 | # ifdef XMRIG_ALGO_CN_FEMTO 86 | case Algorithm::CN_UPX2: 87 | return CN_ITER / 32; 88 | # endif 89 | 90 | # ifdef XMRIG_ALGO_CN_GPU 91 | case Algorithm::CN_GPU: 92 | return 0xC000; 93 | # endif 94 | 95 | default: 96 | break; 97 | } 98 | 99 | return 0; 100 | } 101 | 102 | inline static uint32_t mask(Algorithm::Id algo) 103 | { 104 | # ifdef XMRIG_ALGO_CN_PICO 105 | if (algo == Algorithm::CN_PICO_0) { 106 | return 0x1FFF0; 107 | } 108 | # endif 109 | 110 | # ifdef XMRIG_ALGO_CN_FEMTO 111 | if (algo == Algorithm::CN_UPX2) { 112 | return 0x1FFF0; 113 | } 114 | # endif 115 | 116 | # ifdef XMRIG_ALGO_CN_GPU 117 | if (algo == Algorithm::CN_GPU) { 118 | return 0x1FFFC0; 119 | } 120 | # endif 121 | 122 | return ((Algorithm::l3(algo) - 1) / 16) * 16; 123 | } 124 | 125 | private: 126 | constexpr const static uint32_t CN_ITER = 0x80000; 127 | }; 128 | 129 | 130 | template<> constexpr inline uint32_t CnAlgo::iterations() const { return CN_ITER / 2; } 131 | template<> constexpr inline uint32_t CnAlgo::iterations() const { return CN_ITER / 2; } 132 | template<> constexpr inline uint32_t CnAlgo::iterations() const { return CN_ITER / 2; } 133 | template<> constexpr inline uint32_t CnAlgo::iterations() const { return CN_ITER / 2; } 134 | template<> constexpr inline uint32_t CnAlgo::iterations() const { return CN_ITER / 2; } 135 | template<> constexpr inline uint32_t CnAlgo::iterations() const { return CN_ITER / 2; } 136 | template<> constexpr inline uint32_t CnAlgo::iterations() const { return CN_ITER / 2; } 137 | template<> constexpr inline uint32_t CnAlgo::iterations() const { return CN_ITER * 2; } 138 | template<> constexpr inline uint32_t CnAlgo::iterations() const { return CN_ITER * 2; } 139 | template<> constexpr inline uint32_t CnAlgo::iterations() const { return 0x60000; } 140 | template<> constexpr inline uint32_t CnAlgo::iterations() const { return 0x60000; } 141 | template<> constexpr inline uint32_t CnAlgo::iterations() const { return CN_ITER / 8; } 142 | template<> constexpr inline uint32_t CnAlgo::iterations() const { return CN_ITER / 8; } 143 | template<> constexpr inline uint32_t CnAlgo::iterations() const { return CN_ITER / 2; } 144 | template<> constexpr inline uint32_t CnAlgo::iterations() const { return CN_ITER / 32; } 145 | template<> constexpr inline uint32_t CnAlgo::iterations() const { return 0xC000; } 146 | 147 | 148 | template<> constexpr inline uint32_t CnAlgo::mask() const { return 0x1FFF0; } 149 | template<> constexpr inline uint32_t CnAlgo::mask() const { return 0x1FFF0; } 150 | template<> constexpr inline uint32_t CnAlgo::mask() const { return 0x1FFFC0; } 151 | 152 | 153 | } /* namespace xmrig_cuda */ 154 | 155 | 156 | #endif /* XMRIG_CN_ALGO_H */ 157 | -------------------------------------------------------------------------------- /src/3rdparty/cub/thread/thread_reduce.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Thread utilities for sequential reduction over statically-sized array types 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "../thread/thread_operators.cuh" 37 | #include "../config.cuh" 38 | 39 | /// Optional outer namespace(s) 40 | CUB_NS_PREFIX 41 | 42 | /// CUB namespace 43 | namespace cub { 44 | 45 | /// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations) 46 | namespace internal { 47 | 48 | /** 49 | * Sequential reduction over statically-sized array types 50 | */ 51 | template < 52 | int LENGTH, 53 | typename T, 54 | typename ReductionOp> 55 | __device__ __forceinline__ T ThreadReduce( 56 | T* input, ///< [in] Input array 57 | ReductionOp reduction_op, ///< [in] Binary reduction operator 58 | T prefix, ///< [in] Prefix to seed reduction with 59 | Int2Type /*length*/) 60 | { 61 | T retval = prefix; 62 | 63 | #pragma unroll 64 | for (int i = 0; i < LENGTH; ++i) 65 | retval = reduction_op(retval, input[i]); 66 | 67 | return retval; 68 | } 69 | 70 | 71 | /** 72 | * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned. 73 | * 74 | * \tparam LENGTH LengthT of input array 75 | * \tparam T [inferred] The data type to be reduced. 76 | * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) 77 | */ 78 | template < 79 | int LENGTH, 80 | typename T, 81 | typename ReductionOp> 82 | __device__ __forceinline__ T ThreadReduce( 83 | T* input, ///< [in] Input array 84 | ReductionOp reduction_op, ///< [in] Binary reduction operator 85 | T prefix) ///< [in] Prefix to seed reduction with 86 | { 87 | return ThreadReduce(input, reduction_op, prefix, Int2Type()); 88 | } 89 | 90 | 91 | /** 92 | * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array. The aggregate is returned. 93 | * 94 | * \tparam LENGTH LengthT of input array 95 | * \tparam T [inferred] The data type to be reduced. 96 | * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) 97 | */ 98 | template < 99 | int LENGTH, 100 | typename T, 101 | typename ReductionOp> 102 | __device__ __forceinline__ T ThreadReduce( 103 | T* input, ///< [in] Input array 104 | ReductionOp reduction_op) ///< [in] Binary reduction operator 105 | { 106 | T prefix = input[0]; 107 | return ThreadReduce(input + 1, reduction_op, prefix); 108 | } 109 | 110 | 111 | /** 112 | * \brief Perform a sequential reduction over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned. 113 | * 114 | * \tparam LENGTH [inferred] LengthT of \p input array 115 | * \tparam T [inferred] The data type to be reduced. 116 | * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) 117 | */ 118 | template < 119 | int LENGTH, 120 | typename T, 121 | typename ReductionOp> 122 | __device__ __forceinline__ T ThreadReduce( 123 | T (&input)[LENGTH], ///< [in] Input array 124 | ReductionOp reduction_op, ///< [in] Binary reduction operator 125 | T prefix) ///< [in] Prefix to seed reduction with 126 | { 127 | return ThreadReduce(input, reduction_op, prefix, Int2Type()); 128 | } 129 | 130 | 131 | /** 132 | * \brief Serial reduction with the specified operator 133 | * 134 | * \tparam LENGTH [inferred] LengthT of \p input array 135 | * \tparam T [inferred] The data type to be reduced. 136 | * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) 137 | */ 138 | template < 139 | int LENGTH, 140 | typename T, 141 | typename ReductionOp> 142 | __device__ __forceinline__ T ThreadReduce( 143 | T (&input)[LENGTH], ///< [in] Input array 144 | ReductionOp reduction_op) ///< [in] Binary reduction operator 145 | { 146 | return ThreadReduce((T*) input, reduction_op); 147 | } 148 | 149 | 150 | } // internal namespace 151 | } // CUB namespace 152 | CUB_NS_POSTFIX // Optional outer namespace(s) 153 | -------------------------------------------------------------------------------- /src/3rdparty/cub/block/block_raking_layout.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data. 32 | */ 33 | 34 | 35 | #pragma once 36 | 37 | #include "../config.cuh" 38 | #include "../util_type.cuh" 39 | 40 | /// Optional outer namespace(s) 41 | CUB_NS_PREFIX 42 | 43 | /// CUB namespace 44 | namespace cub { 45 | 46 | /** 47 | * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data. ![](raking.png) 48 | * \ingroup BlockModule 49 | * 50 | * \par Overview 51 | * This type facilitates a shared memory usage pattern where a block of CUDA 52 | * threads places elements into shared memory and then reduces the active 53 | * parallelism to one "raking" warp of threads for serially aggregating consecutive 54 | * sequences of shared items. Padding is inserted to eliminate bank conflicts 55 | * (for most data types). 56 | * 57 | * \tparam T The data type to be exchanged. 58 | * \tparam BLOCK_THREADS The thread block size in threads. 59 | * \tparam PTX_ARCH [optional] \ptxversion 60 | */ 61 | template < 62 | typename T, 63 | int BLOCK_THREADS, 64 | int PTX_ARCH = CUB_PTX_ARCH> 65 | struct BlockRakingLayout 66 | { 67 | //--------------------------------------------------------------------- 68 | // Constants and type definitions 69 | //--------------------------------------------------------------------- 70 | 71 | enum 72 | { 73 | /// The total number of elements that need to be cooperatively reduced 74 | SHARED_ELEMENTS = BLOCK_THREADS, 75 | 76 | /// Maximum number of warp-synchronous raking threads 77 | MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(PTX_ARCH)), 78 | 79 | /// Number of raking elements per warp-synchronous raking thread (rounded up) 80 | SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS, 81 | 82 | /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads) 83 | RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH, 84 | 85 | /// Whether we will have bank conflicts (technically we should find out if the GCD is > 1) 86 | HAS_CONFLICTS = (CUB_SMEM_BANKS(PTX_ARCH) % SEGMENT_LENGTH == 0), 87 | 88 | /// Degree of bank conflicts (e.g., 4-way) 89 | CONFLICT_DEGREE = (HAS_CONFLICTS) ? 90 | (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(PTX_ARCH) : 91 | 1, 92 | 93 | /// Pad each segment length with one element if segment length is not relatively prime to warp size and can't be optimized as a vector load 94 | USE_SEGMENT_PADDING = ((SEGMENT_LENGTH & 1) == 0) && (SEGMENT_LENGTH > 2), 95 | 96 | /// Total number of elements in the raking grid 97 | GRID_ELEMENTS = RAKING_THREADS * (SEGMENT_LENGTH + USE_SEGMENT_PADDING), 98 | 99 | /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the number of raking threads) 100 | UNGUARDED = (SHARED_ELEMENTS % RAKING_THREADS == 0), 101 | }; 102 | 103 | 104 | /** 105 | * \brief Shared memory storage type 106 | */ 107 | struct __align__(16) _TempStorage 108 | { 109 | T buff[BlockRakingLayout::GRID_ELEMENTS]; 110 | }; 111 | 112 | /// Alias wrapper allowing storage to be unioned 113 | struct TempStorage : Uninitialized<_TempStorage> {}; 114 | 115 | 116 | /** 117 | * \brief Returns the location for the calling thread to place data into the grid 118 | */ 119 | static __device__ __forceinline__ T* PlacementPtr( 120 | TempStorage &temp_storage, 121 | unsigned int linear_tid) 122 | { 123 | // Offset for partial 124 | unsigned int offset = linear_tid; 125 | 126 | // Add in one padding element for every segment 127 | if (USE_SEGMENT_PADDING > 0) 128 | { 129 | offset += offset / SEGMENT_LENGTH; 130 | } 131 | 132 | // Incorporating a block of padding partials every shared memory segment 133 | return temp_storage.Alias().buff + offset; 134 | } 135 | 136 | 137 | /** 138 | * \brief Returns the location for the calling thread to begin sequential raking 139 | */ 140 | static __device__ __forceinline__ T* RakingPtr( 141 | TempStorage &temp_storage, 142 | unsigned int linear_tid) 143 | { 144 | return temp_storage.Alias().buff + (linear_tid * (SEGMENT_LENGTH + USE_SEGMENT_PADDING)); 145 | } 146 | }; 147 | 148 | } // CUB namespace 149 | CUB_NS_POSTFIX // Optional outer namespace(s) 150 | 151 | -------------------------------------------------------------------------------- /src/KawPow/raven/KawPow.cu: -------------------------------------------------------------------------------- 1 | /* XMRig 2 | * Copyright 2010 Jeff Garzik 3 | * Copyright 2012-2014 pooler 4 | * Copyright 2014 Lucas Jones 5 | * Copyright 2014-2016 Wolf9466 6 | * Copyright 2016 Jay D Dee 7 | * Copyright 2017-2018 XMR-Stak , 8 | * Copyright 2018-2020 SChernykh 9 | * Copyright 2016-2020 XMRig , 10 | * 11 | * This program is free software: you can redistribute it and/or modify 12 | * it under the terms of the GNU General Public License as published by 13 | * the Free Software Foundation, either version 3 of the License, or 14 | * (at your option) any later version. 15 | * 16 | * This program is distributed in the hope that it will be useful, 17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | * GNU General Public License for more details. 20 | * 21 | * You should have received a copy of the GNU General Public License 22 | * along with this program. If not, see . 23 | */ 24 | 25 | 26 | #include 27 | #include 28 | #include 29 | 30 | #include "cryptonight.h" 31 | #include "cuda_device.hpp" 32 | #include "KawPow_dag.h" 33 | #include "CudaKawPow_gen.h" 34 | 35 | 36 | void kawpow_prepare(nvid_ctx *ctx, const void* cache, size_t cache_size, const void* dag_precalc, size_t dag_size, uint32_t height, const uint64_t* dag_sizes) 37 | { 38 | constexpr size_t MEM_ALIGN = 1024 * 1024; 39 | 40 | if (cache_size != ctx->kawpow_cache_size) { 41 | ctx->kawpow_cache_size = cache_size; 42 | 43 | if (!dag_precalc) { 44 | if (cache_size > ctx->kawpow_cache_capacity) { 45 | CUDA_CHECK(ctx->device_id, cudaFree(ctx->kawpow_cache)); 46 | 47 | ctx->kawpow_cache_capacity = ((cache_size + MEM_ALIGN - 1) / MEM_ALIGN) * MEM_ALIGN; 48 | CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->kawpow_cache, ctx->kawpow_cache_capacity)); 49 | } 50 | 51 | CUDA_CHECK(ctx->device_id, cudaMemcpy((uint8_t*)(ctx->kawpow_cache), cache, cache_size, cudaMemcpyHostToDevice)); 52 | } 53 | } 54 | 55 | if (dag_size != ctx->kawpow_dag_size) { 56 | ctx->kawpow_dag_size = dag_size; 57 | 58 | if (dag_size > ctx->kawpow_dag_capacity) { 59 | CUDA_CHECK(ctx->device_id, cudaFree(ctx->kawpow_dag)); 60 | 61 | ctx->kawpow_dag_capacity = ((dag_size + MEM_ALIGN - 1) / MEM_ALIGN) * MEM_ALIGN; 62 | CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->kawpow_dag, ctx->kawpow_dag_capacity)); 63 | } 64 | 65 | if (dag_precalc) { 66 | CUDA_CHECK(ctx->device_id, cudaMemcpy((uint8_t*)(ctx->kawpow_dag), cache, cache_size, cudaMemcpyHostToDevice)); 67 | } 68 | 69 | constexpr int blocks = 8192; 70 | constexpr int threads = 32; 71 | 72 | const size_t cache_items = ((cache_size + 255) / 256) * 256 / sizeof(hash64_t); 73 | const size_t dag_items = dag_size / sizeof(hash64_t); 74 | 75 | uint4 light_words; 76 | light_words.w = ctx->kawpow_cache_size / sizeof(hash64_t); 77 | calculate_fast_mod_data(light_words.w, light_words.x, light_words.y, light_words.z); 78 | 79 | for (size_t i = dag_precalc ? cache_items : 0; i < dag_items; i += blocks * threads) { 80 | CUDA_CHECK_KERNEL(ctx->device_id, ethash_calculate_dag_item<<>>( 81 | i, 82 | (hash64_t*) ctx->kawpow_dag, 83 | ctx->kawpow_dag_size, 84 | (hash64_t*)(dag_precalc ? ctx->kawpow_dag : ctx->kawpow_cache), 85 | light_words 86 | )); 87 | CUDA_CHECK(ctx->device_id, cudaDeviceSynchronize()); 88 | } 89 | 90 | if (dag_precalc) { 91 | CUDA_CHECK(ctx->device_id, cudaMemcpy((uint8_t*)(ctx->kawpow_dag), dag_precalc, cache_items * sizeof(hash64_t), cudaMemcpyHostToDevice)); 92 | } 93 | } 94 | 95 | constexpr uint32_t PERIOD_LENGTH = 3; 96 | const uint32_t period = height / PERIOD_LENGTH; 97 | 98 | if (ctx->kawpow_period != period) { 99 | if (ctx->kawpow_module) { 100 | cuModuleUnload(ctx->kawpow_module); 101 | } 102 | 103 | std::vector ptx; 104 | std::string lowered_name; 105 | KawPow_get_program(ptx, lowered_name, period, ctx->device_threads, ctx->device_arch[0], ctx->device_arch[1], dag_sizes); 106 | 107 | CU_CHECK(ctx->device_id, cuModuleLoadDataEx(&ctx->kawpow_module, ptx.data(), 0, 0, 0)); 108 | CU_CHECK(ctx->device_id, cuModuleGetFunction(&ctx->kawpow_kernel, ctx->kawpow_module, lowered_name.c_str())); 109 | 110 | ctx->kawpow_period = period; 111 | 112 | KawPow_get_program(ptx, lowered_name, period + 1, ctx->device_threads, ctx->device_arch[0], ctx->device_arch[1], dag_sizes, true); 113 | } 114 | 115 | if (!ctx->kawpow_stop_host) { 116 | CUDA_CHECK(ctx->device_id, cudaMallocHost(&ctx->kawpow_stop_host, sizeof(uint32_t) * 2)); 117 | CUDA_CHECK(ctx->device_id, cudaHostGetDevicePointer(&ctx->kawpow_stop_device, ctx->kawpow_stop_host, 0)); 118 | } 119 | } 120 | 121 | 122 | void kawpow_stop_hash(nvid_ctx *ctx) 123 | { 124 | if (ctx->kawpow_stop_host) { 125 | *ctx->kawpow_stop_host = 1; 126 | } 127 | } 128 | 129 | 130 | namespace KawPow_Raven { 131 | 132 | void hash(nvid_ctx *ctx, uint8_t* job_blob, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t *skipped_hashes) 133 | { 134 | dim3 grid(ctx->device_blocks); 135 | dim3 block(ctx->device_threads); 136 | 137 | uint32_t hack_false = 0; 138 | void* args[] = { &ctx->kawpow_dag, &ctx->d_input, &target, &hack_false, &ctx->d_result_nonce, &ctx->kawpow_stop_device }; 139 | 140 | CUDA_CHECK(ctx->device_id, cudaMemcpy(ctx->d_input, job_blob, 40, cudaMemcpyHostToDevice)); 141 | CUDA_CHECK(ctx->device_id, cudaMemset(ctx->d_result_nonce, 0, sizeof(uint32_t))); 142 | memset(ctx->kawpow_stop_host, 0, sizeof(uint32_t) * 2); 143 | 144 | CU_CHECK(ctx->device_id, cuLaunchKernel( 145 | ctx->kawpow_kernel, 146 | grid.x, grid.y, grid.z, 147 | block.x, block.y, block.z, 148 | 0, nullptr, args, 0 149 | )); 150 | CU_CHECK(ctx->device_id, cuCtxSynchronize()); 151 | 152 | *skipped_hashes = ctx->kawpow_stop_host[1]; 153 | 154 | uint32_t results[16]; 155 | CUDA_CHECK(ctx->device_id, cudaMemcpy(results, ctx->d_result_nonce, sizeof(results), cudaMemcpyDeviceToHost)); 156 | 157 | if (results[0] > 15) { 158 | results[0] = 15; 159 | } 160 | 161 | *rescount = results[0]; 162 | memcpy(resnonce, results + 1, results[0] * sizeof(uint32_t)); 163 | } 164 | 165 | } 166 | -------------------------------------------------------------------------------- /src/cryptonight.h: -------------------------------------------------------------------------------- 1 | /* XMRig 2 | * Copyright 2010 Jeff Garzik 3 | * Copyright 2012-2014 pooler 4 | * Copyright 2014 Lucas Jones 5 | * Copyright 2014-2016 Wolf9466 6 | * Copyright 2016 Jay D Dee 7 | * Copyright 2017-2018 XMR-Stak , 8 | * Copyright 2018 Lee Clagett 9 | * Copyright 2019 Spudz76 10 | * Copyright 2018-2020 SChernykh 11 | * Copyright 2016-2020 XMRig , 12 | * 13 | * This program is free software: you can redistribute it and/or modify 14 | * it under the terms of the GNU General Public License as published by 15 | * the Free Software Foundation, either version 3 of the License, or 16 | * (at your option) any later version. 17 | * 18 | * This program is distributed in the hope that it will be useful, 19 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 20 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 21 | * GNU General Public License for more details. 22 | * 23 | * You should have received a copy of the GNU General Public License 24 | * along with this program. If not, see . 25 | */ 26 | 27 | 28 | #pragma once 29 | 30 | #include "crypto/common/Algorithm.h" 31 | 32 | 33 | #include 34 | 35 | 36 | #if defined(XMRIG_ALGO_KAWPOW) || defined(XMRIG_ALGO_CN_R) 37 | # include 38 | #endif 39 | 40 | 41 | struct nvid_ctx { 42 | # ifdef XMRIG_ALGO_CN_R 43 | CUdevice cuDevice = -1; 44 | CUmodule module = nullptr; 45 | CUfunction kernel = nullptr; 46 | # endif 47 | 48 | xmrig_cuda::Algorithm algorithm = xmrig_cuda::Algorithm::INVALID; 49 | uint64_t kernel_height = 0; 50 | 51 | int device_id = 0; 52 | const char *device_name = nullptr; 53 | int device_arch[2] { 0,}; 54 | int device_mpcount = 0; 55 | int device_blocks = 0; 56 | int device_threads = 0; 57 | int device_bfactor = 0; 58 | int device_bsleep = 0; 59 | int device_clockRate = 0; 60 | int device_memoryClockRate = 0; 61 | size_t device_memoryTotal = 0; 62 | size_t device_memoryFree = 0; 63 | int device_pciBusID = 0; 64 | int device_pciDeviceID = 0; 65 | int device_pciDomainID = 0; 66 | uint32_t syncMode = 3; 67 | bool ready = false; 68 | 69 | uint32_t *d_input = nullptr; 70 | int inputlen = 0; 71 | uint32_t *d_result_count = nullptr; 72 | uint32_t *d_result_nonce = nullptr; 73 | uint32_t *d_long_state = nullptr; 74 | uint64_t d_scratchpads_size = 0; 75 | uint32_t *d_ctx_state = nullptr; 76 | uint32_t *d_ctx_state2 = nullptr; 77 | uint32_t *d_ctx_a = nullptr; 78 | uint32_t *d_ctx_b = nullptr; 79 | uint32_t *d_ctx_key1 = nullptr; 80 | uint32_t *d_ctx_key2 = nullptr; 81 | uint32_t *d_ctx_text = nullptr; 82 | 83 | uint32_t rx_batch_size = 0; 84 | int32_t rx_dataset_host = -1; 85 | uint32_t *d_rx_dataset = nullptr; 86 | uint32_t *d_rx_hashes = nullptr; 87 | uint32_t *d_rx_entropy = nullptr; 88 | uint32_t *d_rx_vm_states = nullptr; 89 | uint32_t *d_rx_rounding = nullptr; 90 | 91 | # ifdef XMRIG_ALGO_KAWPOW 92 | void* kawpow_cache = nullptr; 93 | size_t kawpow_cache_size = 0; 94 | size_t kawpow_cache_capacity = 0; 95 | 96 | void* kawpow_dag = nullptr; 97 | size_t kawpow_dag_size = 0; 98 | size_t kawpow_dag_capacity = 0; 99 | 100 | uint32_t* kawpow_stop_host = nullptr; 101 | uint32_t* kawpow_stop_device = nullptr; 102 | 103 | uint32_t kawpow_period = 0; 104 | 105 | CUmodule kawpow_module = nullptr; 106 | CUfunction kawpow_kernel = nullptr; 107 | # endif 108 | }; 109 | 110 | 111 | int cuda_get_devicecount(); 112 | int cuda_get_runtime_version(); 113 | int cuda_get_driver_version(); 114 | int cuda_get_deviceinfo(nvid_ctx *ctx); 115 | int cryptonight_gpu_init(nvid_ctx *ctx); 116 | void cryptonight_extra_cpu_set_data(nvid_ctx *ctx, const void *data, size_t len); 117 | void cryptonight_extra_cpu_prepare(nvid_ctx *ctx, uint32_t startNonce, const xmrig_cuda::Algorithm &algorithm); 118 | void cryptonight_gpu_hash(nvid_ctx *ctx, const xmrig_cuda::Algorithm &algorithm, uint64_t height, uint32_t startNonce); 119 | void cryptonight_extra_cpu_final(nvid_ctx *ctx, uint32_t startNonce, uint64_t target, uint32_t *rescount, uint32_t *resnonce, const xmrig_cuda::Algorithm &algorithm); 120 | 121 | void cuda_extra_cpu_set_data(nvid_ctx *ctx, const void *data, size_t len); 122 | void randomx_prepare(nvid_ctx *ctx, const void *dataset, size_t dataset_size, uint32_t batch_size); 123 | void randomx_update_dataset(nvid_ctx* ctx, const void* dataset, size_t dataset_size); 124 | 125 | namespace RandomX_Arqma { void hash(nvid_ctx *ctx, uint32_t nonce, uint32_t nonce_offset, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); } 126 | namespace RandomX_Monero { void hash(nvid_ctx *ctx, uint32_t nonce, uint32_t nonce_offset, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); } 127 | namespace RandomX_Wownero { void hash(nvid_ctx *ctx, uint32_t nonce, uint32_t nonce_offset, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); } 128 | namespace RandomX_Keva { void hash(nvid_ctx *ctx, uint32_t nonce, uint32_t nonce_offset, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); } 129 | namespace RandomX_Graft { void hash(nvid_ctx *ctx, uint32_t nonce, uint32_t nonce_offset, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); } 130 | namespace RandomX_Yada { void hash(nvid_ctx* ctx, uint32_t nonce, uint32_t nonce_offset, uint64_t target, uint32_t* rescount, uint32_t* resnonce, uint32_t batch_size); } 131 | 132 | #ifdef XMRIG_ALGO_KAWPOW 133 | void kawpow_prepare(nvid_ctx *ctx, const void* cache, size_t cache_size, const void* dag_precalc, size_t dag_size, uint32_t height, const uint64_t* dag_sizes); 134 | void kawpow_stop_hash(nvid_ctx *ctx); 135 | 136 | namespace KawPow_Raven { void hash(nvid_ctx *ctx, uint8_t* job_blob, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t *skipped_hashes); } 137 | #endif 138 | -------------------------------------------------------------------------------- /src/crypto/common/Algorithm.h: -------------------------------------------------------------------------------- 1 | /* XMRig 2 | * Copyright (c) 2018 Lee Clagett 3 | * Copyright (c) 2018-2021 SChernykh 4 | * Copyright (c) 2016-2021 XMRig , 5 | * 6 | * This program is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation, either version 3 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see . 18 | */ 19 | 20 | #ifndef XMRIG_ALGORITHM_H 21 | #define XMRIG_ALGORITHM_H 22 | 23 | 24 | #include 25 | #include 26 | #include 27 | 28 | 29 | namespace xmrig_cuda { 30 | 31 | 32 | class Algorithm 33 | { 34 | public: 35 | enum Id : uint32_t { 36 | INVALID = 0, 37 | CN_0 = 0x63150000, // "cn/0" CryptoNight (original). 38 | CN_1 = 0x63150100, // "cn/1" CryptoNight variant 1 also known as Monero7 and CryptoNightV7. 39 | CN_2 = 0x63150200, // "cn/2" CryptoNight variant 2. 40 | CN_R = 0x63150272, // "cn/r" CryptoNightR (Monero's variant 4). 41 | CN_FAST = 0x63150166, // "cn/fast" CryptoNight variant 1 with half iterations. 42 | CN_HALF = 0x63150268, // "cn/half" CryptoNight variant 2 with half iterations (Masari/Torque). 43 | CN_XAO = 0x63150078, // "cn/xao" CryptoNight variant 0 (modified, Alloy only). 44 | CN_RTO = 0x63150172, // "cn/rto" CryptoNight variant 1 (modified, Arto only). 45 | CN_RWZ = 0x63150277, // "cn/rwz" CryptoNight variant 2 with 3/4 iterations and reversed shuffle operation (Graft). 46 | CN_ZLS = 0x6315027a, // "cn/zls" CryptoNight variant 2 with 3/4 iterations (Zelerius). 47 | CN_DOUBLE = 0x63150264, // "cn/double" CryptoNight variant 2 with double iterations (X-CASH). 48 | CN_CCX = 0x63150063, // "cn/ccx" Conceal (CCX) 49 | CN_LITE_0 = 0x63140000, // "cn-lite/0" CryptoNight-Lite variant 0. 50 | CN_LITE_1 = 0x63140100, // "cn-lite/1" CryptoNight-Lite variant 1. 51 | CN_HEAVY_0 = 0x63160000, // "cn-heavy/0" CryptoNight-Heavy (4 MB). 52 | CN_HEAVY_TUBE = 0x63160172, // "cn-heavy/tube" CryptoNight-Heavy (modified, TUBE only). 53 | CN_HEAVY_XHV = 0x63160068, // "cn-heavy/xhv" CryptoNight-Heavy (modified, Haven Protocol only). 54 | CN_PICO_0 = 0x63120200, // "cn-pico" CryptoNight-Pico 55 | CN_PICO_TLO = 0x63120274, // "cn-pico/tlo" CryptoNight-Pico (TLO) 56 | CN_UPX2 = 0x63110200, // "cn/upx2" Uplexa (UPX2) 57 | CN_GPU = 0x63150300, // "cn/gpu" CryptoNight-GPU (Ryo). 58 | RX_0 = 0x72151200, // "rx/0" RandomX (reference configuration). 59 | RX_WOW = 0x72141177, // "rx/wow" RandomWOW (Wownero). 60 | RX_ARQ = 0x72121061, // "rx/arq" RandomARQ (Arqma). 61 | RX_GRAFT = 0x72151267, // "rx/graft" RandomGRAFT (Graft). 62 | RX_SFX = 0x72151273, // "rx/sfx" RandomSFX (Safex Cash). 63 | RX_KEVA = 0x7214116b, // "rx/keva" RandomKEVA (Keva). 64 | RX_YADA = 0x72151279, // "rx/yada" RandomYada (YadaCoin). 65 | AR2_CHUKWA = 0x61130000, // "argon2/chukwa" Argon2id (Chukwa). 66 | AR2_CHUKWA_V2 = 0x61140000, // "argon2/chukwav2" Argon2id (Chukwa v2). 67 | AR2_WRKZ = 0x61120000, // "argon2/wrkz" Argon2id (WRKZ) 68 | KAWPOW_RVN = 0x6b0f0000, // "kawpow/rvn" KawPow (RVN) 69 | 70 | RX_XLA = 0x721211ff, // "panthera" Panthera (Scala2). 71 | }; 72 | 73 | enum Family : uint32_t { 74 | UNKNOWN = 0, 75 | CN_ANY = 0x63000000, 76 | CN = 0x63150000, 77 | CN_LITE = 0x63140000, 78 | CN_HEAVY = 0x63160000, 79 | CN_PICO = 0x63120000, 80 | CN_FEMTO = 0x63110000, 81 | RANDOM_X = 0x72000000, 82 | ARGON2 = 0x61000000, 83 | KAWPOW = 0x6b000000 84 | }; 85 | 86 | inline Algorithm() = default; 87 | inline Algorithm(Id id) : m_id(id) {} 88 | Algorithm(uint32_t id) : m_id(parse(id)) {} 89 | 90 | static inline constexpr bool isCN(Id id) { return (id & 0xff000000) == CN_ANY; } 91 | static inline constexpr Id base(Id id) { return isCN(id) ? static_cast(CN_0 | (id & 0xff00)) : INVALID; } 92 | static inline constexpr size_t l2(Id id) { return family(id) == RANDOM_X ? (1U << ((id >> 8) & 0xff)) : 0U; } 93 | static inline constexpr size_t l3(Id id) { return 1ULL << ((id >> 16) & 0xff); } 94 | static inline constexpr uint32_t family(Id id) { return id & (isCN(id) ? 0xffff0000 : 0xff000000); } 95 | 96 | inline bool isCN() const { return isCN(m_id); } 97 | inline bool isEqual(const Algorithm &other) const { return m_id == other.m_id; } 98 | inline bool isValid() const { return m_id != INVALID && family() > UNKNOWN; } 99 | inline Id base() const { return base(m_id); } 100 | inline Id id() const { return m_id; } 101 | inline size_t l2() const { return l2(m_id); } 102 | inline size_t l3() const { return l3(m_id); } 103 | inline uint32_t family() const { return family(m_id); } 104 | inline uint32_t maxIntensity() const { return isCN() ? 5 : 1; }; 105 | 106 | inline bool operator!=(Algorithm::Id id) const { return m_id != id; } 107 | inline bool operator!=(const Algorithm &other) const { return !isEqual(other); } 108 | inline bool operator==(Algorithm::Id id) const { return m_id == id; } 109 | inline bool operator==(const Algorithm &other) const { return isEqual(other); } 110 | inline operator Algorithm::Id() const { return m_id; } 111 | 112 | static Id parse(uint32_t id); 113 | 114 | private: 115 | Id m_id = INVALID; 116 | }; 117 | 118 | 119 | } // namespace xmrig_cuda 120 | 121 | 122 | #endif /* XMRIG_ALGORITHM_H */ 123 | -------------------------------------------------------------------------------- /src/3rdparty/cub/util_arch.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Static architectural properties by SM version. 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "util_cpp_dialect.cuh" 37 | #include "util_namespace.cuh" 38 | #include "util_macro.cuh" 39 | 40 | /// Optional outer namespace(s) 41 | CUB_NS_PREFIX 42 | 43 | /// CUB namespace 44 | namespace cub { 45 | 46 | #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document 47 | 48 | #if ((__CUDACC_VER_MAJOR__ >= 9) || defined(__NVCOMPILER_CUDA__)) && \ 49 | !defined(CUB_USE_COOPERATIVE_GROUPS) 50 | #define CUB_USE_COOPERATIVE_GROUPS 51 | #endif 52 | 53 | /// In device code, CUB_PTX_ARCH expands to the PTX version for which we are 54 | /// compiling. In host code, CUB_PTX_ARCH's value is implementation defined. 55 | #ifndef CUB_PTX_ARCH 56 | #if defined(__NVCOMPILER_CUDA__) 57 | // __NVCOMPILER_CUDA_ARCH__ is the target PTX version, and is defined 58 | // when compiling both host code and device code. Currently, only one 59 | // PTX version can be targeted. 60 | #define CUB_PTX_ARCH __NVCOMPILER_CUDA_ARCH__ 61 | #elif !defined(__CUDA_ARCH__) 62 | #define CUB_PTX_ARCH 0 63 | #else 64 | #define CUB_PTX_ARCH __CUDA_ARCH__ 65 | #endif 66 | #endif 67 | 68 | #ifndef CUB_IS_DEVICE_CODE 69 | #if defined(__NVCOMPILER_CUDA__) 70 | #define CUB_IS_DEVICE_CODE __builtin_is_device_code() 71 | #define CUB_IS_HOST_CODE (!__builtin_is_device_code()) 72 | #define CUB_INCLUDE_DEVICE_CODE 1 73 | #define CUB_INCLUDE_HOST_CODE 1 74 | #elif CUB_PTX_ARCH > 0 75 | #define CUB_IS_DEVICE_CODE 1 76 | #define CUB_IS_HOST_CODE 0 77 | #define CUB_INCLUDE_DEVICE_CODE 1 78 | #define CUB_INCLUDE_HOST_CODE 0 79 | #else 80 | #define CUB_IS_DEVICE_CODE 0 81 | #define CUB_IS_HOST_CODE 1 82 | #define CUB_INCLUDE_DEVICE_CODE 0 83 | #define CUB_INCLUDE_HOST_CODE 1 84 | #endif 85 | #endif 86 | 87 | /// Maximum number of devices supported. 88 | #ifndef CUB_MAX_DEVICES 89 | #define CUB_MAX_DEVICES 128 90 | #endif 91 | 92 | #if CUB_CPP_DIALECT >= 2011 93 | static_assert(CUB_MAX_DEVICES > 0, "CUB_MAX_DEVICES must be greater than 0."); 94 | #endif 95 | 96 | /// Whether or not the source targeted by the active compiler pass is allowed to invoke device kernels or methods from the CUDA runtime API. 97 | #ifndef CUB_RUNTIME_FUNCTION 98 | #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__>= 350 && defined(__CUDACC_RDC__)) 99 | #define CUB_RUNTIME_ENABLED 100 | #define CUB_RUNTIME_FUNCTION __host__ __device__ 101 | #else 102 | #define CUB_RUNTIME_FUNCTION __host__ 103 | #endif 104 | #endif 105 | 106 | 107 | /// Number of threads per warp 108 | #ifndef CUB_LOG_WARP_THREADS 109 | #define CUB_LOG_WARP_THREADS(arch) \ 110 | (5) 111 | #define CUB_WARP_THREADS(arch) \ 112 | (1 << CUB_LOG_WARP_THREADS(arch)) 113 | 114 | #define CUB_PTX_WARP_THREADS CUB_WARP_THREADS(CUB_PTX_ARCH) 115 | #define CUB_PTX_LOG_WARP_THREADS CUB_LOG_WARP_THREADS(CUB_PTX_ARCH) 116 | #endif 117 | 118 | 119 | /// Number of smem banks 120 | #ifndef CUB_LOG_SMEM_BANKS 121 | #define CUB_LOG_SMEM_BANKS(arch) \ 122 | ((arch >= 200) ? \ 123 | (5) : \ 124 | (4)) 125 | #define CUB_SMEM_BANKS(arch) \ 126 | (1 << CUB_LOG_SMEM_BANKS(arch)) 127 | 128 | #define CUB_PTX_LOG_SMEM_BANKS CUB_LOG_SMEM_BANKS(CUB_PTX_ARCH) 129 | #define CUB_PTX_SMEM_BANKS CUB_SMEM_BANKS(CUB_PTX_ARCH) 130 | #endif 131 | 132 | 133 | /// Oversubscription factor 134 | #ifndef CUB_SUBSCRIPTION_FACTOR 135 | #define CUB_SUBSCRIPTION_FACTOR(arch) \ 136 | ((arch >= 300) ? \ 137 | (5) : \ 138 | ((arch >= 200) ? \ 139 | (3) : \ 140 | (10))) 141 | #define CUB_PTX_SUBSCRIPTION_FACTOR CUB_SUBSCRIPTION_FACTOR(CUB_PTX_ARCH) 142 | #endif 143 | 144 | 145 | /// Prefer padding overhead vs X-way conflicts greater than this threshold 146 | #ifndef CUB_PREFER_CONFLICT_OVER_PADDING 147 | #define CUB_PREFER_CONFLICT_OVER_PADDING(arch) \ 148 | ((arch >= 300) ? \ 149 | (1) : \ 150 | (4)) 151 | #define CUB_PTX_PREFER_CONFLICT_OVER_PADDING CUB_PREFER_CONFLICT_OVER_PADDING(CUB_PTX_ARCH) 152 | #endif 153 | 154 | 155 | template < 156 | int NOMINAL_4B_BLOCK_THREADS, 157 | int NOMINAL_4B_ITEMS_PER_THREAD, 158 | typename T> 159 | struct RegBoundScaling 160 | { 161 | enum { 162 | ITEMS_PER_THREAD = CUB_MAX(1, NOMINAL_4B_ITEMS_PER_THREAD * 4 / CUB_MAX(4, sizeof(T))), 163 | BLOCK_THREADS = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, (((1024 * 48) / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32), 164 | }; 165 | }; 166 | 167 | 168 | template < 169 | int NOMINAL_4B_BLOCK_THREADS, 170 | int NOMINAL_4B_ITEMS_PER_THREAD, 171 | typename T> 172 | struct MemBoundScaling 173 | { 174 | enum { 175 | ITEMS_PER_THREAD = CUB_MAX(1, CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T), NOMINAL_4B_ITEMS_PER_THREAD * 2)), 176 | BLOCK_THREADS = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, (((1024 * 48) / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32), 177 | }; 178 | }; 179 | 180 | 181 | 182 | 183 | #endif // Do not document 184 | 185 | } // CUB namespace 186 | CUB_NS_POSTFIX // Optional outer namespace(s) 187 | -------------------------------------------------------------------------------- /src/cuda_keccak.hpp: -------------------------------------------------------------------------------- 1 | #ifdef __CUDACC__ 2 | __constant__ 3 | #else 4 | const 5 | #endif 6 | uint64_t keccakf_rndc[24] ={ 7 | 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, 8 | 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, 9 | 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, 10 | 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, 11 | 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, 12 | 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, 13 | 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, 14 | 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 15 | }; 16 | 17 | #if __CUDA_ARCH__ >= 350 18 | __forceinline__ __device__ uint64_t cuda_rotl64(const uint64_t value, const int offset) 19 | { 20 | uint2 result; 21 | if(offset >= 32) 22 | { 23 | asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); 24 | asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); 25 | } 26 | else 27 | { 28 | asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); 29 | asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); 30 | } 31 | return __double_as_longlong(__hiloint2double(result.y, result.x)); 32 | } 33 | #define rotl64_1(x, y) (cuda_rotl64((x), (y))) 34 | #else 35 | #define rotl64_1(x, y) ((x) << (y) | ((x) >> (64 - (y)))) 36 | #endif 37 | 38 | #define rotl64_2(x, y) rotl64_1(((x) >> 32) | ((x) << 32), (y)) 39 | #define bitselect(a, b, c) ((a) ^ ((c) & ((b) ^ (a)))) 40 | 41 | __device__ __forceinline__ void cn_keccakf2(uint64_t *s) 42 | { 43 | uint8_t i; 44 | 45 | for(i = 0; i < 24; ++i) 46 | { 47 | uint64_t bc[5], tmpxor[5], tmp1, tmp2; 48 | 49 | tmpxor[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; 50 | tmpxor[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; 51 | tmpxor[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; 52 | tmpxor[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; 53 | tmpxor[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; 54 | 55 | bc[0] = tmpxor[0] ^ rotl64_1(tmpxor[2], 1); 56 | bc[1] = tmpxor[1] ^ rotl64_1(tmpxor[3], 1); 57 | bc[2] = tmpxor[2] ^ rotl64_1(tmpxor[4], 1); 58 | bc[3] = tmpxor[3] ^ rotl64_1(tmpxor[0], 1); 59 | bc[4] = tmpxor[4] ^ rotl64_1(tmpxor[1], 1); 60 | 61 | tmp1 = s[1] ^ bc[0]; 62 | 63 | s[0] ^= bc[4]; 64 | s[1] = rotl64_2(s[6] ^ bc[0], 12); 65 | s[6] = rotl64_1(s[9] ^ bc[3], 20); 66 | s[9] = rotl64_2(s[22] ^ bc[1], 29); 67 | s[22] = rotl64_2(s[14] ^ bc[3], 7); 68 | s[14] = rotl64_1(s[20] ^ bc[4], 18); 69 | s[20] = rotl64_2(s[2] ^ bc[1], 30); 70 | s[2] = rotl64_2(s[12] ^ bc[1], 11); 71 | s[12] = rotl64_1(s[13] ^ bc[2], 25); 72 | s[13] = rotl64_1(s[19] ^ bc[3], 8); 73 | s[19] = rotl64_2(s[23] ^ bc[2], 24); 74 | s[23] = rotl64_2(s[15] ^ bc[4], 9); 75 | s[15] = rotl64_1(s[4] ^ bc[3], 27); 76 | s[4] = rotl64_1(s[24] ^ bc[3], 14); 77 | s[24] = rotl64_1(s[21] ^ bc[0], 2); 78 | s[21] = rotl64_2(s[8] ^ bc[2], 23); 79 | s[8] = rotl64_2(s[16] ^ bc[0], 13); 80 | s[16] = rotl64_2(s[5] ^ bc[4], 4); 81 | s[5] = rotl64_1(s[3] ^ bc[2], 28); 82 | s[3] = rotl64_1(s[18] ^ bc[2], 21); 83 | s[18] = rotl64_1(s[17] ^ bc[1], 15); 84 | s[17] = rotl64_1(s[11] ^ bc[0], 10); 85 | s[11] = rotl64_1(s[7] ^ bc[1], 6); 86 | s[7] = rotl64_1(s[10] ^ bc[4], 3); 87 | s[10] = rotl64_1(tmp1, 1); 88 | 89 | tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1); 90 | tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1); 91 | tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1); 92 | tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1); 93 | tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1); 94 | s[0] ^= keccakf_rndc[i]; 95 | } 96 | } 97 | 98 | __device__ __forceinline__ void cn_keccakf(uint64_t *s) 99 | { 100 | uint64_t bc[5], tmpxor[5], tmp1, tmp2; 101 | 102 | for(int i = 0; i < 24; ++i) 103 | { 104 | tmpxor[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; 105 | tmpxor[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; 106 | tmpxor[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; 107 | tmpxor[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; 108 | tmpxor[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; 109 | 110 | bc[0] = tmpxor[0] ^ rotl64_1(tmpxor[2], 1); 111 | bc[1] = tmpxor[1] ^ rotl64_1(tmpxor[3], 1); 112 | bc[2] = tmpxor[2] ^ rotl64_1(tmpxor[4], 1); 113 | bc[3] = tmpxor[3] ^ rotl64_1(tmpxor[0], 1); 114 | bc[4] = tmpxor[4] ^ rotl64_1(tmpxor[1], 1); 115 | 116 | tmp1 = s[1] ^ bc[0]; 117 | 118 | s[0] ^= bc[4]; 119 | s[1] = rotl64_2(s[6] ^ bc[0], 12); 120 | s[6] = rotl64_1(s[9] ^ bc[3], 20); 121 | s[9] = rotl64_2(s[22] ^ bc[1], 29); 122 | s[22] = rotl64_2(s[14] ^ bc[3], 7); 123 | s[14] = rotl64_1(s[20] ^ bc[4], 18); 124 | s[20] = rotl64_2(s[2] ^ bc[1], 30); 125 | s[2] = rotl64_2(s[12] ^ bc[1], 11); 126 | s[12] = rotl64_1(s[13] ^ bc[2], 25); 127 | s[13] = rotl64_1(s[19] ^ bc[3], 8); 128 | s[19] = rotl64_2(s[23] ^ bc[2], 24); 129 | s[23] = rotl64_2(s[15] ^ bc[4], 9); 130 | s[15] = rotl64_1(s[4] ^ bc[3], 27); 131 | s[4] = rotl64_1(s[24] ^ bc[3], 14); 132 | s[24] = rotl64_1(s[21] ^ bc[0], 2); 133 | s[21] = rotl64_2(s[8] ^ bc[2], 23); 134 | s[8] = rotl64_2(s[16] ^ bc[0], 13); 135 | s[16] = rotl64_2(s[5] ^ bc[4], 4); 136 | s[5] = rotl64_1(s[3] ^ bc[2], 28); 137 | s[3] = rotl64_1(s[18] ^ bc[2], 21); 138 | s[18] = rotl64_1(s[17] ^ bc[1], 15); 139 | s[17] = rotl64_1(s[11] ^ bc[0], 10); 140 | s[11] = rotl64_1(s[7] ^ bc[1], 6); 141 | s[7] = rotl64_1(s[10] ^ bc[4], 3); 142 | s[10] = rotl64_1(tmp1, 1); 143 | 144 | tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1); 145 | tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1); 146 | tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1); 147 | tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1); 148 | tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1); 149 | s[0] ^= keccakf_rndc[i]; 150 | } 151 | } 152 | 153 | __device__ __forceinline__ void cn_keccak(const uint64_t * __restrict__ input, int inlen, uint8_t * __restrict__ md) 154 | { 155 | uint64_t st[25]; 156 | 157 | #pragma unroll 158 | for (int i = 0; i < 25; ++i) { 159 | st[i] = 0; 160 | } 161 | 162 | // Input length must be a multiple of 136 and padded on the host side 163 | for (int i = 0; inlen > 0; i += 17, inlen -= 136) { 164 | #pragma unroll 165 | for (int j = 0; j < 17; ++j) { 166 | st[j] ^= input[i + j]; 167 | } 168 | cn_keccakf(st); 169 | } 170 | 171 | MEMCPY8(md, st, 25); 172 | return; 173 | } 174 | -------------------------------------------------------------------------------- /src/CudaCryptonightR_gen.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | 9 | #include "crypto/cn/CryptoNight_monero.h" 10 | #include "CudaCryptonightR_gen.h" 11 | #include "cuda_device.hpp" 12 | 13 | 14 | static std::string get_code(const V4_Instruction* code, int code_size) 15 | { 16 | std::stringstream s; 17 | 18 | for (int i = 0; i < code_size; ++i) 19 | { 20 | const V4_Instruction inst = code[i]; 21 | 22 | const uint32_t a = inst.dst_index; 23 | const uint32_t b = inst.src_index; 24 | 25 | switch (inst.opcode) 26 | { 27 | case MUL: 28 | s << 'r' << a << "*=r" << b << ';'; 29 | break; 30 | 31 | case ADD: 32 | s << 'r' << a << "+=r" << b << '+' << inst.C << "U;"; 33 | break; 34 | 35 | case SUB: 36 | s << 'r' << a << "-=r" << b << ';'; 37 | break; 38 | 39 | case ROR: 40 | s << 'r' << a << "=rotate_right(r" << a << ",r" << b << ");"; 41 | break; 42 | 43 | case ROL: 44 | s << 'r' << a << "=rotate_left(r" << a << ",r" << b << ");"; 45 | break; 46 | 47 | case XOR: 48 | s << 'r' << a << "^=r" << b << ';'; 49 | break; 50 | } 51 | 52 | s << '\n'; 53 | } 54 | 55 | return s.str(); 56 | } 57 | 58 | struct CacheEntry 59 | { 60 | CacheEntry(uint64_t height, int arch_major, int arch_minor, const std::vector& ptx, const std::string& lowered_name) : 61 | height(height), 62 | arch_major(arch_major), 63 | arch_minor(arch_minor), 64 | ptx(ptx), 65 | lowered_name(lowered_name) 66 | {} 67 | 68 | uint64_t height; 69 | int arch_major; 70 | int arch_minor; 71 | std::vector ptx; 72 | std::string lowered_name; 73 | }; 74 | 75 | struct BackgroundTaskBase 76 | { 77 | virtual ~BackgroundTaskBase() = default; 78 | virtual void exec() = 0; 79 | }; 80 | 81 | template 82 | struct BackgroundTask : public BackgroundTaskBase 83 | { 84 | BackgroundTask(T&& func) : m_func(std::move(func)) {} 85 | void exec() override { m_func(); } 86 | 87 | T m_func; 88 | }; 89 | 90 | static std::mutex CryptonightR_cache_mutex; 91 | static std::mutex CryptonightR_build_mutex; 92 | static std::vector CryptonightR_cache; 93 | 94 | static std::mutex background_tasks_mutex; 95 | static std::vector background_tasks; 96 | static std::thread* background_thread = nullptr; 97 | 98 | static void background_thread_proc() 99 | { 100 | std::vector tasks; 101 | for (;;) { 102 | tasks.clear(); 103 | { 104 | std::lock_guard g(background_tasks_mutex); 105 | background_tasks.swap(tasks); 106 | } 107 | 108 | for (BackgroundTaskBase* task : tasks) { 109 | task->exec(); 110 | delete task; 111 | } 112 | 113 | std::this_thread::sleep_for(std::chrono::milliseconds(500)); 114 | } 115 | } 116 | 117 | template 118 | static void background_exec(T&& func) 119 | { 120 | BackgroundTaskBase* task = new BackgroundTask(std::move(func)); 121 | 122 | std::lock_guard g(background_tasks_mutex); 123 | background_tasks.push_back(task); 124 | if (!background_thread) { 125 | background_thread = new std::thread(background_thread_proc); 126 | } 127 | } 128 | 129 | 130 | static void CryptonightR_build_program( 131 | std::vector& ptx, 132 | std::string& lowered_name, 133 | uint64_t height, 134 | int arch_major, 135 | int arch_minor, 136 | std::string source) 137 | { 138 | { 139 | std::lock_guard g(CryptonightR_cache_mutex); 140 | 141 | // Remove old programs from cache 142 | for (size_t i = 0; i < CryptonightR_cache.size();) { 143 | const CacheEntry& entry = CryptonightR_cache[i]; 144 | if (entry.height + 2 < height) { 145 | CryptonightR_cache[i] = std::move(CryptonightR_cache.back()); 146 | CryptonightR_cache.pop_back(); 147 | } 148 | else { 149 | ++i; 150 | } 151 | } 152 | } 153 | 154 | ptx.clear(); 155 | ptx.reserve(65536); 156 | 157 | std::lock_guard g1(CryptonightR_build_mutex); 158 | { 159 | std::lock_guard g(CryptonightR_cache_mutex); 160 | 161 | // Check if the cache already has this program (some other thread might have added it first) 162 | for (const CacheEntry& entry : CryptonightR_cache) 163 | { 164 | if ((entry.height == height) && (entry.arch_major == arch_major) && (entry.arch_minor == arch_minor)) 165 | { 166 | ptx = entry.ptx; 167 | lowered_name = entry.lowered_name; 168 | return; 169 | } 170 | } 171 | } 172 | 173 | nvrtcProgram prog; 174 | nvrtcResult result = nvrtcCreateProgram(&prog, source.c_str(), "CryptonightR.cu", 0, nullptr, nullptr); 175 | if (result != NVRTC_SUCCESS) { 176 | CUDA_THROW(nvrtcGetErrorString(result)); 177 | } 178 | 179 | result = nvrtcAddNameExpression(prog, "CryptonightR_phase2"); 180 | if (result != NVRTC_SUCCESS) { 181 | nvrtcDestroyProgram(&prog); 182 | 183 | CUDA_THROW(nvrtcGetErrorString(result)); 184 | } 185 | 186 | char opt0[64]; 187 | sprintf(opt0, "--gpu-architecture=compute_%d%d", arch_major, arch_minor); 188 | 189 | const char* opts[2] = { opt0, "-DVARIANT=13" }; 190 | result = nvrtcCompileProgram(prog, 2, opts); 191 | if (result != NVRTC_SUCCESS) { 192 | size_t logSize; 193 | if (nvrtcGetProgramLogSize(prog, &logSize) == NVRTC_SUCCESS) { 194 | char *log = new char[logSize](); 195 | if (nvrtcGetProgramLog(prog, log) == NVRTC_SUCCESS) { 196 | printf("Program compile log: %s\n", log); 197 | } 198 | 199 | delete[] log; 200 | } 201 | 202 | nvrtcDestroyProgram(&prog); 203 | 204 | CUDA_THROW(nvrtcGetErrorString(result)); 205 | } 206 | 207 | 208 | const char* name; 209 | result = nvrtcGetLoweredName(prog, "CryptonightR_phase2", &name); 210 | if (result != NVRTC_SUCCESS) { 211 | nvrtcDestroyProgram(&prog); 212 | 213 | CUDA_THROW(nvrtcGetErrorString(result)); 214 | } 215 | 216 | size_t ptxSize; 217 | result = nvrtcGetPTXSize(prog, &ptxSize); 218 | if (result != NVRTC_SUCCESS) { 219 | nvrtcDestroyProgram(&prog); 220 | 221 | CUDA_THROW(nvrtcGetErrorString(result)); 222 | } 223 | 224 | ptx.resize(ptxSize); 225 | result = nvrtcGetPTX(prog, ptx.data()); 226 | if (result != NVRTC_SUCCESS) { 227 | nvrtcDestroyProgram(&prog); 228 | 229 | CUDA_THROW(nvrtcGetErrorString(result)); 230 | } 231 | 232 | lowered_name = name; 233 | 234 | nvrtcDestroyProgram(&prog); 235 | 236 | { 237 | std::lock_guard g(CryptonightR_cache_mutex); 238 | CryptonightR_cache.emplace_back(height, arch_major, arch_minor, ptx, lowered_name); 239 | } 240 | } 241 | 242 | void CryptonightR_get_program(std::vector& ptx, std::string& lowered_name, uint64_t height, int arch_major, int arch_minor, bool background) 243 | { 244 | if (background) { 245 | background_exec([=]() { std::vector tmp; std::string s; CryptonightR_get_program(tmp, s, height, arch_major, arch_minor, false); }); 246 | return; 247 | } 248 | 249 | ptx.clear(); 250 | 251 | const char* source_code_template = 252 | #include "CryptonightR.cu" 253 | ; 254 | 255 | const char include_name[] = "XMRIG_INCLUDE_RANDOM_MATH"; 256 | const char *offset = strstr(source_code_template, include_name); 257 | if (!offset){ 258 | return; 259 | } 260 | 261 | V4_Instruction code[256]; 262 | const int code_size = v4_random_math_init(code, height); 263 | 264 | std::string source_code(source_code_template, offset); 265 | source_code.append(get_code(code, code_size)); 266 | source_code.append(offset + sizeof(include_name) - 1); 267 | 268 | { 269 | std::lock_guard g(CryptonightR_cache_mutex); 270 | 271 | // Check if the cache has this program 272 | for (const CacheEntry& entry : CryptonightR_cache) { 273 | if ((entry.height == height) && (entry.arch_major == arch_major) && (entry.arch_minor == arch_minor)) { 274 | ptx = entry.ptx; 275 | lowered_name = entry.lowered_name; 276 | 277 | return; 278 | } 279 | } 280 | } 281 | 282 | CryptonightR_build_program(ptx, lowered_name, height, arch_major, arch_minor, source_code); 283 | } 284 | -------------------------------------------------------------------------------- /src/3rdparty/cub/iterator/cache_modified_input_iterator.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Random-access iterator types 32 | */ 33 | 34 | #pragma once 35 | 36 | #include 37 | #include 38 | 39 | #include "../config.cuh" 40 | #include "../thread/thread_load.cuh" 41 | #include "../thread/thread_store.cuh" 42 | #include "../util_device.cuh" 43 | 44 | #if (THRUST_VERSION >= 100700) 45 | // This iterator is compatible with Thrust API 1.7 and newer 46 | #include 47 | #include 48 | #endif // THRUST_VERSION 49 | 50 | 51 | /// Optional outer namespace(s) 52 | CUB_NS_PREFIX 53 | 54 | /// CUB namespace 55 | namespace cub { 56 | 57 | 58 | 59 | /** 60 | * \addtogroup UtilIterator 61 | * @{ 62 | */ 63 | 64 | 65 | /** 66 | * \brief A random-access input wrapper for dereferencing array values using a PTX cache load modifier. 67 | * 68 | * \par Overview 69 | * - CacheModifiedInputIteratorTis a random-access input iterator that wraps a native 70 | * device pointer of type ValueType*. \p ValueType references are 71 | * made by reading \p ValueType values through loads modified by \p MODIFIER. 72 | * - Can be used to load any data type from memory using PTX cache load modifiers (e.g., "LOAD_LDG", 73 | * "LOAD_CG", "LOAD_CA", "LOAD_CS", "LOAD_CV", etc.). 74 | * - Can be constructed, manipulated, and exchanged within and between host and device 75 | * functions, but can only be dereferenced within device functions. 76 | * - Compatible with Thrust API v1.7 or newer. 77 | * 78 | * \par Snippet 79 | * The code snippet below illustrates the use of \p CacheModifiedInputIteratorTto 80 | * dereference a device array of double using the "ldg" PTX load modifier 81 | * (i.e., load values through texture cache). 82 | * \par 83 | * \code 84 | * #include // or equivalently 85 | * 86 | * // Declare, allocate, and initialize a device array 87 | * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0] 88 | * 89 | * // Create an iterator wrapper 90 | * cub::CacheModifiedInputIterator itr(d_in); 91 | * 92 | * // Within device code: 93 | * printf("%f\n", itr[0]); // 8.0 94 | * printf("%f\n", itr[1]); // 6.0 95 | * printf("%f\n", itr[6]); // 9.0 96 | * 97 | * \endcode 98 | * 99 | * \tparam CacheLoadModifier The cub::CacheLoadModifier to use when accessing data 100 | * \tparam ValueType The value type of this iterator 101 | * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) 102 | */ 103 | template < 104 | CacheLoadModifier MODIFIER, 105 | typename ValueType, 106 | typename OffsetT = ptrdiff_t> 107 | class CacheModifiedInputIterator 108 | { 109 | public: 110 | 111 | // Required iterator traits 112 | typedef CacheModifiedInputIterator self_type; ///< My own type 113 | typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another 114 | typedef ValueType value_type; ///< The type of the element the iterator can point to 115 | typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to 116 | typedef ValueType reference; ///< The type of a reference to an element the iterator can point to 117 | 118 | #if (THRUST_VERSION >= 100700) 119 | // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods 120 | typedef typename thrust::detail::iterator_facade_category< 121 | thrust::device_system_tag, 122 | thrust::random_access_traversal_tag, 123 | value_type, 124 | reference 125 | >::type iterator_category; ///< The iterator category 126 | #else 127 | typedef std::random_access_iterator_tag iterator_category; ///< The iterator category 128 | #endif // THRUST_VERSION 129 | 130 | 131 | public: 132 | 133 | /// Wrapped native pointer 134 | ValueType* ptr; 135 | 136 | /// Constructor 137 | template 138 | __host__ __device__ __forceinline__ CacheModifiedInputIterator( 139 | QualifiedValueType* ptr) ///< Native pointer to wrap 140 | : 141 | ptr(const_cast::Type *>(ptr)) 142 | {} 143 | 144 | /// Postfix increment 145 | __host__ __device__ __forceinline__ self_type operator++(int) 146 | { 147 | self_type retval = *this; 148 | ptr++; 149 | return retval; 150 | } 151 | 152 | /// Prefix increment 153 | __host__ __device__ __forceinline__ self_type operator++() 154 | { 155 | ptr++; 156 | return *this; 157 | } 158 | 159 | /// Indirection 160 | __device__ __forceinline__ reference operator*() const 161 | { 162 | return ThreadLoad(ptr); 163 | } 164 | 165 | /// Addition 166 | template 167 | __host__ __device__ __forceinline__ self_type operator+(Distance n) const 168 | { 169 | self_type retval(ptr + n); 170 | return retval; 171 | } 172 | 173 | /// Addition assignment 174 | template 175 | __host__ __device__ __forceinline__ self_type& operator+=(Distance n) 176 | { 177 | ptr += n; 178 | return *this; 179 | } 180 | 181 | /// Subtraction 182 | template 183 | __host__ __device__ __forceinline__ self_type operator-(Distance n) const 184 | { 185 | self_type retval(ptr - n); 186 | return retval; 187 | } 188 | 189 | /// Subtraction assignment 190 | template 191 | __host__ __device__ __forceinline__ self_type& operator-=(Distance n) 192 | { 193 | ptr -= n; 194 | return *this; 195 | } 196 | 197 | /// Distance 198 | __host__ __device__ __forceinline__ difference_type operator-(self_type other) const 199 | { 200 | return ptr - other.ptr; 201 | } 202 | 203 | /// Array subscript 204 | template 205 | __device__ __forceinline__ reference operator[](Distance n) const 206 | { 207 | return ThreadLoad(ptr + n); 208 | } 209 | 210 | /// Structure dereference 211 | __device__ __forceinline__ pointer operator->() 212 | { 213 | return &ThreadLoad(ptr); 214 | } 215 | 216 | /// Equal to 217 | __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) 218 | { 219 | return (ptr == rhs.ptr); 220 | } 221 | 222 | /// Not equal to 223 | __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) 224 | { 225 | return (ptr != rhs.ptr); 226 | } 227 | 228 | /// ostream operator 229 | friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/) 230 | { 231 | return os; 232 | } 233 | }; 234 | 235 | 236 | 237 | /** @} */ // end group UtilIterator 238 | 239 | } // CUB namespace 240 | CUB_NS_POSTFIX // Optional outer namespace(s) 241 | --------------------------------------------------------------------------------