├── .gitattributes ├── .gitignore ├── .gitmodules ├── LICENSE ├── README.md ├── RandomX_OpenCL.sln └── RandomX_OpenCL ├── CL ├── aes.cl ├── blake2b.cl ├── blake2b_double_block.cl ├── fillAes1Rx4.cl ├── randomx_constants.h ├── randomx_constants_jit.h ├── randomx_init.cl ├── randomx_run.cl └── randomx_vm.cl ├── GCNASM ├── randomx_run_gfx1010.asm ├── randomx_run_gfx803.asm └── randomx_run_gfx900.asm ├── RandomX_OpenCL.cpp ├── RandomX_OpenCL.vcxproj ├── RandomX_OpenCL.vcxproj.filters ├── RandomX_OpenCL.vcxproj.user ├── definitions.h ├── makefile ├── miner.cpp ├── miner.h ├── opencl_helpers.cpp ├── opencl_helpers.h ├── tests.cpp └── tests.h /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vs 2 | x64 3 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "RandomX"] 2 | path = RandomX 3 | url = https://github.com/SChernykh/RandomX 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RandomX OpenCL implementation 2 | 3 | This repository contains full RandomX OpenCL implementation (portable code for all GPUs and optimized code AMD Vega GPUs). The latest version of RandomX (1.1.0 as of August 30th, 2019) is supported. 4 | 5 | Note: it's only a benchmark/testing tool, not an actual miner. RandomX hashrate is expected to improve somewhat in the future thanks to further optimizations. 6 | 7 | GPUs tested so far: 8 | 9 | Model|CryptonightR H/S|RandomX H/S|Relative speed|Comment 10 | -----|---------------|-----------|---------------|------- 11 | AMD Radeon VII (stock)|3125|1500|48%|JIT compiled mode, 150W 12 | AMD Vega 64 (1700/1100 MHz)|2200|1225|55.7%|JIT compiled mode, 285W 13 | AMD Vega 64 (1100/800 MHz)|1023|845|82.6%|JIT compiled mode, 115W 14 | AMD Vega 64 (1700/1100 MHz)|2200|163|7.4%|VM interpreted mode 15 | AMD Vega FE (stock)|2150|980|45.6%|JIT compiled mode (intensity 4096) 16 | AMD Radeon RX 560 4GB (1400/2200 MHz)|495|260|52.5%|JIT compiled mode (intensity 896) 17 | AMD Radeon RX RX470/570 4GB|930-950|400-410|43%|JIT compiled mode, 50W 18 | AMD Radeon RX RX480/580 4GB|960-1000|470|47%|JIT compiled mode, 60W 19 | GeForce GTX 1080 Ti (2037/11800 MHz)|927|601|64.8%|VM interpreted mode 20 | 21 | ## Building on Windows 22 | 23 | - Install Visual Studio 2017 Community and [CLRadeonExtender](https://github.com/CLRX/CLRX-mirror/releases) 24 | - Add CLRadeonExtender's bin directory to PATH environment variable 25 | - Open .sln file in Visual Studio and build it 26 | 27 | ## Building on Ubuntu 28 | 29 | - Install prerequisites `sudo apt install git cmake build-essential` 30 | - If you want to try JIT compiled code for Vega or Polaris GPUs, install amdgpu-pro drivers with OpenCL enabled (run the install script like this `./amdgpu-pro-install --opencl=pal`) 31 | - Download [CLRadeonExtender](https://github.com/CLRX/CLRX-mirror/releases) and copy `clrxasm` to `/usr/local/bin` 32 | - Then run commands: 33 | ``` 34 | git clone --recursive https://github.com/SChernykh/RandomX_OpenCL 35 | cd RandomX_OpenCL/RandomX 36 | mkdir build && cd build 37 | cmake -DARCH=native .. 38 | make 39 | cd ../../RandomX_OpenCL 40 | make 41 | ``` 42 | 43 | ## Donations 44 | 45 | If you'd like to support further development/optimization of RandomX miners (both CPU and AMD/NVIDIA), you're welcome to send any amount of XMR to the following address: 46 | 47 | ``` 48 | 44MnN1f3Eto8DZYUWuE5XZNUtE3vcRzt2j6PzqWpPau34e6Cf4fAxt6X2MBmrm6F9YMEiMNjN6W4Shn4pLcfNAja621jwyg 49 | ``` 50 | -------------------------------------------------------------------------------- /RandomX_OpenCL.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 15 4 | VisualStudioVersion = 15.0.28307.645 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "RandomX_OpenCL", "RandomX_OpenCL\RandomX_OpenCL.vcxproj", "{32C205F3-317C-4EE6-8B8B-E2B6D87B8CC4}" 7 | ProjectSection(ProjectDependencies) = postProject 8 | {3346A4AD-C438-4324-8B77-47A16452954B} = {3346A4AD-C438-4324-8B77-47A16452954B} 9 | EndProjectSection 10 | EndProject 11 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "randomx", "RandomX\vcxproj\randomx.vcxproj", "{3346A4AD-C438-4324-8B77-47A16452954B}" 12 | EndProject 13 | Global 14 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 15 | Debug|x64 = Debug|x64 16 | Debug|x86 = Debug|x86 17 | Release|x64 = Release|x64 18 | Release|x86 = Release|x86 19 | EndGlobalSection 20 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 21 | {32C205F3-317C-4EE6-8B8B-E2B6D87B8CC4}.Debug|x64.ActiveCfg = Debug|x64 22 | {32C205F3-317C-4EE6-8B8B-E2B6D87B8CC4}.Debug|x64.Build.0 = Debug|x64 23 | {32C205F3-317C-4EE6-8B8B-E2B6D87B8CC4}.Debug|x86.ActiveCfg = Debug|x64 24 | {32C205F3-317C-4EE6-8B8B-E2B6D87B8CC4}.Release|x64.ActiveCfg = Release|x64 25 | {32C205F3-317C-4EE6-8B8B-E2B6D87B8CC4}.Release|x64.Build.0 = Release|x64 26 | {32C205F3-317C-4EE6-8B8B-E2B6D87B8CC4}.Release|x86.ActiveCfg = Release|x64 27 | {3346A4AD-C438-4324-8B77-47A16452954B}.Debug|x64.ActiveCfg = Release|x64 28 | {3346A4AD-C438-4324-8B77-47A16452954B}.Debug|x64.Build.0 = Release|x64 29 | {3346A4AD-C438-4324-8B77-47A16452954B}.Debug|x86.ActiveCfg = Debug|Win32 30 | {3346A4AD-C438-4324-8B77-47A16452954B}.Debug|x86.Build.0 = Debug|Win32 31 | {3346A4AD-C438-4324-8B77-47A16452954B}.Release|x64.ActiveCfg = Release|x64 32 | {3346A4AD-C438-4324-8B77-47A16452954B}.Release|x64.Build.0 = Release|x64 33 | {3346A4AD-C438-4324-8B77-47A16452954B}.Release|x86.ActiveCfg = Release|Win32 34 | {3346A4AD-C438-4324-8B77-47A16452954B}.Release|x86.Build.0 = Release|Win32 35 | EndGlobalSection 36 | GlobalSection(SolutionProperties) = preSolution 37 | HideSolutionNode = FALSE 38 | EndGlobalSection 39 | GlobalSection(ExtensibilityGlobals) = postSolution 40 | SolutionGuid = {D7CE6C55-7FD7-4C3E-A52E-E3128C74A127} 41 | EndGlobalSection 42 | EndGlobal 43 | -------------------------------------------------------------------------------- /RandomX_OpenCL/CL/aes.cl: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2019 SChernykh 3 | 4 | This file is part of RandomX OpenCL. 5 | 6 | RandomX OpenCL is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | RandomX OpenCL is distributed in the hope that it will be useful, 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | GNU General Public License for more details. 15 | 16 | You should have received a copy of the GNU General Public License 17 | along with RandomX OpenCL. If not, see . 18 | */ 19 | 20 | __constant static const uint AES_TABLE[2048] = 21 | { 22 | 0xa56363c6U, 0x847c7cf8U, 0x997777eeU, 0x8d7b7bf6U, 23 | 0x0df2f2ffU, 0xbd6b6bd6U, 0xb16f6fdeU, 0x54c5c591U, 24 | 0x50303060U, 0x03010102U, 0xa96767ceU, 0x7d2b2b56U, 25 | 0x19fefee7U, 0x62d7d7b5U, 0xe6abab4dU, 0x9a7676ecU, 26 | 0x45caca8fU, 0x9d82821fU, 0x40c9c989U, 0x877d7dfaU, 27 | 0x15fafaefU, 0xeb5959b2U, 0xc947478eU, 0x0bf0f0fbU, 28 | 0xecadad41U, 0x67d4d4b3U, 0xfda2a25fU, 0xeaafaf45U, 29 | 0xbf9c9c23U, 0xf7a4a453U, 0x967272e4U, 0x5bc0c09bU, 30 | 0xc2b7b775U, 0x1cfdfde1U, 0xae93933dU, 0x6a26264cU, 31 | 0x5a36366cU, 0x413f3f7eU, 0x02f7f7f5U, 0x4fcccc83U, 32 | 0x5c343468U, 0xf4a5a551U, 0x34e5e5d1U, 0x08f1f1f9U, 33 | 0x937171e2U, 0x73d8d8abU, 0x53313162U, 0x3f15152aU, 34 | 0x0c040408U, 0x52c7c795U, 0x65232346U, 0x5ec3c39dU, 35 | 0x28181830U, 0xa1969637U, 0x0f05050aU, 0xb59a9a2fU, 36 | 0x0907070eU, 0x36121224U, 0x9b80801bU, 0x3de2e2dfU, 37 | 0x26ebebcdU, 0x6927274eU, 0xcdb2b27fU, 0x9f7575eaU, 38 | 0x1b090912U, 0x9e83831dU, 0x742c2c58U, 0x2e1a1a34U, 39 | 0x2d1b1b36U, 0xb26e6edcU, 0xee5a5ab4U, 0xfba0a05bU, 40 | 0xf65252a4U, 0x4d3b3b76U, 0x61d6d6b7U, 0xceb3b37dU, 41 | 0x7b292952U, 0x3ee3e3ddU, 0x712f2f5eU, 0x97848413U, 42 | 0xf55353a6U, 0x68d1d1b9U, 0x00000000U, 0x2cededc1U, 43 | 0x60202040U, 0x1ffcfce3U, 0xc8b1b179U, 0xed5b5bb6U, 44 | 0xbe6a6ad4U, 0x46cbcb8dU, 0xd9bebe67U, 0x4b393972U, 45 | 0xde4a4a94U, 0xd44c4c98U, 0xe85858b0U, 0x4acfcf85U, 46 | 0x6bd0d0bbU, 0x2aefefc5U, 0xe5aaaa4fU, 0x16fbfbedU, 47 | 0xc5434386U, 0xd74d4d9aU, 0x55333366U, 0x94858511U, 48 | 0xcf45458aU, 0x10f9f9e9U, 0x06020204U, 0x817f7ffeU, 49 | 0xf05050a0U, 0x443c3c78U, 0xba9f9f25U, 0xe3a8a84bU, 50 | 0xf35151a2U, 0xfea3a35dU, 0xc0404080U, 0x8a8f8f05U, 51 | 0xad92923fU, 0xbc9d9d21U, 0x48383870U, 0x04f5f5f1U, 52 | 0xdfbcbc63U, 0xc1b6b677U, 0x75dadaafU, 0x63212142U, 53 | 0x30101020U, 0x1affffe5U, 0x0ef3f3fdU, 0x6dd2d2bfU, 54 | 0x4ccdcd81U, 0x140c0c18U, 0x35131326U, 0x2fececc3U, 55 | 0xe15f5fbeU, 0xa2979735U, 0xcc444488U, 0x3917172eU, 56 | 0x57c4c493U, 0xf2a7a755U, 0x827e7efcU, 0x473d3d7aU, 57 | 0xac6464c8U, 0xe75d5dbaU, 0x2b191932U, 0x957373e6U, 58 | 0xa06060c0U, 0x98818119U, 0xd14f4f9eU, 0x7fdcdca3U, 59 | 0x66222244U, 0x7e2a2a54U, 0xab90903bU, 0x8388880bU, 60 | 0xca46468cU, 0x29eeeec7U, 0xd3b8b86bU, 0x3c141428U, 61 | 0x79dedea7U, 0xe25e5ebcU, 0x1d0b0b16U, 0x76dbdbadU, 62 | 0x3be0e0dbU, 0x56323264U, 0x4e3a3a74U, 0x1e0a0a14U, 63 | 0xdb494992U, 0x0a06060cU, 0x6c242448U, 0xe45c5cb8U, 64 | 0x5dc2c29fU, 0x6ed3d3bdU, 0xefacac43U, 0xa66262c4U, 65 | 0xa8919139U, 0xa4959531U, 0x37e4e4d3U, 0x8b7979f2U, 66 | 0x32e7e7d5U, 0x43c8c88bU, 0x5937376eU, 0xb76d6ddaU, 67 | 0x8c8d8d01U, 0x64d5d5b1U, 0xd24e4e9cU, 0xe0a9a949U, 68 | 0xb46c6cd8U, 0xfa5656acU, 0x07f4f4f3U, 0x25eaeacfU, 69 | 0xaf6565caU, 0x8e7a7af4U, 0xe9aeae47U, 0x18080810U, 70 | 0xd5baba6fU, 0x887878f0U, 0x6f25254aU, 0x722e2e5cU, 71 | 0x241c1c38U, 0xf1a6a657U, 0xc7b4b473U, 0x51c6c697U, 72 | 0x23e8e8cbU, 0x7cdddda1U, 0x9c7474e8U, 0x211f1f3eU, 73 | 0xdd4b4b96U, 0xdcbdbd61U, 0x868b8b0dU, 0x858a8a0fU, 74 | 0x907070e0U, 0x423e3e7cU, 0xc4b5b571U, 0xaa6666ccU, 75 | 0xd8484890U, 0x05030306U, 0x01f6f6f7U, 0x120e0e1cU, 76 | 0xa36161c2U, 0x5f35356aU, 0xf95757aeU, 0xd0b9b969U, 77 | 0x91868617U, 0x58c1c199U, 0x271d1d3aU, 0xb99e9e27U, 78 | 0x38e1e1d9U, 0x13f8f8ebU, 0xb398982bU, 0x33111122U, 79 | 0xbb6969d2U, 0x70d9d9a9U, 0x898e8e07U, 0xa7949433U, 80 | 0xb69b9b2dU, 0x221e1e3cU, 0x92878715U, 0x20e9e9c9U, 81 | 0x49cece87U, 0xff5555aaU, 0x78282850U, 0x7adfdfa5U, 82 | 0x8f8c8c03U, 0xf8a1a159U, 0x80898909U, 0x170d0d1aU, 83 | 0xdabfbf65U, 0x31e6e6d7U, 0xc6424284U, 0xb86868d0U, 84 | 0xc3414182U, 0xb0999929U, 0x772d2d5aU, 0x110f0f1eU, 85 | 0xcbb0b07bU, 0xfc5454a8U, 0xd6bbbb6dU, 0x3a16162cU, 86 | 0x6363c6a5U, 0x7c7cf884U, 0x7777ee99U, 0x7b7bf68dU, 87 | 0xf2f2ff0dU, 0x6b6bd6bdU, 0x6f6fdeb1U, 0xc5c59154U, 88 | 0x30306050U, 0x01010203U, 0x6767cea9U, 0x2b2b567dU, 89 | 0xfefee719U, 0xd7d7b562U, 0xabab4de6U, 0x7676ec9aU, 90 | 0xcaca8f45U, 0x82821f9dU, 0xc9c98940U, 0x7d7dfa87U, 91 | 0xfafaef15U, 0x5959b2ebU, 0x47478ec9U, 0xf0f0fb0bU, 92 | 0xadad41ecU, 0xd4d4b367U, 0xa2a25ffdU, 0xafaf45eaU, 93 | 0x9c9c23bfU, 0xa4a453f7U, 0x7272e496U, 0xc0c09b5bU, 94 | 0xb7b775c2U, 0xfdfde11cU, 0x93933daeU, 0x26264c6aU, 95 | 0x36366c5aU, 0x3f3f7e41U, 0xf7f7f502U, 0xcccc834fU, 96 | 0x3434685cU, 0xa5a551f4U, 0xe5e5d134U, 0xf1f1f908U, 97 | 0x7171e293U, 0xd8d8ab73U, 0x31316253U, 0x15152a3fU, 98 | 0x0404080cU, 0xc7c79552U, 0x23234665U, 0xc3c39d5eU, 99 | 0x18183028U, 0x969637a1U, 0x05050a0fU, 0x9a9a2fb5U, 100 | 0x07070e09U, 0x12122436U, 0x80801b9bU, 0xe2e2df3dU, 101 | 0xebebcd26U, 0x27274e69U, 0xb2b27fcdU, 0x7575ea9fU, 102 | 0x0909121bU, 0x83831d9eU, 0x2c2c5874U, 0x1a1a342eU, 103 | 0x1b1b362dU, 0x6e6edcb2U, 0x5a5ab4eeU, 0xa0a05bfbU, 104 | 0x5252a4f6U, 0x3b3b764dU, 0xd6d6b761U, 0xb3b37dceU, 105 | 0x2929527bU, 0xe3e3dd3eU, 0x2f2f5e71U, 0x84841397U, 106 | 0x5353a6f5U, 0xd1d1b968U, 0x00000000U, 0xededc12cU, 107 | 0x20204060U, 0xfcfce31fU, 0xb1b179c8U, 0x5b5bb6edU, 108 | 0x6a6ad4beU, 0xcbcb8d46U, 0xbebe67d9U, 0x3939724bU, 109 | 0x4a4a94deU, 0x4c4c98d4U, 0x5858b0e8U, 0xcfcf854aU, 110 | 0xd0d0bb6bU, 0xefefc52aU, 0xaaaa4fe5U, 0xfbfbed16U, 111 | 0x434386c5U, 0x4d4d9ad7U, 0x33336655U, 0x85851194U, 112 | 0x45458acfU, 0xf9f9e910U, 0x02020406U, 0x7f7ffe81U, 113 | 0x5050a0f0U, 0x3c3c7844U, 0x9f9f25baU, 0xa8a84be3U, 114 | 0x5151a2f3U, 0xa3a35dfeU, 0x404080c0U, 0x8f8f058aU, 115 | 0x92923fadU, 0x9d9d21bcU, 0x38387048U, 0xf5f5f104U, 116 | 0xbcbc63dfU, 0xb6b677c1U, 0xdadaaf75U, 0x21214263U, 117 | 0x10102030U, 0xffffe51aU, 0xf3f3fd0eU, 0xd2d2bf6dU, 118 | 0xcdcd814cU, 0x0c0c1814U, 0x13132635U, 0xececc32fU, 119 | 0x5f5fbee1U, 0x979735a2U, 0x444488ccU, 0x17172e39U, 120 | 0xc4c49357U, 0xa7a755f2U, 0x7e7efc82U, 0x3d3d7a47U, 121 | 0x6464c8acU, 0x5d5dbae7U, 0x1919322bU, 0x7373e695U, 122 | 0x6060c0a0U, 0x81811998U, 0x4f4f9ed1U, 0xdcdca37fU, 123 | 0x22224466U, 0x2a2a547eU, 0x90903babU, 0x88880b83U, 124 | 0x46468ccaU, 0xeeeec729U, 0xb8b86bd3U, 0x1414283cU, 125 | 0xdedea779U, 0x5e5ebce2U, 0x0b0b161dU, 0xdbdbad76U, 126 | 0xe0e0db3bU, 0x32326456U, 0x3a3a744eU, 0x0a0a141eU, 127 | 0x494992dbU, 0x06060c0aU, 0x2424486cU, 0x5c5cb8e4U, 128 | 0xc2c29f5dU, 0xd3d3bd6eU, 0xacac43efU, 0x6262c4a6U, 129 | 0x919139a8U, 0x959531a4U, 0xe4e4d337U, 0x7979f28bU, 130 | 0xe7e7d532U, 0xc8c88b43U, 0x37376e59U, 0x6d6ddab7U, 131 | 0x8d8d018cU, 0xd5d5b164U, 0x4e4e9cd2U, 0xa9a949e0U, 132 | 0x6c6cd8b4U, 0x5656acfaU, 0xf4f4f307U, 0xeaeacf25U, 133 | 0x6565caafU, 0x7a7af48eU, 0xaeae47e9U, 0x08081018U, 134 | 0xbaba6fd5U, 0x7878f088U, 0x25254a6fU, 0x2e2e5c72U, 135 | 0x1c1c3824U, 0xa6a657f1U, 0xb4b473c7U, 0xc6c69751U, 136 | 0xe8e8cb23U, 0xdddda17cU, 0x7474e89cU, 0x1f1f3e21U, 137 | 0x4b4b96ddU, 0xbdbd61dcU, 0x8b8b0d86U, 0x8a8a0f85U, 138 | 0x7070e090U, 0x3e3e7c42U, 0xb5b571c4U, 0x6666ccaaU, 139 | 0x484890d8U, 0x03030605U, 0xf6f6f701U, 0x0e0e1c12U, 140 | 0x6161c2a3U, 0x35356a5fU, 0x5757aef9U, 0xb9b969d0U, 141 | 0x86861791U, 0xc1c19958U, 0x1d1d3a27U, 0x9e9e27b9U, 142 | 0xe1e1d938U, 0xf8f8eb13U, 0x98982bb3U, 0x11112233U, 143 | 0x6969d2bbU, 0xd9d9a970U, 0x8e8e0789U, 0x949433a7U, 144 | 0x9b9b2db6U, 0x1e1e3c22U, 0x87871592U, 0xe9e9c920U, 145 | 0xcece8749U, 0x5555aaffU, 0x28285078U, 0xdfdfa57aU, 146 | 0x8c8c038fU, 0xa1a159f8U, 0x89890980U, 0x0d0d1a17U, 147 | 0xbfbf65daU, 0xe6e6d731U, 0x424284c6U, 0x6868d0b8U, 148 | 0x414182c3U, 0x999929b0U, 0x2d2d5a77U, 0x0f0f1e11U, 149 | 0xb0b07bcbU, 0x5454a8fcU, 0xbbbb6dd6U, 0x16162c3aU, 150 | 0x63c6a563U, 0x7cf8847cU, 0x77ee9977U, 0x7bf68d7bU, 151 | 0xf2ff0df2U, 0x6bd6bd6bU, 0x6fdeb16fU, 0xc59154c5U, 152 | 0x30605030U, 0x01020301U, 0x67cea967U, 0x2b567d2bU, 153 | 0xfee719feU, 0xd7b562d7U, 0xab4de6abU, 0x76ec9a76U, 154 | 0xca8f45caU, 0x821f9d82U, 0xc98940c9U, 0x7dfa877dU, 155 | 0xfaef15faU, 0x59b2eb59U, 0x478ec947U, 0xf0fb0bf0U, 156 | 0xad41ecadU, 0xd4b367d4U, 0xa25ffda2U, 0xaf45eaafU, 157 | 0x9c23bf9cU, 0xa453f7a4U, 0x72e49672U, 0xc09b5bc0U, 158 | 0xb775c2b7U, 0xfde11cfdU, 0x933dae93U, 0x264c6a26U, 159 | 0x366c5a36U, 0x3f7e413fU, 0xf7f502f7U, 0xcc834fccU, 160 | 0x34685c34U, 0xa551f4a5U, 0xe5d134e5U, 0xf1f908f1U, 161 | 0x71e29371U, 0xd8ab73d8U, 0x31625331U, 0x152a3f15U, 162 | 0x04080c04U, 0xc79552c7U, 0x23466523U, 0xc39d5ec3U, 163 | 0x18302818U, 0x9637a196U, 0x050a0f05U, 0x9a2fb59aU, 164 | 0x070e0907U, 0x12243612U, 0x801b9b80U, 0xe2df3de2U, 165 | 0xebcd26ebU, 0x274e6927U, 0xb27fcdb2U, 0x75ea9f75U, 166 | 0x09121b09U, 0x831d9e83U, 0x2c58742cU, 0x1a342e1aU, 167 | 0x1b362d1bU, 0x6edcb26eU, 0x5ab4ee5aU, 0xa05bfba0U, 168 | 0x52a4f652U, 0x3b764d3bU, 0xd6b761d6U, 0xb37dceb3U, 169 | 0x29527b29U, 0xe3dd3ee3U, 0x2f5e712fU, 0x84139784U, 170 | 0x53a6f553U, 0xd1b968d1U, 0x00000000U, 0xedc12cedU, 171 | 0x20406020U, 0xfce31ffcU, 0xb179c8b1U, 0x5bb6ed5bU, 172 | 0x6ad4be6aU, 0xcb8d46cbU, 0xbe67d9beU, 0x39724b39U, 173 | 0x4a94de4aU, 0x4c98d44cU, 0x58b0e858U, 0xcf854acfU, 174 | 0xd0bb6bd0U, 0xefc52aefU, 0xaa4fe5aaU, 0xfbed16fbU, 175 | 0x4386c543U, 0x4d9ad74dU, 0x33665533U, 0x85119485U, 176 | 0x458acf45U, 0xf9e910f9U, 0x02040602U, 0x7ffe817fU, 177 | 0x50a0f050U, 0x3c78443cU, 0x9f25ba9fU, 0xa84be3a8U, 178 | 0x51a2f351U, 0xa35dfea3U, 0x4080c040U, 0x8f058a8fU, 179 | 0x923fad92U, 0x9d21bc9dU, 0x38704838U, 0xf5f104f5U, 180 | 0xbc63dfbcU, 0xb677c1b6U, 0xdaaf75daU, 0x21426321U, 181 | 0x10203010U, 0xffe51affU, 0xf3fd0ef3U, 0xd2bf6dd2U, 182 | 0xcd814ccdU, 0x0c18140cU, 0x13263513U, 0xecc32fecU, 183 | 0x5fbee15fU, 0x9735a297U, 0x4488cc44U, 0x172e3917U, 184 | 0xc49357c4U, 0xa755f2a7U, 0x7efc827eU, 0x3d7a473dU, 185 | 0x64c8ac64U, 0x5dbae75dU, 0x19322b19U, 0x73e69573U, 186 | 0x60c0a060U, 0x81199881U, 0x4f9ed14fU, 0xdca37fdcU, 187 | 0x22446622U, 0x2a547e2aU, 0x903bab90U, 0x880b8388U, 188 | 0x468cca46U, 0xeec729eeU, 0xb86bd3b8U, 0x14283c14U, 189 | 0xdea779deU, 0x5ebce25eU, 0x0b161d0bU, 0xdbad76dbU, 190 | 0xe0db3be0U, 0x32645632U, 0x3a744e3aU, 0x0a141e0aU, 191 | 0x4992db49U, 0x060c0a06U, 0x24486c24U, 0x5cb8e45cU, 192 | 0xc29f5dc2U, 0xd3bd6ed3U, 0xac43efacU, 0x62c4a662U, 193 | 0x9139a891U, 0x9531a495U, 0xe4d337e4U, 0x79f28b79U, 194 | 0xe7d532e7U, 0xc88b43c8U, 0x376e5937U, 0x6ddab76dU, 195 | 0x8d018c8dU, 0xd5b164d5U, 0x4e9cd24eU, 0xa949e0a9U, 196 | 0x6cd8b46cU, 0x56acfa56U, 0xf4f307f4U, 0xeacf25eaU, 197 | 0x65caaf65U, 0x7af48e7aU, 0xae47e9aeU, 0x08101808U, 198 | 0xba6fd5baU, 0x78f08878U, 0x254a6f25U, 0x2e5c722eU, 199 | 0x1c38241cU, 0xa657f1a6U, 0xb473c7b4U, 0xc69751c6U, 200 | 0xe8cb23e8U, 0xdda17cddU, 0x74e89c74U, 0x1f3e211fU, 201 | 0x4b96dd4bU, 0xbd61dcbdU, 0x8b0d868bU, 0x8a0f858aU, 202 | 0x70e09070U, 0x3e7c423eU, 0xb571c4b5U, 0x66ccaa66U, 203 | 0x4890d848U, 0x03060503U, 0xf6f701f6U, 0x0e1c120eU, 204 | 0x61c2a361U, 0x356a5f35U, 0x57aef957U, 0xb969d0b9U, 205 | 0x86179186U, 0xc19958c1U, 0x1d3a271dU, 0x9e27b99eU, 206 | 0xe1d938e1U, 0xf8eb13f8U, 0x982bb398U, 0x11223311U, 207 | 0x69d2bb69U, 0xd9a970d9U, 0x8e07898eU, 0x9433a794U, 208 | 0x9b2db69bU, 0x1e3c221eU, 0x87159287U, 0xe9c920e9U, 209 | 0xce8749ceU, 0x55aaff55U, 0x28507828U, 0xdfa57adfU, 210 | 0x8c038f8cU, 0xa159f8a1U, 0x89098089U, 0x0d1a170dU, 211 | 0xbf65dabfU, 0xe6d731e6U, 0x4284c642U, 0x68d0b868U, 212 | 0x4182c341U, 0x9929b099U, 0x2d5a772dU, 0x0f1e110fU, 213 | 0xb07bcbb0U, 0x54a8fc54U, 0xbb6dd6bbU, 0x162c3a16U, 214 | 0xc6a56363U, 0xf8847c7cU, 0xee997777U, 0xf68d7b7bU, 215 | 0xff0df2f2U, 0xd6bd6b6bU, 0xdeb16f6fU, 0x9154c5c5U, 216 | 0x60503030U, 0x02030101U, 0xcea96767U, 0x567d2b2bU, 217 | 0xe719fefeU, 0xb562d7d7U, 0x4de6ababU, 0xec9a7676U, 218 | 0x8f45cacaU, 0x1f9d8282U, 0x8940c9c9U, 0xfa877d7dU, 219 | 0xef15fafaU, 0xb2eb5959U, 0x8ec94747U, 0xfb0bf0f0U, 220 | 0x41ecadadU, 0xb367d4d4U, 0x5ffda2a2U, 0x45eaafafU, 221 | 0x23bf9c9cU, 0x53f7a4a4U, 0xe4967272U, 0x9b5bc0c0U, 222 | 0x75c2b7b7U, 0xe11cfdfdU, 0x3dae9393U, 0x4c6a2626U, 223 | 0x6c5a3636U, 0x7e413f3fU, 0xf502f7f7U, 0x834fccccU, 224 | 0x685c3434U, 0x51f4a5a5U, 0xd134e5e5U, 0xf908f1f1U, 225 | 0xe2937171U, 0xab73d8d8U, 0x62533131U, 0x2a3f1515U, 226 | 0x080c0404U, 0x9552c7c7U, 0x46652323U, 0x9d5ec3c3U, 227 | 0x30281818U, 0x37a19696U, 0x0a0f0505U, 0x2fb59a9aU, 228 | 0x0e090707U, 0x24361212U, 0x1b9b8080U, 0xdf3de2e2U, 229 | 0xcd26ebebU, 0x4e692727U, 0x7fcdb2b2U, 0xea9f7575U, 230 | 0x121b0909U, 0x1d9e8383U, 0x58742c2cU, 0x342e1a1aU, 231 | 0x362d1b1bU, 0xdcb26e6eU, 0xb4ee5a5aU, 0x5bfba0a0U, 232 | 0xa4f65252U, 0x764d3b3bU, 0xb761d6d6U, 0x7dceb3b3U, 233 | 0x527b2929U, 0xdd3ee3e3U, 0x5e712f2fU, 0x13978484U, 234 | 0xa6f55353U, 0xb968d1d1U, 0x00000000U, 0xc12cededU, 235 | 0x40602020U, 0xe31ffcfcU, 0x79c8b1b1U, 0xb6ed5b5bU, 236 | 0xd4be6a6aU, 0x8d46cbcbU, 0x67d9bebeU, 0x724b3939U, 237 | 0x94de4a4aU, 0x98d44c4cU, 0xb0e85858U, 0x854acfcfU, 238 | 0xbb6bd0d0U, 0xc52aefefU, 0x4fe5aaaaU, 0xed16fbfbU, 239 | 0x86c54343U, 0x9ad74d4dU, 0x66553333U, 0x11948585U, 240 | 0x8acf4545U, 0xe910f9f9U, 0x04060202U, 0xfe817f7fU, 241 | 0xa0f05050U, 0x78443c3cU, 0x25ba9f9fU, 0x4be3a8a8U, 242 | 0xa2f35151U, 0x5dfea3a3U, 0x80c04040U, 0x058a8f8fU, 243 | 0x3fad9292U, 0x21bc9d9dU, 0x70483838U, 0xf104f5f5U, 244 | 0x63dfbcbcU, 0x77c1b6b6U, 0xaf75dadaU, 0x42632121U, 245 | 0x20301010U, 0xe51affffU, 0xfd0ef3f3U, 0xbf6dd2d2U, 246 | 0x814ccdcdU, 0x18140c0cU, 0x26351313U, 0xc32fececU, 247 | 0xbee15f5fU, 0x35a29797U, 0x88cc4444U, 0x2e391717U, 248 | 0x9357c4c4U, 0x55f2a7a7U, 0xfc827e7eU, 0x7a473d3dU, 249 | 0xc8ac6464U, 0xbae75d5dU, 0x322b1919U, 0xe6957373U, 250 | 0xc0a06060U, 0x19988181U, 0x9ed14f4fU, 0xa37fdcdcU, 251 | 0x44662222U, 0x547e2a2aU, 0x3bab9090U, 0x0b838888U, 252 | 0x8cca4646U, 0xc729eeeeU, 0x6bd3b8b8U, 0x283c1414U, 253 | 0xa779dedeU, 0xbce25e5eU, 0x161d0b0bU, 0xad76dbdbU, 254 | 0xdb3be0e0U, 0x64563232U, 0x744e3a3aU, 0x141e0a0aU, 255 | 0x92db4949U, 0x0c0a0606U, 0x486c2424U, 0xb8e45c5cU, 256 | 0x9f5dc2c2U, 0xbd6ed3d3U, 0x43efacacU, 0xc4a66262U, 257 | 0x39a89191U, 0x31a49595U, 0xd337e4e4U, 0xf28b7979U, 258 | 0xd532e7e7U, 0x8b43c8c8U, 0x6e593737U, 0xdab76d6dU, 259 | 0x018c8d8dU, 0xb164d5d5U, 0x9cd24e4eU, 0x49e0a9a9U, 260 | 0xd8b46c6cU, 0xacfa5656U, 0xf307f4f4U, 0xcf25eaeaU, 261 | 0xcaaf6565U, 0xf48e7a7aU, 0x47e9aeaeU, 0x10180808U, 262 | 0x6fd5babaU, 0xf0887878U, 0x4a6f2525U, 0x5c722e2eU, 263 | 0x38241c1cU, 0x57f1a6a6U, 0x73c7b4b4U, 0x9751c6c6U, 264 | 0xcb23e8e8U, 0xa17cddddU, 0xe89c7474U, 0x3e211f1fU, 265 | 0x96dd4b4bU, 0x61dcbdbdU, 0x0d868b8bU, 0x0f858a8aU, 266 | 0xe0907070U, 0x7c423e3eU, 0x71c4b5b5U, 0xccaa6666U, 267 | 0x90d84848U, 0x06050303U, 0xf701f6f6U, 0x1c120e0eU, 268 | 0xc2a36161U, 0x6a5f3535U, 0xaef95757U, 0x69d0b9b9U, 269 | 0x17918686U, 0x9958c1c1U, 0x3a271d1dU, 0x27b99e9eU, 270 | 0xd938e1e1U, 0xeb13f8f8U, 0x2bb39898U, 0x22331111U, 271 | 0xd2bb6969U, 0xa970d9d9U, 0x07898e8eU, 0x33a79494U, 272 | 0x2db69b9bU, 0x3c221e1eU, 0x15928787U, 0xc920e9e9U, 273 | 0x8749ceceU, 0xaaff5555U, 0x50782828U, 0xa57adfdfU, 274 | 0x038f8c8cU, 0x59f8a1a1U, 0x09808989U, 0x1a170d0dU, 275 | 0x65dabfbfU, 0xd731e6e6U, 0x84c64242U, 0xd0b86868U, 276 | 0x82c34141U, 0x29b09999U, 0x5a772d2dU, 0x1e110f0fU, 277 | 0x7bcbb0b0U, 0xa8fc5454U, 0x6dd6bbbbU, 0x2c3a1616U, 278 | 0x50a7f451U, 0x5365417eU, 0xc3a4171aU, 0x965e273aU, 279 | 0xcb6bab3bU, 0xf1459d1fU, 0xab58faacU, 0x9303e34bU, 280 | 0x55fa3020U, 0xf66d76adU, 0x9176cc88U, 0x254c02f5U, 281 | 0xfcd7e54fU, 0xd7cb2ac5U, 0x80443526U, 0x8fa362b5U, 282 | 0x495ab1deU, 0x671bba25U, 0x980eea45U, 0xe1c0fe5dU, 283 | 0x02752fc3U, 0x12f04c81U, 0xa397468dU, 0xc6f9d36bU, 284 | 0xe75f8f03U, 0x959c9215U, 0xeb7a6dbfU, 0xda595295U, 285 | 0x2d83bed4U, 0xd3217458U, 0x2969e049U, 0x44c8c98eU, 286 | 0x6a89c275U, 0x78798ef4U, 0x6b3e5899U, 0xdd71b927U, 287 | 0xb64fe1beU, 0x17ad88f0U, 0x66ac20c9U, 0xb43ace7dU, 288 | 0x184adf63U, 0x82311ae5U, 0x60335197U, 0x457f5362U, 289 | 0xe07764b1U, 0x84ae6bbbU, 0x1ca081feU, 0x942b08f9U, 290 | 0x58684870U, 0x19fd458fU, 0x876cde94U, 0xb7f87b52U, 291 | 0x23d373abU, 0xe2024b72U, 0x578f1fe3U, 0x2aab5566U, 292 | 0x0728ebb2U, 0x03c2b52fU, 0x9a7bc586U, 0xa50837d3U, 293 | 0xf2872830U, 0xb2a5bf23U, 0xba6a0302U, 0x5c8216edU, 294 | 0x2b1ccf8aU, 0x92b479a7U, 0xf0f207f3U, 0xa1e2694eU, 295 | 0xcdf4da65U, 0xd5be0506U, 0x1f6234d1U, 0x8afea6c4U, 296 | 0x9d532e34U, 0xa055f3a2U, 0x32e18a05U, 0x75ebf6a4U, 297 | 0x39ec830bU, 0xaaef6040U, 0x069f715eU, 0x51106ebdU, 298 | 0xf98a213eU, 0x3d06dd96U, 0xae053eddU, 0x46bde64dU, 299 | 0xb58d5491U, 0x055dc471U, 0x6fd40604U, 0xff155060U, 300 | 0x24fb9819U, 0x97e9bdd6U, 0xcc434089U, 0x779ed967U, 301 | 0xbd42e8b0U, 0x888b8907U, 0x385b19e7U, 0xdbeec879U, 302 | 0x470a7ca1U, 0xe90f427cU, 0xc91e84f8U, 0x00000000U, 303 | 0x83868009U, 0x48ed2b32U, 0xac70111eU, 0x4e725a6cU, 304 | 0xfbff0efdU, 0x5638850fU, 0x1ed5ae3dU, 0x27392d36U, 305 | 0x64d90f0aU, 0x21a65c68U, 0xd1545b9bU, 0x3a2e3624U, 306 | 0xb1670a0cU, 0x0fe75793U, 0xd296eeb4U, 0x9e919b1bU, 307 | 0x4fc5c080U, 0xa220dc61U, 0x694b775aU, 0x161a121cU, 308 | 0x0aba93e2U, 0xe52aa0c0U, 0x43e0223cU, 0x1d171b12U, 309 | 0x0b0d090eU, 0xadc78bf2U, 0xb9a8b62dU, 0xc8a91e14U, 310 | 0x8519f157U, 0x4c0775afU, 0xbbdd99eeU, 0xfd607fa3U, 311 | 0x9f2601f7U, 0xbcf5725cU, 0xc53b6644U, 0x347efb5bU, 312 | 0x7629438bU, 0xdcc623cbU, 0x68fcedb6U, 0x63f1e4b8U, 313 | 0xcadc31d7U, 0x10856342U, 0x40229713U, 0x2011c684U, 314 | 0x7d244a85U, 0xf83dbbd2U, 0x1132f9aeU, 0x6da129c7U, 315 | 0x4b2f9e1dU, 0xf330b2dcU, 0xec52860dU, 0xd0e3c177U, 316 | 0x6c16b32bU, 0x99b970a9U, 0xfa489411U, 0x2264e947U, 317 | 0xc48cfca8U, 0x1a3ff0a0U, 0xd82c7d56U, 0xef903322U, 318 | 0xc74e4987U, 0xc1d138d9U, 0xfea2ca8cU, 0x360bd498U, 319 | 0xcf81f5a6U, 0x28de7aa5U, 0x268eb7daU, 0xa4bfad3fU, 320 | 0xe49d3a2cU, 0x0d927850U, 0x9bcc5f6aU, 0x62467e54U, 321 | 0xc2138df6U, 0xe8b8d890U, 0x5ef7392eU, 0xf5afc382U, 322 | 0xbe805d9fU, 0x7c93d069U, 0xa92dd56fU, 0xb31225cfU, 323 | 0x3b99acc8U, 0xa77d1810U, 0x6e639ce8U, 0x7bbb3bdbU, 324 | 0x097826cdU, 0xf418596eU, 0x01b79aecU, 0xa89a4f83U, 325 | 0x656e95e6U, 0x7ee6ffaaU, 0x08cfbc21U, 0xe6e815efU, 326 | 0xd99be7baU, 0xce366f4aU, 0xd4099feaU, 0xd67cb029U, 327 | 0xafb2a431U, 0x31233f2aU, 0x3094a5c6U, 0xc066a235U, 328 | 0x37bc4e74U, 0xa6ca82fcU, 0xb0d090e0U, 0x15d8a733U, 329 | 0x4a9804f1U, 0xf7daec41U, 0x0e50cd7fU, 0x2ff69117U, 330 | 0x8dd64d76U, 0x4db0ef43U, 0x544daaccU, 0xdf0496e4U, 331 | 0xe3b5d19eU, 0x1b886a4cU, 0xb81f2cc1U, 0x7f516546U, 332 | 0x04ea5e9dU, 0x5d358c01U, 0x737487faU, 0x2e410bfbU, 333 | 0x5a1d67b3U, 0x52d2db92U, 0x335610e9U, 0x1347d66dU, 334 | 0x8c61d79aU, 0x7a0ca137U, 0x8e14f859U, 0x893c13ebU, 335 | 0xee27a9ceU, 0x35c961b7U, 0xede51ce1U, 0x3cb1477aU, 336 | 0x59dfd29cU, 0x3f73f255U, 0x79ce1418U, 0xbf37c773U, 337 | 0xeacdf753U, 0x5baafd5fU, 0x146f3ddfU, 0x86db4478U, 338 | 0x81f3afcaU, 0x3ec468b9U, 0x2c342438U, 0x5f40a3c2U, 339 | 0x72c31d16U, 0x0c25e2bcU, 0x8b493c28U, 0x41950dffU, 340 | 0x7101a839U, 0xdeb30c08U, 0x9ce4b4d8U, 0x90c15664U, 341 | 0x6184cb7bU, 0x70b632d5U, 0x745c6c48U, 0x4257b8d0U, 342 | 0xa7f45150U, 0x65417e53U, 0xa4171ac3U, 0x5e273a96U, 343 | 0x6bab3bcbU, 0x459d1ff1U, 0x58faacabU, 0x03e34b93U, 344 | 0xfa302055U, 0x6d76adf6U, 0x76cc8891U, 0x4c02f525U, 345 | 0xd7e54ffcU, 0xcb2ac5d7U, 0x44352680U, 0xa362b58fU, 346 | 0x5ab1de49U, 0x1bba2567U, 0x0eea4598U, 0xc0fe5de1U, 347 | 0x752fc302U, 0xf04c8112U, 0x97468da3U, 0xf9d36bc6U, 348 | 0x5f8f03e7U, 0x9c921595U, 0x7a6dbfebU, 0x595295daU, 349 | 0x83bed42dU, 0x217458d3U, 0x69e04929U, 0xc8c98e44U, 350 | 0x89c2756aU, 0x798ef478U, 0x3e58996bU, 0x71b927ddU, 351 | 0x4fe1beb6U, 0xad88f017U, 0xac20c966U, 0x3ace7db4U, 352 | 0x4adf6318U, 0x311ae582U, 0x33519760U, 0x7f536245U, 353 | 0x7764b1e0U, 0xae6bbb84U, 0xa081fe1cU, 0x2b08f994U, 354 | 0x68487058U, 0xfd458f19U, 0x6cde9487U, 0xf87b52b7U, 355 | 0xd373ab23U, 0x024b72e2U, 0x8f1fe357U, 0xab55662aU, 356 | 0x28ebb207U, 0xc2b52f03U, 0x7bc5869aU, 0x0837d3a5U, 357 | 0x872830f2U, 0xa5bf23b2U, 0x6a0302baU, 0x8216ed5cU, 358 | 0x1ccf8a2bU, 0xb479a792U, 0xf207f3f0U, 0xe2694ea1U, 359 | 0xf4da65cdU, 0xbe0506d5U, 0x6234d11fU, 0xfea6c48aU, 360 | 0x532e349dU, 0x55f3a2a0U, 0xe18a0532U, 0xebf6a475U, 361 | 0xec830b39U, 0xef6040aaU, 0x9f715e06U, 0x106ebd51U, 362 | 0x8a213ef9U, 0x06dd963dU, 0x053eddaeU, 0xbde64d46U, 363 | 0x8d5491b5U, 0x5dc47105U, 0xd406046fU, 0x155060ffU, 364 | 0xfb981924U, 0xe9bdd697U, 0x434089ccU, 0x9ed96777U, 365 | 0x42e8b0bdU, 0x8b890788U, 0x5b19e738U, 0xeec879dbU, 366 | 0x0a7ca147U, 0x0f427ce9U, 0x1e84f8c9U, 0x00000000U, 367 | 0x86800983U, 0xed2b3248U, 0x70111eacU, 0x725a6c4eU, 368 | 0xff0efdfbU, 0x38850f56U, 0xd5ae3d1eU, 0x392d3627U, 369 | 0xd90f0a64U, 0xa65c6821U, 0x545b9bd1U, 0x2e36243aU, 370 | 0x670a0cb1U, 0xe757930fU, 0x96eeb4d2U, 0x919b1b9eU, 371 | 0xc5c0804fU, 0x20dc61a2U, 0x4b775a69U, 0x1a121c16U, 372 | 0xba93e20aU, 0x2aa0c0e5U, 0xe0223c43U, 0x171b121dU, 373 | 0x0d090e0bU, 0xc78bf2adU, 0xa8b62db9U, 0xa91e14c8U, 374 | 0x19f15785U, 0x0775af4cU, 0xdd99eebbU, 0x607fa3fdU, 375 | 0x2601f79fU, 0xf5725cbcU, 0x3b6644c5U, 0x7efb5b34U, 376 | 0x29438b76U, 0xc623cbdcU, 0xfcedb668U, 0xf1e4b863U, 377 | 0xdc31d7caU, 0x85634210U, 0x22971340U, 0x11c68420U, 378 | 0x244a857dU, 0x3dbbd2f8U, 0x32f9ae11U, 0xa129c76dU, 379 | 0x2f9e1d4bU, 0x30b2dcf3U, 0x52860decU, 0xe3c177d0U, 380 | 0x16b32b6cU, 0xb970a999U, 0x489411faU, 0x64e94722U, 381 | 0x8cfca8c4U, 0x3ff0a01aU, 0x2c7d56d8U, 0x903322efU, 382 | 0x4e4987c7U, 0xd138d9c1U, 0xa2ca8cfeU, 0x0bd49836U, 383 | 0x81f5a6cfU, 0xde7aa528U, 0x8eb7da26U, 0xbfad3fa4U, 384 | 0x9d3a2ce4U, 0x9278500dU, 0xcc5f6a9bU, 0x467e5462U, 385 | 0x138df6c2U, 0xb8d890e8U, 0xf7392e5eU, 0xafc382f5U, 386 | 0x805d9fbeU, 0x93d0697cU, 0x2dd56fa9U, 0x1225cfb3U, 387 | 0x99acc83bU, 0x7d1810a7U, 0x639ce86eU, 0xbb3bdb7bU, 388 | 0x7826cd09U, 0x18596ef4U, 0xb79aec01U, 0x9a4f83a8U, 389 | 0x6e95e665U, 0xe6ffaa7eU, 0xcfbc2108U, 0xe815efe6U, 390 | 0x9be7bad9U, 0x366f4aceU, 0x099fead4U, 0x7cb029d6U, 391 | 0xb2a431afU, 0x233f2a31U, 0x94a5c630U, 0x66a235c0U, 392 | 0xbc4e7437U, 0xca82fca6U, 0xd090e0b0U, 0xd8a73315U, 393 | 0x9804f14aU, 0xdaec41f7U, 0x50cd7f0eU, 0xf691172fU, 394 | 0xd64d768dU, 0xb0ef434dU, 0x4daacc54U, 0x0496e4dfU, 395 | 0xb5d19ee3U, 0x886a4c1bU, 0x1f2cc1b8U, 0x5165467fU, 396 | 0xea5e9d04U, 0x358c015dU, 0x7487fa73U, 0x410bfb2eU, 397 | 0x1d67b35aU, 0xd2db9252U, 0x5610e933U, 0x47d66d13U, 398 | 0x61d79a8cU, 0x0ca1377aU, 0x14f8598eU, 0x3c13eb89U, 399 | 0x27a9ceeeU, 0xc961b735U, 0xe51ce1edU, 0xb1477a3cU, 400 | 0xdfd29c59U, 0x73f2553fU, 0xce141879U, 0x37c773bfU, 401 | 0xcdf753eaU, 0xaafd5f5bU, 0x6f3ddf14U, 0xdb447886U, 402 | 0xf3afca81U, 0xc468b93eU, 0x3424382cU, 0x40a3c25fU, 403 | 0xc31d1672U, 0x25e2bc0cU, 0x493c288bU, 0x950dff41U, 404 | 0x01a83971U, 0xb30c08deU, 0xe4b4d89cU, 0xc1566490U, 405 | 0x84cb7b61U, 0xb632d570U, 0x5c6c4874U, 0x57b8d042U, 406 | 0xf45150a7U, 0x417e5365U, 0x171ac3a4U, 0x273a965eU, 407 | 0xab3bcb6bU, 0x9d1ff145U, 0xfaacab58U, 0xe34b9303U, 408 | 0x302055faU, 0x76adf66dU, 0xcc889176U, 0x02f5254cU, 409 | 0xe54ffcd7U, 0x2ac5d7cbU, 0x35268044U, 0x62b58fa3U, 410 | 0xb1de495aU, 0xba25671bU, 0xea45980eU, 0xfe5de1c0U, 411 | 0x2fc30275U, 0x4c8112f0U, 0x468da397U, 0xd36bc6f9U, 412 | 0x8f03e75fU, 0x9215959cU, 0x6dbfeb7aU, 0x5295da59U, 413 | 0xbed42d83U, 0x7458d321U, 0xe0492969U, 0xc98e44c8U, 414 | 0xc2756a89U, 0x8ef47879U, 0x58996b3eU, 0xb927dd71U, 415 | 0xe1beb64fU, 0x88f017adU, 0x20c966acU, 0xce7db43aU, 416 | 0xdf63184aU, 0x1ae58231U, 0x51976033U, 0x5362457fU, 417 | 0x64b1e077U, 0x6bbb84aeU, 0x81fe1ca0U, 0x08f9942bU, 418 | 0x48705868U, 0x458f19fdU, 0xde94876cU, 0x7b52b7f8U, 419 | 0x73ab23d3U, 0x4b72e202U, 0x1fe3578fU, 0x55662aabU, 420 | 0xebb20728U, 0xb52f03c2U, 0xc5869a7bU, 0x37d3a508U, 421 | 0x2830f287U, 0xbf23b2a5U, 0x0302ba6aU, 0x16ed5c82U, 422 | 0xcf8a2b1cU, 0x79a792b4U, 0x07f3f0f2U, 0x694ea1e2U, 423 | 0xda65cdf4U, 0x0506d5beU, 0x34d11f62U, 0xa6c48afeU, 424 | 0x2e349d53U, 0xf3a2a055U, 0x8a0532e1U, 0xf6a475ebU, 425 | 0x830b39ecU, 0x6040aaefU, 0x715e069fU, 0x6ebd5110U, 426 | 0x213ef98aU, 0xdd963d06U, 0x3eddae05U, 0xe64d46bdU, 427 | 0x5491b58dU, 0xc471055dU, 0x06046fd4U, 0x5060ff15U, 428 | 0x981924fbU, 0xbdd697e9U, 0x4089cc43U, 0xd967779eU, 429 | 0xe8b0bd42U, 0x8907888bU, 0x19e7385bU, 0xc879dbeeU, 430 | 0x7ca1470aU, 0x427ce90fU, 0x84f8c91eU, 0x00000000U, 431 | 0x80098386U, 0x2b3248edU, 0x111eac70U, 0x5a6c4e72U, 432 | 0x0efdfbffU, 0x850f5638U, 0xae3d1ed5U, 0x2d362739U, 433 | 0x0f0a64d9U, 0x5c6821a6U, 0x5b9bd154U, 0x36243a2eU, 434 | 0x0a0cb167U, 0x57930fe7U, 0xeeb4d296U, 0x9b1b9e91U, 435 | 0xc0804fc5U, 0xdc61a220U, 0x775a694bU, 0x121c161aU, 436 | 0x93e20abaU, 0xa0c0e52aU, 0x223c43e0U, 0x1b121d17U, 437 | 0x090e0b0dU, 0x8bf2adc7U, 0xb62db9a8U, 0x1e14c8a9U, 438 | 0xf1578519U, 0x75af4c07U, 0x99eebbddU, 0x7fa3fd60U, 439 | 0x01f79f26U, 0x725cbcf5U, 0x6644c53bU, 0xfb5b347eU, 440 | 0x438b7629U, 0x23cbdcc6U, 0xedb668fcU, 0xe4b863f1U, 441 | 0x31d7cadcU, 0x63421085U, 0x97134022U, 0xc6842011U, 442 | 0x4a857d24U, 0xbbd2f83dU, 0xf9ae1132U, 0x29c76da1U, 443 | 0x9e1d4b2fU, 0xb2dcf330U, 0x860dec52U, 0xc177d0e3U, 444 | 0xb32b6c16U, 0x70a999b9U, 0x9411fa48U, 0xe9472264U, 445 | 0xfca8c48cU, 0xf0a01a3fU, 0x7d56d82cU, 0x3322ef90U, 446 | 0x4987c74eU, 0x38d9c1d1U, 0xca8cfea2U, 0xd498360bU, 447 | 0xf5a6cf81U, 0x7aa528deU, 0xb7da268eU, 0xad3fa4bfU, 448 | 0x3a2ce49dU, 0x78500d92U, 0x5f6a9bccU, 0x7e546246U, 449 | 0x8df6c213U, 0xd890e8b8U, 0x392e5ef7U, 0xc382f5afU, 450 | 0x5d9fbe80U, 0xd0697c93U, 0xd56fa92dU, 0x25cfb312U, 451 | 0xacc83b99U, 0x1810a77dU, 0x9ce86e63U, 0x3bdb7bbbU, 452 | 0x26cd0978U, 0x596ef418U, 0x9aec01b7U, 0x4f83a89aU, 453 | 0x95e6656eU, 0xffaa7ee6U, 0xbc2108cfU, 0x15efe6e8U, 454 | 0xe7bad99bU, 0x6f4ace36U, 0x9fead409U, 0xb029d67cU, 455 | 0xa431afb2U, 0x3f2a3123U, 0xa5c63094U, 0xa235c066U, 456 | 0x4e7437bcU, 0x82fca6caU, 0x90e0b0d0U, 0xa73315d8U, 457 | 0x04f14a98U, 0xec41f7daU, 0xcd7f0e50U, 0x91172ff6U, 458 | 0x4d768dd6U, 0xef434db0U, 0xaacc544dU, 0x96e4df04U, 459 | 0xd19ee3b5U, 0x6a4c1b88U, 0x2cc1b81fU, 0x65467f51U, 460 | 0x5e9d04eaU, 0x8c015d35U, 0x87fa7374U, 0x0bfb2e41U, 461 | 0x67b35a1dU, 0xdb9252d2U, 0x10e93356U, 0xd66d1347U, 462 | 0xd79a8c61U, 0xa1377a0cU, 0xf8598e14U, 0x13eb893cU, 463 | 0xa9ceee27U, 0x61b735c9U, 0x1ce1ede5U, 0x477a3cb1U, 464 | 0xd29c59dfU, 0xf2553f73U, 0x141879ceU, 0xc773bf37U, 465 | 0xf753eacdU, 0xfd5f5baaU, 0x3ddf146fU, 0x447886dbU, 466 | 0xafca81f3U, 0x68b93ec4U, 0x24382c34U, 0xa3c25f40U, 467 | 0x1d1672c3U, 0xe2bc0c25U, 0x3c288b49U, 0x0dff4195U, 468 | 0xa8397101U, 0x0c08deb3U, 0xb4d89ce4U, 0x566490c1U, 469 | 0xcb7b6184U, 0x32d570b6U, 0x6c48745cU, 0xb8d04257U, 470 | 0x5150a7f4U, 0x7e536541U, 0x1ac3a417U, 0x3a965e27U, 471 | 0x3bcb6babU, 0x1ff1459dU, 0xacab58faU, 0x4b9303e3U, 472 | 0x2055fa30U, 0xadf66d76U, 0x889176ccU, 0xf5254c02U, 473 | 0x4ffcd7e5U, 0xc5d7cb2aU, 0x26804435U, 0xb58fa362U, 474 | 0xde495ab1U, 0x25671bbaU, 0x45980eeaU, 0x5de1c0feU, 475 | 0xc302752fU, 0x8112f04cU, 0x8da39746U, 0x6bc6f9d3U, 476 | 0x03e75f8fU, 0x15959c92U, 0xbfeb7a6dU, 0x95da5952U, 477 | 0xd42d83beU, 0x58d32174U, 0x492969e0U, 0x8e44c8c9U, 478 | 0x756a89c2U, 0xf478798eU, 0x996b3e58U, 0x27dd71b9U, 479 | 0xbeb64fe1U, 0xf017ad88U, 0xc966ac20U, 0x7db43aceU, 480 | 0x63184adfU, 0xe582311aU, 0x97603351U, 0x62457f53U, 481 | 0xb1e07764U, 0xbb84ae6bU, 0xfe1ca081U, 0xf9942b08U, 482 | 0x70586848U, 0x8f19fd45U, 0x94876cdeU, 0x52b7f87bU, 483 | 0xab23d373U, 0x72e2024bU, 0xe3578f1fU, 0x662aab55U, 484 | 0xb20728ebU, 0x2f03c2b5U, 0x869a7bc5U, 0xd3a50837U, 485 | 0x30f28728U, 0x23b2a5bfU, 0x02ba6a03U, 0xed5c8216U, 486 | 0x8a2b1ccfU, 0xa792b479U, 0xf3f0f207U, 0x4ea1e269U, 487 | 0x65cdf4daU, 0x06d5be05U, 0xd11f6234U, 0xc48afea6U, 488 | 0x349d532eU, 0xa2a055f3U, 0x0532e18aU, 0xa475ebf6U, 489 | 0x0b39ec83U, 0x40aaef60U, 0x5e069f71U, 0xbd51106eU, 490 | 0x3ef98a21U, 0x963d06ddU, 0xddae053eU, 0x4d46bde6U, 491 | 0x91b58d54U, 0x71055dc4U, 0x046fd406U, 0x60ff1550U, 492 | 0x1924fb98U, 0xd697e9bdU, 0x89cc4340U, 0x67779ed9U, 493 | 0xb0bd42e8U, 0x07888b89U, 0xe7385b19U, 0x79dbeec8U, 494 | 0xa1470a7cU, 0x7ce90f42U, 0xf8c91e84U, 0x00000000U, 495 | 0x09838680U, 0x3248ed2bU, 0x1eac7011U, 0x6c4e725aU, 496 | 0xfdfbff0eU, 0x0f563885U, 0x3d1ed5aeU, 0x3627392dU, 497 | 0x0a64d90fU, 0x6821a65cU, 0x9bd1545bU, 0x243a2e36U, 498 | 0x0cb1670aU, 0x930fe757U, 0xb4d296eeU, 0x1b9e919bU, 499 | 0x804fc5c0U, 0x61a220dcU, 0x5a694b77U, 0x1c161a12U, 500 | 0xe20aba93U, 0xc0e52aa0U, 0x3c43e022U, 0x121d171bU, 501 | 0x0e0b0d09U, 0xf2adc78bU, 0x2db9a8b6U, 0x14c8a91eU, 502 | 0x578519f1U, 0xaf4c0775U, 0xeebbdd99U, 0xa3fd607fU, 503 | 0xf79f2601U, 0x5cbcf572U, 0x44c53b66U, 0x5b347efbU, 504 | 0x8b762943U, 0xcbdcc623U, 0xb668fcedU, 0xb863f1e4U, 505 | 0xd7cadc31U, 0x42108563U, 0x13402297U, 0x842011c6U, 506 | 0x857d244aU, 0xd2f83dbbU, 0xae1132f9U, 0xc76da129U, 507 | 0x1d4b2f9eU, 0xdcf330b2U, 0x0dec5286U, 0x77d0e3c1U, 508 | 0x2b6c16b3U, 0xa999b970U, 0x11fa4894U, 0x472264e9U, 509 | 0xa8c48cfcU, 0xa01a3ff0U, 0x56d82c7dU, 0x22ef9033U, 510 | 0x87c74e49U, 0xd9c1d138U, 0x8cfea2caU, 0x98360bd4U, 511 | 0xa6cf81f5U, 0xa528de7aU, 0xda268eb7U, 0x3fa4bfadU, 512 | 0x2ce49d3aU, 0x500d9278U, 0x6a9bcc5fU, 0x5462467eU, 513 | 0xf6c2138dU, 0x90e8b8d8U, 0x2e5ef739U, 0x82f5afc3U, 514 | 0x9fbe805dU, 0x697c93d0U, 0x6fa92dd5U, 0xcfb31225U, 515 | 0xc83b99acU, 0x10a77d18U, 0xe86e639cU, 0xdb7bbb3bU, 516 | 0xcd097826U, 0x6ef41859U, 0xec01b79aU, 0x83a89a4fU, 517 | 0xe6656e95U, 0xaa7ee6ffU, 0x2108cfbcU, 0xefe6e815U, 518 | 0xbad99be7U, 0x4ace366fU, 0xead4099fU, 0x29d67cb0U, 519 | 0x31afb2a4U, 0x2a31233fU, 0xc63094a5U, 0x35c066a2U, 520 | 0x7437bc4eU, 0xfca6ca82U, 0xe0b0d090U, 0x3315d8a7U, 521 | 0xf14a9804U, 0x41f7daecU, 0x7f0e50cdU, 0x172ff691U, 522 | 0x768dd64dU, 0x434db0efU, 0xcc544daaU, 0xe4df0496U, 523 | 0x9ee3b5d1U, 0x4c1b886aU, 0xc1b81f2cU, 0x467f5165U, 524 | 0x9d04ea5eU, 0x015d358cU, 0xfa737487U, 0xfb2e410bU, 525 | 0xb35a1d67U, 0x9252d2dbU, 0xe9335610U, 0x6d1347d6U, 526 | 0x9a8c61d7U, 0x377a0ca1U, 0x598e14f8U, 0xeb893c13U, 527 | 0xceee27a9U, 0xb735c961U, 0xe1ede51cU, 0x7a3cb147U, 528 | 0x9c59dfd2U, 0x553f73f2U, 0x1879ce14U, 0x73bf37c7U, 529 | 0x53eacdf7U, 0x5f5baafdU, 0xdf146f3dU, 0x7886db44U, 530 | 0xca81f3afU, 0xb93ec468U, 0x382c3424U, 0xc25f40a3U, 531 | 0x1672c31dU, 0xbc0c25e2U, 0x288b493cU, 0xff41950dU, 532 | 0x397101a8U, 0x08deb30cU, 0xd89ce4b4U, 0x6490c156U, 533 | 0x7b6184cbU, 0xd570b632U, 0x48745c6cU, 0xd04257b8U, 534 | }; 535 | 536 | __constant static const uint AES_KEY_FILL[16] = { 537 | 0x6daca553, 0x62716609, 0xdbb5552b, 0xb4f44917, 538 | 0x6d7caf07, 0x846a710d, 0x1725d378, 0x0da1dc4e, 539 | 0x3f1262f1, 0x9f947ec6, 0xf4c0794f, 0x3e20e345, 540 | 0x6aef8135, 0xb1ba317c, 0x16314c88, 0x49169154, 541 | }; 542 | 543 | __constant static const uint AES_STATE_HASH[16] = { 544 | 0x92b52c0d, 0x9fa856de, 0xcc82db47, 0xd7983aad, 545 | 0x338d996e, 0x15c7b798, 0xf59e125a, 0xace78057, 546 | 0x6a770017, 0xae62c7d0, 0x5079506b, 0xe8a07ce4, 547 | 0x630a240c, 0x07ad828d, 0x79a10005, 0x7e994948, 548 | }; 549 | 550 | uint get_byte(uint a, uint start_bit) { return (a >> start_bit) & 0xFF; } 551 | 552 | #include "randomx_constants.h" 553 | 554 | #define fillAes_name fillAes1Rx4_scratchpad 555 | #define outputSize RANDOMX_SCRATCHPAD_L3 556 | #define outputSize0 (outputSize + 64) 557 | #define unroll_factor 8 558 | #define num_rounds 1 559 | #include "fillAes1Rx4.cl" 560 | #undef num_rounds 561 | #undef unroll_factor 562 | #undef outputSize 563 | #undef outputSize0 564 | #undef fillAes_name 565 | 566 | #define fillAes_name fillAes4Rx4_entropy 567 | #define outputSize ENTROPY_SIZE 568 | #define outputSize0 outputSize 569 | #define unroll_factor 2 570 | #define num_rounds 4 571 | #include "fillAes1Rx4.cl" 572 | #undef num_rounds 573 | #undef unroll_factor 574 | #undef outputSize 575 | #undef outputSize0 576 | #undef fillAes_name 577 | 578 | #define inputSize RANDOMX_SCRATCHPAD_L3 579 | 580 | __attribute__((reqd_work_group_size(64, 1, 1))) 581 | __kernel void hashAes1Rx4(__global const void* input, __global void* hash, uint hashOffsetBytes, uint hashStrideBytes, uint batch_size) 582 | { 583 | __local uint T[2048]; 584 | 585 | const uint global_index = get_global_id(0); 586 | if (global_index >= batch_size * 4) 587 | return; 588 | 589 | const uint idx = global_index / 4; 590 | const uint sub = global_index % 4; 591 | 592 | for (uint i = get_local_id(0), step = get_local_size(0); i < 2048; i += step) 593 | T[i] = AES_TABLE[i]; 594 | 595 | barrier(CLK_LOCAL_MEM_FENCE); 596 | 597 | uint x[4] = { AES_STATE_HASH[sub * 4], AES_STATE_HASH[sub * 4 + 1], AES_STATE_HASH[sub * 4 + 2], AES_STATE_HASH[sub * 4 + 3] }; 598 | 599 | const uint s1 = ((sub & 1) == 0) ? 8 : 24; 600 | const uint s3 = ((sub & 1) == 0) ? 24 : 8; 601 | 602 | __global const uint4* p = ((__global uint4*) input) + idx * ((inputSize + 64) / sizeof(uint4)) + sub; 603 | 604 | __local const uint* const t0 = ((sub & 1) == 0) ? T : (T + 1024); 605 | __local const uint* const t1 = ((sub & 1) == 0) ? (T + 256) : (T + 1792); 606 | __local const uint* const t2 = ((sub & 1) == 0) ? (T + 512) : (T + 1536); 607 | __local const uint* const t3 = ((sub & 1) == 0) ? (T + 768) : (T + 1280); 608 | 609 | #pragma unroll(8) 610 | for (uint i = 0; i < inputSize / sizeof(uint4); i += 4, p += 4) 611 | { 612 | uint k[4], y[4]; 613 | *(uint4*)(k) = *p; 614 | y[0] = t0[get_byte(x[0], 0)] ^ t1[get_byte(x[1], s1)] ^ t2[get_byte(x[2], 16)] ^ t3[get_byte(x[3], s3)] ^ k[0]; 615 | y[1] = t0[get_byte(x[1], 0)] ^ t1[get_byte(x[2], s1)] ^ t2[get_byte(x[3], 16)] ^ t3[get_byte(x[0], s3)] ^ k[1]; 616 | y[2] = t0[get_byte(x[2], 0)] ^ t1[get_byte(x[3], s1)] ^ t2[get_byte(x[0], 16)] ^ t3[get_byte(x[1], s3)] ^ k[2]; 617 | y[3] = t0[get_byte(x[3], 0)] ^ t1[get_byte(x[0], s1)] ^ t2[get_byte(x[1], 16)] ^ t3[get_byte(x[2], s3)] ^ k[3]; 618 | x[0] = y[0]; 619 | x[1] = y[1]; 620 | x[2] = y[2]; 621 | x[3] = y[3]; 622 | } 623 | 624 | uint y[4]; 625 | 626 | y[0] = t0[get_byte(x[0], 0)] ^ t1[get_byte(x[1], s1)] ^ t2[get_byte(x[2], 16)] ^ t3[get_byte(x[3], s3)] ^ 0xf6fa8389; 627 | y[1] = t0[get_byte(x[1], 0)] ^ t1[get_byte(x[2], s1)] ^ t2[get_byte(x[3], 16)] ^ t3[get_byte(x[0], s3)] ^ 0x8b24949f; 628 | y[2] = t0[get_byte(x[2], 0)] ^ t1[get_byte(x[3], s1)] ^ t2[get_byte(x[0], 16)] ^ t3[get_byte(x[1], s3)] ^ 0x90dc56bf; 629 | y[3] = t0[get_byte(x[3], 0)] ^ t1[get_byte(x[0], s1)] ^ t2[get_byte(x[1], 16)] ^ t3[get_byte(x[2], s3)] ^ 0x06890201; 630 | 631 | x[0] = t0[get_byte(y[0], 0)] ^ t1[get_byte(y[1], s1)] ^ t2[get_byte(y[2], 16)] ^ t3[get_byte(y[3], s3)] ^ 0x61b263d1; 632 | x[1] = t0[get_byte(y[1], 0)] ^ t1[get_byte(y[2], s1)] ^ t2[get_byte(y[3], 16)] ^ t3[get_byte(y[0], s3)] ^ 0x51f4e03c; 633 | x[2] = t0[get_byte(y[2], 0)] ^ t1[get_byte(y[3], s1)] ^ t2[get_byte(y[0], 16)] ^ t3[get_byte(y[1], s3)] ^ 0xee1043c6; 634 | x[3] = t0[get_byte(y[3], 0)] ^ t1[get_byte(y[0], s1)] ^ t2[get_byte(y[1], 16)] ^ t3[get_byte(y[2], s3)] ^ 0xed18f99b; 635 | 636 | *((__global uint4*)(hash) + idx * (hashStrideBytes / sizeof(uint4)) + sub + (hashOffsetBytes / sizeof(uint4))) = *(uint4*)(x); 637 | } 638 | -------------------------------------------------------------------------------- /RandomX_OpenCL/CL/blake2b.cl: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2019 SChernykh 3 | 4 | This file is part of RandomX OpenCL. 5 | 6 | RandomX OpenCL is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | RandomX OpenCL is distributed in the hope that it will be useful, 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | GNU General Public License for more details. 15 | 16 | You should have received a copy of the GNU General Public License 17 | along with RandomX OpenCL. If not, see . 18 | */ 19 | 20 | #define BLOCK_TEMPLATE_SIZE 76 21 | 22 | __constant static const uchar blake2b_sigma[12 * 16] = { 23 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24 | 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3, 25 | 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4, 26 | 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8, 27 | 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13, 28 | 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9, 29 | 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11, 30 | 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10, 31 | 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5, 32 | 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0, 33 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 34 | 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3, 35 | }; 36 | 37 | enum Blake2b_IV 38 | { 39 | iv0 = 0x6a09e667f3bcc908ul, 40 | iv1 = 0xbb67ae8584caa73bul, 41 | iv2 = 0x3c6ef372fe94f82bul, 42 | iv3 = 0xa54ff53a5f1d36f1ul, 43 | iv4 = 0x510e527fade682d1ul, 44 | iv5 = 0x9b05688c2b3e6c1ful, 45 | iv6 = 0x1f83d9abfb41bd6bul, 46 | iv7 = 0x5be0cd19137e2179ul, 47 | }; 48 | 49 | ulong rotr64(ulong a, ulong shift) { return rotate(a, 64 - shift); } 50 | 51 | #define G(r, i, a, b, c, d) \ 52 | do { \ 53 | a = a + b + m[blake2b_sigma[r * 16 + 2 * i + 0]]; \ 54 | d = rotr64(d ^ a, 32); \ 55 | c = c + d; \ 56 | b = rotr64(b ^ c, 24); \ 57 | a = a + b + m[blake2b_sigma[r * 16 + 2 * i + 1]]; \ 58 | d = rotr64(d ^ a, 16); \ 59 | c = c + d; \ 60 | b = rotr64(b ^ c, 63); \ 61 | } while (0) 62 | 63 | #define ROUND(r) \ 64 | do { \ 65 | G(r, 0, v[0], v[4], v[8], v[12]); \ 66 | G(r, 1, v[1], v[5], v[9], v[13]); \ 67 | G(r, 2, v[2], v[6], v[10], v[14]); \ 68 | G(r, 3, v[3], v[7], v[11], v[15]); \ 69 | G(r, 4, v[0], v[5], v[10], v[15]); \ 70 | G(r, 5, v[1], v[6], v[11], v[12]); \ 71 | G(r, 6, v[2], v[7], v[8], v[13]); \ 72 | G(r, 7, v[3], v[4], v[9], v[14]); \ 73 | } while (0) 74 | 75 | #define BLAKE2B_ROUNDS() ROUND(0);ROUND(1);ROUND(2);ROUND(3);ROUND(4);ROUND(5);ROUND(6);ROUND(7);ROUND(8);ROUND(9);ROUND(10);ROUND(11); 76 | 77 | void blake2b_512_process_single_block(ulong *h, const ulong* m) 78 | { 79 | ulong v[16] = 80 | { 81 | iv0 ^ 0x01010040, iv1, iv2, iv3, iv4 , iv5, iv6, iv7, 82 | iv0 , iv1, iv2, iv3, iv4 ^ BLOCK_TEMPLATE_SIZE, iv5, ~iv6, iv7, 83 | }; 84 | 85 | BLAKE2B_ROUNDS(); 86 | 87 | h[0] = v[0] ^ v[ 8] ^ iv0 ^ 0x01010040; 88 | h[1] = v[1] ^ v[ 9] ^ iv1; 89 | h[2] = v[2] ^ v[10] ^ iv2; 90 | h[3] = v[3] ^ v[11] ^ iv3; 91 | h[4] = v[4] ^ v[12] ^ iv4; 92 | h[5] = v[5] ^ v[13] ^ iv5; 93 | h[6] = v[6] ^ v[14] ^ iv6; 94 | h[7] = v[7] ^ v[15] ^ iv7; 95 | } 96 | 97 | __attribute__((reqd_work_group_size(64, 1, 1))) 98 | __kernel void blake2b_initial_hash(__global void *out, __global const void* blockTemplate, uint start_nonce) 99 | { 100 | const uint global_index = get_global_id(0); 101 | 102 | __global const ulong* p = (__global const ulong*) blockTemplate; 103 | ulong m[16] = { 104 | (BLOCK_TEMPLATE_SIZE > 0) ? p[ 0] : 0, 105 | (BLOCK_TEMPLATE_SIZE > 8) ? p[ 1] : 0, 106 | (BLOCK_TEMPLATE_SIZE > 16) ? p[ 2] : 0, 107 | (BLOCK_TEMPLATE_SIZE > 24) ? p[ 3] : 0, 108 | (BLOCK_TEMPLATE_SIZE > 32) ? p[ 4] : 0, 109 | (BLOCK_TEMPLATE_SIZE > 40) ? p[ 5] : 0, 110 | (BLOCK_TEMPLATE_SIZE > 48) ? p[ 6] : 0, 111 | (BLOCK_TEMPLATE_SIZE > 56) ? p[ 7] : 0, 112 | (BLOCK_TEMPLATE_SIZE > 64) ? p[ 8] : 0, 113 | (BLOCK_TEMPLATE_SIZE > 72) ? p[ 9] : 0, 114 | (BLOCK_TEMPLATE_SIZE > 80) ? p[10] : 0, 115 | (BLOCK_TEMPLATE_SIZE > 88) ? p[11] : 0, 116 | (BLOCK_TEMPLATE_SIZE > 96) ? p[12] : 0, 117 | (BLOCK_TEMPLATE_SIZE > 104) ? p[13] : 0, 118 | (BLOCK_TEMPLATE_SIZE > 112) ? p[14] : 0, 119 | (BLOCK_TEMPLATE_SIZE > 120) ? p[15] : 0, 120 | }; 121 | 122 | if (BLOCK_TEMPLATE_SIZE % sizeof(ulong)) 123 | m[BLOCK_TEMPLATE_SIZE / sizeof(ulong)] &= (ulong)(-1) >> (64 - (BLOCK_TEMPLATE_SIZE % sizeof(ulong)) * 8); 124 | 125 | const ulong nonce = start_nonce + global_index; 126 | m[4] = (m[4] & ((ulong)(-1) >> 8)) | (nonce << 56); 127 | m[5] = (m[5] & ((ulong)(-1) << 24)) | (nonce >> 8); 128 | 129 | ulong hash[8]; 130 | blake2b_512_process_single_block(hash, m); 131 | 132 | __global ulong* t = ((__global ulong*) out) + global_index * 8; 133 | t[0] = hash[0]; 134 | t[1] = hash[1]; 135 | t[2] = hash[2]; 136 | t[3] = hash[3]; 137 | t[4] = hash[4]; 138 | t[5] = hash[5]; 139 | t[6] = hash[6]; 140 | t[7] = hash[7]; 141 | } 142 | 143 | __attribute__((reqd_work_group_size(64, 1, 1))) 144 | __kernel void blake2b_512_single_block_bench(__global ulong *out, __global const void* in, ulong start_nonce) 145 | { 146 | const uint global_index = get_global_id(0); 147 | 148 | __global const ulong* p = (__global const ulong*) in; 149 | ulong m[16] = { 150 | start_nonce + global_index, 151 | (BLOCK_TEMPLATE_SIZE > 8) ? p[1] : 0, 152 | (BLOCK_TEMPLATE_SIZE > 16) ? p[2] : 0, 153 | (BLOCK_TEMPLATE_SIZE > 24) ? p[3] : 0, 154 | (BLOCK_TEMPLATE_SIZE > 32) ? p[4] : 0, 155 | (BLOCK_TEMPLATE_SIZE > 40) ? p[5] : 0, 156 | (BLOCK_TEMPLATE_SIZE > 48) ? p[6] : 0, 157 | (BLOCK_TEMPLATE_SIZE > 56) ? p[7] : 0, 158 | (BLOCK_TEMPLATE_SIZE > 64) ? p[8] : 0, 159 | (BLOCK_TEMPLATE_SIZE > 72) ? p[9] : 0, 160 | (BLOCK_TEMPLATE_SIZE > 80) ? p[10] : 0, 161 | (BLOCK_TEMPLATE_SIZE > 88) ? p[11] : 0, 162 | (BLOCK_TEMPLATE_SIZE > 96) ? p[12] : 0, 163 | (BLOCK_TEMPLATE_SIZE > 104) ? p[13] : 0, 164 | (BLOCK_TEMPLATE_SIZE > 112) ? p[14] : 0, 165 | (BLOCK_TEMPLATE_SIZE > 120) ? p[15] : 0, 166 | }; 167 | 168 | if (BLOCK_TEMPLATE_SIZE % sizeof(ulong)) 169 | m[BLOCK_TEMPLATE_SIZE / sizeof(ulong)] &= (ulong)(-1) >> (64 - (BLOCK_TEMPLATE_SIZE % sizeof(ulong)) * 8); 170 | 171 | ulong hash[8]; 172 | blake2b_512_process_single_block(hash, m); 173 | 174 | if (((uint*) hash)[15] == 0) 175 | *out = m[0]; 176 | } 177 | 178 | #define in_len 256 179 | 180 | #define out_len 32 181 | #define blake2b_512_process_double_block_name blake2b_512_process_double_block_32 182 | #define blake2b_hash_registers_name blake2b_hash_registers_32 183 | #include "blake2b_double_block.cl" 184 | #undef blake2b_hash_registers_name 185 | #undef blake2b_512_process_double_block_name 186 | #undef out_len 187 | 188 | #define out_len 64 189 | #define blake2b_512_process_double_block_name blake2b_512_process_double_block_64 190 | #define blake2b_hash_registers_name blake2b_hash_registers_64 191 | #include "blake2b_double_block.cl" 192 | #undef blake2b_hash_registers_name 193 | #undef blake2b_512_process_double_block_name 194 | #undef out_len 195 | 196 | __attribute__((reqd_work_group_size(64, 1, 1))) 197 | __kernel void blake2b_512_double_block_bench(__global ulong *out, __global const void* in, ulong start_nonce) 198 | { 199 | const uint global_index = get_global_id(0); 200 | 201 | __global const ulong* p = (__global const ulong*) in; 202 | ulong m[16] = { start_nonce + global_index, p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15] }; 203 | 204 | ulong hash[8]; 205 | blake2b_512_process_double_block_64(hash, m, p); 206 | 207 | if (((uint*) hash)[15] == 0) 208 | *out = start_nonce + global_index; 209 | } 210 | -------------------------------------------------------------------------------- /RandomX_OpenCL/CL/blake2b_double_block.cl: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2019 SChernykh 3 | 4 | This file is part of RandomX OpenCL. 5 | 6 | RandomX OpenCL is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | RandomX OpenCL is distributed in the hope that it will be useful, 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | GNU General Public License for more details. 15 | 16 | You should have received a copy of the GNU General Public License 17 | along with RandomX OpenCL. If not, see . 18 | */ 19 | 20 | void blake2b_512_process_double_block_name(ulong *out, ulong* m, __global const ulong* in) 21 | { 22 | ulong v[16] = 23 | { 24 | iv0 ^ (0x01010000u | out_len), iv1, iv2, iv3, iv4 , iv5, iv6, iv7, 25 | iv0 , iv1, iv2, iv3, iv4 ^ 128, iv5, iv6, iv7, 26 | }; 27 | 28 | BLAKE2B_ROUNDS(); 29 | 30 | ulong h[8]; 31 | v[0] = h[0] = v[0] ^ v[8] ^ iv0 ^ (0x01010000u | out_len); 32 | v[1] = h[1] = v[1] ^ v[9] ^ iv1; 33 | v[2] = h[2] = v[2] ^ v[10] ^ iv2; 34 | v[3] = h[3] = v[3] ^ v[11] ^ iv3; 35 | v[4] = h[4] = v[4] ^ v[12] ^ iv4; 36 | v[5] = h[5] = v[5] ^ v[13] ^ iv5; 37 | v[6] = h[6] = v[6] ^ v[14] ^ iv6; 38 | v[7] = h[7] = v[7] ^ v[15] ^ iv7; 39 | v[8] = iv0; 40 | v[9] = iv1; 41 | v[10] = iv2; 42 | v[11] = iv3; 43 | v[12] = iv4 ^ in_len; 44 | v[13] = iv5; 45 | v[14] = ~iv6; 46 | v[15] = iv7; 47 | 48 | m[ 0] = (in_len > 128) ? in[16] : 0; 49 | m[ 1] = (in_len > 136) ? in[17] : 0; 50 | m[ 2] = (in_len > 144) ? in[18] : 0; 51 | m[ 3] = (in_len > 152) ? in[19] : 0; 52 | m[ 4] = (in_len > 160) ? in[20] : 0; 53 | m[ 5] = (in_len > 168) ? in[21] : 0; 54 | m[ 6] = (in_len > 176) ? in[22] : 0; 55 | m[ 7] = (in_len > 184) ? in[23] : 0; 56 | m[ 8] = (in_len > 192) ? in[24] : 0; 57 | m[ 9] = (in_len > 200) ? in[25] : 0; 58 | m[10] = (in_len > 208) ? in[26] : 0; 59 | m[11] = (in_len > 216) ? in[27] : 0; 60 | m[12] = (in_len > 224) ? in[28] : 0; 61 | m[13] = (in_len > 232) ? in[29] : 0; 62 | m[14] = (in_len > 240) ? in[30] : 0; 63 | m[15] = (in_len > 248) ? in[31] : 0; 64 | 65 | if (in_len % sizeof(ulong)) 66 | m[(in_len - 128) / sizeof(ulong)] &= (ulong)(-1) >> (64 - (in_len % sizeof(ulong)) * 8); 67 | 68 | BLAKE2B_ROUNDS(); 69 | 70 | if (out_len > 0) out[0] = h[0] ^ v[0] ^ v[8]; 71 | if (out_len > 8) out[1] = h[1] ^ v[1] ^ v[9]; 72 | if (out_len > 16) out[2] = h[2] ^ v[2] ^ v[10]; 73 | if (out_len > 24) out[3] = h[3] ^ v[3] ^ v[11]; 74 | if (out_len > 32) out[4] = h[4] ^ v[4] ^ v[12]; 75 | if (out_len > 40) out[5] = h[5] ^ v[5] ^ v[13]; 76 | if (out_len > 48) out[6] = h[6] ^ v[6] ^ v[14]; 77 | if (out_len > 56) out[7] = h[7] ^ v[7] ^ v[15]; 78 | } 79 | 80 | __attribute__((reqd_work_group_size(64, 1, 1))) 81 | __kernel void blake2b_hash_registers_name(__global void *out, __global const void* in, uint inStrideBytes) 82 | { 83 | const uint global_index = get_global_id(0); 84 | __global const ulong* p = ((__global const ulong*) in) + global_index * (inStrideBytes / sizeof(ulong)); 85 | __global ulong* h = ((__global ulong*) out) + global_index * (out_len / sizeof(ulong)); 86 | 87 | ulong m[16] = { p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15] }; 88 | 89 | ulong hash[8]; 90 | blake2b_512_process_double_block_name(hash, m, p); 91 | 92 | if (out_len > 0) h[0] = hash[0]; 93 | if (out_len > 8) h[1] = hash[1]; 94 | if (out_len > 16) h[2] = hash[2]; 95 | if (out_len > 24) h[3] = hash[3]; 96 | if (out_len > 32) h[4] = hash[4]; 97 | if (out_len > 40) h[5] = hash[5]; 98 | if (out_len > 48) h[6] = hash[6]; 99 | if (out_len > 56) h[7] = hash[7]; 100 | } 101 | -------------------------------------------------------------------------------- /RandomX_OpenCL/CL/fillAes1Rx4.cl: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2019 SChernykh 3 | 4 | This file is part of RandomX OpenCL. 5 | 6 | RandomX OpenCL is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | RandomX OpenCL is distributed in the hope that it will be useful, 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | GNU General Public License for more details. 15 | 16 | You should have received a copy of the GNU General Public License 17 | along with RandomX OpenCL. If not, see . 18 | */ 19 | 20 | __attribute__((reqd_work_group_size(64, 1, 1))) 21 | __kernel void fillAes_name(__global void* state, __global void* out, uint batch_size) 22 | { 23 | __local uint T[2048]; 24 | 25 | const uint global_index = get_global_id(0); 26 | if (global_index >= batch_size * 4) 27 | return; 28 | 29 | const uint idx = global_index / 4; 30 | const uint sub = global_index % 4; 31 | 32 | for (uint i = get_local_id(0), step = get_local_size(0); i < 2048; i += step) 33 | T[i] = AES_TABLE[i]; 34 | 35 | barrier(CLK_LOCAL_MEM_FENCE); 36 | 37 | #if num_rounds != 4 38 | const uint k[4] = { AES_KEY_FILL[sub * 4], AES_KEY_FILL[sub * 4 + 1], AES_KEY_FILL[sub * 4 + 2], AES_KEY_FILL[sub * 4 + 3] }; 39 | #else 40 | const bool b = (sub < 2); 41 | uint k[16]; 42 | k[ 0] = b ? 0x6421aaddu : 0xb5826f73u; 43 | k[ 1] = b ? 0xd1833ddbu : 0xe3d6a7a6u; 44 | k[ 2] = b ? 0x2f546d2bu : 0x3d518b6du; 45 | k[ 3] = b ? 0x99e5d23fu : 0x229effb4u; 46 | k[ 4] = b ? 0xb20e3450u : 0xc7566bf3u; 47 | k[ 5] = b ? 0xb6913f55u : 0x9c10b3d9u; 48 | k[ 6] = b ? 0x06f79d53u : 0xe9024d4eu; 49 | k[ 7] = b ? 0xa5dfcde5u : 0xb272b7d2u; 50 | k[ 8] = b ? 0x5c3ed904u : 0xf273c9e7u; 51 | k[ 9] = b ? 0x515e7bafu : 0xf765a38bu; 52 | k[10] = b ? 0x0aa4679fu : 0x2ba9660au; 53 | k[11] = b ? 0x171c02bfu : 0xf63befa7u; 54 | k[12] = b ? 0x85623763u : 0x7a7cd609u; 55 | k[13] = b ? 0xe78f5d08u : 0x915839deu; 56 | k[14] = b ? 0xcd673785u : 0x0c06d1fdu; 57 | k[15] = b ? 0xd8ded291u : 0xc0b0762du; 58 | #endif 59 | 60 | __global uint* s = ((__global uint*) state) + idx * (64 / sizeof(uint)) + sub * (16 / sizeof(uint)); 61 | uint x[4] = { s[0], s[1], s[2], s[3] }; 62 | 63 | const uint s1 = (sub & 1) ? 8 : 24; 64 | const uint s3 = (sub & 1) ? 24 : 8; 65 | 66 | __global uint4* p = ((__global uint4*) out) + idx * (outputSize0 / sizeof(uint4)) + sub; 67 | 68 | const __local uint* const t0 = (sub & 1) ? T : (T + 1024); 69 | const __local uint* const t1 = (sub & 1) ? (T + 256) : (T + 1792); 70 | const __local uint* const t2 = (sub & 1) ? (T + 512) : (T + 1536); 71 | const __local uint* const t3 = (sub & 1) ? (T + 768) : (T + 1280); 72 | 73 | #pragma unroll(unroll_factor) 74 | for (uint i = 0; i < outputSize / sizeof(uint4); i += 4, p += 4) 75 | { 76 | uint y[4]; 77 | 78 | #if num_rounds != 4 79 | y[0] = t0[get_byte(x[0], 0)] ^ t1[get_byte(x[1], s1)] ^ t2[get_byte(x[2], 16)] ^ t3[get_byte(x[3], s3)] ^ k[0]; 80 | y[1] = t0[get_byte(x[1], 0)] ^ t1[get_byte(x[2], s1)] ^ t2[get_byte(x[3], 16)] ^ t3[get_byte(x[0], s3)] ^ k[1]; 81 | y[2] = t0[get_byte(x[2], 0)] ^ t1[get_byte(x[3], s1)] ^ t2[get_byte(x[0], 16)] ^ t3[get_byte(x[1], s3)] ^ k[2]; 82 | y[3] = t0[get_byte(x[3], 0)] ^ t1[get_byte(x[0], s1)] ^ t2[get_byte(x[1], 16)] ^ t3[get_byte(x[2], s3)] ^ k[3]; 83 | 84 | *p = *(uint4*)(y); 85 | 86 | x[0] = y[0]; 87 | x[1] = y[1]; 88 | x[2] = y[2]; 89 | x[3] = y[3]; 90 | #else 91 | y[0] = t0[get_byte(x[0], 0)] ^ t1[get_byte(x[1], s1)] ^ t2[get_byte(x[2], 16)] ^ t3[get_byte(x[3], s3)] ^ k[ 0]; 92 | y[1] = t0[get_byte(x[1], 0)] ^ t1[get_byte(x[2], s1)] ^ t2[get_byte(x[3], 16)] ^ t3[get_byte(x[0], s3)] ^ k[ 1]; 93 | y[2] = t0[get_byte(x[2], 0)] ^ t1[get_byte(x[3], s1)] ^ t2[get_byte(x[0], 16)] ^ t3[get_byte(x[1], s3)] ^ k[ 2]; 94 | y[3] = t0[get_byte(x[3], 0)] ^ t1[get_byte(x[0], s1)] ^ t2[get_byte(x[1], 16)] ^ t3[get_byte(x[2], s3)] ^ k[ 3]; 95 | 96 | x[0] = t0[get_byte(y[0], 0)] ^ t1[get_byte(y[1], s1)] ^ t2[get_byte(y[2], 16)] ^ t3[get_byte(y[3], s3)] ^ k[ 4]; 97 | x[1] = t0[get_byte(y[1], 0)] ^ t1[get_byte(y[2], s1)] ^ t2[get_byte(y[3], 16)] ^ t3[get_byte(y[0], s3)] ^ k[ 5]; 98 | x[2] = t0[get_byte(y[2], 0)] ^ t1[get_byte(y[3], s1)] ^ t2[get_byte(y[0], 16)] ^ t3[get_byte(y[1], s3)] ^ k[ 6]; 99 | x[3] = t0[get_byte(y[3], 0)] ^ t1[get_byte(y[0], s1)] ^ t2[get_byte(y[1], 16)] ^ t3[get_byte(y[2], s3)] ^ k[ 7]; 100 | 101 | y[0] = t0[get_byte(x[0], 0)] ^ t1[get_byte(x[1], s1)] ^ t2[get_byte(x[2], 16)] ^ t3[get_byte(x[3], s3)] ^ k[ 8]; 102 | y[1] = t0[get_byte(x[1], 0)] ^ t1[get_byte(x[2], s1)] ^ t2[get_byte(x[3], 16)] ^ t3[get_byte(x[0], s3)] ^ k[ 9]; 103 | y[2] = t0[get_byte(x[2], 0)] ^ t1[get_byte(x[3], s1)] ^ t2[get_byte(x[0], 16)] ^ t3[get_byte(x[1], s3)] ^ k[10]; 104 | y[3] = t0[get_byte(x[3], 0)] ^ t1[get_byte(x[0], s1)] ^ t2[get_byte(x[1], 16)] ^ t3[get_byte(x[2], s3)] ^ k[11]; 105 | 106 | x[0] = t0[get_byte(y[0], 0)] ^ t1[get_byte(y[1], s1)] ^ t2[get_byte(y[2], 16)] ^ t3[get_byte(y[3], s3)] ^ k[12]; 107 | x[1] = t0[get_byte(y[1], 0)] ^ t1[get_byte(y[2], s1)] ^ t2[get_byte(y[3], 16)] ^ t3[get_byte(y[0], s3)] ^ k[13]; 108 | x[2] = t0[get_byte(y[2], 0)] ^ t1[get_byte(y[3], s1)] ^ t2[get_byte(y[0], 16)] ^ t3[get_byte(y[1], s3)] ^ k[14]; 109 | x[3] = t0[get_byte(y[3], 0)] ^ t1[get_byte(y[0], s1)] ^ t2[get_byte(y[1], 16)] ^ t3[get_byte(y[2], s3)] ^ k[15]; 110 | 111 | *p = *(uint4*)(x); 112 | #endif 113 | } 114 | 115 | *(__global uint4*)(s) = *(uint4*)(x); 116 | } 117 | -------------------------------------------------------------------------------- /RandomX_OpenCL/CL/randomx_constants.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2019 SChernykh 3 | 4 | This file is part of RandomX OpenCL. 5 | 6 | RandomX OpenCL is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | RandomX OpenCL is distributed in the hope that it will be useful, 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | GNU General Public License for more details. 15 | 16 | You should have received a copy of the GNU General Public License 17 | along with RandomX OpenCL. If not, see . 18 | */ 19 | 20 | #pragma once 21 | 22 | //Dataset base size in bytes. Must be a power of 2. 23 | #define RANDOMX_DATASET_BASE_SIZE 2147483648 24 | 25 | //Dataset extra size. Must be divisible by 64. 26 | #define RANDOMX_DATASET_EXTRA_SIZE 33554368 27 | 28 | //Scratchpad L3 size in bytes. Must be a power of 2. 29 | #define RANDOMX_SCRATCHPAD_L3 2097152 30 | 31 | //Scratchpad L2 size in bytes. Must be a power of two and less than or equal to RANDOMX_SCRATCHPAD_L3. 32 | #define RANDOMX_SCRATCHPAD_L2 262144 33 | 34 | //Scratchpad L1 size in bytes. Must be a power of two (minimum 64) and less than or equal to RANDOMX_SCRATCHPAD_L2. 35 | #define RANDOMX_SCRATCHPAD_L1 16384 36 | 37 | //Jump condition mask size in bits. 38 | #define RANDOMX_JUMP_BITS 8 39 | 40 | //Jump condition mask offset in bits. The sum of RANDOMX_JUMP_BITS and RANDOMX_JUMP_OFFSET must not exceed 16. 41 | #define RANDOMX_JUMP_OFFSET 8 42 | 43 | //Integer instructions 44 | #define RANDOMX_FREQ_IADD_RS 16 45 | #define RANDOMX_FREQ_IADD_M 7 46 | #define RANDOMX_FREQ_ISUB_R 16 47 | #define RANDOMX_FREQ_ISUB_M 7 48 | #define RANDOMX_FREQ_IMUL_R 16 49 | #define RANDOMX_FREQ_IMUL_M 4 50 | #define RANDOMX_FREQ_IMULH_R 4 51 | #define RANDOMX_FREQ_IMULH_M 1 52 | #define RANDOMX_FREQ_ISMULH_R 4 53 | #define RANDOMX_FREQ_ISMULH_M 1 54 | #define RANDOMX_FREQ_IMUL_RCP 8 55 | #define RANDOMX_FREQ_INEG_R 2 56 | #define RANDOMX_FREQ_IXOR_R 15 57 | #define RANDOMX_FREQ_IXOR_M 5 58 | #define RANDOMX_FREQ_IROR_R 8 59 | #define RANDOMX_FREQ_IROL_R 2 60 | #define RANDOMX_FREQ_ISWAP_R 4 61 | 62 | //Floating point instructions 63 | #define RANDOMX_FREQ_FSWAP_R 4 64 | #define RANDOMX_FREQ_FADD_R 16 65 | #define RANDOMX_FREQ_FADD_M 5 66 | #define RANDOMX_FREQ_FSUB_R 16 67 | #define RANDOMX_FREQ_FSUB_M 5 68 | #define RANDOMX_FREQ_FSCAL_R 6 69 | #define RANDOMX_FREQ_FMUL_R 32 70 | #define RANDOMX_FREQ_FDIV_M 4 71 | #define RANDOMX_FREQ_FSQRT_R 6 72 | 73 | //Control instructions 74 | #define RANDOMX_FREQ_CBRANCH 25 75 | #define RANDOMX_FREQ_CFROUND 1 76 | 77 | //Store instruction 78 | #define RANDOMX_FREQ_ISTORE 16 79 | 80 | //No-op instruction 81 | #define RANDOMX_FREQ_NOP 0 82 | 83 | #define RANDOMX_DATASET_ITEM_SIZE 64 84 | 85 | #define RANDOMX_PROGRAM_SIZE 256 86 | 87 | #define HASH_SIZE 64 88 | #define ENTROPY_SIZE (128 + RANDOMX_PROGRAM_SIZE * 8) 89 | #define REGISTERS_SIZE 256 90 | #define IMM_BUF_SIZE (RANDOMX_PROGRAM_SIZE * 4 - REGISTERS_SIZE) 91 | #define IMM_INDEX_COUNT ((IMM_BUF_SIZE / 4) - 2) 92 | #define VM_STATE_SIZE (REGISTERS_SIZE + IMM_BUF_SIZE + RANDOMX_PROGRAM_SIZE * 4) 93 | #define ROUNDING_MODE (RANDOMX_FREQ_CFROUND ? -1 : 0) 94 | 95 | // Scratchpad L1/L2/L3 bits 96 | #define LOC_L1 (32 - 14) 97 | #define LOC_L2 (32 - 18) 98 | #define LOC_L3 (32 - 21) 99 | -------------------------------------------------------------------------------- /RandomX_OpenCL/CL/randomx_constants_jit.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2019 SChernykh 3 | 4 | This file is part of RandomX OpenCL. 5 | 6 | RandomX OpenCL is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | RandomX OpenCL is distributed in the hope that it will be useful, 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | GNU General Public License for more details. 15 | 16 | You should have received a copy of the GNU General Public License 17 | along with RandomX OpenCL. If not, see . 18 | */ 19 | 20 | #pragma once 21 | 22 | #define INITIAL_HASH_SIZE 64 23 | #define INTERMEDIATE_PROGRAM_SIZE (RANDOMX_PROGRAM_SIZE * 16) 24 | #define COMPILED_PROGRAM_SIZE 10048 25 | #define NUM_VGPR_REGISTERS 128 26 | -------------------------------------------------------------------------------- /RandomX_OpenCL/CL/randomx_run.cl: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2019 SChernykh 3 | Portions Copyright (c) 2018-2019 tevador 4 | 5 | This file is part of RandomX OpenCL. 6 | 7 | RandomX OpenCL is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | RandomX OpenCL is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with RandomX OpenCL. If not, see . 19 | */ 20 | 21 | #include "randomx_constants.h" 22 | #include "randomx_constants_jit.h" 23 | 24 | #define REGISTERS_COUNT (REGISTERS_SIZE / 8) 25 | 26 | #define CacheLineSize 64U 27 | #define CacheLineAlignMask ((1U << 31) - 1) & ~(CacheLineSize - 1) 28 | 29 | #define mantissaSize 52 30 | #define dynamicExponentBits 4 31 | #define dynamicMantissaMask ((1UL << (mantissaSize + dynamicExponentBits)) - 1) 32 | 33 | double load_F_E_groups(int value, ulong andMask, ulong orMask) 34 | { 35 | ulong x = as_ulong(convert_double_rte(value)); 36 | x &= andMask; 37 | x |= orMask; 38 | return as_double(x); 39 | } 40 | 41 | // This kernel is only used to dump binary and disassemble it into randomx_run.asm 42 | __attribute__((reqd_work_group_size(32, 1, 1))) 43 | __kernel void randomx_run(__global const uchar* dataset, __global uchar* scratchpad, __global ulong* registers, __global uint* rounding_modes, __global uint* programs, uint batch_size, uint rx_parameters) 44 | { 45 | __local ulong2 R_buf[REGISTERS_COUNT / 2]; 46 | 47 | const uint idx = get_group_id(0); 48 | const uint sub = get_local_id(0); 49 | 50 | const uint program_iterations = 1U << (rx_parameters >> 15); 51 | const uint ScratchpadL3Size = 1U << ((rx_parameters >> 10) & 31); 52 | const uint ScratchpadL3Mask64 = ScratchpadL3Size - 64; 53 | 54 | __local ulong* R = (__local ulong*)(R_buf); 55 | 56 | __local double* F = (__local double*)(R + 8); 57 | __local double* E = (__local double*)(R + 16); 58 | 59 | registers += idx * REGISTERS_COUNT; 60 | scratchpad += idx * (ulong)(ScratchpadL3Size + 64); 61 | rounding_modes += idx; 62 | programs += get_group_id(0) * (COMPILED_PROGRAM_SIZE / sizeof(uint)); 63 | 64 | // Copy registers (256 bytes) into shared memory: 32 workers, 8 bytes for each worker 65 | ((__local ulong*) R)[sub] = ((__global ulong*) registers)[sub]; 66 | barrier(CLK_LOCAL_MEM_FENCE); 67 | 68 | if (sub >= 8) 69 | return; 70 | 71 | uint mx = ((__local uint*)(R + 16))[1]; 72 | uint ma = ((__local uint*)(R + 16))[0]; 73 | 74 | const uint readReg0 = ((__local uint*)(R + 17))[0]; 75 | const uint readReg1 = ((__local uint*)(R + 17))[1]; 76 | const uint readReg2 = ((__local uint*)(R + 17))[2]; 77 | const uint readReg3 = ((__local uint*)(R + 17))[3]; 78 | 79 | const uint datasetOffset = ((__local uint*)(R + 19))[0]; 80 | dataset += datasetOffset; 81 | 82 | uint spAddr0 = mx; 83 | uint spAddr1 = ma; 84 | 85 | const bool f_group = (sub < 4); 86 | __local double* fe = f_group ? (F + sub * 2) : (E + (sub - 4) * 2); 87 | 88 | const ulong andMask = f_group ? (ulong)(-1) : dynamicMantissaMask; 89 | const ulong orMask1 = f_group ? 0 : R[20]; 90 | const ulong orMask2 = f_group ? 0 : R[21]; 91 | 92 | #pragma unroll(1) 93 | for (uint ic = 0; ic < program_iterations; ++ic) 94 | { 95 | const uint2 spMix = as_uint2(R[readReg0] ^ R[readReg1]); 96 | spAddr0 ^= spMix.x; 97 | spAddr0 &= ScratchpadL3Mask64; 98 | spAddr1 ^= spMix.y; 99 | spAddr1 &= ScratchpadL3Mask64; 100 | 101 | __global ulong* p0 = (__global ulong*)(scratchpad + (spAddr0 + sub * 8)); 102 | __global ulong* p1 = (__global ulong*)(scratchpad + (spAddr1 + sub * 8)); 103 | 104 | R[sub] ^= *p0; 105 | 106 | const int2 q = as_int2(*p1); 107 | fe[0] = load_F_E_groups(q.x, andMask, orMask1); 108 | fe[1] = load_F_E_groups(q.y, andMask, orMask2); 109 | 110 | barrier(CLK_LOCAL_MEM_FENCE); 111 | 112 | // TODO: 113 | // 114 | // 1) Compile with atomic_inc uncommented 115 | // 2) clrxdisasm -C randomx.bin > randomx.asm 116 | // 3) Replace GLOBAL_ATOMIC_ADD in randomx.asm with a call to JIT code (S_SWAPPC_B64 to call, S_SETPC_B64 to return) 117 | // 4) clrxasm randomx.asm -o randomx.bin 118 | // 5) ??? 119 | // 6) PROFIT!!! 120 | 121 | atomic_inc(programs); 122 | 123 | mx ^= R[readReg2] ^ R[readReg3]; 124 | mx &= CacheLineAlignMask; 125 | 126 | const ulong data = *(__global const ulong*)(dataset + ma + sub * 8); 127 | 128 | const ulong next_r = R[sub] ^ data; 129 | R[sub] = next_r; 130 | 131 | *p1 = next_r; 132 | *p0 = as_ulong(F[sub]) ^ as_ulong(E[sub]); 133 | 134 | uint tmp = ma; 135 | ma = mx; 136 | mx = tmp; 137 | 138 | spAddr0 = 0; 139 | spAddr1 = 0; 140 | } 141 | 142 | registers[sub] = R[sub]; 143 | registers[sub + 8] = as_ulong(F[sub]) ^ as_ulong(E[sub]); 144 | registers[sub + 16] = as_ulong(E[sub]); 145 | } 146 | -------------------------------------------------------------------------------- /RandomX_OpenCL/GCNASM/randomx_run_gfx1010.asm: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2019-2020 SChernykh 3 | 4 | This file is part of RandomX OpenCL. 5 | 6 | RandomX OpenCL is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | RandomX OpenCL is distributed in the hope that it will be useful, 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | GNU General Public License for more details. 15 | 16 | You should have received a copy of the GNU General Public License 17 | along with RandomX OpenCL. If not, see . 18 | */ 19 | 20 | .rocm 21 | .gpu GFX1010 22 | .arch_minor 1 23 | .arch_stepping 0 24 | .eflags 53 25 | .llvm10binfmt 26 | .metadatav3 27 | .md_version 1, 0 28 | .globaldata 29 | .fill 64, 1, 0 30 | .kernel randomx_run 31 | .config 32 | .dims x 33 | .sgprsnum 96 34 | .vgprsnum 128 35 | .shared_vgprs 0 36 | .dx10clamp 37 | .ieeemode 38 | .floatmode 0xf0 39 | .priority 0 40 | .exceptions 0 41 | .userdatanum 6 42 | 43 | # https://llvm.org/docs/AMDGPUUsage.html#amdgpu-amdhsa-compute-pgm-rsrc1-gfx6-gfx10-table 44 | # https://llvm.org/docs/AMDGPUUsage.html#amdgpu-amdhsa-compute-pgm-rsrc2-gfx6-gfx10-table 45 | # https://llvm.org/docs/AMDGPUUsage.html#amdgpu-amdhsa-compute-pgm-rsrc3-gfx10-table 46 | .pgmrsrc1 0x40af0105 47 | .pgmrsrc2 0x0000008c 48 | .pgmrsrc3 0x00000000 49 | 50 | .group_segment_fixed_size 256 51 | .private_segment_fixed_size 0 52 | .kernel_code_entry_offset 0x10c0 53 | .use_private_segment_buffer 54 | .use_kernarg_segment_ptr 55 | .use_wave32 56 | .config 57 | .md_symname "randomx_run.kd" 58 | .md_language "OpenCL C", 1, 2 59 | .reqd_work_group_size 32, 1, 1 60 | .md_kernarg_segment_size 104 61 | .md_kernarg_segment_align 8 62 | .md_group_segment_fixed_size 256 63 | .md_private_segment_fixed_size 0 64 | .md_wavefront_size 32 65 | .md_sgprsnum 96 66 | .md_vgprsnum 128 67 | .spilledsgprs 0 68 | .spilledvgprs 0 69 | .max_flat_work_group_size 32 70 | .arg dataset, "uchar*", 8, 0, globalbuf, u8, global, default const 71 | .arg scratchpad, "uchar*", 8, 8, globalbuf, u8, global, default 72 | .arg registers, "ulong*", 8, 16, globalbuf, u64, global, default 73 | .arg rounding_modes, "uint*", 8, 24, globalbuf, u32, global, default 74 | .arg programs, "uint*", 8, 32, globalbuf, u32, global, default 75 | .arg batch_size, "uint", 4, 40, value, u32 76 | .arg rx_parameters, "uint", 4, 44, value, u32 77 | .arg , "", 8, 48, gox, i64 78 | .arg , "", 8, 56, goy, i64 79 | .arg , "", 8, 64, goz, i64 80 | .arg , "", 8, 72, none, i8 81 | .arg , "", 8, 80, none, i8 82 | .arg , "", 8, 88, none, i8 83 | .arg , "", 8, 96, multigridsyncarg, i8 84 | .text 85 | randomx_run: 86 | # clear all caches 87 | s_dcache_wb 88 | s_waitcnt vmcnt(0) & lgkmcnt(0) 89 | s_waitcnt_vscnt null, 0x0 90 | s_icache_inv 91 | s_branch begin 92 | 93 | # pgmrsrc2 = 0x0000008c, bits 1:5 = 6, so first 6 SGPRs (s0-s7) contain user data 94 | # s6 contains group id 95 | # v0 contains local id 96 | begin: 97 | # s[0:1] - pointer to registers 98 | # s[2:3] - pointer to rounding modes 99 | s_load_dwordx4 s[0:3], s[4:5], 0x10 100 | 101 | # s[8:9] - group_id*group_size 102 | s_mov_b32 s9, 0 103 | s_lshl_b32 s8, s6, 5 104 | 105 | # v0 - local id (sub) 106 | # v39 - R[sub] 107 | v_lshlrev_b32 v39, 3, v0 108 | 109 | s_mov_b32 s12, s7 110 | 111 | # vcc_lo = "if (sub < 8)" 112 | v_cmp_gt_u32 vcc_lo, 8, v0 113 | 114 | s_waitcnt lgkmcnt(0) 115 | 116 | # load rounding mode 117 | s_lshl_b32 s16, s6, 2 118 | s_add_u32 s64, s2, s16 119 | s_addc_u32 s65, s3, 0 120 | v_mov_b32 v1, 0 121 | global_load_dword v1, v1, s[64:65] 122 | s_waitcnt vmcnt(0) 123 | v_readlane_b32 s66, v1, 0 124 | s_setreg_b32 hwreg(mode, 2, 2), s66 125 | s_mov_b32 s67, 0 126 | 127 | # ((__local ulong*) R)[sub] = ((__global ulong*) registers)[sub]; 128 | s_lshl_b64 s[2:3], s[8:9], 3 129 | s_mov_b32 s32, s12 130 | s_add_u32 s0, s0, s2 131 | s_addc_u32 s1, s1, s3 132 | v_add_co_u32 v1, s0, s0, v39 133 | v_add_co_ci_u32 v2, s0, s1, 0, s0 134 | global_load_dwordx2 v[4:5], v[1:2], off 135 | s_waitcnt vmcnt(0) 136 | ds_write_b64 v39, v[4:5] 137 | s_waitcnt vmcnt(0) & lgkmcnt(0) 138 | s_waitcnt_vscnt null, 0x0 139 | 140 | # "if (sub >= 8) return" 141 | s_and_saveexec_b32 s0, vcc_lo 142 | s_cbranch_execz program_end 143 | 144 | # s[8:9] - pointer to dataset 145 | # s[10:11] - pointer to scratchpads 146 | # s[0:1] - pointer to programs 147 | s_load_dwordx4 s[8:11], s[4:5], 0x0 148 | s_load_dwordx2 s[0:1], s[4:5], 0x20 149 | 150 | # rx_parameters 151 | s_load_dword s20, s[4:5], 0x2c 152 | 153 | v_mov_b32 v5, 0 154 | v_mov_b32 v10, 0 155 | s_waitcnt_vscnt null, 0x0 156 | ds_read_b64 v[8:9], v39 157 | v_cmp_gt_u32 vcc_lo, 4, v0 158 | v_lshlrev_b32 v0, 3, v0 159 | ds_read2_b64 v[25:28], v5 offset0:16 offset1:17 160 | ds_read_b32 v11, v5 offset:152 161 | ds_read_b64 v[35:36], v5 offset:168 162 | ds_read2_b64 v[20:23], v5 offset0:18 offset1:20 163 | v_cndmask_b32 v4, 0xffffff, -1, vcc_lo 164 | v_add_nc_u32 v5, v39, v0 165 | s_waitcnt lgkmcnt(0) 166 | v_mov_b32 v13, s11 167 | v_mov_b32 v7, s1 168 | v_mov_b32 v6, s0 169 | 170 | # Scratchpad L1 size 171 | s_bfe_u32 s21, s20, 0x050000 172 | s_lshl_b32 s21, 1, s21 173 | 174 | # Scratchpad L2 size 175 | s_bfe_u32 s22, s20, 0x050005 176 | s_lshl_b32 s22, 1, s22 177 | 178 | # Scratchpad L3 size 179 | s_bfe_u32 s0, s20, 0x05000A 180 | s_lshl_b32 s23, 1, s0 181 | 182 | # program iterations 183 | s_bfe_u32 s24, s20, 0x04000F 184 | s_lshl_b32 s24, 1, s24 185 | 186 | v_mov_b32 v12, s10 187 | v_mad_u64_u32 v[6:7], s2, 10048, s6, v[6:7] 188 | 189 | # s[4:5] - pointer to current program 190 | v_readlane_b32 s4, v6, 0 191 | v_readlane_b32 s5, v7, 0 192 | 193 | s_lshl_b32 s2, 1, s0 194 | v_add_co_u32 v14, s0, s8, v11 195 | v_cndmask_b32 v34, v36, 0, vcc_lo 196 | v_cndmask_b32 v24, v23, 0, vcc_lo 197 | v_cndmask_b32 v3, v22, 0, vcc_lo 198 | s_add_i32 s3, s2, 64 199 | v_add_co_ci_u32 v29, s0, s9, v10, s0 200 | v_cndmask_b32 v35, v35, 0, vcc_lo 201 | v_add_co_u32 v22, vcc_lo, v14, v0 202 | 203 | # v[12:13] - pointer to current scratchpad 204 | v_mad_u64_u32 v[12:13], s2, s3, s6, v[12:13] 205 | v_mov_b32 v10, v26 206 | v_mov_b32 v11, v25 207 | v_lshlrev_b32 v36, 3, v27 208 | v_lshlrev_b32 v37, 3, v28 209 | v_lshlrev_b32 v20, 3, v20 210 | v_lshlrev_b32 v21, 3, v21 211 | v_add_co_ci_u32 v23, vcc_lo, 0, v29, vcc_lo 212 | 213 | # rename registers 214 | # v6 - R[sub] 215 | v_mov_b32 v6, v39 216 | 217 | # loop counter 218 | s_sub_u32 s2, s24, 1 219 | 220 | # used in IXOR_R instruction 221 | s_mov_b32 s63, -1 222 | 223 | # used in CBRANCH instruction 224 | s_mov_b32 s70, (0xFF << 8) 225 | s_mov_b32 s71, (0xFF << 9) 226 | s_mov_b32 s72, (0xFF << 10) 227 | s_mov_b32 s73, (0xFF << 11) 228 | s_mov_b32 s74, (0xFF << 12) 229 | s_mov_b32 s75, (0xFF << 13) 230 | s_mov_b32 s76, (0xFF << 14) 231 | s_mov_b32 s77, (0xFF << 15) 232 | s_mov_b32 s78, (0xFF << 16) 233 | s_mov_b32 s79, (0xFF << 17) 234 | s_mov_b32 s80, (0xFF << 18) 235 | s_mov_b32 s81, (0xFF << 19) 236 | s_mov_b32 s82, (0xFF << 20) 237 | s_mov_b32 s83, (0xFF << 21) 238 | s_mov_b32 s84, (0xFF << 22) 239 | s_mov_b32 s85, (0xFF << 23) 240 | 241 | # ScratchpadL3Mask64 242 | s_sub_u32 s86, s23, 64 243 | 244 | # Scratchpad masks for scratchpads 245 | v_sub_nc_u32 v38, s21, 8 246 | v_sub_nc_u32 v39, s22, 8 247 | v_sub_nc_u32 v50, s23, 8 248 | 249 | # mask for FSCAL_R 250 | v_mov_b32 v51, 0x80F00000 251 | 252 | # load scratchpad base address 253 | v_readlane_b32 s0, v12, 0 254 | v_readlane_b32 s1, v13, 0 255 | 256 | # v41, v44 = 0 257 | v_mov_b32 v41, 0 258 | v_mov_b32 v44, 0 259 | 260 | # v41 = 0 on lane 0, set it to 8 on lane 1 261 | # v44 = 0 on lane 0, set it to 4 on lane 1 262 | s_mov_b64 exec, 2 263 | v_mov_b32 v41, 8 264 | v_mov_b32 v44, 4 265 | 266 | # load group A registers 267 | # Read low 8 bytes into lane 0 and high 8 bytes into lane 1 268 | s_mov_b64 exec, 3 269 | ds_read2_b64 v[52:55], v41 offset0:24 offset1:26 270 | ds_read2_b64 v[56:59], v41 offset0:28 offset1:30 271 | 272 | # xmantissaMask 273 | v_mov_b32 v77, (1 << 24) - 1 274 | 275 | # xexponentMask 276 | ds_read_b64 v[78:79], v41 offset:160 277 | 278 | # Restore execution mask 279 | s_mov_b64 exec, 255 280 | 281 | # sign mask (used in FSQRT_R) 282 | v_mov_b32 v82, 0x80000000 283 | 284 | # used in FSQRT_R to check for "positive normal value" (v_cmpx_class_f64) 285 | s_mov_b32 s68, 256 286 | s_mov_b32 s69, 0 287 | 288 | # High 32 bits of "1.0" constant (used in FDIV_M) 289 | v_mov_b32 v83, (1023 << 20) 290 | 291 | # Used to multiply FP64 values by 0.5 292 | v_mov_b32 v84, (1 << 20) 293 | 294 | s_getpc_b64 s[14:15] 295 | cur_addr: 296 | 297 | # get addresses of FSQRT_R subroutines 298 | s_add_u32 s40, s14, fsqrt_r_sub0 - cur_addr 299 | s_addc_u32 s41, s15, 0 300 | s_add_u32 s42, s14, fsqrt_r_sub1 - cur_addr 301 | s_addc_u32 s43, s15, 0 302 | s_add_u32 s44, s14, fsqrt_r_sub2 - cur_addr 303 | s_addc_u32 s45, s15, 0 304 | s_add_u32 s46, s14, fsqrt_r_sub3 - cur_addr 305 | s_addc_u32 s47, s15, 0 306 | 307 | # get addresses of FDIV_M subroutines 308 | s_add_u32 s48, s14, fdiv_m_sub0 - cur_addr 309 | s_addc_u32 s49, s15, 0 310 | s_add_u32 s50, s14, fdiv_m_sub1 - cur_addr 311 | s_addc_u32 s51, s15, 0 312 | s_add_u32 s52, s14, fdiv_m_sub2 - cur_addr 313 | s_addc_u32 s53, s15, 0 314 | s_add_u32 s54, s14, fdiv_m_sub3 - cur_addr 315 | s_addc_u32 s55, s15, 0 316 | 317 | # get address for ISMULH_R subroutine 318 | s_add_u32 s56, s14, ismulh_r_sub - cur_addr 319 | s_addc_u32 s57, s15, 0 320 | 321 | # get address for IMULH_R subroutine 322 | s_add_u32 s58, s14, imulh_r_sub - cur_addr 323 | s_addc_u32 s59, s15, 0 324 | 325 | /* 326 | used: v0-v6, v8-v37 327 | not used: v7 328 | */ 329 | main_loop: 330 | s_waitcnt_vscnt null, 0x0 331 | 332 | # v[27:28] = R[readReg0] 333 | # v[29:30] = R[readReg1] 334 | ds_read_b64 v[27:28], v37 335 | ds_read_b64 v[29:30], v36 336 | s_waitcnt lgkmcnt(0) 337 | 338 | # R[readReg0] ^ R[readReg0] (high 32 bits) 339 | v_xor_b32 v28, v28, v30 340 | 341 | # spAddr1 342 | v_xor_b32 v25, v28, v25 343 | v_and_b32 v25, s86, v25 344 | v_add_nc_u32 v25, v25, v0 345 | 346 | v_add_co_u32 v16, vcc_lo, s0, v25 347 | 348 | # R[readReg0] ^ R[readReg0] (low 32 bits) 349 | v_xor_b32 v25, v27, v29 350 | 351 | v_mov_b32 v29, v11 352 | v_add_co_ci_u32 v17, vcc_lo, 0, s1, vcc_lo 353 | v_xor_b32 v25, v25, v26 354 | 355 | # load from spAddr1 356 | global_load_dwordx2 v[27:28], v[16:17], off 357 | 358 | # spAddr0 359 | v_and_b32 v25, s86, v25 360 | v_add_nc_u32 v25, v25, v0 361 | 362 | v_add_co_u32 v31, vcc_lo, s0, v25 363 | v_add_co_ci_u32 v32, vcc_lo, 0, s1, vcc_lo 364 | v_add_co_u32 v29, vcc_lo, v22, v29 365 | 366 | # load from spAddr0 367 | global_load_dwordx2 v[25:26], v[31:32], off 368 | 369 | v_add_co_ci_u32 v30, vcc_lo, 0, v23, vcc_lo 370 | v_mov_b32 v33, v11 371 | s_and_b32 vcc_lo, exec_lo, vcc_lo 372 | s_waitcnt vmcnt(1) 373 | v_cvt_f64_i32 v[14:15], v28 374 | v_cvt_f64_i32 v[12:13], v27 375 | v_or_b32 v14, v14, v35 376 | s_waitcnt vmcnt(0) 377 | 378 | # R[sub] ^= *p0; 379 | v_xor_b32 v8, v25, v8 380 | v_xor_b32 v9, v26, v9 381 | 382 | v_and_b32 v26, v4, v15 383 | 384 | v_and_b32 v19, v4, v13 385 | v_or_b32 v15, v26, v34 386 | v_or_b32 v18, v12, v3 387 | v_mov_b32 v26, 0 388 | v_or_b32 v19, v19, v24 389 | v_mov_b32 v25, v26 390 | ds_write2_b64 v5, v[18:19], v[14:15] offset0:8 offset1:9 391 | 392 | # load from dataset 393 | global_load_dwordx2 v[18:19], v[29:30], off 394 | 395 | # load group F,E registers 396 | # Read low 8 bytes into lane 0 and high 8 bytes into lane 1 397 | s_mov_b64 exec, 3 398 | s_waitcnt lgkmcnt(0) 399 | ds_read2_b64 v[60:63], v41 offset0:8 offset1:10 400 | ds_read2_b64 v[64:67], v41 offset0:12 offset1:14 401 | ds_read2_b64 v[68:71], v41 offset0:16 offset1:18 402 | ds_read2_b64 v[72:75], v41 offset0:20 offset1:22 403 | 404 | # load VM integer registers 405 | v_readlane_b32 s16, v8, 0 406 | v_readlane_b32 s17, v9, 0 407 | v_readlane_b32 s18, v8, 1 408 | v_readlane_b32 s19, v9, 1 409 | v_readlane_b32 s20, v8, 2 410 | v_readlane_b32 s21, v9, 2 411 | v_readlane_b32 s22, v8, 3 412 | v_readlane_b32 s23, v9, 3 413 | v_readlane_b32 s24, v8, 4 414 | v_readlane_b32 s25, v9, 4 415 | v_readlane_b32 s26, v8, 5 416 | v_readlane_b32 s27, v9, 5 417 | v_readlane_b32 s28, v8, 6 418 | v_readlane_b32 s29, v9, 6 419 | v_readlane_b32 s30, v8, 7 420 | v_readlane_b32 s31, v9, 7 421 | 422 | s_waitcnt lgkmcnt(0) 423 | 424 | # Use only first 2 lanes for the program 425 | s_mov_b64 exec, 3 426 | 427 | # call JIT code 428 | s_swappc_b64 s[12:13], s[4:5] 429 | 430 | # Write out group F,E registers 431 | # Write low 8 bytes from lane 0 and high 8 bytes from lane 1 432 | ds_write2_b64 v41, v[60:61], v[62:63] offset0:8 offset1:10 433 | ds_write2_b64 v41, v[64:65], v[66:67] offset0:12 offset1:14 434 | ds_write2_b64 v41, v[68:69], v[70:71] offset0:16 offset1:18 435 | ds_write2_b64 v41, v[72:73], v[74:75] offset0:20 offset1:22 436 | 437 | # store VM integer registers 438 | v_writelane_b32 v8, s16, 0 439 | v_writelane_b32 v9, s17, 0 440 | v_writelane_b32 v8, s18, 1 441 | v_writelane_b32 v9, s19, 1 442 | v_writelane_b32 v8, s20, 2 443 | v_writelane_b32 v9, s21, 2 444 | v_writelane_b32 v8, s22, 3 445 | v_writelane_b32 v9, s23, 3 446 | v_writelane_b32 v8, s24, 4 447 | v_writelane_b32 v9, s25, 4 448 | v_writelane_b32 v8, s26, 5 449 | v_writelane_b32 v9, s27, 5 450 | v_writelane_b32 v8, s28, 6 451 | v_writelane_b32 v9, s29, 6 452 | v_writelane_b32 v8, s30, 7 453 | v_writelane_b32 v9, s31, 7 454 | 455 | # Turn back on 8 execution lanes 456 | s_mov_b64 exec, 255 457 | 458 | # Write out VM integer registers 459 | ds_write_b64 v6, v[8:9] 460 | s_waitcnt lgkmcnt(0) 461 | 462 | # R[readReg2], R[readReg3] 463 | ds_read_b32 v11, v21 464 | ds_read_b32 v27, v20 465 | s_waitcnt lgkmcnt(0) 466 | 467 | # mx ^= R[readReg2] ^ R[readReg3]; 468 | v_xor_b32 v11, v11, v27 469 | v_xor_b32 v10, v10, v11 470 | 471 | # v[27:28] = R[sub] 472 | # v[29:30] = F[sub] 473 | ds_read2_b64 v[27:30], v6 offset1:8 474 | 475 | # mx &= CacheLineAlignMask; 476 | v_and_b32 v11, 0x7fffffc0, v10 477 | v_mov_b32 v10, v33 478 | s_waitcnt lgkmcnt(0) 479 | 480 | # const ulong next_r = R[sub] ^ data; 481 | s_waitcnt lgkmcnt(0) 482 | v_xor_b32 v8, v27, v18 483 | v_xor_b32 v9, v28, v19 484 | 485 | # *p1 = next_r; 486 | global_store_dwordx2 v[16:17], v[8:9], off 487 | 488 | # v[27:28] = E[sub] 489 | ds_read_b64 v[27:28], v6 offset:128 490 | 491 | # R[sub] = next_r; 492 | ds_write_b64 v6, v[8:9] 493 | s_waitcnt lgkmcnt(1) 494 | 495 | # *p0 = as_ulong(F[sub]) ^ as_ulong(E[sub]); 496 | v_xor_b32 v29, v27, v29 497 | v_xor_b32 v30, v28, v30 498 | global_store_dwordx2 v[31:32], v[29:30], off 499 | 500 | s_sub_u32 s2, s2, 1 501 | s_cbranch_scc0 main_loop 502 | main_loop_end: 503 | 504 | global_store_dwordx2 v[1:2], v[8:9], off 505 | global_store_dwordx2 v[1:2], v[29:30], off inst_offset:64 506 | global_store_dwordx2 v[1:2], v[27:28], off inst_offset:128 507 | 508 | # store rounding mode 509 | v_mov_b32 v0, 0 510 | v_mov_b32 v1, s66 511 | global_store_dword v0, v1, s[64:65] 512 | 513 | program_end: 514 | s_endpgm 515 | 516 | fsqrt_r_sub0: 517 | s_setreg_b32 hwreg(mode, 2, 2), s67 518 | v_rsq_f64 v[28:29], v[68:69] 519 | 520 | # Improve initial approximation (can be skipped) 521 | #v_mul_f64 v[42:43], v[28:29], v[68:69] 522 | #v_mul_f64 v[48:49], v[28:29], -0.5 523 | #v_fma_f64 v[48:49], v[48:49], v[42:43], 0.5 524 | #v_fma_f64 v[28:29], v[28:29], v[48:49], v[28:29] 525 | 526 | v_mul_f64 v[42:43], v[28:29], v[68:69] 527 | v_mov_b32 v48, v28 528 | v_sub_nc_u32 v49, v29, v84 529 | v_mov_b32 v46, v28 530 | v_xor_b32 v47, v49, v82 531 | v_fma_f64 v[46:47], v[46:47], v[42:43], 0.5 532 | v_fma_f64 v[42:43], v[42:43], v[46:47], v[42:43] 533 | v_fma_f64 v[48:49], v[48:49], v[46:47], v[48:49] 534 | v_fma_f64 v[46:47], -v[42:43], v[42:43], v[68:69] 535 | s_setreg_b32 hwreg(mode, 2, 2), s66 536 | v_fma_f64 v[42:43], v[46:47], v[48:49], v[42:43] 537 | v_cmpx_class_f64 v[68:69], s[68:69] 538 | v_mov_b32 v68, v42 539 | v_mov_b32 v69, v43 540 | s_mov_b64 exec, 3 541 | s_setpc_b64 s[60:61] 542 | 543 | fsqrt_r_sub1: 544 | s_setreg_b32 hwreg(mode, 2, 2), s67 545 | v_rsq_f64 v[28:29], v[70:71] 546 | 547 | # Improve initial approximation (can be skipped) 548 | #v_mul_f64 v[42:43], v[28:29], v[70:71] 549 | #v_mul_f64 v[48:49], v[28:29], -0.5 550 | #v_fma_f64 v[48:49], v[48:49], v[42:43], 0.5 551 | #v_fma_f64 v[28:29], v[28:29], v[48:49], v[28:29] 552 | 553 | v_mul_f64 v[42:43], v[28:29], v[70:71] 554 | v_mov_b32 v48, v28 555 | v_sub_nc_u32 v49, v29, v84 556 | v_mov_b32 v46, v28 557 | v_xor_b32 v47, v49, v82 558 | v_fma_f64 v[46:47], v[46:47], v[42:43], 0.5 559 | v_fma_f64 v[42:43], v[42:43], v[46:47], v[42:43] 560 | v_fma_f64 v[48:49], v[48:49], v[46:47], v[48:49] 561 | v_fma_f64 v[46:47], -v[42:43], v[42:43], v[70:71] 562 | s_setreg_b32 hwreg(mode, 2, 2), s66 563 | v_fma_f64 v[42:43], v[46:47], v[48:49], v[42:43] 564 | v_cmpx_class_f64 v[70:71], s[68:69] 565 | v_mov_b32 v70, v42 566 | v_mov_b32 v71, v43 567 | s_mov_b64 exec, 3 568 | s_setpc_b64 s[60:61] 569 | 570 | fsqrt_r_sub2: 571 | s_setreg_b32 hwreg(mode, 2, 2), s67 572 | v_rsq_f64 v[28:29], v[72:73] 573 | 574 | # Improve initial approximation (can be skipped) 575 | #v_mul_f64 v[42:43], v[28:29], v[72:73] 576 | #v_mul_f64 v[48:49], v[28:29], -0.5 577 | #v_fma_f64 v[48:49], v[48:49], v[42:43], 0.5 578 | #v_fma_f64 v[28:29], v[28:29], v[48:49], v[28:29] 579 | 580 | v_mul_f64 v[42:43], v[28:29], v[72:73] 581 | v_mov_b32 v48, v28 582 | v_sub_nc_u32 v49, v29, v84 583 | v_mov_b32 v46, v28 584 | v_xor_b32 v47, v49, v82 585 | v_fma_f64 v[46:47], v[46:47], v[42:43], 0.5 586 | v_fma_f64 v[42:43], v[42:43], v[46:47], v[42:43] 587 | v_fma_f64 v[48:49], v[48:49], v[46:47], v[48:49] 588 | v_fma_f64 v[46:47], -v[42:43], v[42:43], v[72:73] 589 | s_setreg_b32 hwreg(mode, 2, 2), s66 590 | v_fma_f64 v[42:43], v[46:47], v[48:49], v[42:43] 591 | v_cmpx_class_f64 v[72:73], s[68:69] 592 | v_mov_b32 v72, v42 593 | v_mov_b32 v73, v43 594 | s_mov_b64 exec, 3 595 | s_setpc_b64 s[60:61] 596 | 597 | fsqrt_r_sub3: 598 | s_setreg_b32 hwreg(mode, 2, 2), s67 599 | v_rsq_f64 v[28:29], v[74:75] 600 | 601 | # Improve initial approximation (can be skipped) 602 | #v_mul_f64 v[42:43], v[28:29], v[74:75] 603 | #v_mul_f64 v[48:49], v[28:29], -0.5 604 | #v_fma_f64 v[48:49], v[48:49], v[42:43], 0.5 605 | #v_fma_f64 v[28:29], v[28:29], v[48:49], v[28:29] 606 | 607 | v_mul_f64 v[42:43], v[28:29], v[74:75] 608 | v_mov_b32 v48, v28 609 | v_sub_nc_u32 v49, v29, v84 610 | v_mov_b32 v46, v28 611 | v_xor_b32 v47, v49, v82 612 | v_fma_f64 v[46:47], v[46:47], v[42:43], 0.5 613 | v_fma_f64 v[42:43], v[42:43], v[46:47], v[42:43] 614 | v_fma_f64 v[48:49], v[48:49], v[46:47], v[48:49] 615 | v_fma_f64 v[46:47], -v[42:43], v[42:43], v[74:75] 616 | s_setreg_b32 hwreg(mode, 2, 2), s66 617 | v_fma_f64 v[42:43], v[46:47], v[48:49], v[42:43] 618 | v_cmpx_class_f64 v[74:75], s[68:69] 619 | v_mov_b32 v74, v42 620 | v_mov_b32 v75, v43 621 | s_mov_b64 exec, 3 622 | s_setpc_b64 s[60:61] 623 | 624 | fdiv_m_sub0: 625 | v_or_b32 v28, v28, v78 626 | v_and_or_b32 v29, v29, v77, v79 627 | s_setreg_b32 hwreg(mode, 2, 2), s67 628 | v_rcp_f64 v[48:49], v[28:29] 629 | v_fma_f64 v[80:81], -v[28:29], v[48:49], 1.0 630 | v_fma_f64 v[48:49], v[48:49], v[80:81], v[48:49] 631 | v_mul_f64 v[80:81], v[68:69], v[48:49] 632 | v_fma_f64 v[42:43], -v[28:29], v[80:81], v[68:69] 633 | s_setreg_b32 hwreg(mode, 2, 2), s66 634 | v_fma_f64 v[42:43], v[42:43], v[48:49], v[80:81] 635 | v_div_fixup_f64 v[80:81], v[42:43], v[28:29], v[68:69] 636 | v_cmpx_eq_f64 v[68:69], v[28:29] 637 | v_mov_b32 v80, 0 638 | v_mov_b32 v81, v83 639 | s_mov_b64 exec, 3 640 | v_mov_b32 v68, v80 641 | v_mov_b32 v69, v81 642 | s_setpc_b64 s[60:61] 643 | 644 | fdiv_m_sub1: 645 | v_or_b32 v28, v28, v78 646 | v_and_or_b32 v29, v29, v77, v79 647 | s_setreg_b32 hwreg(mode, 2, 2), s67 648 | v_rcp_f64 v[48:49], v[28:29] 649 | v_fma_f64 v[80:81], -v[28:29], v[48:49], 1.0 650 | v_fma_f64 v[48:49], v[48:49], v[80:81], v[48:49] 651 | v_mul_f64 v[80:81], v[70:71], v[48:49] 652 | v_fma_f64 v[42:43], -v[28:29], v[80:81], v[70:71] 653 | s_setreg_b32 hwreg(mode, 2, 2), s66 654 | v_fma_f64 v[42:43], v[42:43], v[48:49], v[80:81] 655 | v_div_fixup_f64 v[80:81], v[42:43], v[28:29], v[70:71] 656 | v_cmpx_eq_f64 v[70:71], v[28:29] 657 | v_mov_b32 v80, 0 658 | v_mov_b32 v81, v83 659 | s_mov_b64 exec, 3 660 | v_mov_b32 v70, v80 661 | v_mov_b32 v71, v81 662 | s_setpc_b64 s[60:61] 663 | 664 | fdiv_m_sub2: 665 | v_or_b32 v28, v28, v78 666 | v_and_or_b32 v29, v29, v77, v79 667 | s_setreg_b32 hwreg(mode, 2, 2), s67 668 | v_rcp_f64 v[48:49], v[28:29] 669 | v_fma_f64 v[80:81], -v[28:29], v[48:49], 1.0 670 | v_fma_f64 v[48:49], v[48:49], v[80:81], v[48:49] 671 | v_mul_f64 v[80:81], v[72:73], v[48:49] 672 | v_fma_f64 v[42:43], -v[28:29], v[80:81], v[72:73] 673 | s_setreg_b32 hwreg(mode, 2, 2), s66 674 | v_fma_f64 v[42:43], v[42:43], v[48:49], v[80:81] 675 | v_div_fixup_f64 v[80:81], v[42:43], v[28:29], v[72:73] 676 | v_cmpx_eq_f64 v[72:73], v[28:29] 677 | v_mov_b32 v80, 0 678 | v_mov_b32 v81, v83 679 | s_mov_b64 exec, 3 680 | v_mov_b32 v72, v80 681 | v_mov_b32 v73, v81 682 | s_setpc_b64 s[60:61] 683 | 684 | fdiv_m_sub3: 685 | v_or_b32 v28, v28, v78 686 | v_and_or_b32 v29, v29, v77, v79 687 | s_setreg_b32 hwreg(mode, 2, 2), s67 688 | v_rcp_f64 v[48:49], v[28:29] 689 | v_fma_f64 v[80:81], -v[28:29], v[48:49], 1.0 690 | v_fma_f64 v[48:49], v[48:49], v[80:81], v[48:49] 691 | v_mul_f64 v[80:81], v[74:75], v[48:49] 692 | v_fma_f64 v[42:43], -v[28:29], v[80:81], v[74:75] 693 | s_setreg_b32 hwreg(mode, 2, 2), s66 694 | v_fma_f64 v[42:43], v[42:43], v[48:49], v[80:81] 695 | v_div_fixup_f64 v[80:81], v[42:43], v[28:29], v[74:75] 696 | v_cmpx_eq_f64 v[74:75], v[28:29] 697 | v_mov_b32 v80, 0 698 | v_mov_b32 v81, v83 699 | s_mov_b64 exec, 3 700 | v_mov_b32 v74, v80 701 | v_mov_b32 v75, v81 702 | s_setpc_b64 s[60:61] 703 | 704 | ismulh_r_sub: 705 | s_mov_b64 exec, 1 706 | v_mov_b32 v45, s14 707 | v_mul_hi_u32 v40, s38, v45 708 | v_mov_b32 v47, s15 709 | v_mad_u64_u32 v[42:43], s32, s38, v47, v[40:41] 710 | v_mov_b32 v40, v42 711 | v_mad_u64_u32 v[45:46], s32, s39, v45, v[40:41] 712 | v_mad_u64_u32 v[42:43], s32, s39, v47, v[43:44] 713 | v_add_co_u32 v42, vcc_lo, v42, v46 714 | v_add_co_ci_u32 v43, vcc_lo, 0, v43, vcc_lo 715 | v_readlane_b32 s32, v42, 0 716 | v_readlane_b32 s33, v43, 0 717 | s_cmp_lt_i32 s15, 0 718 | s_cselect_b64 s[34:35], s[38:39], 0 719 | s_sub_u32 s32, s32, s34 720 | s_subb_u32 s33, s33, s35 721 | s_cmp_lt_i32 s39, 0 722 | s_cselect_b64 s[34:35], s[14:15], 0 723 | s_sub_u32 s14, s32, s34 724 | s_subb_u32 s15, s33, s35 725 | s_mov_b64 exec, 3 726 | s_setpc_b64 s[60:61] 727 | 728 | imulh_r_sub: 729 | s_mov_b64 exec, 1 730 | v_mov_b32 v45, s38 731 | v_mul_hi_u32 v40, s14, v45 732 | v_mov_b32 v47, s39 733 | v_mad_u64_u32 v[42:43], s32, s14, v47, v[40:41] 734 | v_mov_b32 v40, v42 735 | v_mad_u64_u32 v[45:46], s32, s15, v45, v[40:41] 736 | v_mad_u64_u32 v[42:43], s32, s15, v47, v[43:44] 737 | v_add_co_u32 v42, vcc_lo, v42, v46 738 | v_add_co_ci_u32 v43, vcc_lo, 0, v43, vcc_lo 739 | v_readlane_b32 s14, v42, 0 740 | v_readlane_b32 s15, v43, 0 741 | s_mov_b64 exec, 3 742 | s_setpc_b64 s[60:61] 743 | -------------------------------------------------------------------------------- /RandomX_OpenCL/GCNASM/randomx_run_gfx803.asm: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2019 SChernykh 3 | 4 | This file is part of RandomX OpenCL. 5 | 6 | RandomX OpenCL is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | RandomX OpenCL is distributed in the hope that it will be useful, 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | GNU General Public License for more details. 15 | 16 | You should have received a copy of the GNU General Public License 17 | along with RandomX OpenCL. If not, see . 18 | */ 19 | 20 | .amdcl2 21 | .gpu GFX803 22 | .64bit 23 | .arch_minor 0 24 | .arch_stepping 0 25 | .driver_version 203603 26 | .kernel randomx_run 27 | .config 28 | .dims x 29 | .cws 64, 1, 1 30 | .sgprsnum 96 31 | # 6 waves per SIMD: 37-40 VGPRs 32 | # 5 waves per SIMD: 41-48 VGPRs 33 | # 4 waves per SIMD: 49-64 VGPRs 34 | # 3 waves per SIMD: 65-84 VGPRs 35 | # 2 waves per SIMD: 85-128 VGPRs 36 | # 1 wave per SIMD: 129-256 VGPRs 37 | .vgprsnum 128 38 | .localsize 256 39 | .floatmode 0xc0 40 | .pgmrsrc1 0x00ac035f 41 | .pgmrsrc2 0x0000008c 42 | .dx10clamp 43 | .ieeemode 44 | .useargs 45 | .priority 0 46 | .arg _.global_offset_0, "size_t", long 47 | .arg _.global_offset_1, "size_t", long 48 | .arg _.global_offset_2, "size_t", long 49 | .arg _.printf_buffer, "size_t", void*, global, , rdonly 50 | .arg _.vqueue_pointer, "size_t", long 51 | .arg _.aqlwrap_pointer, "size_t", long 52 | .arg dataset, "uchar*", uchar*, global, const, rdonly 53 | .arg scratchpad, "uchar*", uchar*, global, 54 | .arg registers, "ulong*", ulong*, global, 55 | .arg rounding_modes, "uint*", uint*, global, 56 | .arg programs, "uint*", uint*, global, 57 | .arg batch_size, "uint", uint 58 | .arg rx_parameters, "uint", uint 59 | .text 60 | s_mov_b32 m0, 0x10000 61 | s_dcache_wb 62 | s_waitcnt vmcnt(0) & lgkmcnt(0) 63 | s_icache_inv 64 | s_branch begin 65 | 66 | # pgmrsrc2 = 0x00000090, bits 1:5 = 8, so first 8 SGPRs (s0-s7) contain user data 67 | # s8 contains group id 68 | # v0 contains local id 69 | begin: 70 | s_mov_b32 s8, s6 71 | v_lshlrev_b32 v1, 6, s8 72 | v_add_u32 v1, vcc, v1, v0 73 | s_load_dwordx2 s[0:1], s[4:5], 0x0 74 | s_load_dwordx2 s[2:3], s[4:5], 0x40 75 | s_load_dwordx2 s[64:65], s[4:5], 0x48 76 | s_waitcnt lgkmcnt(0) 77 | 78 | # load rounding mode 79 | s_lshl_b32 s16, s8, 2 80 | s_add_u32 s64, s64, s16 81 | s_addc_u32 s65, s65, 0 82 | v_mov_b32 v8, s64 83 | v_mov_b32 v9, s65 84 | flat_load_dword v8, v[8:9] 85 | s_waitcnt vmcnt(0) 86 | v_readlane_b32 s66, v8, 0 87 | s_setreg_b32 hwreg(mode, 2, 2), s66 88 | s_mov_b32 s67, 0 89 | 90 | # used in FSQRT_R to check for "positive normal value" (v_cmpx_class_f64) 91 | s_mov_b32 s68, 256 92 | s_mov_b32 s69, 0 93 | 94 | v_add_u32 v1, vcc, s0, v1 95 | v_lshrrev_b32 v2, 6, v1 96 | v_lshlrev_b32 v3, 5, v2 97 | v_and_b32 v1, 63, v1 98 | v_mov_b32 v4, 0 99 | v_lshlrev_b64 v[3:4], 3, v[3:4] 100 | v_lshlrev_b32 v5, 4, v1 101 | v_add_u32 v3, vcc, s2, v3 102 | v_mov_b32 v6, s3 103 | v_addc_u32 v4, vcc, v6, v4, vcc 104 | v_lshlrev_b32 v41, 2, v1 105 | v_add_u32 v6, vcc, v3, v41 106 | v_addc_u32 v7, vcc, v4, 0, vcc 107 | flat_load_dword v6, v[6:7] 108 | v_mov_b32 v0, 0 109 | s_waitcnt vmcnt(0) 110 | ds_write_b32 v41, v6 111 | s_waitcnt lgkmcnt(0) 112 | s_mov_b64 s[0:1], exec 113 | v_cmpx_le_u32 s[2:3], v1, 7 114 | s_cbranch_execz program_end 115 | 116 | # rx_parameters 117 | s_load_dword s20, s[4:5], 0x5c 118 | s_waitcnt lgkmcnt(0) 119 | 120 | # Scratchpad L1 size 121 | s_bfe_u32 s21, s20, 0x050000 122 | s_lshl_b32 s21, 1, s21 123 | 124 | # Scratchpad L2 size 125 | s_bfe_u32 s22, s20, 0x050005 126 | s_lshl_b32 s22, 1, s22 127 | 128 | # Scratchpad L3 size 129 | s_bfe_u32 s23, s20, 0x05000A 130 | s_lshl_b32 s23, 1, s23 131 | 132 | # program iterations 133 | s_bfe_u32 s24, s20, 0x04000F 134 | s_lshl_b32 s24, 1, s24 135 | 136 | # Base address for scratchpads 137 | s_add_u32 s2, s23, 64 138 | v_mul_hi_u32 v20, v2, s2 139 | v_mul_lo_u32 v2, v2, s2 140 | 141 | # v41, v44 = 0 142 | v_mov_b32 v41, 0 143 | v_mov_b32 v44, 0 144 | 145 | ds_read_b32 v6, v0 offset:152 146 | v_cmp_lt_u32 s[2:3], v1, 4 147 | ds_read2_b64 v[34:37], v0 offset0:18 offset1:16 148 | ds_read_b64 v[11:12], v0 offset:136 149 | s_movk_i32 s9, 0x0 150 | s_mov_b64 s[6:7], exec 151 | s_andn2_b64 exec, s[6:7], s[2:3] 152 | ds_read_b64 v[13:14], v0 offset:160 153 | s_andn2_b64 exec, s[6:7], exec 154 | v_mov_b32 v13, 0 155 | v_mov_b32 v14, 0 156 | s_mov_b64 exec, s[6:7] 157 | 158 | # compiled program size 159 | s_mov_b64 s[6:7], s[8:9] 160 | s_mulk_i32 s6, 10048 161 | 162 | v_add_u32 v5, vcc, v0, v5 163 | v_add_u32 v5, vcc, v5, 64 164 | s_mov_b64 s[8:9], exec 165 | s_andn2_b64 exec, s[8:9], s[2:3] 166 | ds_read_b64 v[15:16], v0 offset:168 167 | s_andn2_b64 exec, s[8:9], exec 168 | v_mov_b32 v15, 0 169 | v_mov_b32 v16, 0 170 | s_mov_b64 exec, s[8:9] 171 | s_load_dwordx4 s[8:11], s[4:5], 0x30 172 | 173 | # batch_size 174 | s_load_dword s16, s[4:5], 0x58 175 | 176 | s_load_dwordx2 s[4:5], s[4:5], 0x50 177 | v_lshlrev_b32 v1, 3, v1 178 | v_add_u32 v17, vcc, v0, v1 179 | s_waitcnt lgkmcnt(0) 180 | v_add_u32 v2, vcc, s10, v2 181 | v_mov_b32 v18, s11 182 | v_addc_u32 v18, vcc, v18, v20, vcc 183 | v_mov_b32 v19, 0xffffff 184 | v_add_u32 v6, vcc, s8, v6 185 | v_mov_b32 v20, s9 186 | v_addc_u32 v20, vcc, v20, 0, vcc 187 | ds_read_b64 v[21:22], v17 188 | s_add_u32 s4, s4, s6 189 | s_addc_u32 s5, s5, s7 190 | v_cndmask_b32 v19, v19, -1, s[2:3] 191 | v_lshlrev_b32 v8, 3, v35 192 | v_lshlrev_b32 v7, 3, v34 193 | v_lshlrev_b32 v12, 3, v12 194 | v_lshlrev_b32 v10, 3, v11 195 | v_add_u32 v8, vcc, v8, v0 196 | v_add_u32 v7, vcc, v7, v0 197 | v_add_u32 v12, vcc, v12, v0 198 | v_add_u32 v0, vcc, v10, v0 199 | v_mov_b32 v10, v36 200 | v_mov_b32 v23, v37 201 | 202 | # loop counter 203 | s_sub_u32 s2, s24, 1 204 | 205 | # batch_size 206 | s_mov_b32 s3, s16 207 | 208 | # Scratchpad masks for scratchpads 209 | v_sub_u32 v38, vcc, s21, 8 210 | v_sub_u32 v39, vcc, s22, 8 211 | v_sub_u32 v50, vcc, s23, 8 212 | 213 | # mask for FSCAL_R 214 | v_mov_b32 v51, 0x80F00000 215 | 216 | # swap v3 and v18 217 | v_mov_b32 v52, v3 218 | v_mov_b32 v3, v18 219 | v_mov_b32 v18, v52 220 | 221 | # load scratchpad base address 222 | v_readlane_b32 s0, v2, 0 223 | v_readlane_b32 s1, v3, 0 224 | 225 | # save current executiom mask 226 | s_mov_b64 s[36:37], exec 227 | 228 | # v41 = 0 on lane 0, set it to 8 on lane 1 229 | # v44 = 0 on lane 0, set it to 4 on lane 1 230 | s_mov_b64 exec, 2 231 | v_mov_b32 v41, 8 232 | v_mov_b32 v44, 4 233 | 234 | # load group A registers 235 | # Read low 8 bytes into lane 0 and high 8 bytes into lane 1 236 | s_mov_b64 exec, 3 237 | ds_read2_b64 v[52:55], v41 offset0:24 offset1:26 238 | ds_read2_b64 v[56:59], v41 offset0:28 offset1:30 239 | 240 | # xmantissaMask 241 | v_mov_b32 v77, (1 << 24) - 1 242 | 243 | # xexponentMask 244 | ds_read_b64 v[78:79], v41 offset:160 245 | 246 | # Restore execution mask 247 | s_mov_b64 exec, s[36:37] 248 | 249 | # sign mask (used in FSQRT_R) 250 | v_mov_b32 v82, 0x80000000 251 | 252 | # High 32 bits of "1.0" constant (used in FDIV_M) 253 | v_mov_b32 v83, (1023 << 20) 254 | 255 | # Used to multiply FP64 values by 0.5 256 | v_mov_b32 v84, (1 << 20) 257 | 258 | s_getpc_b64 s[14:15] 259 | cur_addr: 260 | 261 | # get addresses of FSQRT_R subroutines 262 | s_add_u32 s40, s14, fsqrt_r_sub0 - cur_addr 263 | s_addc_u32 s41, s15, 0 264 | s_add_u32 s42, s14, fsqrt_r_sub1 - cur_addr 265 | s_addc_u32 s43, s15, 0 266 | s_add_u32 s44, s14, fsqrt_r_sub2 - cur_addr 267 | s_addc_u32 s45, s15, 0 268 | s_add_u32 s46, s14, fsqrt_r_sub3 - cur_addr 269 | s_addc_u32 s47, s15, 0 270 | 271 | # get addresses of FDIV_M subroutines 272 | s_add_u32 s48, s14, fdiv_m_sub0 - cur_addr 273 | s_addc_u32 s49, s15, 0 274 | s_add_u32 s50, s14, fdiv_m_sub1 - cur_addr 275 | s_addc_u32 s51, s15, 0 276 | s_add_u32 s52, s14, fdiv_m_sub2 - cur_addr 277 | s_addc_u32 s53, s15, 0 278 | s_add_u32 s54, s14, fdiv_m_sub3 - cur_addr 279 | s_addc_u32 s55, s15, 0 280 | 281 | # get address for ISMULH_R subroutine 282 | s_add_u32 s56, s14, ismulh_r_sub - cur_addr 283 | s_addc_u32 s57, s15, 0 284 | 285 | # get address for IMULH_R subroutine 286 | s_add_u32 s58, s14, imulh_r_sub - cur_addr 287 | s_addc_u32 s59, s15, 0 288 | 289 | # used in IXOR_R instruction 290 | s_mov_b32 s63, -1 291 | 292 | # used in CBRANCH instruction 293 | s_mov_b32 s70, (0xFF << 8) 294 | s_mov_b32 s71, (0xFF << 9) 295 | s_mov_b32 s72, (0xFF << 10) 296 | s_mov_b32 s73, (0xFF << 11) 297 | s_mov_b32 s74, (0xFF << 12) 298 | s_mov_b32 s75, (0xFF << 13) 299 | s_mov_b32 s76, (0xFF << 14) 300 | s_mov_b32 s77, (0xFF << 15) 301 | s_mov_b32 s78, (0xFF << 16) 302 | s_mov_b32 s79, (0xFF << 17) 303 | s_mov_b32 s80, (0xFF << 18) 304 | s_mov_b32 s81, (0xFF << 19) 305 | s_mov_b32 s82, (0xFF << 20) 306 | s_mov_b32 s83, (0xFF << 21) 307 | s_mov_b32 s84, (0xFF << 22) 308 | s_mov_b32 s85, (0xFF << 23) 309 | 310 | # ScratchpadL3Mask64 311 | s_sub_u32 s86, s23, 64 312 | 313 | main_loop: 314 | # const uint2 spMix = as_uint2(R[readReg0] ^ R[readReg1]); 315 | ds_read_b64 v[24:25], v0 316 | ds_read_b64 v[26:27], v12 317 | s_waitcnt lgkmcnt(0) 318 | v_xor_b32 v25, v27, v25 319 | v_xor_b32 v24, v26, v24 320 | 321 | # spAddr1 ^= spMix.y; 322 | # spAddr0 ^= spMix.x; 323 | v_xor_b32 v10, v25, v10 324 | v_xor_b32 v23, v24, v23 325 | 326 | # spAddr1 &= ScratchpadL3Mask64; 327 | # spAddr0 &= ScratchpadL3Mask64; 328 | v_and_b32 v10, s86, v10 329 | v_and_b32 v23, s86, v23 330 | 331 | # Offset for scratchpads 332 | # offset1 = spAddr1 + sub * 8 333 | # offset0 = spAddr0 + sub * 8 334 | v_add_u32 v10, vcc, v10, v1 335 | v_add_u32 v23, vcc, v23, v1 336 | 337 | # __global ulong* p1 = (__global ulong*)(scratchpad + offset1); 338 | # __global ulong* p0 = (__global ulong*)(scratchpad + offset0); 339 | v_add_u32 v26, vcc, v2, v10 340 | v_addc_u32 v27, vcc, v3, 0, vcc 341 | v_add_u32 v23, vcc, v2, v23 342 | v_addc_u32 v24, vcc, v3, 0, vcc 343 | 344 | # load from spAddr1 345 | flat_load_dwordx2 v[28:29], v[26:27] 346 | 347 | # load from spAddr0 348 | flat_load_dwordx2 v[30:31], v[23:24] 349 | s_waitcnt vmcnt(1) 350 | 351 | v_cvt_f64_i32 v[32:33], v28 352 | v_cvt_f64_i32 v[28:29], v29 353 | s_waitcnt vmcnt(0) 354 | 355 | # R[sub] ^= *p0; 356 | v_xor_b32 v34, v21, v30 357 | v_xor_b32 v35, v22, v31 358 | 359 | v_add_u32 v22, vcc, v6, v36 360 | v_addc_u32 v25, vcc, v20, 0, vcc 361 | v_add_u32 v21, vcc, v22, v1 362 | v_addc_u32 v22, vcc, v25, 0, vcc 363 | flat_load_dwordx2 v[21:22], v[21:22] 364 | v_or_b32 v30, v32, v13 365 | v_and_b32 v31, v33, v19 366 | v_or_b32 v31, v31, v14 367 | v_or_b32 v28, v28, v15 368 | v_and_b32 v29, v29, v19 369 | v_or_b32 v29, v29, v16 370 | ds_write2_b64 v5, v[30:31], v[28:29] offset1:1 371 | s_waitcnt lgkmcnt(0) 372 | 373 | # Program 0 374 | 375 | # load group F,E registers 376 | # Read low 8 bytes into lane 0 and high 8 bytes into lane 1 377 | s_mov_b64 exec, 3 378 | ds_read2_b64 v[60:63], v41 offset0:8 offset1:10 379 | ds_read2_b64 v[64:67], v41 offset0:12 offset1:14 380 | ds_read2_b64 v[68:71], v41 offset0:16 offset1:18 381 | ds_read2_b64 v[72:75], v41 offset0:20 offset1:22 382 | 383 | # load VM integer registers 384 | v_readlane_b32 s16, v34, 0 385 | v_readlane_b32 s17, v35, 0 386 | v_readlane_b32 s18, v34, 1 387 | v_readlane_b32 s19, v35, 1 388 | v_readlane_b32 s20, v34, 2 389 | v_readlane_b32 s21, v35, 2 390 | v_readlane_b32 s22, v34, 3 391 | v_readlane_b32 s23, v35, 3 392 | v_readlane_b32 s24, v34, 4 393 | v_readlane_b32 s25, v35, 4 394 | v_readlane_b32 s26, v34, 5 395 | v_readlane_b32 s27, v35, 5 396 | v_readlane_b32 s28, v34, 6 397 | v_readlane_b32 s29, v35, 6 398 | v_readlane_b32 s30, v34, 7 399 | v_readlane_b32 s31, v35, 7 400 | 401 | s_waitcnt lgkmcnt(0) 402 | 403 | # call JIT code 404 | s_swappc_b64 s[12:13], s[4:5] 405 | 406 | # Write out group F,E registers 407 | # Write low 8 bytes from lane 0 and high 8 bytes from lane 1 408 | ds_write2_b64 v41, v[60:61], v[62:63] offset0:8 offset1:10 409 | ds_write2_b64 v41, v[64:65], v[66:67] offset0:12 offset1:14 410 | ds_write2_b64 v41, v[68:69], v[70:71] offset0:16 offset1:18 411 | ds_write2_b64 v41, v[72:73], v[74:75] offset0:20 offset1:22 412 | 413 | # store VM integer registers 414 | v_writelane_b32 v28, s16, 0 415 | v_writelane_b32 v29, s17, 0 416 | v_writelane_b32 v28, s18, 1 417 | v_writelane_b32 v29, s19, 1 418 | v_writelane_b32 v28, s20, 2 419 | v_writelane_b32 v29, s21, 2 420 | v_writelane_b32 v28, s22, 3 421 | v_writelane_b32 v29, s23, 3 422 | v_writelane_b32 v28, s24, 4 423 | v_writelane_b32 v29, s25, 4 424 | v_writelane_b32 v28, s26, 5 425 | v_writelane_b32 v29, s27, 5 426 | v_writelane_b32 v28, s28, 6 427 | v_writelane_b32 v29, s29, 6 428 | v_writelane_b32 v28, s30, 7 429 | v_writelane_b32 v29, s31, 7 430 | 431 | # Restore execution mask 432 | s_mov_b64 exec, s[36:37] 433 | 434 | # Write out VM integer registers 435 | ds_write_b64 v17, v[28:29] 436 | 437 | s_waitcnt lgkmcnt(0) 438 | v_xor_b32 v21, v28, v21 439 | v_xor_b32 v22, v29, v22 440 | ds_read_b32 v28, v7 441 | ds_read_b32 v29, v8 442 | ds_write_b64 v17, v[21:22] 443 | s_waitcnt lgkmcnt(1) 444 | ds_read2_b64 v[30:33], v17 offset0:8 offset1:16 445 | v_xor_b32 v10, v28, v37 446 | s_waitcnt lgkmcnt(0) 447 | v_xor_b32 v30, v32, v30 448 | v_xor_b32 v31, v33, v31 449 | v_xor_b32 v10, v10, v29 450 | flat_store_dwordx2 v[26:27], v[21:22] 451 | v_and_b32 v10, 0x7fffffc0, v10 452 | flat_store_dwordx2 v[23:24], v[30:31] 453 | s_cmp_eq_u32 s2, 0 454 | s_cbranch_scc1 main_loop_end 455 | s_sub_i32 s2, s2, 1 456 | v_mov_b32 v37, v36 457 | v_mov_b32 v23, 0 458 | v_mov_b32 v36, v10 459 | v_mov_b32 v10, 0 460 | s_branch main_loop 461 | main_loop_end: 462 | 463 | v_add_u32 v0, vcc, v18, v1 464 | v_addc_u32 v1, vcc, v4, 0, vcc 465 | flat_store_dwordx2 v[0:1], v[21:22] 466 | v_add_u32 v0, vcc, v0, 64 467 | v_addc_u32 v1, vcc, v1, 0, vcc 468 | flat_store_dwordx2 v[0:1], v[30:31] 469 | v_add_u32 v0, vcc, v0, 64 470 | v_addc_u32 v1, vcc, v1, 0, vcc 471 | flat_store_dwordx2 v[0:1], v[32:33] 472 | 473 | # store rounding mode 474 | v_mov_b32 v0, s64 475 | v_mov_b32 v1, s65 476 | v_mov_b32 v2, s66 477 | flat_store_dword v[0:1], v2 478 | 479 | program_end: 480 | s_endpgm 481 | 482 | fsqrt_r_sub0: 483 | s_setreg_b32 hwreg(mode, 2, 2), s67 484 | v_rsq_f64 v[28:29], v[68:69] 485 | 486 | # Improve initial approximation (can be skipped) 487 | #v_mul_f64 v[42:43], v[28:29], v[68:69] 488 | #v_mul_f64 v[48:49], v[28:29], -0.5 489 | #v_fma_f64 v[48:49], v[48:49], v[42:43], 0.5 490 | #v_fma_f64 v[28:29], v[28:29], v[48:49], v[28:29] 491 | 492 | v_mul_f64 v[42:43], v[28:29], v[68:69] 493 | v_mov_b32 v48, v28 494 | v_sub_u32 v49, vcc, v29, v84 495 | v_mov_b32 v46, v28 496 | v_xor_b32 v47, v49, v82 497 | v_fma_f64 v[46:47], v[46:47], v[42:43], 0.5 498 | v_fma_f64 v[42:43], v[42:43], v[46:47], v[42:43] 499 | v_fma_f64 v[48:49], v[48:49], v[46:47], v[48:49] 500 | v_fma_f64 v[46:47], -v[42:43], v[42:43], v[68:69] 501 | s_setreg_b32 hwreg(mode, 2, 2), s66 502 | v_fma_f64 v[42:43], v[46:47], v[48:49], v[42:43] 503 | v_cmpx_class_f64 s[14:15], v[68:69], s[68:69] 504 | v_mov_b32 v68, v42 505 | v_mov_b32 v69, v43 506 | s_mov_b64 exec, 3 507 | s_setpc_b64 s[60:61] 508 | 509 | fsqrt_r_sub1: 510 | s_setreg_b32 hwreg(mode, 2, 2), s67 511 | v_rsq_f64 v[28:29], v[70:71] 512 | 513 | # Improve initial approximation (can be skipped) 514 | #v_mul_f64 v[42:43], v[28:29], v[70:71] 515 | #v_mul_f64 v[48:49], v[28:29], -0.5 516 | #v_fma_f64 v[48:49], v[48:49], v[42:43], 0.5 517 | #v_fma_f64 v[28:29], v[28:29], v[48:49], v[28:29] 518 | 519 | v_mul_f64 v[42:43], v[28:29], v[70:71] 520 | v_mov_b32 v48, v28 521 | v_sub_u32 v49, vcc, v29, v84 522 | v_mov_b32 v46, v28 523 | v_xor_b32 v47, v49, v82 524 | v_fma_f64 v[46:47], v[46:47], v[42:43], 0.5 525 | v_fma_f64 v[42:43], v[42:43], v[46:47], v[42:43] 526 | v_fma_f64 v[48:49], v[48:49], v[46:47], v[48:49] 527 | v_fma_f64 v[46:47], -v[42:43], v[42:43], v[70:71] 528 | s_setreg_b32 hwreg(mode, 2, 2), s66 529 | v_fma_f64 v[42:43], v[46:47], v[48:49], v[42:43] 530 | v_cmpx_class_f64 s[14:15], v[70:71], s[68:69] 531 | v_mov_b32 v70, v42 532 | v_mov_b32 v71, v43 533 | s_mov_b64 exec, 3 534 | s_setpc_b64 s[60:61] 535 | 536 | fsqrt_r_sub2: 537 | s_setreg_b32 hwreg(mode, 2, 2), s67 538 | v_rsq_f64 v[28:29], v[72:73] 539 | 540 | # Improve initial approximation (can be skipped) 541 | #v_mul_f64 v[42:43], v[28:29], v[72:73] 542 | #v_mul_f64 v[48:49], v[28:29], -0.5 543 | #v_fma_f64 v[48:49], v[48:49], v[42:43], 0.5 544 | #v_fma_f64 v[28:29], v[28:29], v[48:49], v[28:29] 545 | 546 | v_mul_f64 v[42:43], v[28:29], v[72:73] 547 | v_mov_b32 v48, v28 548 | v_sub_u32 v49, vcc, v29, v84 549 | v_mov_b32 v46, v28 550 | v_xor_b32 v47, v49, v82 551 | v_fma_f64 v[46:47], v[46:47], v[42:43], 0.5 552 | v_fma_f64 v[42:43], v[42:43], v[46:47], v[42:43] 553 | v_fma_f64 v[48:49], v[48:49], v[46:47], v[48:49] 554 | v_fma_f64 v[46:47], -v[42:43], v[42:43], v[72:73] 555 | s_setreg_b32 hwreg(mode, 2, 2), s66 556 | v_fma_f64 v[42:43], v[46:47], v[48:49], v[42:43] 557 | v_cmpx_class_f64 s[14:15], v[72:73], s[68:69] 558 | v_mov_b32 v72, v42 559 | v_mov_b32 v73, v43 560 | s_mov_b64 exec, 3 561 | s_setpc_b64 s[60:61] 562 | 563 | fsqrt_r_sub3: 564 | s_setreg_b32 hwreg(mode, 2, 2), s67 565 | v_rsq_f64 v[28:29], v[74:75] 566 | 567 | # Improve initial approximation (can be skipped) 568 | #v_mul_f64 v[42:43], v[28:29], v[74:75] 569 | #v_mul_f64 v[48:49], v[28:29], -0.5 570 | #v_fma_f64 v[48:49], v[48:49], v[42:43], 0.5 571 | #v_fma_f64 v[28:29], v[28:29], v[48:49], v[28:29] 572 | 573 | v_mul_f64 v[42:43], v[28:29], v[74:75] 574 | v_mov_b32 v48, v28 575 | v_sub_u32 v49, vcc, v29, v84 576 | v_mov_b32 v46, v28 577 | v_xor_b32 v47, v49, v82 578 | v_fma_f64 v[46:47], v[46:47], v[42:43], 0.5 579 | v_fma_f64 v[42:43], v[42:43], v[46:47], v[42:43] 580 | v_fma_f64 v[48:49], v[48:49], v[46:47], v[48:49] 581 | v_fma_f64 v[46:47], -v[42:43], v[42:43], v[74:75] 582 | s_setreg_b32 hwreg(mode, 2, 2), s66 583 | v_fma_f64 v[42:43], v[46:47], v[48:49], v[42:43] 584 | v_cmpx_class_f64 s[14:15], v[74:75], s[68:69] 585 | v_mov_b32 v74, v42 586 | v_mov_b32 v75, v43 587 | s_mov_b64 exec, 3 588 | s_setpc_b64 s[60:61] 589 | 590 | fdiv_m_sub0: 591 | v_or_b32 v28, v28, v78 592 | v_and_b32 v29, v29, v77 593 | v_or_b32 v29, v29, v79 594 | s_setreg_b32 hwreg(mode, 2, 2), s67 595 | v_rcp_f64 v[48:49], v[28:29] 596 | v_fma_f64 v[80:81], -v[28:29], v[48:49], 1.0 597 | v_fma_f64 v[48:49], v[48:49], v[80:81], v[48:49] 598 | v_mul_f64 v[80:81], v[68:69], v[48:49] 599 | v_fma_f64 v[42:43], -v[28:29], v[80:81], v[68:69] 600 | s_setreg_b32 hwreg(mode, 2, 2), s66 601 | v_fma_f64 v[42:43], v[42:43], v[48:49], v[80:81] 602 | v_div_fixup_f64 v[80:81], v[42:43], v[28:29], v[68:69] 603 | v_cmpx_eq_f64 s[14:15], v[68:69], v[28:29] 604 | v_mov_b32 v80, 0 605 | v_mov_b32 v81, v83 606 | s_mov_b64 exec, 3 607 | v_mov_b32 v68, v80 608 | v_mov_b32 v69, v81 609 | s_setpc_b64 s[60:61] 610 | 611 | fdiv_m_sub1: 612 | v_or_b32 v28, v28, v78 613 | v_and_b32 v29, v29, v77 614 | v_or_b32 v29, v29, v79 615 | s_setreg_b32 hwreg(mode, 2, 2), s67 616 | v_rcp_f64 v[48:49], v[28:29] 617 | v_fma_f64 v[80:81], -v[28:29], v[48:49], 1.0 618 | v_fma_f64 v[48:49], v[48:49], v[80:81], v[48:49] 619 | v_mul_f64 v[80:81], v[70:71], v[48:49] 620 | v_fma_f64 v[42:43], -v[28:29], v[80:81], v[70:71] 621 | s_setreg_b32 hwreg(mode, 2, 2), s66 622 | v_fma_f64 v[42:43], v[42:43], v[48:49], v[80:81] 623 | v_div_fixup_f64 v[80:81], v[42:43], v[28:29], v[70:71] 624 | v_cmpx_eq_f64 s[14:15], v[70:71], v[28:29] 625 | v_mov_b32 v80, 0 626 | v_mov_b32 v81, v83 627 | s_mov_b64 exec, 3 628 | v_mov_b32 v70, v80 629 | v_mov_b32 v71, v81 630 | s_setpc_b64 s[60:61] 631 | 632 | fdiv_m_sub2: 633 | v_or_b32 v28, v28, v78 634 | v_and_b32 v29, v29, v77 635 | v_or_b32 v29, v29, v79 636 | s_setreg_b32 hwreg(mode, 2, 2), s67 637 | v_rcp_f64 v[48:49], v[28:29] 638 | v_fma_f64 v[80:81], -v[28:29], v[48:49], 1.0 639 | v_fma_f64 v[48:49], v[48:49], v[80:81], v[48:49] 640 | v_mul_f64 v[80:81], v[72:73], v[48:49] 641 | v_fma_f64 v[42:43], -v[28:29], v[80:81], v[72:73] 642 | s_setreg_b32 hwreg(mode, 2, 2), s66 643 | v_fma_f64 v[42:43], v[42:43], v[48:49], v[80:81] 644 | v_div_fixup_f64 v[80:81], v[42:43], v[28:29], v[72:73] 645 | v_cmpx_eq_f64 s[14:15], v[72:73], v[28:29] 646 | v_mov_b32 v80, 0 647 | v_mov_b32 v81, v83 648 | s_mov_b64 exec, 3 649 | v_mov_b32 v72, v80 650 | v_mov_b32 v73, v81 651 | s_setpc_b64 s[60:61] 652 | 653 | fdiv_m_sub3: 654 | v_or_b32 v28, v28, v78 655 | v_and_b32 v29, v29, v77 656 | v_or_b32 v29, v29, v79 657 | s_setreg_b32 hwreg(mode, 2, 2), s67 658 | v_rcp_f64 v[48:49], v[28:29] 659 | v_fma_f64 v[80:81], -v[28:29], v[48:49], 1.0 660 | v_fma_f64 v[48:49], v[48:49], v[80:81], v[48:49] 661 | v_mul_f64 v[80:81], v[74:75], v[48:49] 662 | v_fma_f64 v[42:43], -v[28:29], v[80:81], v[74:75] 663 | s_setreg_b32 hwreg(mode, 2, 2), s66 664 | v_fma_f64 v[42:43], v[42:43], v[48:49], v[80:81] 665 | v_div_fixup_f64 v[80:81], v[42:43], v[28:29], v[74:75] 666 | v_cmpx_eq_f64 s[14:15], v[74:75], v[28:29] 667 | v_mov_b32 v80, 0 668 | v_mov_b32 v81, v83 669 | s_mov_b64 exec, 3 670 | v_mov_b32 v74, v80 671 | v_mov_b32 v75, v81 672 | s_setpc_b64 s[60:61] 673 | 674 | ismulh_r_sub: 675 | s_mov_b64 exec, 1 676 | v_mov_b32 v45, s14 677 | v_mul_hi_u32 v40, s38, v45 678 | v_mov_b32 v47, s15 679 | v_mad_u64_u32 v[42:43], s[32:33], s38, v47, v[40:41] 680 | v_mov_b32 v40, v42 681 | v_mad_u64_u32 v[45:46], s[32:33], s39, v45, v[40:41] 682 | v_mad_u64_u32 v[42:43], s[32:33], s39, v47, v[43:44] 683 | v_add_u32 v42, vcc, v42, v46 684 | v_addc_u32 v43, vcc, 0, v43, vcc 685 | v_readlane_b32 s32, v42, 0 686 | v_readlane_b32 s33, v43, 0 687 | s_cmp_lt_i32 s15, 0 688 | s_cselect_b64 s[34:35], s[38:39], 0 689 | s_sub_u32 s32, s32, s34 690 | s_subb_u32 s33, s33, s35 691 | s_cmp_lt_i32 s39, 0 692 | s_cselect_b64 s[34:35], s[14:15], 0 693 | s_sub_u32 s14, s32, s34 694 | s_subb_u32 s15, s33, s35 695 | s_mov_b64 exec, 3 696 | s_setpc_b64 s[60:61] 697 | 698 | imulh_r_sub: 699 | s_mov_b64 exec, 1 700 | v_mov_b32 v45, s38 701 | v_mul_hi_u32 v40, s14, v45 702 | v_mov_b32 v47, s39 703 | v_mad_u64_u32 v[42:43], s[32:33], s14, v47, v[40:41] 704 | v_mov_b32 v40, v42 705 | v_mad_u64_u32 v[45:46], s[32:33], s15, v45, v[40:41] 706 | v_mad_u64_u32 v[42:43], s[32:33], s15, v47, v[43:44] 707 | v_add_u32 v42, vcc, v42, v46 708 | v_addc_u32 v43, vcc, 0, v43, vcc 709 | v_readlane_b32 s14, v42, 0 710 | v_readlane_b32 s15, v43, 0 711 | s_mov_b64 exec, 3 712 | s_setpc_b64 s[60:61] 713 | -------------------------------------------------------------------------------- /RandomX_OpenCL/GCNASM/randomx_run_gfx900.asm: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2019 SChernykh 3 | 4 | This file is part of RandomX OpenCL. 5 | 6 | RandomX OpenCL is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | RandomX OpenCL is distributed in the hope that it will be useful, 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | GNU General Public License for more details. 15 | 16 | You should have received a copy of the GNU General Public License 17 | along with RandomX OpenCL. If not, see . 18 | */ 19 | 20 | .amdcl2 21 | .gpu GFX900 22 | .64bit 23 | .arch_minor 0 24 | .arch_stepping 0 25 | .driver_version 223600 26 | .kernel randomx_run 27 | .config 28 | .dims x 29 | .cws 64, 1, 1 30 | .sgprsnum 96 31 | # 6 waves per SIMD: 37-40 VGPRs 32 | # 5 waves per SIMD: 41-48 VGPRs 33 | # 4 waves per SIMD: 49-64 VGPRs 34 | # 3 waves per SIMD: 65-84 VGPRs 35 | # 2 waves per SIMD: 85-128 VGPRs 36 | # 1 wave per SIMD: 129-256 VGPRs 37 | .vgprsnum 128 38 | .localsize 256 39 | .floatmode 0xc0 40 | .pgmrsrc1 0x00ac035f 41 | .pgmrsrc2 0x00000090 42 | .dx10clamp 43 | .ieeemode 44 | .useargs 45 | .priority 0 46 | .arg _.global_offset_0, "size_t", long 47 | .arg _.global_offset_1, "size_t", long 48 | .arg _.global_offset_2, "size_t", long 49 | .arg _.printf_buffer, "size_t", void*, global, , rdonly 50 | .arg _.vqueue_pointer, "size_t", long 51 | .arg _.aqlwrap_pointer, "size_t", long 52 | .arg dataset, "uchar*", uchar*, global, const, rdonly 53 | .arg scratchpad, "uchar*", uchar*, global, 54 | .arg registers, "ulong*", ulong*, global, 55 | .arg rounding_modes, "uint*", uint*, global, 56 | .arg programs, "uint*", uint*, global, 57 | .arg batch_size, "uint", uint 58 | .arg rx_parameters, "uint", uint 59 | .text 60 | s_mov_b32 m0, 0x10000 61 | s_dcache_wb 62 | s_waitcnt vmcnt(0) & lgkmcnt(0) 63 | s_icache_inv 64 | s_branch begin 65 | 66 | # pgmrsrc2 = 0x00000090, bits 1:5 = 8, so first 8 SGPRs (s0-s7) contain user data 67 | # s8 contains group id 68 | # v0 contains local id 69 | begin: 70 | v_lshl_add_u32 v1, s8, 6, v0 71 | s_load_dwordx2 s[0:1], s[4:5], 0x0 72 | s_load_dwordx2 s[2:3], s[4:5], 0x40 73 | s_load_dwordx2 s[64:65], s[4:5], 0x48 74 | s_waitcnt lgkmcnt(0) 75 | 76 | # load rounding mode 77 | s_lshl_b32 s16, s8, 2 78 | s_add_u32 s64, s64, s16 79 | s_addc_u32 s65, s65, 0 80 | v_mov_b32 v8, 0 81 | global_load_dword v8, v8, s[64:65] 82 | s_waitcnt vmcnt(0) 83 | v_readlane_b32 s66, v8, 0 84 | s_setreg_b32 hwreg(mode, 2, 2), s66 85 | s_mov_b32 s67, 0 86 | 87 | # used in FSQRT_R to check for "positive normal value" (v_cmpx_class_f64) 88 | s_mov_b32 s68, 256 89 | s_mov_b32 s69, 0 90 | 91 | v_add_u32 v1, s0, v1 92 | v_lshrrev_b32 v2, 6, v1 93 | v_lshlrev_b32 v3, 5, v2 94 | v_and_b32 v1, 63, v1 95 | v_mov_b32 v4, 0 96 | v_lshlrev_b64 v[3:4], 3, v[3:4] 97 | v_lshlrev_b32 v5, 4, v1 98 | v_add_co_u32 v3, vcc, s2, v3 99 | v_mov_b32 v6, s3 100 | v_addc_co_u32 v4, vcc, v6, v4, vcc 101 | v_lshlrev_b32 v41, 2, v1 102 | v_add_co_u32 v6, vcc, v3, v41 103 | v_addc_co_u32 v7, vcc, v4, 0, vcc 104 | global_load_dword v6, v[6:7], off 105 | v_mov_b32 v0, 0 106 | s_waitcnt vmcnt(0) 107 | ds_write_b32 v41, v6 108 | s_waitcnt lgkmcnt(0) 109 | s_mov_b64 s[0:1], exec 110 | v_cmpx_le_u32 s[2:3], v1, 7 111 | s_cbranch_execz program_end 112 | 113 | # rx_parameters 114 | s_load_dword s20, s[4:5], 0x5c 115 | s_waitcnt lgkmcnt(0) 116 | 117 | # Scratchpad L1 size 118 | s_bfe_u32 s21, s20, 0x050000 119 | s_lshl_b32 s21, 1, s21 120 | 121 | # Scratchpad L2 size 122 | s_bfe_u32 s22, s20, 0x050005 123 | s_lshl_b32 s22, 1, s22 124 | 125 | # Scratchpad L3 size 126 | s_bfe_u32 s23, s20, 0x05000A 127 | s_lshl_b32 s23, 1, s23 128 | 129 | # program iterations 130 | s_bfe_u32 s24, s20, 0x04000F 131 | s_lshl_b32 s24, 1, s24 132 | 133 | # Base address for scratchpads 134 | s_add_u32 s2, s23, 64 135 | v_mul_hi_u32 v20, v2, s2 136 | v_mul_lo_u32 v2, v2, s2 137 | 138 | # v41, v44 = 0 139 | v_mov_b32 v41, 0 140 | v_mov_b32 v44, 0 141 | 142 | ds_read_b32 v6, v0 offset:152 143 | v_cmp_lt_u32 s[2:3], v1, 4 144 | ds_read2_b64 v[34:37], v0 offset0:18 offset1:16 145 | ds_read_b64 v[11:12], v0 offset:136 146 | s_movk_i32 s9, 0x0 147 | s_mov_b64 s[6:7], exec 148 | s_andn2_b64 exec, s[6:7], s[2:3] 149 | ds_read_b64 v[13:14], v0 offset:160 150 | s_andn2_b64 exec, s[6:7], exec 151 | v_mov_b32 v13, 0 152 | v_mov_b32 v14, 0 153 | s_mov_b64 exec, s[6:7] 154 | 155 | # compiled program size 156 | s_mov_b64 s[6:7], s[8:9] 157 | s_mulk_i32 s6, 10048 158 | 159 | v_add3_u32 v5, v0, v5, 64 160 | s_mov_b64 s[8:9], exec 161 | s_andn2_b64 exec, s[8:9], s[2:3] 162 | ds_read_b64 v[15:16], v0 offset:168 163 | s_andn2_b64 exec, s[8:9], exec 164 | v_mov_b32 v15, 0 165 | v_mov_b32 v16, 0 166 | s_mov_b64 exec, s[8:9] 167 | s_load_dwordx4 s[8:11], s[4:5], 0x30 168 | 169 | # batch_size 170 | s_load_dword s16, s[4:5], 0x58 171 | 172 | s_load_dwordx2 s[4:5], s[4:5], 0x50 173 | v_lshlrev_b32 v1, 3, v1 174 | v_add_u32 v17, v0, v1 175 | s_waitcnt lgkmcnt(0) 176 | v_add_co_u32 v2, vcc, s10, v2 177 | v_mov_b32 v18, s11 178 | v_addc_co_u32 v18, vcc, v18, v20, vcc 179 | v_mov_b32 v19, 0xffffff 180 | v_add_co_u32 v6, vcc, s8, v6 181 | v_mov_b32 v20, s9 182 | v_addc_co_u32 v20, vcc, v20, 0, vcc 183 | ds_read_b64 v[21:22], v17 184 | s_add_u32 s4, s4, s6 185 | s_addc_u32 s5, s5, s7 186 | v_cndmask_b32 v19, v19, -1, s[2:3] 187 | v_lshl_add_u32 v8, v35, 3, v0 188 | v_lshl_add_u32 v7, v34, 3, v0 189 | v_lshl_add_u32 v12, v12, 3, v0 190 | v_lshl_add_u32 v0, v11, 3, v0 191 | v_mov_b32 v10, v36 192 | v_mov_b32 v23, v37 193 | 194 | # loop counter 195 | s_sub_u32 s2, s24, 1 196 | 197 | # batch_size 198 | s_mov_b32 s3, s16 199 | 200 | # Scratchpad masks for scratchpads 201 | v_sub_u32 v38, s21, 8 202 | v_sub_u32 v39, s22, 8 203 | v_sub_u32 v50, s23, 8 204 | 205 | # mask for FSCAL_R 206 | v_mov_b32 v51, 0x80F00000 207 | 208 | # load scratchpad base address 209 | v_readlane_b32 s0, v2, 0 210 | v_readlane_b32 s1, v18, 0 211 | 212 | # save current executiom mask 213 | s_mov_b64 s[36:37], exec 214 | 215 | # v41 = 0 on lane 0, set it to 8 on lane 1 216 | # v44 = 0 on lane 0, set it to 4 on lane 1 217 | s_mov_b64 exec, 2 218 | v_mov_b32 v41, 8 219 | v_mov_b32 v44, 4 220 | 221 | # load group A registers 222 | # Read low 8 bytes into lane 0 and high 8 bytes into lane 1 223 | s_mov_b64 exec, 3 224 | ds_read2_b64 v[52:55], v41 offset0:24 offset1:26 225 | ds_read2_b64 v[56:59], v41 offset0:28 offset1:30 226 | 227 | # xmantissaMask 228 | v_mov_b32 v77, (1 << 24) - 1 229 | 230 | # xexponentMask 231 | ds_read_b64 v[78:79], v41 offset:160 232 | 233 | # Restore execution mask 234 | s_mov_b64 exec, s[36:37] 235 | 236 | # sign mask (used in FSQRT_R) 237 | v_mov_b32 v82, 0x80000000 238 | 239 | # High 32 bits of "1.0" constant (used in FDIV_M) 240 | v_mov_b32 v83, (1023 << 20) 241 | 242 | # Used to multiply FP64 values by 0.5 243 | v_mov_b32 v84, (1 << 20) 244 | 245 | s_getpc_b64 s[14:15] 246 | cur_addr: 247 | 248 | # get addresses of FSQRT_R subroutines 249 | s_add_u32 s40, s14, fsqrt_r_sub0 - cur_addr 250 | s_addc_u32 s41, s15, 0 251 | s_add_u32 s42, s14, fsqrt_r_sub1 - cur_addr 252 | s_addc_u32 s43, s15, 0 253 | s_add_u32 s44, s14, fsqrt_r_sub2 - cur_addr 254 | s_addc_u32 s45, s15, 0 255 | s_add_u32 s46, s14, fsqrt_r_sub3 - cur_addr 256 | s_addc_u32 s47, s15, 0 257 | 258 | # get addresses of FDIV_M subroutines 259 | s_add_u32 s48, s14, fdiv_m_sub0 - cur_addr 260 | s_addc_u32 s49, s15, 0 261 | s_add_u32 s50, s14, fdiv_m_sub1 - cur_addr 262 | s_addc_u32 s51, s15, 0 263 | s_add_u32 s52, s14, fdiv_m_sub2 - cur_addr 264 | s_addc_u32 s53, s15, 0 265 | s_add_u32 s54, s14, fdiv_m_sub3 - cur_addr 266 | s_addc_u32 s55, s15, 0 267 | 268 | # get address for ISMULH_R subroutine 269 | s_add_u32 s56, s14, ismulh_r_sub - cur_addr 270 | s_addc_u32 s57, s15, 0 271 | 272 | # get address for IMULH_R subroutine 273 | s_add_u32 s58, s14, imulh_r_sub - cur_addr 274 | s_addc_u32 s59, s15, 0 275 | 276 | # used in IXOR_R instruction 277 | s_mov_b32 s63, -1 278 | 279 | # used in CBRANCH instruction 280 | s_mov_b32 s70, (0xFF << 8) 281 | s_mov_b32 s71, (0xFF << 9) 282 | s_mov_b32 s72, (0xFF << 10) 283 | s_mov_b32 s73, (0xFF << 11) 284 | s_mov_b32 s74, (0xFF << 12) 285 | s_mov_b32 s75, (0xFF << 13) 286 | s_mov_b32 s76, (0xFF << 14) 287 | s_mov_b32 s77, (0xFF << 15) 288 | s_mov_b32 s78, (0xFF << 16) 289 | s_mov_b32 s79, (0xFF << 17) 290 | s_mov_b32 s80, (0xFF << 18) 291 | s_mov_b32 s81, (0xFF << 19) 292 | s_mov_b32 s82, (0xFF << 20) 293 | s_mov_b32 s83, (0xFF << 21) 294 | s_mov_b32 s84, (0xFF << 22) 295 | s_mov_b32 s85, (0xFF << 23) 296 | 297 | # ScratchpadL3Mask64 298 | s_sub_u32 s86, s23, 64 299 | 300 | main_loop: 301 | # const uint2 spMix = as_uint2(R[readReg0] ^ R[readReg1]); 302 | ds_read_b64 v[24:25], v0 303 | ds_read_b64 v[26:27], v12 304 | s_waitcnt lgkmcnt(0) 305 | v_xor_b32 v25, v27, v25 306 | v_xor_b32 v24, v26, v24 307 | 308 | # spAddr1 ^= spMix.y; 309 | # spAddr0 ^= spMix.x; 310 | v_xor_b32 v10, v25, v10 311 | v_xor_b32 v23, v24, v23 312 | 313 | # spAddr1 &= ScratchpadL3Mask64; 314 | # spAddr0 &= ScratchpadL3Mask64; 315 | v_and_b32 v10, s86, v10 316 | v_and_b32 v23, s86, v23 317 | 318 | # Offset for scratchpads 319 | # offset1 = spAddr1 + sub * 8 320 | # offset0 = spAddr0 + sub * 8 321 | v_add_u32 v10, v10, v1 322 | v_add_u32 v23, v23, v1 323 | 324 | # __global ulong* p1 = (__global ulong*)(scratchpad + offset1); 325 | # __global ulong* p0 = (__global ulong*)(scratchpad + offset0); 326 | v_add_co_u32 v26, vcc, v2, v10 327 | v_addc_co_u32 v27, vcc, v18, 0, vcc 328 | v_add_co_u32 v23, vcc, v2, v23 329 | v_addc_co_u32 v24, vcc, v18, 0, vcc 330 | 331 | # load from spAddr1 332 | global_load_dwordx2 v[28:29], v[26:27], off 333 | 334 | # load from spAddr0 335 | global_load_dwordx2 v[30:31], v[23:24], off 336 | s_waitcnt vmcnt(1) 337 | 338 | v_cvt_f64_i32 v[32:33], v28 339 | v_cvt_f64_i32 v[28:29], v29 340 | s_waitcnt vmcnt(0) 341 | 342 | # R[sub] ^= *p0; 343 | v_xor_b32 v34, v21, v30 344 | v_xor_b32 v35, v22, v31 345 | 346 | v_add_co_u32 v22, vcc, v6, v36 347 | v_addc_co_u32 v25, vcc, v20, 0, vcc 348 | v_add_co_u32 v21, vcc, v22, v1 349 | v_addc_co_u32 v22, vcc, v25, 0, vcc 350 | global_load_dwordx2 v[21:22], v[21:22], off 351 | v_or_b32 v30, v32, v13 352 | v_and_or_b32 v31, v33, v19, v14 353 | v_or_b32 v28, v28, v15 354 | v_and_or_b32 v29, v29, v19, v16 355 | ds_write2_b64 v5, v[30:31], v[28:29] offset1:1 356 | s_waitcnt lgkmcnt(0) 357 | 358 | # Program 0 359 | 360 | # load group F,E registers 361 | # Read low 8 bytes into lane 0 and high 8 bytes into lane 1 362 | s_mov_b64 exec, 3 363 | ds_read2_b64 v[60:63], v41 offset0:8 offset1:10 364 | ds_read2_b64 v[64:67], v41 offset0:12 offset1:14 365 | ds_read2_b64 v[68:71], v41 offset0:16 offset1:18 366 | ds_read2_b64 v[72:75], v41 offset0:20 offset1:22 367 | 368 | # load VM integer registers 369 | v_readlane_b32 s16, v34, 0 370 | v_readlane_b32 s17, v35, 0 371 | v_readlane_b32 s18, v34, 1 372 | v_readlane_b32 s19, v35, 1 373 | v_readlane_b32 s20, v34, 2 374 | v_readlane_b32 s21, v35, 2 375 | v_readlane_b32 s22, v34, 3 376 | v_readlane_b32 s23, v35, 3 377 | v_readlane_b32 s24, v34, 4 378 | v_readlane_b32 s25, v35, 4 379 | v_readlane_b32 s26, v34, 5 380 | v_readlane_b32 s27, v35, 5 381 | v_readlane_b32 s28, v34, 6 382 | v_readlane_b32 s29, v35, 6 383 | v_readlane_b32 s30, v34, 7 384 | v_readlane_b32 s31, v35, 7 385 | 386 | s_waitcnt lgkmcnt(0) 387 | 388 | # call JIT code 389 | s_swappc_b64 s[12:13], s[4:5] 390 | 391 | # Write out group F,E registers 392 | # Write low 8 bytes from lane 0 and high 8 bytes from lane 1 393 | ds_write2_b64 v41, v[60:61], v[62:63] offset0:8 offset1:10 394 | ds_write2_b64 v41, v[64:65], v[66:67] offset0:12 offset1:14 395 | ds_write2_b64 v41, v[68:69], v[70:71] offset0:16 offset1:18 396 | ds_write2_b64 v41, v[72:73], v[74:75] offset0:20 offset1:22 397 | 398 | # store VM integer registers 399 | v_writelane_b32 v28, s16, 0 400 | v_writelane_b32 v29, s17, 0 401 | v_writelane_b32 v28, s18, 1 402 | v_writelane_b32 v29, s19, 1 403 | v_writelane_b32 v28, s20, 2 404 | v_writelane_b32 v29, s21, 2 405 | v_writelane_b32 v28, s22, 3 406 | v_writelane_b32 v29, s23, 3 407 | v_writelane_b32 v28, s24, 4 408 | v_writelane_b32 v29, s25, 4 409 | v_writelane_b32 v28, s26, 5 410 | v_writelane_b32 v29, s27, 5 411 | v_writelane_b32 v28, s28, 6 412 | v_writelane_b32 v29, s29, 6 413 | v_writelane_b32 v28, s30, 7 414 | v_writelane_b32 v29, s31, 7 415 | 416 | # Restore execution mask 417 | s_mov_b64 exec, s[36:37] 418 | 419 | # Write out VM integer registers 420 | ds_write_b64 v17, v[28:29] 421 | 422 | s_waitcnt lgkmcnt(0) 423 | v_xor_b32 v21, v28, v21 424 | v_xor_b32 v22, v29, v22 425 | ds_read_b32 v28, v7 426 | ds_read_b32 v29, v8 427 | ds_write_b64 v17, v[21:22] 428 | s_waitcnt lgkmcnt(1) 429 | ds_read2_b64 v[30:33], v17 offset0:8 offset1:16 430 | v_xor_b32 v10, v28, v37 431 | s_waitcnt lgkmcnt(0) 432 | v_xor_b32 v30, v32, v30 433 | v_xor_b32 v31, v33, v31 434 | v_xor_b32 v10, v10, v29 435 | global_store_dwordx2 v[26:27], v[21:22], off 436 | v_and_b32 v10, 0x7fffffc0, v10 437 | global_store_dwordx2 v[23:24], v[30:31], off 438 | s_cmp_eq_u32 s2, 0 439 | s_cbranch_scc1 main_loop_end 440 | s_sub_i32 s2, s2, 1 441 | v_mov_b32 v37, v36 442 | v_mov_b32 v23, 0 443 | v_mov_b32 v36, v10 444 | v_mov_b32 v10, 0 445 | s_branch main_loop 446 | main_loop_end: 447 | 448 | v_add_co_u32 v0, vcc, v3, v1 449 | v_addc_co_u32 v1, vcc, v4, 0, vcc 450 | global_store_dwordx2 v[0:1], v[21:22], off 451 | global_store_dwordx2 v[0:1], v[30:31], off inst_offset:64 452 | global_store_dwordx2 v[0:1], v[32:33], off inst_offset:128 453 | 454 | # store rounding mode 455 | v_mov_b32 v0, 0 456 | v_mov_b32 v1, s66 457 | global_store_dword v0, v1, s[64:65] 458 | 459 | program_end: 460 | s_endpgm 461 | 462 | fsqrt_r_sub0: 463 | s_setreg_b32 hwreg(mode, 2, 2), s67 464 | v_rsq_f64 v[28:29], v[68:69] 465 | 466 | # Improve initial approximation (can be skipped) 467 | #v_mul_f64 v[42:43], v[28:29], v[68:69] 468 | #v_mul_f64 v[48:49], v[28:29], -0.5 469 | #v_fma_f64 v[48:49], v[48:49], v[42:43], 0.5 470 | #v_fma_f64 v[28:29], v[28:29], v[48:49], v[28:29] 471 | 472 | v_mul_f64 v[42:43], v[28:29], v[68:69] 473 | v_mov_b32 v48, v28 474 | v_sub_u32 v49, v29, v84 475 | v_mov_b32 v46, v28 476 | v_xor_b32 v47, v49, v82 477 | v_fma_f64 v[46:47], v[46:47], v[42:43], 0.5 478 | v_fma_f64 v[42:43], v[42:43], v[46:47], v[42:43] 479 | v_fma_f64 v[48:49], v[48:49], v[46:47], v[48:49] 480 | v_fma_f64 v[46:47], -v[42:43], v[42:43], v[68:69] 481 | s_setreg_b32 hwreg(mode, 2, 2), s66 482 | v_fma_f64 v[42:43], v[46:47], v[48:49], v[42:43] 483 | v_cmpx_class_f64 s[14:15], v[68:69], s[68:69] 484 | v_mov_b32 v68, v42 485 | v_mov_b32 v69, v43 486 | s_mov_b64 exec, 3 487 | s_setpc_b64 s[60:61] 488 | 489 | fsqrt_r_sub1: 490 | s_setreg_b32 hwreg(mode, 2, 2), s67 491 | v_rsq_f64 v[28:29], v[70:71] 492 | 493 | # Improve initial approximation (can be skipped) 494 | #v_mul_f64 v[42:43], v[28:29], v[70:71] 495 | #v_mul_f64 v[48:49], v[28:29], -0.5 496 | #v_fma_f64 v[48:49], v[48:49], v[42:43], 0.5 497 | #v_fma_f64 v[28:29], v[28:29], v[48:49], v[28:29] 498 | 499 | v_mul_f64 v[42:43], v[28:29], v[70:71] 500 | v_mov_b32 v48, v28 501 | v_sub_u32 v49, v29, v84 502 | v_mov_b32 v46, v28 503 | v_xor_b32 v47, v49, v82 504 | v_fma_f64 v[46:47], v[46:47], v[42:43], 0.5 505 | v_fma_f64 v[42:43], v[42:43], v[46:47], v[42:43] 506 | v_fma_f64 v[48:49], v[48:49], v[46:47], v[48:49] 507 | v_fma_f64 v[46:47], -v[42:43], v[42:43], v[70:71] 508 | s_setreg_b32 hwreg(mode, 2, 2), s66 509 | v_fma_f64 v[42:43], v[46:47], v[48:49], v[42:43] 510 | v_cmpx_class_f64 s[14:15], v[70:71], s[68:69] 511 | v_mov_b32 v70, v42 512 | v_mov_b32 v71, v43 513 | s_mov_b64 exec, 3 514 | s_setpc_b64 s[60:61] 515 | 516 | fsqrt_r_sub2: 517 | s_setreg_b32 hwreg(mode, 2, 2), s67 518 | v_rsq_f64 v[28:29], v[72:73] 519 | 520 | # Improve initial approximation (can be skipped) 521 | #v_mul_f64 v[42:43], v[28:29], v[72:73] 522 | #v_mul_f64 v[48:49], v[28:29], -0.5 523 | #v_fma_f64 v[48:49], v[48:49], v[42:43], 0.5 524 | #v_fma_f64 v[28:29], v[28:29], v[48:49], v[28:29] 525 | 526 | v_mul_f64 v[42:43], v[28:29], v[72:73] 527 | v_mov_b32 v48, v28 528 | v_sub_u32 v49, v29, v84 529 | v_mov_b32 v46, v28 530 | v_xor_b32 v47, v49, v82 531 | v_fma_f64 v[46:47], v[46:47], v[42:43], 0.5 532 | v_fma_f64 v[42:43], v[42:43], v[46:47], v[42:43] 533 | v_fma_f64 v[48:49], v[48:49], v[46:47], v[48:49] 534 | v_fma_f64 v[46:47], -v[42:43], v[42:43], v[72:73] 535 | s_setreg_b32 hwreg(mode, 2, 2), s66 536 | v_fma_f64 v[42:43], v[46:47], v[48:49], v[42:43] 537 | v_cmpx_class_f64 s[14:15], v[72:73], s[68:69] 538 | v_mov_b32 v72, v42 539 | v_mov_b32 v73, v43 540 | s_mov_b64 exec, 3 541 | s_setpc_b64 s[60:61] 542 | 543 | fsqrt_r_sub3: 544 | s_setreg_b32 hwreg(mode, 2, 2), s67 545 | v_rsq_f64 v[28:29], v[74:75] 546 | 547 | # Improve initial approximation (can be skipped) 548 | #v_mul_f64 v[42:43], v[28:29], v[74:75] 549 | #v_mul_f64 v[48:49], v[28:29], -0.5 550 | #v_fma_f64 v[48:49], v[48:49], v[42:43], 0.5 551 | #v_fma_f64 v[28:29], v[28:29], v[48:49], v[28:29] 552 | 553 | v_mul_f64 v[42:43], v[28:29], v[74:75] 554 | v_mov_b32 v48, v28 555 | v_sub_u32 v49, v29, v84 556 | v_mov_b32 v46, v28 557 | v_xor_b32 v47, v49, v82 558 | v_fma_f64 v[46:47], v[46:47], v[42:43], 0.5 559 | v_fma_f64 v[42:43], v[42:43], v[46:47], v[42:43] 560 | v_fma_f64 v[48:49], v[48:49], v[46:47], v[48:49] 561 | v_fma_f64 v[46:47], -v[42:43], v[42:43], v[74:75] 562 | s_setreg_b32 hwreg(mode, 2, 2), s66 563 | v_fma_f64 v[42:43], v[46:47], v[48:49], v[42:43] 564 | v_cmpx_class_f64 s[14:15], v[74:75], s[68:69] 565 | v_mov_b32 v74, v42 566 | v_mov_b32 v75, v43 567 | s_mov_b64 exec, 3 568 | s_setpc_b64 s[60:61] 569 | 570 | fdiv_m_sub0: 571 | v_or_b32 v28, v28, v78 572 | v_and_or_b32 v29, v29, v77, v79 573 | s_setreg_b32 hwreg(mode, 2, 2), s67 574 | v_rcp_f64 v[48:49], v[28:29] 575 | v_fma_f64 v[80:81], -v[28:29], v[48:49], 1.0 576 | v_fma_f64 v[48:49], v[48:49], v[80:81], v[48:49] 577 | v_mul_f64 v[80:81], v[68:69], v[48:49] 578 | v_fma_f64 v[42:43], -v[28:29], v[80:81], v[68:69] 579 | s_setreg_b32 hwreg(mode, 2, 2), s66 580 | v_fma_f64 v[42:43], v[42:43], v[48:49], v[80:81] 581 | v_div_fixup_f64 v[80:81], v[42:43], v[28:29], v[68:69] 582 | v_cmpx_eq_f64 s[14:15], v[68:69], v[28:29] 583 | v_mov_b32 v80, 0 584 | v_mov_b32 v81, v83 585 | s_mov_b64 exec, 3 586 | v_mov_b32 v68, v80 587 | v_mov_b32 v69, v81 588 | s_setpc_b64 s[60:61] 589 | 590 | fdiv_m_sub1: 591 | v_or_b32 v28, v28, v78 592 | v_and_or_b32 v29, v29, v77, v79 593 | s_setreg_b32 hwreg(mode, 2, 2), s67 594 | v_rcp_f64 v[48:49], v[28:29] 595 | v_fma_f64 v[80:81], -v[28:29], v[48:49], 1.0 596 | v_fma_f64 v[48:49], v[48:49], v[80:81], v[48:49] 597 | v_mul_f64 v[80:81], v[70:71], v[48:49] 598 | v_fma_f64 v[42:43], -v[28:29], v[80:81], v[70:71] 599 | s_setreg_b32 hwreg(mode, 2, 2), s66 600 | v_fma_f64 v[42:43], v[42:43], v[48:49], v[80:81] 601 | v_div_fixup_f64 v[80:81], v[42:43], v[28:29], v[70:71] 602 | v_cmpx_eq_f64 s[14:15], v[70:71], v[28:29] 603 | v_mov_b32 v80, 0 604 | v_mov_b32 v81, v83 605 | s_mov_b64 exec, 3 606 | v_mov_b32 v70, v80 607 | v_mov_b32 v71, v81 608 | s_setpc_b64 s[60:61] 609 | 610 | fdiv_m_sub2: 611 | v_or_b32 v28, v28, v78 612 | v_and_or_b32 v29, v29, v77, v79 613 | s_setreg_b32 hwreg(mode, 2, 2), s67 614 | v_rcp_f64 v[48:49], v[28:29] 615 | v_fma_f64 v[80:81], -v[28:29], v[48:49], 1.0 616 | v_fma_f64 v[48:49], v[48:49], v[80:81], v[48:49] 617 | v_mul_f64 v[80:81], v[72:73], v[48:49] 618 | v_fma_f64 v[42:43], -v[28:29], v[80:81], v[72:73] 619 | s_setreg_b32 hwreg(mode, 2, 2), s66 620 | v_fma_f64 v[42:43], v[42:43], v[48:49], v[80:81] 621 | v_div_fixup_f64 v[80:81], v[42:43], v[28:29], v[72:73] 622 | v_cmpx_eq_f64 s[14:15], v[72:73], v[28:29] 623 | v_mov_b32 v80, 0 624 | v_mov_b32 v81, v83 625 | s_mov_b64 exec, 3 626 | v_mov_b32 v72, v80 627 | v_mov_b32 v73, v81 628 | s_setpc_b64 s[60:61] 629 | 630 | fdiv_m_sub3: 631 | v_or_b32 v28, v28, v78 632 | v_and_or_b32 v29, v29, v77, v79 633 | s_setreg_b32 hwreg(mode, 2, 2), s67 634 | v_rcp_f64 v[48:49], v[28:29] 635 | v_fma_f64 v[80:81], -v[28:29], v[48:49], 1.0 636 | v_fma_f64 v[48:49], v[48:49], v[80:81], v[48:49] 637 | v_mul_f64 v[80:81], v[74:75], v[48:49] 638 | v_fma_f64 v[42:43], -v[28:29], v[80:81], v[74:75] 639 | s_setreg_b32 hwreg(mode, 2, 2), s66 640 | v_fma_f64 v[42:43], v[42:43], v[48:49], v[80:81] 641 | v_div_fixup_f64 v[80:81], v[42:43], v[28:29], v[74:75] 642 | v_cmpx_eq_f64 s[14:15], v[74:75], v[28:29] 643 | v_mov_b32 v80, 0 644 | v_mov_b32 v81, v83 645 | s_mov_b64 exec, 3 646 | v_mov_b32 v74, v80 647 | v_mov_b32 v75, v81 648 | s_setpc_b64 s[60:61] 649 | 650 | ismulh_r_sub: 651 | s_mov_b64 exec, 1 652 | v_mov_b32 v45, s14 653 | v_mul_hi_u32 v40, s38, v45 654 | v_mov_b32 v47, s15 655 | v_mad_u64_u32 v[42:43], s[32:33], s38, v47, v[40:41] 656 | v_mov_b32 v40, v42 657 | v_mad_u64_u32 v[45:46], s[32:33], s39, v45, v[40:41] 658 | v_mad_u64_u32 v[42:43], s[32:33], s39, v47, v[43:44] 659 | v_add_co_u32 v42, vcc, v42, v46 660 | v_addc_co_u32 v43, vcc, 0, v43, vcc 661 | v_readlane_b32 s32, v42, 0 662 | v_readlane_b32 s33, v43, 0 663 | s_cmp_lt_i32 s15, 0 664 | s_cselect_b64 s[34:35], s[38:39], 0 665 | s_sub_u32 s32, s32, s34 666 | s_subb_u32 s33, s33, s35 667 | s_cmp_lt_i32 s39, 0 668 | s_cselect_b64 s[34:35], s[14:15], 0 669 | s_sub_u32 s14, s32, s34 670 | s_subb_u32 s15, s33, s35 671 | s_mov_b64 exec, 3 672 | s_setpc_b64 s[60:61] 673 | 674 | imulh_r_sub: 675 | s_mov_b64 exec, 1 676 | v_mov_b32 v45, s38 677 | v_mul_hi_u32 v40, s14, v45 678 | v_mov_b32 v47, s39 679 | v_mad_u64_u32 v[42:43], s[32:33], s14, v47, v[40:41] 680 | v_mov_b32 v40, v42 681 | v_mad_u64_u32 v[45:46], s[32:33], s15, v45, v[40:41] 682 | v_mad_u64_u32 v[42:43], s[32:33], s15, v47, v[43:44] 683 | v_add_co_u32 v42, vcc, v42, v46 684 | v_addc_co_u32 v43, vcc, 0, v43, vcc 685 | v_readlane_b32 s14, v42, 0 686 | v_readlane_b32 s15, v43, 0 687 | s_mov_b64 exec, 3 688 | s_setpc_b64 s[60:61] 689 | -------------------------------------------------------------------------------- /RandomX_OpenCL/RandomX_OpenCL.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2019 SChernykh 3 | 4 | This file is part of RandomX OpenCL. 5 | 6 | RandomX OpenCL is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | RandomX OpenCL is distributed in the hope that it will be useful, 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | GNU General Public License for more details. 15 | 16 | You should have received a copy of the GNU General Public License 17 | along with RandomX OpenCL. If not, see . 18 | */ 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include "tests.h" 25 | #include "miner.h" 26 | 27 | int main(int argc, char** argv) 28 | { 29 | if (argc < 2) 30 | { 31 | printf("Usage: %s --mine [--validate] [--platform_id N] [--device_id N] [--intensity N] [--portable] [--workers N] [--bfactor N] [--dataset_host]\n\n", argv[0]); 32 | printf("platform_id 0 if you have only 1 OpenCL platform\n"); 33 | printf("device_id 0 if you have only 1 GPU\n"); 34 | printf("intensity number of scratchpads to allocate, if it's not set then as many as possible will be allocated.\n\n"); 35 | printf("portable use generic OpenCL code that works on all GPUs.\n\n"); 36 | printf("workers number of parallel workers per hash to run in portable mode. Can be 2,4,8,16, default is 8.\n\n"); 37 | printf("bfactor splits main loop into multiple sub-steps. Use it to improve screen responsiveness. Can be 0-10, default is 5.\n\n"); 38 | printf("dataset_host allocate dataset on host. This is required for 2 GB GPUs.\n\n"); 39 | printf("Examples:\n%s --mine --validate --intensity 1984\n", argv[0]); 40 | return 0; 41 | } 42 | 43 | uint32_t platform_id = 0; 44 | uint32_t device_id = 0; 45 | size_t intensity = 0; 46 | uint32_t start_nonce = 0; 47 | uint32_t workers_per_hash = 8; 48 | uint32_t bfactor = 5; 49 | bool portable = false; 50 | bool dataset_host_allocated = false; 51 | bool validate = false; 52 | 53 | for (int i = 1; i < argc; ++i) 54 | { 55 | if ((strcmp(argv[i], "--platform_id") == 0) && (i + 1 < argc)) 56 | platform_id = atoi(argv[i + 1]); 57 | else if ((strcmp(argv[i], "--device_id") == 0) && (i + 1 < argc)) 58 | device_id = atoi(argv[i + 1]); 59 | else if ((strcmp(argv[i], "--intensity") == 0) && (i + 1 < argc)) 60 | intensity = atoi(argv[i + 1]); 61 | else if ((strcmp(argv[i], "--nonce") == 0) && (i + 1 < argc)) 62 | start_nonce = atoi(argv[i + 1]); 63 | else if ((strcmp(argv[i], "--workers") == 0) && (i + 1 < argc)) 64 | workers_per_hash = atoi(argv[i + 1]); 65 | else if ((strcmp(argv[i], "--bfactor") == 0) && (i + 1 < argc)) 66 | bfactor = atoi(argv[i + 1]); 67 | else if (strcmp(argv[i], "--portable") == 0) 68 | portable = true; 69 | else if (strcmp(argv[i], "--dataset_host") == 0) 70 | dataset_host_allocated = true; 71 | else if (strcmp(argv[i], "--validate") == 0) 72 | validate = true; 73 | } 74 | 75 | if (strcmp(argv[1], "--mine") == 0) 76 | return test_mining(platform_id, device_id, intensity, start_nonce, workers_per_hash, bfactor, portable, dataset_host_allocated, validate) ? 0 : 1; 77 | else if (strcmp(argv[1], "--test") == 0) 78 | return tests(platform_id, device_id, intensity) ? 0 : 1; 79 | 80 | return 0; 81 | } 82 | -------------------------------------------------------------------------------- /RandomX_OpenCL/RandomX_OpenCL.vcxproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | x64 7 | 8 | 9 | Release 10 | x64 11 | 12 | 13 | 14 | 15.0 15 | {32C205F3-317C-4EE6-8B8B-E2B6D87B8CC4} 16 | Win32Proj 17 | RandomXOpenCL 18 | 10.0 19 | 20 | 21 | 22 | Application 23 | true 24 | v142 25 | MultiByte 26 | 27 | 28 | Application 29 | false 30 | v142 31 | true 32 | MultiByte 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | true 48 | 49 | 50 | false 51 | NativeRecommendedRules.ruleset 52 | 53 | 54 | 55 | Level4 56 | Disabled 57 | true 58 | true 59 | $(CUDA_PATH)\include 60 | true 61 | _CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions) 62 | 63 | 64 | Console 65 | true 66 | OpenCL.lib;%(AdditionalDependencies) 67 | $(CUDA_PATH)\lib\x64 68 | 69 | 70 | 71 | 72 | Level4 73 | MaxSpeed 74 | true 75 | true 76 | true 77 | true 78 | $(CUDA_PATH)\include 79 | true 80 | _CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions) 81 | 82 | 83 | Console 84 | true 85 | true 86 | true 87 | OpenCL.lib;%(AdditionalDependencies) 88 | $(CUDA_PATH)\lib\x64 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | {3346a4ad-c438-4324-8b77-47a16452954b} 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | Document 120 | clrxasm %(Identity) -o %(Filename).bin 121 | %(Filename).bin 122 | clrxasm %(Identity) -o %(Filename).bin 123 | %(Filename).bin 124 | 125 | 126 | Document 127 | clrxasm %(Identity) -o %(Filename).bin 128 | clrxasm %(Identity) -o %(Filename).bin 129 | %(Filename).bin 130 | %(Filename).bin 131 | 132 | 133 | Document 134 | clrxasm %(Identity) -o %(Filename).bin 135 | %(Filename).bin 136 | 137 | 138 | 139 | 140 | 141 | -------------------------------------------------------------------------------- /RandomX_OpenCL/RandomX_OpenCL.vcxproj.filters: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF} 6 | cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx 7 | 8 | 9 | {93995380-89BD-4b04-88EB-625FBE52EBFB} 10 | h;hh;hpp;hxx;hm;inl;inc;ipp;xsd 11 | 12 | 13 | {7789d323-c959-469b-addc-91336f3201fc} 14 | 15 | 16 | {181c0a82-12c6-4d82-8e7f-51fe2a1cee17} 17 | 18 | 19 | 20 | 21 | Source Files 22 | 23 | 24 | Source Files 25 | 26 | 27 | Source Files 28 | 29 | 30 | Source Files 31 | 32 | 33 | 34 | 35 | Header Files 36 | 37 | 38 | Header Files 39 | 40 | 41 | Header Files 42 | 43 | 44 | Header Files 45 | 46 | 47 | Header Files 48 | 49 | 50 | Header Files 51 | 52 | 53 | 54 | 55 | Source Files\CL 56 | 57 | 58 | Source Files\CL 59 | 60 | 61 | Source Files\CL 62 | 63 | 64 | Source Files\CL 65 | 66 | 67 | Source Files\CL 68 | 69 | 70 | Source Files\CL 71 | 72 | 73 | Source Files\CL 74 | 75 | 76 | 77 | 78 | Source Files\GCNASM 79 | 80 | 81 | Source Files\GCNASM 82 | 83 | 84 | Source Files\GCNASM 85 | 86 | 87 | -------------------------------------------------------------------------------- /RandomX_OpenCL/RandomX_OpenCL.vcxproj.user: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | --mine --validate --portable --workers 8 --bfactor 5 --intensity 3584 5 | WindowsLocalDebugger 6 | 7 | 8 | --mine --validate --intensity 1984 9 | WindowsLocalDebugger 10 | 11 | -------------------------------------------------------------------------------- /RandomX_OpenCL/definitions.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2019 SChernykh 3 | 4 | This file is part of RandomX OpenCL. 5 | 6 | RandomX OpenCL is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | RandomX OpenCL is distributed in the hope that it will be useful, 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | GNU General Public License for more details. 15 | 16 | You should have received a copy of the GNU General Public License 17 | along with RandomX OpenCL. If not, see . 18 | */ 19 | 20 | #pragma once 21 | 22 | #include 23 | #include "CL/randomx_constants.h" 24 | #include "CL/randomx_constants_jit.h" 25 | 26 | static const std::string AES_CL = "CL/aes.cl"; 27 | static const std::string CL_FILLAES1RX4_SCRATCHPAD = "fillAes1Rx4_scratchpad"; 28 | static const std::string CL_FILLAES4RX4_ENTROPY = "fillAes4Rx4_entropy"; 29 | static const std::string CL_HASHAES1RX4 = "hashAes1Rx4"; 30 | 31 | static const std::string BLAKE2B_CL = "CL/blake2b.cl"; 32 | static const std::string CL_BLAKE2B_INITIAL_HASH = "blake2b_initial_hash"; 33 | static const std::string CL_BLAKE2B_HASH_REGISTERS_32 = "blake2b_hash_registers_32"; 34 | static const std::string CL_BLAKE2B_HASH_REGISTERS_64 = "blake2b_hash_registers_64"; 35 | static const std::string CL_BLAKE2B_512_SINGLE_BLOCK_BENCH = "blake2b_512_single_block_bench"; 36 | static const std::string CL_BLAKE2B_512_DOUBLE_BLOCK_BENCH = "blake2b_512_double_block_bench"; 37 | 38 | static const std::string RANDOMX_INIT_CL = "CL/randomx_init.cl"; 39 | static const std::string CL_RANDOMX_INIT = "randomx_init"; 40 | 41 | static const std::string RANDOMX_RUN_CL = "CL/randomx_run.cl"; 42 | static const std::string CL_RANDOMX_RUN = "randomx_run"; 43 | 44 | static const std::string RANDOMX_VM_CL = "CL/randomx_vm.cl"; 45 | static const std::string CL_INIT_VM = "init_vm"; 46 | static const std::string CL_EXECUTE_VM = "execute_vm"; 47 | 48 | static uint8_t blockTemplate[] = { 49 | 0x07, 0x07, 0xf7, 0xa4, 0xf0, 0xd6, 0x05, 0xb3, 0x03, 0x26, 0x08, 0x16, 0xba, 0x3f, 0x10, 0x90, 0x2e, 0x1a, 0x14, 50 | 0x5a, 0xc5, 0xfa, 0xd3, 0xaa, 0x3a, 0xf6, 0xea, 0x44, 0xc1, 0x18, 0x69, 0xdc, 0x4f, 0x85, 0x3f, 0x00, 0x2b, 0x2e, 51 | 0xea, 0x00, 0x00, 0x00, 0x00, 0x77, 0xb2, 0x06, 0xa0, 0x2c, 0xa5, 0xb1, 0xd4, 0xce, 0x6b, 0xbf, 0xdf, 0x0a, 0xca, 52 | 0xc3, 0x8b, 0xde, 0xd3, 0x4d, 0x2d, 0xcd, 0xee, 0xf9, 0x5c, 0xd2, 0x0c, 0xef, 0xc1, 0x2f, 0x61, 0xd5, 0x61, 0x09 53 | }; 54 | -------------------------------------------------------------------------------- /RandomX_OpenCL/makefile: -------------------------------------------------------------------------------- 1 | release: *.h *.cpp GCNASM/*.asm 2 | clrxasm GCNASM/randomx_run_gfx803.asm -o randomx_run_gfx803.bin 3 | clrxasm GCNASM/randomx_run_gfx900.asm -o randomx_run_gfx900.bin 4 | g++ *.cpp -O3 -lOpenCL -lpthread ../RandomX/build/librandomx.a -o opencl_test 5 | -------------------------------------------------------------------------------- /RandomX_OpenCL/miner.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2019 SChernykh 3 | 4 | This file is part of RandomX OpenCL. 5 | 6 | RandomX OpenCL is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | RandomX OpenCL is distributed in the hope that it will be useful, 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | GNU General Public License for more details. 15 | 16 | You should have received a copy of the GNU General Public License 17 | along with RandomX OpenCL. If not, see . 18 | */ 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include "miner.h" 26 | #include "opencl_helpers.h" 27 | #include "definitions.h" 28 | 29 | #include "../RandomX/src/randomx.h" 30 | #include "../RandomX/src/configuration.h" 31 | #include "../RandomX/src/common.hpp" 32 | 33 | using namespace std::chrono; 34 | 35 | bool test_mining(uint32_t platform_id, uint32_t device_id, size_t intensity, uint32_t start_nonce, uint32_t workers_per_hash, uint32_t bfactor, bool portable, bool dataset_host_allocated, bool validate) 36 | { 37 | std::cout << "Initializing GPU #" << device_id << " on OpenCL platform #" << platform_id << std::endl << std::endl; 38 | 39 | OpenCLContext ctx; 40 | if (!ctx.Init(platform_id, device_id)) 41 | { 42 | return false; 43 | } 44 | 45 | if (!ctx.Compile("base_kernels.bin", 46 | { 47 | AES_CL, 48 | BLAKE2B_CL 49 | }, 50 | { 51 | CL_FILLAES1RX4_SCRATCHPAD, 52 | CL_FILLAES4RX4_ENTROPY, 53 | CL_HASHAES1RX4, 54 | CL_BLAKE2B_INITIAL_HASH, 55 | CL_BLAKE2B_HASH_REGISTERS_32, 56 | CL_BLAKE2B_HASH_REGISTERS_64, 57 | CL_BLAKE2B_512_SINGLE_BLOCK_BENCH, 58 | CL_BLAKE2B_512_DOUBLE_BLOCK_BENCH 59 | }, 60 | "", COMPILE_CACHE_BINARY)) 61 | { 62 | return false; 63 | } 64 | 65 | int gcn_version = 12; 66 | 67 | if (portable) 68 | { 69 | switch (workers_per_hash) 70 | { 71 | case 2: 72 | case 4: 73 | case 8: 74 | case 16: 75 | break; 76 | 77 | default: 78 | workers_per_hash = 8; 79 | break; 80 | } 81 | 82 | if (bfactor > 10) 83 | bfactor = 10; 84 | 85 | std::stringstream options; 86 | options << "-D WORKERS_PER_HASH=" << workers_per_hash << " -Werror"; 87 | if (!ctx.Compile("randomx_vm.bin", { RANDOMX_VM_CL }, { CL_INIT_VM, CL_EXECUTE_VM }, options.str(), COMPILE_CACHE_BINARY)) 88 | { 89 | return false; 90 | } 91 | } 92 | else 93 | { 94 | const char* gcn_binary = "randomx_run_gfx803.bin"; 95 | 96 | std::vector t; 97 | std::transform(ctx.device_name.begin(), ctx.device_name.end(), std::back_inserter(t), [](char c) { return static_cast(std::toupper(c)); }); 98 | if ((strcmp(t.data(), "GFX900") == 0) || (strcmp(t.data(), "GFX906") == 0)) 99 | { 100 | gcn_binary = "randomx_run_gfx900.bin"; 101 | gcn_version = 14; 102 | } 103 | else if ((strcmp(t.data(), "GFX1010") == 0) || (strcmp(t.data(), "GFX1011") == 0) || (strcmp(t.data(), "GFX1012") == 0)) 104 | { 105 | gcn_binary = "randomx_run_gfx1010.bin"; 106 | gcn_version = 15; 107 | } 108 | 109 | std::stringstream options; 110 | options << "-D GCN_VERSION=" << gcn_version; 111 | if (!ctx.Compile("randomx_init.bin", { RANDOMX_INIT_CL }, { CL_RANDOMX_INIT }, options.str(), ALWAYS_COMPILE)) 112 | { 113 | return false; 114 | } 115 | 116 | options.str(""); 117 | options << "-D RANDOMX_PROGRAM_ITERATIONS=" << RANDOMX_PROGRAM_ITERATIONS; 118 | if (!ctx.Compile(gcn_binary, { RANDOMX_RUN_CL }, { CL_RANDOMX_RUN }, options.str(), ALWAYS_USE_BINARY, ctx.elf_binary_flags)) 119 | { 120 | return false; 121 | } 122 | } 123 | 124 | if (!intensity) 125 | intensity = std::min(ctx.device_max_alloc_size, ctx.device_global_mem_size) / RANDOMX_SCRATCHPAD_L3; 126 | 127 | intensity -= (intensity & 63); 128 | 129 | const size_t dataset_size = randomx_dataset_item_count() * RANDOMX_DATASET_ITEM_SIZE; 130 | cl_int err; 131 | cl_mem dataset_gpu = nullptr; 132 | if (!dataset_host_allocated) 133 | { 134 | dataset_gpu = clCreateBuffer(ctx.context, CL_MEM_READ_ONLY, dataset_size, nullptr, &err); 135 | CL_CHECK_RESULT(clCreateBuffer); 136 | std::cout << "Allocated " << (dataset_size / 1048576.0) << " MB dataset on GPU" << std::endl; 137 | } 138 | std::cout << "Initializing dataset..."; 139 | 140 | randomx_dataset *myDataset; 141 | bool large_pages_available = true; 142 | { 143 | auto t1 = high_resolution_clock::now(); 144 | 145 | myDataset = randomx_alloc_dataset(RANDOMX_FLAG_LARGE_PAGES); 146 | if (!myDataset) 147 | { 148 | std::cout << "\nCouldn't allocate dataset using large pages" << std::endl; 149 | myDataset = randomx_alloc_dataset(RANDOMX_FLAG_DEFAULT); 150 | large_pages_available = false; 151 | } 152 | 153 | char* dataset_memory = reinterpret_cast(randomx_get_dataset_memory(myDataset)); 154 | bool read_ok = false; 155 | 156 | FILE* fp = fopen("dataset.bin", "rb"); 157 | if (fp) 158 | { 159 | read_ok = (fread(dataset_memory, 1, randomx::DatasetSize, fp) == randomx::DatasetSize); 160 | fclose(fp); 161 | } 162 | 163 | if (!read_ok) 164 | { 165 | randomx_cache *myCache = randomx_alloc_cache((randomx_flags)(RANDOMX_FLAG_JIT | (large_pages_available ? RANDOMX_FLAG_LARGE_PAGES : 0))); 166 | if (!myCache) 167 | { 168 | std::cout << "\nCouldn't allocate cache using large pages" << std::endl; 169 | myCache = randomx_alloc_cache(RANDOMX_FLAG_JIT); 170 | large_pages_available = false; 171 | } 172 | 173 | const char mySeed[] = "RandomX example seed"; 174 | randomx_init_cache(myCache, mySeed, sizeof(mySeed)); 175 | 176 | std::vector threads; 177 | for (uint32_t i = 0, n = std::thread::hardware_concurrency(); i < n; ++i) 178 | threads.emplace_back([myDataset, myCache, i, n]() { randomx_init_dataset(myDataset, myCache, (i * randomx_dataset_item_count()) / n, ((i + 1) * randomx_dataset_item_count()) / n - (i * randomx_dataset_item_count()) / n); }); 179 | 180 | for (auto& t : threads) 181 | t.join(); 182 | 183 | randomx_release_cache(myCache); 184 | 185 | fp = fopen("dataset.bin", "wb"); 186 | if (fp) 187 | { 188 | fwrite(dataset_memory, 1, randomx::DatasetSize, fp); 189 | fclose(fp); 190 | } 191 | } 192 | 193 | if (!dataset_host_allocated) 194 | { 195 | CL_CHECKED_CALL(clEnqueueWriteBuffer, ctx.queue, dataset_gpu, CL_TRUE, 0, dataset_size, randomx_get_dataset_memory(myDataset), 0, nullptr, nullptr); 196 | } 197 | 198 | std::cout << "done in " << (duration_cast(high_resolution_clock::now() - t1).count() / 1e9) << " seconds" << std::endl; 199 | } 200 | 201 | if (dataset_host_allocated) 202 | { 203 | dataset_gpu = clCreateBuffer(ctx.context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, dataset_size, randomx_get_dataset_memory(myDataset), &err); 204 | CL_CHECK_RESULT(clCreateBuffer); 205 | std::cout << "Using host-allocated " << (dataset_size / 1048576.0) << " MB dataset" << std::endl; 206 | } 207 | 208 | ALLOCATE_DEVICE_MEMORY(scratchpads_gpu, ctx, intensity * (RANDOMX_SCRATCHPAD_L3 + 64)); 209 | std::cout << "Allocated " << intensity << " scratchpads\n" << std::endl; 210 | 211 | ALLOCATE_DEVICE_MEMORY(hashes_gpu, ctx, intensity * INITIAL_HASH_SIZE); 212 | ALLOCATE_DEVICE_MEMORY(entropy_gpu, ctx, intensity * ENTROPY_SIZE); 213 | ALLOCATE_DEVICE_MEMORY(vm_states_gpu, ctx, portable ? (intensity * VM_STATE_SIZE) : (intensity * REGISTERS_SIZE)); 214 | ALLOCATE_DEVICE_MEMORY(rounding_gpu, ctx, intensity * sizeof(uint32_t)); 215 | ALLOCATE_DEVICE_MEMORY(blocktemplate_gpu, ctx, intensity * sizeof(blockTemplate)); 216 | ALLOCATE_DEVICE_MEMORY(intermediate_programs_gpu, ctx, portable ? 0 : (intensity * INTERMEDIATE_PROGRAM_SIZE)); 217 | ALLOCATE_DEVICE_MEMORY(compiled_programs_gpu, ctx, portable ? 0 : (intensity * COMPILED_PROGRAM_SIZE)); 218 | 219 | CL_CHECKED_CALL(clEnqueueWriteBuffer, ctx.queue, blocktemplate_gpu, CL_TRUE, 0, sizeof(blockTemplate), blockTemplate, 0, nullptr, nullptr); 220 | 221 | auto prev_time = high_resolution_clock::now(); 222 | 223 | std::vector hashes, hashes_check; 224 | hashes.resize(intensity * 32); 225 | hashes_check.resize(intensity * 32); 226 | 227 | std::vector threads; 228 | std::atomic nonce_counter; 229 | bool cpu_limited = false; 230 | 231 | uint32_t failed_nonces = 0; 232 | 233 | cl_kernel kernel_blake2b_initial_hash = ctx.kernels[CL_BLAKE2B_INITIAL_HASH]; 234 | if (!clSetKernelArgs(kernel_blake2b_initial_hash, hashes_gpu, blocktemplate_gpu, 0U)) 235 | { 236 | return false; 237 | } 238 | 239 | cl_kernel kernel_fillaes1rx4_scratchpad = ctx.kernels[CL_FILLAES1RX4_SCRATCHPAD]; 240 | if (!clSetKernelArgs(kernel_fillaes1rx4_scratchpad, hashes_gpu, scratchpads_gpu, static_cast(intensity))) 241 | { 242 | return false; 243 | } 244 | 245 | cl_kernel kernel_fillaes1rx4_entropy = ctx.kernels[CL_FILLAES4RX4_ENTROPY]; 246 | if (!clSetKernelArgs(kernel_fillaes1rx4_entropy, hashes_gpu, entropy_gpu, static_cast(intensity))) 247 | { 248 | return false; 249 | } 250 | 251 | cl_kernel kernel_randomx_init, kernel_randomx_run; 252 | if (portable) 253 | { 254 | kernel_randomx_init = ctx.kernels[CL_INIT_VM]; 255 | if (!clSetKernelArgs(kernel_randomx_init, entropy_gpu, vm_states_gpu)) 256 | { 257 | return false; 258 | } 259 | 260 | kernel_randomx_run = ctx.kernels[CL_EXECUTE_VM]; 261 | if (!clSetKernelArgs(kernel_randomx_run, vm_states_gpu, rounding_gpu, scratchpads_gpu, dataset_gpu, static_cast(intensity), static_cast(RANDOMX_PROGRAM_ITERATIONS >> bfactor), 1U, 1U)) 262 | { 263 | return false; 264 | } 265 | } 266 | else 267 | { 268 | kernel_randomx_init = ctx.kernels[CL_RANDOMX_INIT]; 269 | if (!clSetKernelArgs(kernel_randomx_init, entropy_gpu, vm_states_gpu, intermediate_programs_gpu, compiled_programs_gpu, static_cast(intensity))) 270 | { 271 | return false; 272 | } 273 | 274 | kernel_randomx_run = ctx.kernels[CL_RANDOMX_RUN]; 275 | 276 | constexpr uint32_t rx_parameters = 277 | (PowerOf2(RANDOMX_SCRATCHPAD_L1) << 0) | 278 | (PowerOf2(RANDOMX_SCRATCHPAD_L2) << 5) | 279 | (PowerOf2(RANDOMX_SCRATCHPAD_L3) << 10) | 280 | (PowerOf2(RANDOMX_PROGRAM_ITERATIONS) << 15); 281 | 282 | if (!clSetKernelArgs(kernel_randomx_run, dataset_gpu, scratchpads_gpu, vm_states_gpu, rounding_gpu, compiled_programs_gpu, static_cast(intensity), rx_parameters)) 283 | { 284 | return false; 285 | } 286 | } 287 | 288 | cl_kernel kernel_hashaes1rx4 = ctx.kernels[CL_HASHAES1RX4]; 289 | if (!clSetKernelArgs(kernel_hashaes1rx4, scratchpads_gpu, vm_states_gpu, 192U, static_cast(portable ? VM_STATE_SIZE : REGISTERS_SIZE), static_cast(intensity))) 290 | { 291 | return false; 292 | } 293 | 294 | cl_kernel kernel_blake2b_hash_registers_32 = ctx.kernels[CL_BLAKE2B_HASH_REGISTERS_32]; 295 | if (!clSetKernelArgs(kernel_blake2b_hash_registers_32, hashes_gpu, vm_states_gpu, static_cast(portable ? VM_STATE_SIZE : REGISTERS_SIZE))) 296 | { 297 | return false; 298 | } 299 | 300 | cl_kernel kernel_blake2b_hash_registers_64 = ctx.kernels[CL_BLAKE2B_HASH_REGISTERS_64]; 301 | if (!clSetKernelArgs(kernel_blake2b_hash_registers_64, hashes_gpu, vm_states_gpu, static_cast(portable ? VM_STATE_SIZE : REGISTERS_SIZE))) 302 | { 303 | return false; 304 | } 305 | 306 | const size_t global_work_size = intensity; 307 | const size_t global_work_size4 = intensity * 4; 308 | const size_t global_work_size8 = intensity * 8; 309 | const size_t global_work_size16 = intensity * 16; 310 | const size_t global_work_size32 = intensity * 32; 311 | const size_t global_work_size64 = intensity * 64; 312 | const size_t local_work_size = 64; 313 | const size_t local_work_size32 = 32; 314 | const size_t local_work_size16 = 16; 315 | const uint32_t zero = 0; 316 | 317 | for (size_t nonce = start_nonce, k = 0; nonce < 0xFFFFFFFFUL; nonce += intensity, ++k) 318 | { 319 | auto validation_thread = [&nonce_counter, myDataset, &hashes_check, intensity, nonce, &large_pages_available]() { 320 | const randomx_flags flags = (randomx_flags)(RANDOMX_FLAG_FULL_MEM | RANDOMX_FLAG_JIT | RANDOMX_FLAG_HARD_AES); 321 | randomx_vm *myMachine = randomx_create_vm((randomx_flags)(flags | (large_pages_available ? RANDOMX_FLAG_LARGE_PAGES : 0)), nullptr, myDataset); 322 | 323 | if (!myMachine && large_pages_available) 324 | { 325 | large_pages_available = false; 326 | myMachine = randomx_create_vm(flags, nullptr, myDataset); 327 | } 328 | 329 | uint8_t buf[sizeof(blockTemplate)]; 330 | memcpy(buf, blockTemplate, sizeof(buf)); 331 | 332 | for (;;) 333 | { 334 | const uint32_t i = nonce_counter.fetch_add(1); 335 | if (i >= intensity) 336 | break; 337 | 338 | *(uint32_t*)(buf + 39) = static_cast(nonce + i); 339 | 340 | randomx_calculate_hash(myMachine, buf, sizeof(buf), (hashes_check.data() + i * 32)); 341 | } 342 | randomx_destroy_vm(myMachine); 343 | }; 344 | 345 | if (validate) 346 | { 347 | nonce_counter = 0; 348 | 349 | const uint32_t n = std::max(std::thread::hardware_concurrency() / 2, 1U); 350 | 351 | threads.clear(); 352 | for (uint32_t i = 0; i < n; ++i) 353 | threads.emplace_back(validation_thread); 354 | } 355 | 356 | auto cur_time = high_resolution_clock::now(); 357 | if (k > 0) 358 | { 359 | const double dt = duration_cast(cur_time - prev_time).count() / 1e9; 360 | 361 | if (validate) 362 | { 363 | const size_t n = nonce - start_nonce; 364 | printf("%zu (%.3f%%) hashes validated successfully, %u (%.3f%%) hashes failed, %.0f h/s%s\n", 365 | n - failed_nonces, 366 | static_cast(n - failed_nonces) / n * 100.0, 367 | failed_nonces, 368 | static_cast(failed_nonces) / n * 100.0, 369 | intensity / dt, 370 | cpu_limited ? ", limited by CPU" : " " 371 | ); 372 | } 373 | else 374 | { 375 | printf("%.0f h/s\t\r", intensity / dt); 376 | } 377 | } 378 | prev_time = cur_time; 379 | 380 | CL_CHECKED_CALL(clSetKernelArg, kernel_blake2b_initial_hash, 2, sizeof(uint32_t), &nonce); 381 | CL_CHECKED_CALL(clEnqueueNDRangeKernel, ctx.queue, kernel_blake2b_initial_hash, 1, nullptr, &global_work_size, &local_work_size, 0, nullptr, nullptr); 382 | CL_CHECKED_CALL(clEnqueueNDRangeKernel, ctx.queue, kernel_fillaes1rx4_scratchpad, 1, nullptr, &global_work_size4, &local_work_size, 0, nullptr, nullptr); 383 | CL_CHECKED_CALL(clEnqueueFillBuffer, ctx.queue, rounding_gpu, &zero, sizeof(zero), 0, intensity * sizeof(uint32_t), 0, nullptr, nullptr); 384 | 385 | for (size_t i = 0; i < RANDOMX_PROGRAM_COUNT; ++i) 386 | { 387 | CL_CHECKED_CALL(clEnqueueNDRangeKernel, ctx.queue, kernel_fillaes1rx4_entropy, 1, nullptr, &global_work_size4, &local_work_size, 0, nullptr, nullptr); 388 | CL_CHECKED_CALL(clEnqueueNDRangeKernel, ctx.queue, kernel_randomx_init, 1, nullptr, portable ? &global_work_size8 : &global_work_size32, portable ? &local_work_size32 : &local_work_size, 0, nullptr, nullptr); 389 | if (portable) 390 | { 391 | //if (i == 0) 392 | //{ 393 | // CL_CHECKED_CALL(clFinish, ctx.queue); 394 | // std::vector buf(intensity * VM_STATE_SIZE); 395 | // CL_CHECKED_CALL(clEnqueueReadBuffer, ctx.queue, vm_states_gpu, CL_TRUE, 0, buf.size(), buf.data(), 0, nullptr, nullptr); 396 | // FILE* fp; 397 | // fopen_s(&fp, "vm_states.bin", "wb"); 398 | // fwrite(buf.data(), 1, buf.size(), fp); 399 | // fclose(fp); 400 | // return false; 401 | //} 402 | uint32_t first = 1; 403 | uint32_t last = 0; 404 | CL_CHECKED_CALL(clSetKernelArg, kernel_randomx_run, 6, sizeof(uint32_t), &first); 405 | CL_CHECKED_CALL(clSetKernelArg, kernel_randomx_run, 7, sizeof(uint32_t), &last); 406 | for (int j = 0, n = 1 << bfactor; j < n; ++j) 407 | { 408 | if (j == n - 1) 409 | { 410 | last = 1; 411 | CL_CHECKED_CALL(clSetKernelArg, kernel_randomx_run, 7, sizeof(uint32_t), &last); 412 | } 413 | 414 | CL_CHECKED_CALL(clEnqueueNDRangeKernel, ctx.queue, kernel_randomx_run, 1, nullptr, (workers_per_hash == 16) ? &global_work_size16 : &global_work_size8, (workers_per_hash == 16) ? &local_work_size32 : &local_work_size16, 0, nullptr, nullptr); 415 | 416 | if (j == 0) 417 | { 418 | first = 0; 419 | CL_CHECKED_CALL(clSetKernelArg, kernel_randomx_run, 6, sizeof(uint32_t), &first); 420 | } 421 | } 422 | } 423 | else 424 | { 425 | //if (i == 0) 426 | //{ 427 | // CL_CHECKED_CALL(clFinish, ctx.queue); 428 | // std::vector buf(intensity * COMPILED_PROGRAM_SIZE); 429 | // CL_CHECKED_CALL(clEnqueueReadBuffer, ctx.queue, compiled_programs_gpu, CL_TRUE, 0, buf.size(), buf.data(), 0, nullptr, nullptr); 430 | // FILE* fp; 431 | // fopen_s(&fp, "compiled_program.bin", "wb"); 432 | // fwrite(buf.data(), 1, buf.size(), fp); 433 | // fclose(fp); 434 | // return false; 435 | //} 436 | CL_CHECKED_CALL(clFinish, ctx.queue); 437 | if (gcn_version == 15) 438 | { 439 | CL_CHECKED_CALL(clEnqueueNDRangeKernel, ctx.queue, kernel_randomx_run, 1, nullptr, &global_work_size32, &local_work_size32, 0, nullptr, nullptr); 440 | } 441 | else 442 | { 443 | CL_CHECKED_CALL(clEnqueueNDRangeKernel, ctx.queue, kernel_randomx_run, 1, nullptr, &global_work_size64, &local_work_size, 0, nullptr, nullptr); 444 | } 445 | } 446 | 447 | if (i == RANDOMX_PROGRAM_COUNT - 1) 448 | { 449 | CL_CHECKED_CALL(clEnqueueNDRangeKernel, ctx.queue, kernel_hashaes1rx4, 1, nullptr, &global_work_size4, &local_work_size, 0, nullptr, nullptr); 450 | CL_CHECKED_CALL(clEnqueueNDRangeKernel, ctx.queue, kernel_blake2b_hash_registers_32, 1, nullptr, &global_work_size, &local_work_size, 0, nullptr, nullptr); 451 | } 452 | else 453 | { 454 | CL_CHECKED_CALL(clEnqueueNDRangeKernel, ctx.queue, kernel_blake2b_hash_registers_64, 1, nullptr, &global_work_size, &local_work_size, 0, nullptr, nullptr); 455 | } 456 | } 457 | 458 | CL_CHECKED_CALL(clFinish, ctx.queue); 459 | 460 | if (validate) 461 | { 462 | CL_CHECKED_CALL(clEnqueueReadBuffer, ctx.queue, hashes_gpu, CL_TRUE, 0, intensity * 32, hashes.data(), 0, nullptr, nullptr); 463 | 464 | cpu_limited = nonce_counter.load() < intensity; 465 | 466 | for (auto& thread : threads) 467 | thread.join(); 468 | 469 | if (memcmp(hashes.data(), hashes_check.data(), intensity * 32) != 0) 470 | { 471 | for (uint32_t i = 0; i < intensity * 32; i += 32) 472 | { 473 | if (memcmp(hashes.data() + i, hashes_check.data() + i, 32)) 474 | { 475 | std::cerr << "CPU validation error, failing nonce = " << (nonce + i / 32) << std::endl; 476 | ++failed_nonces; 477 | } 478 | } 479 | } 480 | } 481 | } 482 | 483 | return true; 484 | } 485 | -------------------------------------------------------------------------------- /RandomX_OpenCL/miner.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2019 SChernykh 3 | 4 | This file is part of RandomX OpenCL. 5 | 6 | RandomX OpenCL is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | RandomX OpenCL is distributed in the hope that it will be useful, 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | GNU General Public License for more details. 15 | 16 | You should have received a copy of the GNU General Public License 17 | along with RandomX OpenCL. If not, see . 18 | */ 19 | 20 | #pragma once 21 | 22 | bool test_mining(uint32_t platform_id, uint32_t device_id, size_t intensity, uint32_t start_nonce, uint32_t workers_per_hash, uint32_t bfactor, bool portable, bool dataset_host_allocated, bool validate); 23 | -------------------------------------------------------------------------------- /RandomX_OpenCL/opencl_helpers.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2019 SChernykh 3 | 4 | This file is part of RandomX OpenCL. 5 | 6 | RandomX OpenCL is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | RandomX OpenCL is distributed in the hope that it will be useful, 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | GNU General Public License for more details. 15 | 16 | You should have received a copy of the GNU General Public License 17 | along with RandomX OpenCL. If not, see . 18 | */ 19 | 20 | #include 21 | #include 22 | #include 23 | #include "opencl_helpers.h" 24 | 25 | OpenCLContext::~OpenCLContext() 26 | { 27 | for (auto& k : kernels) 28 | clReleaseKernel(k.second); 29 | 30 | clReleaseCommandQueue(queue); 31 | clReleaseContext(context); 32 | } 33 | 34 | bool OpenCLContext::Init(uint32_t platform_id, uint32_t device_id) 35 | { 36 | cl_int err; 37 | 38 | cl_platform_id platforms[4]; 39 | cl_uint num_platforms; 40 | CL_CHECKED_CALL(clGetPlatformIDs, 4, platforms, &num_platforms); 41 | 42 | if (platform_id >= num_platforms) 43 | { 44 | std::cerr << "Invalid platform ID (" << platform_id << "), " << num_platforms << " OpenCL platforms available" << std::endl; 45 | return false; 46 | } 47 | 48 | cl_device_id devices[32]; 49 | cl_uint num_devices; 50 | CL_CHECKED_CALL(clGetDeviceIDs, platforms[platform_id], CL_DEVICE_TYPE_GPU, 32, devices, &num_devices); 51 | 52 | if (device_id >= num_devices) 53 | { 54 | std::cerr << "Invalid device ID (" << device_id << "), " << num_devices << " OpenCL GPU devices available" << std::endl; 55 | return false; 56 | } 57 | 58 | device = devices[device_id]; 59 | 60 | size_t size; 61 | 62 | CL_CHECKED_CALL(clGetDeviceInfo, device, CL_DEVICE_NAME, 0, nullptr, &size); 63 | device_name.resize(size); 64 | CL_CHECKED_CALL(clGetDeviceInfo, device, CL_DEVICE_NAME, size, device_name.data(), nullptr); 65 | 66 | CL_CHECKED_CALL(clGetDeviceInfo, device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(device_global_mem_size), &device_global_mem_size, nullptr); 67 | CL_CHECKED_CALL(clGetDeviceInfo, device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(device_local_mem_size), &device_local_mem_size, nullptr); 68 | CL_CHECKED_CALL(clGetDeviceInfo, device, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(device_freq), &device_freq, nullptr); 69 | CL_CHECKED_CALL(clGetDeviceInfo, device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(device_compute_units), &device_compute_units, nullptr); 70 | CL_CHECKED_CALL(clGetDeviceInfo, device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(device_max_alloc_size), &device_max_alloc_size, nullptr); 71 | 72 | CL_CHECKED_CALL(clGetDeviceInfo, device, CL_DEVICE_VENDOR, 0, nullptr, &size); 73 | device_vendor.resize(size); 74 | CL_CHECKED_CALL(clGetDeviceInfo, device, CL_DEVICE_VENDOR, size, device_vendor.data(), nullptr); 75 | 76 | CL_CHECKED_CALL(clGetDeviceInfo, device, CL_DEVICE_VERSION, 0, nullptr, &size); 77 | device_version.resize(size); 78 | CL_CHECKED_CALL(clGetDeviceInfo, device, CL_DEVICE_VERSION, size, device_version.data(), nullptr); 79 | 80 | CL_CHECKED_CALL(clGetDeviceInfo, device, CL_DRIVER_VERSION, 0, nullptr, &size); 81 | device_driver_version.resize(size); 82 | CL_CHECKED_CALL(clGetDeviceInfo, device, CL_DRIVER_VERSION, size, device_driver_version.data(), nullptr); 83 | 84 | CL_CHECKED_CALL(clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, 0, nullptr, &size); 85 | device_extensions.resize(size); 86 | CL_CHECKED_CALL(clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, size, device_extensions.data(), nullptr); 87 | 88 | std::cout << "Device name: " << device_name.data() << std::endl; 89 | std::cout << "Device vendor: " << device_vendor.data() << std::endl; 90 | std::cout << "Global memory: " << (device_global_mem_size >> 20) << " MB" << std::endl; 91 | std::cout << "Local memory: " << (device_local_mem_size >> 10) << " KB" << std::endl; 92 | std::cout << "Clock speed: " << device_freq << " MHz" << std::endl; 93 | std::cout << "Compute units: " << device_compute_units << std::endl; 94 | std::cout << "OpenCL version: " << device_version.data() << std::endl; 95 | std::cout << "Driver version: " << device_driver_version.data() << std::endl; 96 | std::cout << "Extensions: " << device_extensions.data() << std::endl << std::endl; 97 | 98 | context = clCreateContext(nullptr, 1, &device, nullptr, nullptr, &err); 99 | CL_CHECK_RESULT(clCreateContext); 100 | 101 | queue = clCreateCommandQueue(context, device, 0, &err); 102 | CL_CHECK_RESULT(clCreateCommandQueue); 103 | 104 | return true; 105 | } 106 | 107 | bool OpenCLContext::Compile(const char* binary_name, const std::initializer_list& source_files, const std::initializer_list& kernel_names, const std::string& options, CachingParameters caching, uint32_t force_elf_binary_flags) 108 | { 109 | std::vector source; 110 | source.reserve(source_files.size()); 111 | for (const std::string& source_file : source_files) 112 | { 113 | std::ifstream f(source_file); 114 | if (!f.is_open()) 115 | { 116 | std::cerr << "Couldn't open " << source_file << std::endl; 117 | return false; 118 | } 119 | source.emplace_back((std::istreambuf_iterator(f)), std::istreambuf_iterator()); 120 | } 121 | 122 | std::vector data; 123 | data.reserve(source_files.size()); 124 | for (const std::string& s : source) 125 | data.emplace_back(s.data()); 126 | 127 | const char** p = data.data(); 128 | cl_int err; 129 | 130 | cl_program program = nullptr; 131 | bool created_with_binary = false; 132 | if (caching != ALWAYS_COMPILE) 133 | { 134 | std::ifstream f(binary_name, std::ios::binary); 135 | if (f.is_open()) 136 | { 137 | std::vector buf; 138 | buf.insert(buf.begin(), std::istreambuf_iterator(f), std::istreambuf_iterator()); 139 | 140 | const size_t data_length = buf.size(); 141 | if (force_elf_binary_flags) 142 | *(uint32_t*)(buf.data() + 0x30) = force_elf_binary_flags; 143 | 144 | const unsigned char* binary_data = reinterpret_cast(buf.data()); 145 | 146 | program = clCreateProgramWithBinary(context, 1, &device, &data_length, &binary_data, nullptr, &err); 147 | CL_CHECK_RESULT(clCreateProgramWithBinary); 148 | 149 | created_with_binary = true; 150 | } 151 | else if (caching == ALWAYS_USE_BINARY) 152 | { 153 | std::cerr << "Couldn't open " << binary_name << std::endl; 154 | return false; 155 | } 156 | } 157 | 158 | if (!program) 159 | { 160 | if (caching == ALWAYS_USE_BINARY) 161 | { 162 | std::cerr << "Couldn't create program from binary " << binary_name << std::endl; 163 | return false; 164 | } 165 | program = clCreateProgramWithSource(context, static_cast(source_files.size()), p, nullptr, &err); 166 | CL_CHECK_RESULT(clCreateProgramWithSource); 167 | } 168 | 169 | std::cout << "Compiling " << binary_name << "..."; 170 | std::string s = "-Werror -I CL"; 171 | if (!options.empty()) 172 | { 173 | s += ' '; 174 | s += options; 175 | } 176 | err = clBuildProgram(program, 1, &device, s.c_str(), nullptr, nullptr); 177 | if (err != CL_SUCCESS) 178 | { 179 | std::cerr << "clBuildProgram failed: error " << err << std::endl; 180 | 181 | size_t size; 182 | CL_CHECKED_CALL(clGetProgramBuildInfo, program, device, CL_PROGRAM_BUILD_LOG, 0, nullptr, &size); 183 | 184 | std::vector build_log; 185 | build_log.resize(size); 186 | CL_CHECKED_CALL(clGetProgramBuildInfo, program, device, CL_PROGRAM_BUILD_LOG, size, build_log.data(), nullptr); 187 | 188 | std::cerr << build_log.data() << std::endl; 189 | 190 | return false; 191 | } 192 | std::cout << "done" << std::endl; 193 | 194 | size_t bin_size; 195 | CL_CHECKED_CALL(clGetProgramInfo, program, CL_PROGRAM_BINARY_SIZES, sizeof(bin_size), &bin_size, nullptr); 196 | 197 | std::vector binary_data(bin_size); 198 | char* tmp[1] = { binary_data.data() }; 199 | CL_CHECKED_CALL(clGetProgramInfo, program, CL_PROGRAM_BINARIES, sizeof(tmp), tmp, NULL); 200 | 201 | elf_binary_flags = (bin_size >= 0x34) ? *(uint32_t*)(binary_data.data() + 0x30) : 0; 202 | 203 | if (!created_with_binary) 204 | { 205 | std::ofstream f(binary_name, std::ios::binary); 206 | f.write(tmp[0], bin_size); 207 | f.close(); 208 | } 209 | 210 | for (const std::string& name : kernel_names) 211 | { 212 | cl_kernel kernel = clCreateKernel(program, name.c_str(), &err); 213 | CL_CHECK_RESULT(clCreateKernel); 214 | 215 | kernels.emplace(name, kernel); 216 | } 217 | 218 | CL_CHECKED_CALL(clReleaseProgram, program); 219 | return true; 220 | } 221 | -------------------------------------------------------------------------------- /RandomX_OpenCL/opencl_helpers.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2019 SChernykh 3 | 4 | This file is part of RandomX OpenCL. 5 | 6 | RandomX OpenCL is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | RandomX OpenCL is distributed in the hope that it will be useful, 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | GNU General Public License for more details. 15 | 16 | You should have received a copy of the GNU General Public License 17 | along with RandomX OpenCL. If not, see . 18 | */ 19 | 20 | #pragma once 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | constexpr uint32_t PowerOf2(size_t value) { return (value <= 1) ? 0U : PowerOf2(value / 2) + 1U; } 32 | 33 | #define STR(X) #X 34 | #define STR2(X) STR(X) 35 | 36 | #define CL_CHECK_RESULT(func) \ 37 | if (err != CL_SUCCESS) \ 38 | { \ 39 | std::cerr << STR(func) " failed (" __FILE__ ", line " STR2(__LINE__) "): error " << err << std::endl; \ 40 | return false; \ 41 | } 42 | 43 | #define CL_CHECKED_CALL(func, ...) \ 44 | err = func(__VA_ARGS__); \ 45 | CL_CHECK_RESULT(func); 46 | 47 | enum CachingParameters 48 | { 49 | ALWAYS_COMPILE = 0, 50 | COMPILE_CACHE_BINARY = 1, 51 | ALWAYS_USE_BINARY = 2, 52 | }; 53 | 54 | struct OpenCLContext 55 | { 56 | OpenCLContext() 57 | : context(0) 58 | , queue(0) 59 | , elf_binary_flags(0) 60 | {} 61 | 62 | ~OpenCLContext(); 63 | 64 | bool Init(uint32_t platform_id, uint32_t device_id); 65 | bool Compile(const char* binary_name, const std::initializer_list& source_files, const std::initializer_list& kernel_names, const std::string& options = std::string(), CachingParameters caching = ALWAYS_COMPILE, uint32_t force_elf_binary_flags = 0); 66 | 67 | cl_device_id device; 68 | cl_context context; 69 | cl_command_queue queue; 70 | uint32_t elf_binary_flags; 71 | std::map kernels; 72 | 73 | std::vector device_name; 74 | cl_ulong device_global_mem_size; 75 | cl_ulong device_local_mem_size; 76 | cl_uint device_freq; 77 | cl_uint device_compute_units; 78 | cl_ulong device_max_alloc_size; 79 | std::vector device_vendor; 80 | std::vector device_version; 81 | std::vector device_driver_version; 82 | std::vector device_extensions; 83 | }; 84 | 85 | struct DevicePtr 86 | { 87 | DevicePtr(const OpenCLContext& ctx, size_t size, const char* debug_str) : p(static_cast(0)) { Init(ctx, size, debug_str); } 88 | ~DevicePtr() { if (p) clReleaseMemObject(p); } 89 | 90 | bool Init(const OpenCLContext& ctx, size_t size, const char* debug_str) 91 | { 92 | if (!size) 93 | return true; 94 | 95 | cl_int err; 96 | p = clCreateBuffer(ctx.context, CL_MEM_READ_WRITE, size, nullptr, &err); 97 | if (err != CL_SUCCESS) 98 | { 99 | std::cerr << "clCreateBuffer failed (" << debug_str << "): error " << err << std::endl; 100 | return false; 101 | } 102 | return true; 103 | } 104 | 105 | operator cl_mem() const { return p; } 106 | 107 | private: 108 | cl_mem p; 109 | }; 110 | 111 | static_assert(sizeof(DevicePtr) == sizeof(cl_mem), "Invalid DevicePtr struct, check your compiler options"); 112 | 113 | #define ALLOCATE_DEVICE_MEMORY(p, ctx, size) DevicePtr p(ctx, (size), #p ", " __FILE__ ", line " STR2(__LINE__)); if (!p && (size)) return false; 114 | 115 | template bool _clSetKernelArg(cl_kernel) { return true; } 116 | 117 | template 118 | bool _clSetKernelArg(cl_kernel kernel, T&& value, Args&& ... args) 119 | { 120 | cl_int err; 121 | CL_CHECKED_CALL(clSetKernelArg, kernel, index, sizeof(T), &value); 122 | return _clSetKernelArg(kernel, std::forward(args)...); 123 | } 124 | 125 | template 126 | bool clSetKernelArgs(cl_kernel kernel, Args&& ... args) 127 | { 128 | return _clSetKernelArg<0>(kernel, std::forward(args)...); 129 | } 130 | 131 | struct SThread 132 | { 133 | SThread() : t(nullptr) {} 134 | SThread(const SThread&) = delete; 135 | SThread(SThread&& other) : t(other.t) { other.t = nullptr; } 136 | 137 | SThread& operator=(const SThread&) = delete; 138 | SThread& operator=(SThread&&) = delete; 139 | 140 | template 141 | SThread(T&& func) : t(new std::thread(std::move(func))) {} 142 | 143 | ~SThread() { join(); } 144 | 145 | void join() 146 | { 147 | if (t) 148 | { 149 | t->join(); 150 | delete t; 151 | t = nullptr; 152 | } 153 | } 154 | 155 | std::thread* t; 156 | }; 157 | -------------------------------------------------------------------------------- /RandomX_OpenCL/tests.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2019 SChernykh 3 | 4 | This file is part of RandomX OpenCL. 5 | 6 | RandomX OpenCL is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | RandomX OpenCL is distributed in the hope that it will be useful, 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | GNU General Public License for more details. 15 | 16 | You should have received a copy of the GNU General Public License 17 | along with RandomX OpenCL. If not, see . 18 | */ 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include "opencl_helpers.h" 25 | #include "tests.h" 26 | #include "definitions.h" 27 | 28 | #ifdef _MSC_VER 29 | #pragma warning(push) 30 | #pragma warning(disable : 4804) 31 | #endif 32 | 33 | #include "../RandomX/src/blake2/blake2.h" 34 | #include "../RandomX/src/aes_hash.hpp" 35 | 36 | #ifdef _MSC_VER 37 | #pragma warning(pop) 38 | #endif 39 | 40 | constexpr size_t BLAKE2B_STEP = 1 << 28; 41 | 42 | using namespace std::chrono; 43 | 44 | bool tests(uint32_t platform_id, uint32_t device_id, size_t intensity) 45 | { 46 | std::cout << "Initializing GPU #" << device_id << " on OpenCL platform #" << platform_id << std::endl << std::endl; 47 | 48 | OpenCLContext ctx; 49 | if (!ctx.Init(platform_id, device_id)) 50 | { 51 | return false; 52 | } 53 | 54 | if (!ctx.Compile("base_kernels.bin", 55 | { 56 | AES_CL, 57 | BLAKE2B_CL 58 | }, 59 | { 60 | CL_FILLAES1RX4_SCRATCHPAD, 61 | CL_FILLAES4RX4_ENTROPY, 62 | CL_HASHAES1RX4, 63 | CL_BLAKE2B_INITIAL_HASH, 64 | CL_BLAKE2B_HASH_REGISTERS_32, 65 | CL_BLAKE2B_HASH_REGISTERS_64, 66 | CL_BLAKE2B_512_SINGLE_BLOCK_BENCH, 67 | CL_BLAKE2B_512_DOUBLE_BLOCK_BENCH 68 | }, "", COMPILE_CACHE_BINARY)) 69 | { 70 | return false; 71 | } 72 | 73 | if (!ctx.Compile("randomx_vm.bin", { RANDOMX_VM_CL }, { CL_INIT_VM, CL_EXECUTE_VM }, "-D WORKERS_PER_HASH=8 -cl-std=CL1.2 -Werror", ALWAYS_COMPILE)) 74 | { 75 | return false; 76 | } 77 | 78 | if (!intensity) 79 | intensity = std::min(ctx.device_max_alloc_size, ctx.device_global_mem_size) / RANDOMX_SCRATCHPAD_L3; 80 | 81 | intensity -= (intensity & 63); 82 | 83 | ALLOCATE_DEVICE_MEMORY(scratchpads_gpu, ctx, intensity * (RANDOMX_SCRATCHPAD_L3 + 64)); 84 | std::cout << "Allocated " << intensity << " scratchpads" << std::endl << std::endl; 85 | 86 | ALLOCATE_DEVICE_MEMORY(entropy_gpu, ctx, intensity * ENTROPY_SIZE); 87 | ALLOCATE_DEVICE_MEMORY(vm_states_gpu, ctx, intensity * VM_STATE_SIZE); 88 | 89 | cl_kernel kernel = ctx.kernels[CL_INIT_VM]; 90 | if (!clSetKernelArgs(kernel, entropy_gpu, vm_states_gpu)) 91 | { 92 | return false; 93 | } 94 | 95 | cl_int err; 96 | size_t global_work_size = intensity * 8; 97 | size_t local_work_size = 32; 98 | 99 | std::vector entropy((intensity + 1) * ENTROPY_SIZE); 100 | std::vector vm_states(intensity * VM_STATE_SIZE); 101 | 102 | { 103 | uint64_t r = 123; 104 | uint64_t* p = (uint64_t*) entropy.data(); 105 | for (size_t i = 0; i < intensity * ENTROPY_SIZE / sizeof(uint64_t); ++i) 106 | { 107 | r = r * 6364136223846793005ULL + 1442695040888963407ULL; 108 | p[i] = r; 109 | } 110 | } 111 | 112 | CL_CHECKED_CALL(clEnqueueWriteBuffer, ctx.queue, entropy_gpu, CL_FALSE, 0, intensity * ENTROPY_SIZE, entropy.data(), 0, nullptr, nullptr); 113 | 114 | CL_CHECKED_CALL(clEnqueueNDRangeKernel, ctx.queue, kernel, 1, nullptr, &global_work_size, &local_work_size, 0, nullptr, nullptr); 115 | CL_CHECKED_CALL(clFinish, ctx.queue); 116 | 117 | CL_CHECKED_CALL(clEnqueueReadBuffer, ctx.queue, vm_states_gpu, CL_TRUE, 0, intensity * VM_STATE_SIZE, vm_states.data(), 0, nullptr, nullptr); 118 | 119 | { 120 | std::ofstream f("vm_states.bin", std::ios::binary); 121 | f.write((const char*) vm_states.data(), vm_states.size()); 122 | } 123 | 124 | ALLOCATE_DEVICE_MEMORY(registers_gpu, ctx, intensity * REGISTERS_SIZE); 125 | 126 | uint32_t zero = 0; 127 | CL_CHECKED_CALL(clEnqueueFillBuffer, ctx.queue, registers_gpu, &zero, sizeof(zero), 0, intensity * REGISTERS_SIZE, 0, nullptr, nullptr); 128 | 129 | ALLOCATE_DEVICE_MEMORY(hash_gpu, ctx, intensity * INITIAL_HASH_SIZE); 130 | ALLOCATE_DEVICE_MEMORY(blockTemplate_gpu, ctx, sizeof(blockTemplate)); 131 | 132 | CL_CHECKED_CALL(clEnqueueWriteBuffer, ctx.queue, blockTemplate_gpu, CL_FALSE, 0, sizeof(blockTemplate), blockTemplate, 0, nullptr, nullptr); 133 | 134 | ALLOCATE_DEVICE_MEMORY(nonce_gpu, ctx, sizeof(uint64_t)); 135 | 136 | kernel = ctx.kernels[CL_BLAKE2B_INITIAL_HASH]; 137 | if (!clSetKernelArgs(kernel, hash_gpu, blockTemplate_gpu, 0)) 138 | { 139 | return false; 140 | } 141 | 142 | global_work_size = intensity; 143 | local_work_size = 64; 144 | 145 | CL_CHECKED_CALL(clEnqueueNDRangeKernel, ctx.queue, kernel, 1, nullptr, &global_work_size, &local_work_size, 0, nullptr, nullptr); 146 | CL_CHECKED_CALL(clFinish, ctx.queue); 147 | 148 | std::vector hashes; 149 | hashes.resize(intensity * INITIAL_HASH_SIZE); 150 | CL_CHECKED_CALL(clEnqueueReadBuffer, ctx.queue, hash_gpu, CL_TRUE, 0, intensity * INITIAL_HASH_SIZE, hashes.data(), 0, nullptr, nullptr); 151 | 152 | std::vector hashes2; 153 | hashes2.resize(intensity * INITIAL_HASH_SIZE); 154 | for (uint32_t i = 0; i < intensity; ++i) 155 | { 156 | *(uint32_t*)(blockTemplate + 39) = i; 157 | blake2b(hashes2.data() + static_cast(i) * INITIAL_HASH_SIZE, INITIAL_HASH_SIZE, blockTemplate, sizeof(blockTemplate), nullptr, 0); 158 | } 159 | *(uint32_t*)(blockTemplate + 39) = 0; 160 | 161 | if (hashes != hashes2) 162 | { 163 | std::cerr << "blake2b_initial_hash test failed!" << std::endl; 164 | return false; 165 | } 166 | 167 | std::cout << "blake2b_initial_hash test passed" << std::endl; 168 | 169 | kernel = ctx.kernels[CL_FILLAES1RX4_SCRATCHPAD]; 170 | if (!clSetKernelArgs(kernel, hash_gpu, scratchpads_gpu, static_cast(intensity))) 171 | { 172 | return false; 173 | } 174 | 175 | global_work_size = intensity * 4; 176 | local_work_size = 64; 177 | CL_CHECKED_CALL(clEnqueueNDRangeKernel, ctx.queue, kernel, 1, nullptr, &global_work_size, &local_work_size, 0, nullptr, nullptr); 178 | CL_CHECKED_CALL(clFinish, ctx.queue); 179 | 180 | struct Dummy 181 | { 182 | Dummy() {} 183 | 184 | uint64_t k; 185 | }; 186 | std::vector scratchpads_buf((RANDOMX_SCRATCHPAD_L3 + 64) * (intensity + 1) / sizeof(Dummy)); 187 | uint8_t* scratchpads = reinterpret_cast(scratchpads_buf.data()); 188 | 189 | CL_CHECKED_CALL(clEnqueueReadBuffer, ctx.queue, hash_gpu, CL_TRUE, 0, intensity * INITIAL_HASH_SIZE, hashes.data(), 0, nullptr, nullptr); 190 | CL_CHECKED_CALL(clEnqueueReadBuffer, ctx.queue, scratchpads_gpu, CL_TRUE, 0, intensity * (RANDOMX_SCRATCHPAD_L3 + 64), scratchpads, 0, nullptr, nullptr); 191 | 192 | for (size_t i = 0; i < intensity; ++i) 193 | { 194 | fillAes1Rx4(hashes2.data() + i * INITIAL_HASH_SIZE, RANDOMX_SCRATCHPAD_L3, scratchpads + (RANDOMX_SCRATCHPAD_L3 + 64) * intensity); 195 | 196 | if (memcmp(hashes.data() + i * INITIAL_HASH_SIZE, hashes2.data() + i * INITIAL_HASH_SIZE, INITIAL_HASH_SIZE) != 0) 197 | { 198 | std::cerr << "fillAes1Rx4_scratchpad test (hash) failed!" << std::endl; 199 | return false; 200 | } 201 | 202 | const uint8_t* p1 = scratchpads + (RANDOMX_SCRATCHPAD_L3 + 64) * i; 203 | const uint8_t* p2 = scratchpads + (RANDOMX_SCRATCHPAD_L3 + 64) * intensity; 204 | if (memcmp(p1, p2, RANDOMX_SCRATCHPAD_L3) != 0) 205 | { 206 | std::cerr << "fillAes1Rx4_scratchpad test (scratchpad) failed!" << std::endl; 207 | return false; 208 | } 209 | } 210 | 211 | std::cout << "fillAes1Rx4_scratchpad test passed" << std::endl; 212 | 213 | kernel = ctx.kernels[CL_FILLAES4RX4_ENTROPY]; 214 | if (!clSetKernelArgs(kernel, hash_gpu, entropy_gpu, static_cast(intensity))) 215 | { 216 | return false; 217 | } 218 | 219 | global_work_size = intensity * 4; 220 | local_work_size = 64; 221 | CL_CHECKED_CALL(clEnqueueNDRangeKernel, ctx.queue, kernel, 1, nullptr, &global_work_size, &local_work_size, 0, nullptr, nullptr); 222 | CL_CHECKED_CALL(clFinish, ctx.queue); 223 | 224 | CL_CHECKED_CALL(clEnqueueReadBuffer, ctx.queue, hash_gpu, CL_TRUE, 0, intensity * INITIAL_HASH_SIZE, hashes.data(), 0, nullptr, nullptr); 225 | CL_CHECKED_CALL(clEnqueueReadBuffer, ctx.queue, entropy_gpu, CL_TRUE, 0, intensity * ENTROPY_SIZE, entropy.data(), 0, nullptr, nullptr); 226 | 227 | for (size_t i = 0; i < intensity; ++i) 228 | { 229 | fillAes4Rx4(hashes2.data() + i * INITIAL_HASH_SIZE, ENTROPY_SIZE, entropy.data() + ENTROPY_SIZE * intensity); 230 | 231 | if (memcmp(entropy.data() + i * ENTROPY_SIZE, entropy.data() + ENTROPY_SIZE * intensity, ENTROPY_SIZE) != 0) 232 | { 233 | std::cerr << "fillAes4Rx4_entropy test (entropy) failed!" << std::endl; 234 | return false; 235 | } 236 | } 237 | 238 | std::cout << "fillAes4Rx4_entropy test passed" << std::endl; 239 | 240 | kernel = ctx.kernels[CL_HASHAES1RX4]; 241 | if (!clSetKernelArgs(kernel, scratchpads_gpu, registers_gpu, 192, REGISTERS_SIZE, static_cast(intensity))) 242 | { 243 | return false; 244 | } 245 | 246 | global_work_size = intensity * 4; 247 | local_work_size = 64; 248 | CL_CHECKED_CALL(clEnqueueNDRangeKernel, ctx.queue, kernel, 1, nullptr, &global_work_size, &local_work_size, 0, nullptr, nullptr); 249 | CL_CHECKED_CALL(clFinish, ctx.queue); 250 | 251 | std::vector registers(REGISTERS_SIZE * (intensity + 1)); 252 | 253 | CL_CHECKED_CALL(clEnqueueReadBuffer, ctx.queue, registers_gpu, CL_TRUE, 0, intensity * REGISTERS_SIZE, registers.data(), 0, nullptr, nullptr); 254 | 255 | for (size_t i = 0; i < intensity; ++i) 256 | { 257 | memset(registers.data() + REGISTERS_SIZE * intensity, 0, REGISTERS_SIZE); 258 | uint8_t* src = scratchpads + (RANDOMX_SCRATCHPAD_L3 + 64) * i; 259 | uint8_t* dst = scratchpads + (RANDOMX_SCRATCHPAD_L3 + 64) * intensity; 260 | memcpy(dst, src, RANDOMX_SCRATCHPAD_L3); 261 | 262 | hashAes1Rx4(scratchpads + (RANDOMX_SCRATCHPAD_L3 + 64) * intensity, RANDOMX_SCRATCHPAD_L3, registers.data() + intensity * REGISTERS_SIZE + 192); 263 | 264 | if (memcmp(registers.data() + i * REGISTERS_SIZE, registers.data() + intensity * REGISTERS_SIZE, REGISTERS_SIZE) != 0) 265 | { 266 | std::cerr << "hashAes1Rx4 test failed!" << std::endl; 267 | return false; 268 | } 269 | } 270 | 271 | std::cout << "hashAes1Rx4 test passed" << std::endl; 272 | 273 | kernel = ctx.kernels[CL_BLAKE2B_HASH_REGISTERS_32]; 274 | if (!clSetKernelArgs(kernel, hash_gpu, registers_gpu, REGISTERS_SIZE)) 275 | { 276 | return false; 277 | } 278 | 279 | global_work_size = intensity; 280 | local_work_size = 64; 281 | CL_CHECKED_CALL(clEnqueueNDRangeKernel, ctx.queue, kernel, 1, nullptr, &global_work_size, &local_work_size, 0, nullptr, nullptr); 282 | CL_CHECKED_CALL(clFinish, ctx.queue); 283 | 284 | CL_CHECKED_CALL(clEnqueueReadBuffer, ctx.queue, hash_gpu, CL_TRUE, 0, intensity * 32, hashes.data(), 0, nullptr, nullptr); 285 | 286 | for (size_t i = 0; i < intensity; ++i) 287 | { 288 | blake2b(hashes2.data() + i * 32, 32, registers.data() + i * REGISTERS_SIZE, REGISTERS_SIZE, nullptr, 0); 289 | } 290 | 291 | if (memcmp(hashes.data(), hashes2.data(), intensity * 32) != 0) 292 | { 293 | std::cerr << "blake2b_hash_registers (32 byte hash) test failed!" << std::endl; 294 | return false; 295 | } 296 | 297 | std::cout << "blake2b_hash_registers (32 byte hash) test passed" << std::endl; 298 | 299 | kernel = ctx.kernels[CL_BLAKE2B_HASH_REGISTERS_64]; 300 | if (!clSetKernelArgs(kernel, hash_gpu, registers_gpu, REGISTERS_SIZE)) 301 | { 302 | return false; 303 | } 304 | 305 | global_work_size = intensity; 306 | local_work_size = 64; 307 | CL_CHECKED_CALL(clEnqueueNDRangeKernel, ctx.queue, kernel, 1, nullptr, &global_work_size, &local_work_size, 0, nullptr, nullptr); 308 | CL_CHECKED_CALL(clFinish, ctx.queue); 309 | 310 | CL_CHECKED_CALL(clEnqueueReadBuffer, ctx.queue, hash_gpu, CL_TRUE, 0, intensity * 64, hashes.data(), 0, nullptr, nullptr); 311 | 312 | for (size_t i = 0; i < intensity; ++i) 313 | { 314 | blake2b(hashes2.data() + i * 64, 64, registers.data() + i * REGISTERS_SIZE, REGISTERS_SIZE, nullptr, 0); 315 | } 316 | 317 | if (memcmp(hashes.data(), hashes2.data(), intensity * 64) != 0) 318 | { 319 | std::cerr << "blake2b_hash_registers (64 byte hash) test failed!" << std::endl; 320 | return false; 321 | } 322 | 323 | std::cout << "blake2b_hash_registers (64 byte hash) test passed" << std::endl; 324 | 325 | auto start_time = high_resolution_clock::now(); 326 | 327 | kernel = ctx.kernels[CL_FILLAES1RX4_SCRATCHPAD]; 328 | for (int i = 0; i < 100; ++i) 329 | { 330 | std::cout << "Benchmarking fillAes1Rx4 " << (i + 1) << "/100"; 331 | if (i > 0) 332 | { 333 | const double dt = duration_cast(high_resolution_clock::now() - start_time).count() / 1e9; 334 | std::cout << ", " << ((i * intensity * 10) / dt) << " scratchpads/s "; 335 | } 336 | std::cout << "\r"; 337 | 338 | global_work_size = intensity * 4; 339 | local_work_size = 64; 340 | for (int j = 0; j < 10; ++j) 341 | CL_CHECKED_CALL(clEnqueueNDRangeKernel, ctx.queue, kernel, 1, nullptr, &global_work_size, &local_work_size, 0, nullptr, nullptr); 342 | 343 | CL_CHECKED_CALL(clFinish, ctx.queue); 344 | } 345 | std::cout << std::endl; 346 | 347 | start_time = high_resolution_clock::now(); 348 | 349 | kernel = ctx.kernels[CL_HASHAES1RX4]; 350 | for (int i = 0; i < 100; ++i) 351 | { 352 | std::cout << "Benchmarking hashAes1Rx4 " << (i + 1) << "/100"; 353 | if (i > 0) 354 | { 355 | const double dt = duration_cast(high_resolution_clock::now() - start_time).count() / 1e9; 356 | std::cout << ", " << ((i * intensity * 10) / dt) << " scratchpads/s "; 357 | } 358 | std::cout << "\r"; 359 | 360 | global_work_size = intensity * 4; 361 | local_work_size = 64; 362 | for (int j = 0; j < 10; ++j) 363 | CL_CHECKED_CALL(clEnqueueNDRangeKernel, ctx.queue, kernel, 1, nullptr, &global_work_size, &local_work_size, 0, nullptr, nullptr); 364 | 365 | CL_CHECKED_CALL(clFinish, ctx.queue); 366 | } 367 | std::cout << std::endl; 368 | 369 | CL_CHECKED_CALL(clEnqueueWriteBuffer, ctx.queue, blockTemplate_gpu, CL_FALSE, 0, sizeof(blockTemplate), blockTemplate, 0, nullptr, nullptr); 370 | 371 | kernel = ctx.kernels[CL_BLAKE2B_512_SINGLE_BLOCK_BENCH]; 372 | if (!clSetKernelArgs(kernel, nonce_gpu, blockTemplate_gpu, 0ULL)) 373 | { 374 | return false; 375 | } 376 | 377 | start_time = high_resolution_clock::now(); 378 | 379 | for (uint64_t start_nonce = 0; start_nonce < BLAKE2B_STEP * 100; start_nonce += BLAKE2B_STEP) 380 | { 381 | std::cout << "Benchmarking blake2b_512_single_block " << ((start_nonce + BLAKE2B_STEP) / BLAKE2B_STEP) << "/100"; 382 | if (start_nonce > 0) 383 | { 384 | const double dt = duration_cast(high_resolution_clock::now() - start_time).count() / 1e9; 385 | std::cout << ", " << start_nonce / dt / 1e6 << " MH/s "; 386 | } 387 | std::cout << "\r"; 388 | 389 | CL_CHECKED_CALL(clEnqueueFillBuffer, ctx.queue, nonce_gpu, &zero, sizeof(zero), 0, sizeof(uint64_t), 0, nullptr, nullptr); 390 | 391 | CL_CHECKED_CALL(clSetKernelArg, kernel, 2, sizeof(start_nonce), &start_nonce); 392 | 393 | global_work_size = BLAKE2B_STEP; 394 | local_work_size = 64; 395 | CL_CHECKED_CALL(clEnqueueNDRangeKernel, ctx.queue, kernel, 1, nullptr, &global_work_size, &local_work_size, 0, nullptr, nullptr); 396 | CL_CHECKED_CALL(clFinish, ctx.queue); 397 | 398 | uint64_t nonce; 399 | CL_CHECKED_CALL(clEnqueueReadBuffer, ctx.queue, nonce_gpu, CL_TRUE, 0, sizeof(uint64_t), &nonce, 0, nullptr, nullptr); 400 | 401 | if (nonce) 402 | { 403 | *(uint64_t*)(blockTemplate) = nonce; 404 | uint64_t hash[INITIAL_HASH_SIZE / sizeof(uint64_t)]; 405 | blake2b(hash, INITIAL_HASH_SIZE, blockTemplate, sizeof(blockTemplate), nullptr, 0); 406 | std::cout << "nonce = " << nonce << ", hash[7] = " << std::hex << std::setw(16) << std::setfill('0') << hash[7] << " " << std::endl; 407 | std::cout << std::dec; 408 | } 409 | } 410 | std::cout << std::endl; 411 | 412 | kernel = ctx.kernels[CL_BLAKE2B_512_DOUBLE_BLOCK_BENCH]; 413 | if (!clSetKernelArgs(kernel, nonce_gpu, registers_gpu, 0ULL)) 414 | { 415 | return false; 416 | } 417 | 418 | CL_CHECKED_CALL(clEnqueueFillBuffer, ctx.queue, registers_gpu, &zero, sizeof(zero), 0, REGISTERS_SIZE, 0, nullptr, nullptr); 419 | 420 | start_time = high_resolution_clock::now(); 421 | 422 | for (uint64_t start_nonce = 0; start_nonce < BLAKE2B_STEP * 100; start_nonce += BLAKE2B_STEP) 423 | { 424 | std::cout << "Benchmarking blake2b_512_double_block " << ((start_nonce + BLAKE2B_STEP) / BLAKE2B_STEP) << "/100"; 425 | if (start_nonce > 0) 426 | { 427 | const double dt = duration_cast(high_resolution_clock::now() - start_time).count() / 1e9; 428 | std::cout << ", " << start_nonce / dt / 1e6 << " MH/s "; 429 | } 430 | std::cout << "\r"; 431 | 432 | CL_CHECKED_CALL(clEnqueueFillBuffer, ctx.queue, nonce_gpu, &zero, sizeof(zero), 0, sizeof(uint64_t), 0, nullptr, nullptr); 433 | 434 | CL_CHECKED_CALL(clSetKernelArg, kernel, 2, sizeof(start_nonce), &start_nonce); 435 | 436 | global_work_size = BLAKE2B_STEP; 437 | local_work_size = 64; 438 | CL_CHECKED_CALL(clEnqueueNDRangeKernel, ctx.queue, kernel, 1, nullptr, &global_work_size, &local_work_size, 0, nullptr, nullptr); 439 | CL_CHECKED_CALL(clFinish, ctx.queue); 440 | 441 | uint64_t nonce; 442 | CL_CHECKED_CALL(clEnqueueReadBuffer, ctx.queue, nonce_gpu, CL_TRUE, 0, sizeof(uint64_t), &nonce, 0, nullptr, nullptr); 443 | 444 | if (nonce) 445 | { 446 | memset(registers.data(), 0, REGISTERS_SIZE); 447 | *(uint64_t*)(registers.data()) = nonce; 448 | uint64_t hash[8]; 449 | blake2b(hash, 64, registers.data(), REGISTERS_SIZE, nullptr, 0); 450 | std::cout << "nonce = " << nonce << ", hash[7] = " << std::hex << std::setw(16) << std::setfill('0') << hash[7] << " " << std::endl; 451 | std::cout << std::dec; 452 | } 453 | } 454 | std::cout << std::endl; 455 | 456 | return true; 457 | } 458 | -------------------------------------------------------------------------------- /RandomX_OpenCL/tests.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2019 SChernykh 3 | 4 | This file is part of RandomX OpenCL. 5 | 6 | RandomX OpenCL is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | RandomX OpenCL is distributed in the hope that it will be useful, 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | GNU General Public License for more details. 15 | 16 | You should have received a copy of the GNU General Public License 17 | along with RandomX OpenCL. If not, see . 18 | */ 19 | 20 | #pragma once 21 | 22 | bool tests(uint32_t platform_id, uint32_t device_id, size_t intensity); 23 | --------------------------------------------------------------------------------