├── .gitattributes
├── .gitignore
├── .gitmodules
├── LICENSE
├── README.md
├── RandomX_OpenCL.sln
└── RandomX_OpenCL
├── CL
├── aes.cl
├── blake2b.cl
├── blake2b_double_block.cl
├── fillAes1Rx4.cl
├── randomx_constants.h
├── randomx_constants_jit.h
├── randomx_init.cl
├── randomx_run.cl
└── randomx_vm.cl
├── GCNASM
├── randomx_run_gfx1010.asm
├── randomx_run_gfx803.asm
└── randomx_run_gfx900.asm
├── RandomX_OpenCL.cpp
├── RandomX_OpenCL.vcxproj
├── RandomX_OpenCL.vcxproj.filters
├── RandomX_OpenCL.vcxproj.user
├── definitions.h
├── makefile
├── miner.cpp
├── miner.h
├── opencl_helpers.cpp
├── opencl_helpers.h
├── tests.cpp
└── tests.h
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .vs
2 | x64
3 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "RandomX"]
2 | path = RandomX
3 | url = https://github.com/SChernykh/RandomX
4 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # RandomX OpenCL implementation
2 |
3 | This repository contains full RandomX OpenCL implementation (portable code for all GPUs and optimized code AMD Vega GPUs). The latest version of RandomX (1.1.0 as of August 30th, 2019) is supported.
4 |
5 | Note: it's only a benchmark/testing tool, not an actual miner. RandomX hashrate is expected to improve somewhat in the future thanks to further optimizations.
6 |
7 | GPUs tested so far:
8 |
9 | Model|CryptonightR H/S|RandomX H/S|Relative speed|Comment
10 | -----|---------------|-----------|---------------|-------
11 | AMD Radeon VII (stock)|3125|1500|48%|JIT compiled mode, 150W
12 | AMD Vega 64 (1700/1100 MHz)|2200|1225|55.7%|JIT compiled mode, 285W
13 | AMD Vega 64 (1100/800 MHz)|1023|845|82.6%|JIT compiled mode, 115W
14 | AMD Vega 64 (1700/1100 MHz)|2200|163|7.4%|VM interpreted mode
15 | AMD Vega FE (stock)|2150|980|45.6%|JIT compiled mode (intensity 4096)
16 | AMD Radeon RX 560 4GB (1400/2200 MHz)|495|260|52.5%|JIT compiled mode (intensity 896)
17 | AMD Radeon RX RX470/570 4GB|930-950|400-410|43%|JIT compiled mode, 50W
18 | AMD Radeon RX RX480/580 4GB|960-1000|470|47%|JIT compiled mode, 60W
19 | GeForce GTX 1080 Ti (2037/11800 MHz)|927|601|64.8%|VM interpreted mode
20 |
21 | ## Building on Windows
22 |
23 | - Install Visual Studio 2017 Community and [CLRadeonExtender](https://github.com/CLRX/CLRX-mirror/releases)
24 | - Add CLRadeonExtender's bin directory to PATH environment variable
25 | - Open .sln file in Visual Studio and build it
26 |
27 | ## Building on Ubuntu
28 |
29 | - Install prerequisites `sudo apt install git cmake build-essential`
30 | - If you want to try JIT compiled code for Vega or Polaris GPUs, install amdgpu-pro drivers with OpenCL enabled (run the install script like this `./amdgpu-pro-install --opencl=pal`)
31 | - Download [CLRadeonExtender](https://github.com/CLRX/CLRX-mirror/releases) and copy `clrxasm` to `/usr/local/bin`
32 | - Then run commands:
33 | ```
34 | git clone --recursive https://github.com/SChernykh/RandomX_OpenCL
35 | cd RandomX_OpenCL/RandomX
36 | mkdir build && cd build
37 | cmake -DARCH=native ..
38 | make
39 | cd ../../RandomX_OpenCL
40 | make
41 | ```
42 |
43 | ## Donations
44 |
45 | If you'd like to support further development/optimization of RandomX miners (both CPU and AMD/NVIDIA), you're welcome to send any amount of XMR to the following address:
46 |
47 | ```
48 | 44MnN1f3Eto8DZYUWuE5XZNUtE3vcRzt2j6PzqWpPau34e6Cf4fAxt6X2MBmrm6F9YMEiMNjN6W4Shn4pLcfNAja621jwyg
49 | ```
50 |
--------------------------------------------------------------------------------
/RandomX_OpenCL.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio 15
4 | VisualStudioVersion = 15.0.28307.645
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "RandomX_OpenCL", "RandomX_OpenCL\RandomX_OpenCL.vcxproj", "{32C205F3-317C-4EE6-8B8B-E2B6D87B8CC4}"
7 | ProjectSection(ProjectDependencies) = postProject
8 | {3346A4AD-C438-4324-8B77-47A16452954B} = {3346A4AD-C438-4324-8B77-47A16452954B}
9 | EndProjectSection
10 | EndProject
11 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "randomx", "RandomX\vcxproj\randomx.vcxproj", "{3346A4AD-C438-4324-8B77-47A16452954B}"
12 | EndProject
13 | Global
14 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
15 | Debug|x64 = Debug|x64
16 | Debug|x86 = Debug|x86
17 | Release|x64 = Release|x64
18 | Release|x86 = Release|x86
19 | EndGlobalSection
20 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
21 | {32C205F3-317C-4EE6-8B8B-E2B6D87B8CC4}.Debug|x64.ActiveCfg = Debug|x64
22 | {32C205F3-317C-4EE6-8B8B-E2B6D87B8CC4}.Debug|x64.Build.0 = Debug|x64
23 | {32C205F3-317C-4EE6-8B8B-E2B6D87B8CC4}.Debug|x86.ActiveCfg = Debug|x64
24 | {32C205F3-317C-4EE6-8B8B-E2B6D87B8CC4}.Release|x64.ActiveCfg = Release|x64
25 | {32C205F3-317C-4EE6-8B8B-E2B6D87B8CC4}.Release|x64.Build.0 = Release|x64
26 | {32C205F3-317C-4EE6-8B8B-E2B6D87B8CC4}.Release|x86.ActiveCfg = Release|x64
27 | {3346A4AD-C438-4324-8B77-47A16452954B}.Debug|x64.ActiveCfg = Release|x64
28 | {3346A4AD-C438-4324-8B77-47A16452954B}.Debug|x64.Build.0 = Release|x64
29 | {3346A4AD-C438-4324-8B77-47A16452954B}.Debug|x86.ActiveCfg = Debug|Win32
30 | {3346A4AD-C438-4324-8B77-47A16452954B}.Debug|x86.Build.0 = Debug|Win32
31 | {3346A4AD-C438-4324-8B77-47A16452954B}.Release|x64.ActiveCfg = Release|x64
32 | {3346A4AD-C438-4324-8B77-47A16452954B}.Release|x64.Build.0 = Release|x64
33 | {3346A4AD-C438-4324-8B77-47A16452954B}.Release|x86.ActiveCfg = Release|Win32
34 | {3346A4AD-C438-4324-8B77-47A16452954B}.Release|x86.Build.0 = Release|Win32
35 | EndGlobalSection
36 | GlobalSection(SolutionProperties) = preSolution
37 | HideSolutionNode = FALSE
38 | EndGlobalSection
39 | GlobalSection(ExtensibilityGlobals) = postSolution
40 | SolutionGuid = {D7CE6C55-7FD7-4C3E-A52E-E3128C74A127}
41 | EndGlobalSection
42 | EndGlobal
43 |
--------------------------------------------------------------------------------
/RandomX_OpenCL/CL/aes.cl:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (c) 2019 SChernykh
3 |
4 | This file is part of RandomX OpenCL.
5 |
6 | RandomX OpenCL is free software: you can redistribute it and/or modify
7 | it under the terms of the GNU General Public License as published by
8 | the Free Software Foundation, either version 3 of the License, or
9 | (at your option) any later version.
10 |
11 | RandomX OpenCL is distributed in the hope that it will be useful,
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | GNU General Public License for more details.
15 |
16 | You should have received a copy of the GNU General Public License
17 | along with RandomX OpenCL. If not, see .
18 | */
19 |
20 | __constant static const uint AES_TABLE[2048] =
21 | {
22 | 0xa56363c6U, 0x847c7cf8U, 0x997777eeU, 0x8d7b7bf6U,
23 | 0x0df2f2ffU, 0xbd6b6bd6U, 0xb16f6fdeU, 0x54c5c591U,
24 | 0x50303060U, 0x03010102U, 0xa96767ceU, 0x7d2b2b56U,
25 | 0x19fefee7U, 0x62d7d7b5U, 0xe6abab4dU, 0x9a7676ecU,
26 | 0x45caca8fU, 0x9d82821fU, 0x40c9c989U, 0x877d7dfaU,
27 | 0x15fafaefU, 0xeb5959b2U, 0xc947478eU, 0x0bf0f0fbU,
28 | 0xecadad41U, 0x67d4d4b3U, 0xfda2a25fU, 0xeaafaf45U,
29 | 0xbf9c9c23U, 0xf7a4a453U, 0x967272e4U, 0x5bc0c09bU,
30 | 0xc2b7b775U, 0x1cfdfde1U, 0xae93933dU, 0x6a26264cU,
31 | 0x5a36366cU, 0x413f3f7eU, 0x02f7f7f5U, 0x4fcccc83U,
32 | 0x5c343468U, 0xf4a5a551U, 0x34e5e5d1U, 0x08f1f1f9U,
33 | 0x937171e2U, 0x73d8d8abU, 0x53313162U, 0x3f15152aU,
34 | 0x0c040408U, 0x52c7c795U, 0x65232346U, 0x5ec3c39dU,
35 | 0x28181830U, 0xa1969637U, 0x0f05050aU, 0xb59a9a2fU,
36 | 0x0907070eU, 0x36121224U, 0x9b80801bU, 0x3de2e2dfU,
37 | 0x26ebebcdU, 0x6927274eU, 0xcdb2b27fU, 0x9f7575eaU,
38 | 0x1b090912U, 0x9e83831dU, 0x742c2c58U, 0x2e1a1a34U,
39 | 0x2d1b1b36U, 0xb26e6edcU, 0xee5a5ab4U, 0xfba0a05bU,
40 | 0xf65252a4U, 0x4d3b3b76U, 0x61d6d6b7U, 0xceb3b37dU,
41 | 0x7b292952U, 0x3ee3e3ddU, 0x712f2f5eU, 0x97848413U,
42 | 0xf55353a6U, 0x68d1d1b9U, 0x00000000U, 0x2cededc1U,
43 | 0x60202040U, 0x1ffcfce3U, 0xc8b1b179U, 0xed5b5bb6U,
44 | 0xbe6a6ad4U, 0x46cbcb8dU, 0xd9bebe67U, 0x4b393972U,
45 | 0xde4a4a94U, 0xd44c4c98U, 0xe85858b0U, 0x4acfcf85U,
46 | 0x6bd0d0bbU, 0x2aefefc5U, 0xe5aaaa4fU, 0x16fbfbedU,
47 | 0xc5434386U, 0xd74d4d9aU, 0x55333366U, 0x94858511U,
48 | 0xcf45458aU, 0x10f9f9e9U, 0x06020204U, 0x817f7ffeU,
49 | 0xf05050a0U, 0x443c3c78U, 0xba9f9f25U, 0xe3a8a84bU,
50 | 0xf35151a2U, 0xfea3a35dU, 0xc0404080U, 0x8a8f8f05U,
51 | 0xad92923fU, 0xbc9d9d21U, 0x48383870U, 0x04f5f5f1U,
52 | 0xdfbcbc63U, 0xc1b6b677U, 0x75dadaafU, 0x63212142U,
53 | 0x30101020U, 0x1affffe5U, 0x0ef3f3fdU, 0x6dd2d2bfU,
54 | 0x4ccdcd81U, 0x140c0c18U, 0x35131326U, 0x2fececc3U,
55 | 0xe15f5fbeU, 0xa2979735U, 0xcc444488U, 0x3917172eU,
56 | 0x57c4c493U, 0xf2a7a755U, 0x827e7efcU, 0x473d3d7aU,
57 | 0xac6464c8U, 0xe75d5dbaU, 0x2b191932U, 0x957373e6U,
58 | 0xa06060c0U, 0x98818119U, 0xd14f4f9eU, 0x7fdcdca3U,
59 | 0x66222244U, 0x7e2a2a54U, 0xab90903bU, 0x8388880bU,
60 | 0xca46468cU, 0x29eeeec7U, 0xd3b8b86bU, 0x3c141428U,
61 | 0x79dedea7U, 0xe25e5ebcU, 0x1d0b0b16U, 0x76dbdbadU,
62 | 0x3be0e0dbU, 0x56323264U, 0x4e3a3a74U, 0x1e0a0a14U,
63 | 0xdb494992U, 0x0a06060cU, 0x6c242448U, 0xe45c5cb8U,
64 | 0x5dc2c29fU, 0x6ed3d3bdU, 0xefacac43U, 0xa66262c4U,
65 | 0xa8919139U, 0xa4959531U, 0x37e4e4d3U, 0x8b7979f2U,
66 | 0x32e7e7d5U, 0x43c8c88bU, 0x5937376eU, 0xb76d6ddaU,
67 | 0x8c8d8d01U, 0x64d5d5b1U, 0xd24e4e9cU, 0xe0a9a949U,
68 | 0xb46c6cd8U, 0xfa5656acU, 0x07f4f4f3U, 0x25eaeacfU,
69 | 0xaf6565caU, 0x8e7a7af4U, 0xe9aeae47U, 0x18080810U,
70 | 0xd5baba6fU, 0x887878f0U, 0x6f25254aU, 0x722e2e5cU,
71 | 0x241c1c38U, 0xf1a6a657U, 0xc7b4b473U, 0x51c6c697U,
72 | 0x23e8e8cbU, 0x7cdddda1U, 0x9c7474e8U, 0x211f1f3eU,
73 | 0xdd4b4b96U, 0xdcbdbd61U, 0x868b8b0dU, 0x858a8a0fU,
74 | 0x907070e0U, 0x423e3e7cU, 0xc4b5b571U, 0xaa6666ccU,
75 | 0xd8484890U, 0x05030306U, 0x01f6f6f7U, 0x120e0e1cU,
76 | 0xa36161c2U, 0x5f35356aU, 0xf95757aeU, 0xd0b9b969U,
77 | 0x91868617U, 0x58c1c199U, 0x271d1d3aU, 0xb99e9e27U,
78 | 0x38e1e1d9U, 0x13f8f8ebU, 0xb398982bU, 0x33111122U,
79 | 0xbb6969d2U, 0x70d9d9a9U, 0x898e8e07U, 0xa7949433U,
80 | 0xb69b9b2dU, 0x221e1e3cU, 0x92878715U, 0x20e9e9c9U,
81 | 0x49cece87U, 0xff5555aaU, 0x78282850U, 0x7adfdfa5U,
82 | 0x8f8c8c03U, 0xf8a1a159U, 0x80898909U, 0x170d0d1aU,
83 | 0xdabfbf65U, 0x31e6e6d7U, 0xc6424284U, 0xb86868d0U,
84 | 0xc3414182U, 0xb0999929U, 0x772d2d5aU, 0x110f0f1eU,
85 | 0xcbb0b07bU, 0xfc5454a8U, 0xd6bbbb6dU, 0x3a16162cU,
86 | 0x6363c6a5U, 0x7c7cf884U, 0x7777ee99U, 0x7b7bf68dU,
87 | 0xf2f2ff0dU, 0x6b6bd6bdU, 0x6f6fdeb1U, 0xc5c59154U,
88 | 0x30306050U, 0x01010203U, 0x6767cea9U, 0x2b2b567dU,
89 | 0xfefee719U, 0xd7d7b562U, 0xabab4de6U, 0x7676ec9aU,
90 | 0xcaca8f45U, 0x82821f9dU, 0xc9c98940U, 0x7d7dfa87U,
91 | 0xfafaef15U, 0x5959b2ebU, 0x47478ec9U, 0xf0f0fb0bU,
92 | 0xadad41ecU, 0xd4d4b367U, 0xa2a25ffdU, 0xafaf45eaU,
93 | 0x9c9c23bfU, 0xa4a453f7U, 0x7272e496U, 0xc0c09b5bU,
94 | 0xb7b775c2U, 0xfdfde11cU, 0x93933daeU, 0x26264c6aU,
95 | 0x36366c5aU, 0x3f3f7e41U, 0xf7f7f502U, 0xcccc834fU,
96 | 0x3434685cU, 0xa5a551f4U, 0xe5e5d134U, 0xf1f1f908U,
97 | 0x7171e293U, 0xd8d8ab73U, 0x31316253U, 0x15152a3fU,
98 | 0x0404080cU, 0xc7c79552U, 0x23234665U, 0xc3c39d5eU,
99 | 0x18183028U, 0x969637a1U, 0x05050a0fU, 0x9a9a2fb5U,
100 | 0x07070e09U, 0x12122436U, 0x80801b9bU, 0xe2e2df3dU,
101 | 0xebebcd26U, 0x27274e69U, 0xb2b27fcdU, 0x7575ea9fU,
102 | 0x0909121bU, 0x83831d9eU, 0x2c2c5874U, 0x1a1a342eU,
103 | 0x1b1b362dU, 0x6e6edcb2U, 0x5a5ab4eeU, 0xa0a05bfbU,
104 | 0x5252a4f6U, 0x3b3b764dU, 0xd6d6b761U, 0xb3b37dceU,
105 | 0x2929527bU, 0xe3e3dd3eU, 0x2f2f5e71U, 0x84841397U,
106 | 0x5353a6f5U, 0xd1d1b968U, 0x00000000U, 0xededc12cU,
107 | 0x20204060U, 0xfcfce31fU, 0xb1b179c8U, 0x5b5bb6edU,
108 | 0x6a6ad4beU, 0xcbcb8d46U, 0xbebe67d9U, 0x3939724bU,
109 | 0x4a4a94deU, 0x4c4c98d4U, 0x5858b0e8U, 0xcfcf854aU,
110 | 0xd0d0bb6bU, 0xefefc52aU, 0xaaaa4fe5U, 0xfbfbed16U,
111 | 0x434386c5U, 0x4d4d9ad7U, 0x33336655U, 0x85851194U,
112 | 0x45458acfU, 0xf9f9e910U, 0x02020406U, 0x7f7ffe81U,
113 | 0x5050a0f0U, 0x3c3c7844U, 0x9f9f25baU, 0xa8a84be3U,
114 | 0x5151a2f3U, 0xa3a35dfeU, 0x404080c0U, 0x8f8f058aU,
115 | 0x92923fadU, 0x9d9d21bcU, 0x38387048U, 0xf5f5f104U,
116 | 0xbcbc63dfU, 0xb6b677c1U, 0xdadaaf75U, 0x21214263U,
117 | 0x10102030U, 0xffffe51aU, 0xf3f3fd0eU, 0xd2d2bf6dU,
118 | 0xcdcd814cU, 0x0c0c1814U, 0x13132635U, 0xececc32fU,
119 | 0x5f5fbee1U, 0x979735a2U, 0x444488ccU, 0x17172e39U,
120 | 0xc4c49357U, 0xa7a755f2U, 0x7e7efc82U, 0x3d3d7a47U,
121 | 0x6464c8acU, 0x5d5dbae7U, 0x1919322bU, 0x7373e695U,
122 | 0x6060c0a0U, 0x81811998U, 0x4f4f9ed1U, 0xdcdca37fU,
123 | 0x22224466U, 0x2a2a547eU, 0x90903babU, 0x88880b83U,
124 | 0x46468ccaU, 0xeeeec729U, 0xb8b86bd3U, 0x1414283cU,
125 | 0xdedea779U, 0x5e5ebce2U, 0x0b0b161dU, 0xdbdbad76U,
126 | 0xe0e0db3bU, 0x32326456U, 0x3a3a744eU, 0x0a0a141eU,
127 | 0x494992dbU, 0x06060c0aU, 0x2424486cU, 0x5c5cb8e4U,
128 | 0xc2c29f5dU, 0xd3d3bd6eU, 0xacac43efU, 0x6262c4a6U,
129 | 0x919139a8U, 0x959531a4U, 0xe4e4d337U, 0x7979f28bU,
130 | 0xe7e7d532U, 0xc8c88b43U, 0x37376e59U, 0x6d6ddab7U,
131 | 0x8d8d018cU, 0xd5d5b164U, 0x4e4e9cd2U, 0xa9a949e0U,
132 | 0x6c6cd8b4U, 0x5656acfaU, 0xf4f4f307U, 0xeaeacf25U,
133 | 0x6565caafU, 0x7a7af48eU, 0xaeae47e9U, 0x08081018U,
134 | 0xbaba6fd5U, 0x7878f088U, 0x25254a6fU, 0x2e2e5c72U,
135 | 0x1c1c3824U, 0xa6a657f1U, 0xb4b473c7U, 0xc6c69751U,
136 | 0xe8e8cb23U, 0xdddda17cU, 0x7474e89cU, 0x1f1f3e21U,
137 | 0x4b4b96ddU, 0xbdbd61dcU, 0x8b8b0d86U, 0x8a8a0f85U,
138 | 0x7070e090U, 0x3e3e7c42U, 0xb5b571c4U, 0x6666ccaaU,
139 | 0x484890d8U, 0x03030605U, 0xf6f6f701U, 0x0e0e1c12U,
140 | 0x6161c2a3U, 0x35356a5fU, 0x5757aef9U, 0xb9b969d0U,
141 | 0x86861791U, 0xc1c19958U, 0x1d1d3a27U, 0x9e9e27b9U,
142 | 0xe1e1d938U, 0xf8f8eb13U, 0x98982bb3U, 0x11112233U,
143 | 0x6969d2bbU, 0xd9d9a970U, 0x8e8e0789U, 0x949433a7U,
144 | 0x9b9b2db6U, 0x1e1e3c22U, 0x87871592U, 0xe9e9c920U,
145 | 0xcece8749U, 0x5555aaffU, 0x28285078U, 0xdfdfa57aU,
146 | 0x8c8c038fU, 0xa1a159f8U, 0x89890980U, 0x0d0d1a17U,
147 | 0xbfbf65daU, 0xe6e6d731U, 0x424284c6U, 0x6868d0b8U,
148 | 0x414182c3U, 0x999929b0U, 0x2d2d5a77U, 0x0f0f1e11U,
149 | 0xb0b07bcbU, 0x5454a8fcU, 0xbbbb6dd6U, 0x16162c3aU,
150 | 0x63c6a563U, 0x7cf8847cU, 0x77ee9977U, 0x7bf68d7bU,
151 | 0xf2ff0df2U, 0x6bd6bd6bU, 0x6fdeb16fU, 0xc59154c5U,
152 | 0x30605030U, 0x01020301U, 0x67cea967U, 0x2b567d2bU,
153 | 0xfee719feU, 0xd7b562d7U, 0xab4de6abU, 0x76ec9a76U,
154 | 0xca8f45caU, 0x821f9d82U, 0xc98940c9U, 0x7dfa877dU,
155 | 0xfaef15faU, 0x59b2eb59U, 0x478ec947U, 0xf0fb0bf0U,
156 | 0xad41ecadU, 0xd4b367d4U, 0xa25ffda2U, 0xaf45eaafU,
157 | 0x9c23bf9cU, 0xa453f7a4U, 0x72e49672U, 0xc09b5bc0U,
158 | 0xb775c2b7U, 0xfde11cfdU, 0x933dae93U, 0x264c6a26U,
159 | 0x366c5a36U, 0x3f7e413fU, 0xf7f502f7U, 0xcc834fccU,
160 | 0x34685c34U, 0xa551f4a5U, 0xe5d134e5U, 0xf1f908f1U,
161 | 0x71e29371U, 0xd8ab73d8U, 0x31625331U, 0x152a3f15U,
162 | 0x04080c04U, 0xc79552c7U, 0x23466523U, 0xc39d5ec3U,
163 | 0x18302818U, 0x9637a196U, 0x050a0f05U, 0x9a2fb59aU,
164 | 0x070e0907U, 0x12243612U, 0x801b9b80U, 0xe2df3de2U,
165 | 0xebcd26ebU, 0x274e6927U, 0xb27fcdb2U, 0x75ea9f75U,
166 | 0x09121b09U, 0x831d9e83U, 0x2c58742cU, 0x1a342e1aU,
167 | 0x1b362d1bU, 0x6edcb26eU, 0x5ab4ee5aU, 0xa05bfba0U,
168 | 0x52a4f652U, 0x3b764d3bU, 0xd6b761d6U, 0xb37dceb3U,
169 | 0x29527b29U, 0xe3dd3ee3U, 0x2f5e712fU, 0x84139784U,
170 | 0x53a6f553U, 0xd1b968d1U, 0x00000000U, 0xedc12cedU,
171 | 0x20406020U, 0xfce31ffcU, 0xb179c8b1U, 0x5bb6ed5bU,
172 | 0x6ad4be6aU, 0xcb8d46cbU, 0xbe67d9beU, 0x39724b39U,
173 | 0x4a94de4aU, 0x4c98d44cU, 0x58b0e858U, 0xcf854acfU,
174 | 0xd0bb6bd0U, 0xefc52aefU, 0xaa4fe5aaU, 0xfbed16fbU,
175 | 0x4386c543U, 0x4d9ad74dU, 0x33665533U, 0x85119485U,
176 | 0x458acf45U, 0xf9e910f9U, 0x02040602U, 0x7ffe817fU,
177 | 0x50a0f050U, 0x3c78443cU, 0x9f25ba9fU, 0xa84be3a8U,
178 | 0x51a2f351U, 0xa35dfea3U, 0x4080c040U, 0x8f058a8fU,
179 | 0x923fad92U, 0x9d21bc9dU, 0x38704838U, 0xf5f104f5U,
180 | 0xbc63dfbcU, 0xb677c1b6U, 0xdaaf75daU, 0x21426321U,
181 | 0x10203010U, 0xffe51affU, 0xf3fd0ef3U, 0xd2bf6dd2U,
182 | 0xcd814ccdU, 0x0c18140cU, 0x13263513U, 0xecc32fecU,
183 | 0x5fbee15fU, 0x9735a297U, 0x4488cc44U, 0x172e3917U,
184 | 0xc49357c4U, 0xa755f2a7U, 0x7efc827eU, 0x3d7a473dU,
185 | 0x64c8ac64U, 0x5dbae75dU, 0x19322b19U, 0x73e69573U,
186 | 0x60c0a060U, 0x81199881U, 0x4f9ed14fU, 0xdca37fdcU,
187 | 0x22446622U, 0x2a547e2aU, 0x903bab90U, 0x880b8388U,
188 | 0x468cca46U, 0xeec729eeU, 0xb86bd3b8U, 0x14283c14U,
189 | 0xdea779deU, 0x5ebce25eU, 0x0b161d0bU, 0xdbad76dbU,
190 | 0xe0db3be0U, 0x32645632U, 0x3a744e3aU, 0x0a141e0aU,
191 | 0x4992db49U, 0x060c0a06U, 0x24486c24U, 0x5cb8e45cU,
192 | 0xc29f5dc2U, 0xd3bd6ed3U, 0xac43efacU, 0x62c4a662U,
193 | 0x9139a891U, 0x9531a495U, 0xe4d337e4U, 0x79f28b79U,
194 | 0xe7d532e7U, 0xc88b43c8U, 0x376e5937U, 0x6ddab76dU,
195 | 0x8d018c8dU, 0xd5b164d5U, 0x4e9cd24eU, 0xa949e0a9U,
196 | 0x6cd8b46cU, 0x56acfa56U, 0xf4f307f4U, 0xeacf25eaU,
197 | 0x65caaf65U, 0x7af48e7aU, 0xae47e9aeU, 0x08101808U,
198 | 0xba6fd5baU, 0x78f08878U, 0x254a6f25U, 0x2e5c722eU,
199 | 0x1c38241cU, 0xa657f1a6U, 0xb473c7b4U, 0xc69751c6U,
200 | 0xe8cb23e8U, 0xdda17cddU, 0x74e89c74U, 0x1f3e211fU,
201 | 0x4b96dd4bU, 0xbd61dcbdU, 0x8b0d868bU, 0x8a0f858aU,
202 | 0x70e09070U, 0x3e7c423eU, 0xb571c4b5U, 0x66ccaa66U,
203 | 0x4890d848U, 0x03060503U, 0xf6f701f6U, 0x0e1c120eU,
204 | 0x61c2a361U, 0x356a5f35U, 0x57aef957U, 0xb969d0b9U,
205 | 0x86179186U, 0xc19958c1U, 0x1d3a271dU, 0x9e27b99eU,
206 | 0xe1d938e1U, 0xf8eb13f8U, 0x982bb398U, 0x11223311U,
207 | 0x69d2bb69U, 0xd9a970d9U, 0x8e07898eU, 0x9433a794U,
208 | 0x9b2db69bU, 0x1e3c221eU, 0x87159287U, 0xe9c920e9U,
209 | 0xce8749ceU, 0x55aaff55U, 0x28507828U, 0xdfa57adfU,
210 | 0x8c038f8cU, 0xa159f8a1U, 0x89098089U, 0x0d1a170dU,
211 | 0xbf65dabfU, 0xe6d731e6U, 0x4284c642U, 0x68d0b868U,
212 | 0x4182c341U, 0x9929b099U, 0x2d5a772dU, 0x0f1e110fU,
213 | 0xb07bcbb0U, 0x54a8fc54U, 0xbb6dd6bbU, 0x162c3a16U,
214 | 0xc6a56363U, 0xf8847c7cU, 0xee997777U, 0xf68d7b7bU,
215 | 0xff0df2f2U, 0xd6bd6b6bU, 0xdeb16f6fU, 0x9154c5c5U,
216 | 0x60503030U, 0x02030101U, 0xcea96767U, 0x567d2b2bU,
217 | 0xe719fefeU, 0xb562d7d7U, 0x4de6ababU, 0xec9a7676U,
218 | 0x8f45cacaU, 0x1f9d8282U, 0x8940c9c9U, 0xfa877d7dU,
219 | 0xef15fafaU, 0xb2eb5959U, 0x8ec94747U, 0xfb0bf0f0U,
220 | 0x41ecadadU, 0xb367d4d4U, 0x5ffda2a2U, 0x45eaafafU,
221 | 0x23bf9c9cU, 0x53f7a4a4U, 0xe4967272U, 0x9b5bc0c0U,
222 | 0x75c2b7b7U, 0xe11cfdfdU, 0x3dae9393U, 0x4c6a2626U,
223 | 0x6c5a3636U, 0x7e413f3fU, 0xf502f7f7U, 0x834fccccU,
224 | 0x685c3434U, 0x51f4a5a5U, 0xd134e5e5U, 0xf908f1f1U,
225 | 0xe2937171U, 0xab73d8d8U, 0x62533131U, 0x2a3f1515U,
226 | 0x080c0404U, 0x9552c7c7U, 0x46652323U, 0x9d5ec3c3U,
227 | 0x30281818U, 0x37a19696U, 0x0a0f0505U, 0x2fb59a9aU,
228 | 0x0e090707U, 0x24361212U, 0x1b9b8080U, 0xdf3de2e2U,
229 | 0xcd26ebebU, 0x4e692727U, 0x7fcdb2b2U, 0xea9f7575U,
230 | 0x121b0909U, 0x1d9e8383U, 0x58742c2cU, 0x342e1a1aU,
231 | 0x362d1b1bU, 0xdcb26e6eU, 0xb4ee5a5aU, 0x5bfba0a0U,
232 | 0xa4f65252U, 0x764d3b3bU, 0xb761d6d6U, 0x7dceb3b3U,
233 | 0x527b2929U, 0xdd3ee3e3U, 0x5e712f2fU, 0x13978484U,
234 | 0xa6f55353U, 0xb968d1d1U, 0x00000000U, 0xc12cededU,
235 | 0x40602020U, 0xe31ffcfcU, 0x79c8b1b1U, 0xb6ed5b5bU,
236 | 0xd4be6a6aU, 0x8d46cbcbU, 0x67d9bebeU, 0x724b3939U,
237 | 0x94de4a4aU, 0x98d44c4cU, 0xb0e85858U, 0x854acfcfU,
238 | 0xbb6bd0d0U, 0xc52aefefU, 0x4fe5aaaaU, 0xed16fbfbU,
239 | 0x86c54343U, 0x9ad74d4dU, 0x66553333U, 0x11948585U,
240 | 0x8acf4545U, 0xe910f9f9U, 0x04060202U, 0xfe817f7fU,
241 | 0xa0f05050U, 0x78443c3cU, 0x25ba9f9fU, 0x4be3a8a8U,
242 | 0xa2f35151U, 0x5dfea3a3U, 0x80c04040U, 0x058a8f8fU,
243 | 0x3fad9292U, 0x21bc9d9dU, 0x70483838U, 0xf104f5f5U,
244 | 0x63dfbcbcU, 0x77c1b6b6U, 0xaf75dadaU, 0x42632121U,
245 | 0x20301010U, 0xe51affffU, 0xfd0ef3f3U, 0xbf6dd2d2U,
246 | 0x814ccdcdU, 0x18140c0cU, 0x26351313U, 0xc32fececU,
247 | 0xbee15f5fU, 0x35a29797U, 0x88cc4444U, 0x2e391717U,
248 | 0x9357c4c4U, 0x55f2a7a7U, 0xfc827e7eU, 0x7a473d3dU,
249 | 0xc8ac6464U, 0xbae75d5dU, 0x322b1919U, 0xe6957373U,
250 | 0xc0a06060U, 0x19988181U, 0x9ed14f4fU, 0xa37fdcdcU,
251 | 0x44662222U, 0x547e2a2aU, 0x3bab9090U, 0x0b838888U,
252 | 0x8cca4646U, 0xc729eeeeU, 0x6bd3b8b8U, 0x283c1414U,
253 | 0xa779dedeU, 0xbce25e5eU, 0x161d0b0bU, 0xad76dbdbU,
254 | 0xdb3be0e0U, 0x64563232U, 0x744e3a3aU, 0x141e0a0aU,
255 | 0x92db4949U, 0x0c0a0606U, 0x486c2424U, 0xb8e45c5cU,
256 | 0x9f5dc2c2U, 0xbd6ed3d3U, 0x43efacacU, 0xc4a66262U,
257 | 0x39a89191U, 0x31a49595U, 0xd337e4e4U, 0xf28b7979U,
258 | 0xd532e7e7U, 0x8b43c8c8U, 0x6e593737U, 0xdab76d6dU,
259 | 0x018c8d8dU, 0xb164d5d5U, 0x9cd24e4eU, 0x49e0a9a9U,
260 | 0xd8b46c6cU, 0xacfa5656U, 0xf307f4f4U, 0xcf25eaeaU,
261 | 0xcaaf6565U, 0xf48e7a7aU, 0x47e9aeaeU, 0x10180808U,
262 | 0x6fd5babaU, 0xf0887878U, 0x4a6f2525U, 0x5c722e2eU,
263 | 0x38241c1cU, 0x57f1a6a6U, 0x73c7b4b4U, 0x9751c6c6U,
264 | 0xcb23e8e8U, 0xa17cddddU, 0xe89c7474U, 0x3e211f1fU,
265 | 0x96dd4b4bU, 0x61dcbdbdU, 0x0d868b8bU, 0x0f858a8aU,
266 | 0xe0907070U, 0x7c423e3eU, 0x71c4b5b5U, 0xccaa6666U,
267 | 0x90d84848U, 0x06050303U, 0xf701f6f6U, 0x1c120e0eU,
268 | 0xc2a36161U, 0x6a5f3535U, 0xaef95757U, 0x69d0b9b9U,
269 | 0x17918686U, 0x9958c1c1U, 0x3a271d1dU, 0x27b99e9eU,
270 | 0xd938e1e1U, 0xeb13f8f8U, 0x2bb39898U, 0x22331111U,
271 | 0xd2bb6969U, 0xa970d9d9U, 0x07898e8eU, 0x33a79494U,
272 | 0x2db69b9bU, 0x3c221e1eU, 0x15928787U, 0xc920e9e9U,
273 | 0x8749ceceU, 0xaaff5555U, 0x50782828U, 0xa57adfdfU,
274 | 0x038f8c8cU, 0x59f8a1a1U, 0x09808989U, 0x1a170d0dU,
275 | 0x65dabfbfU, 0xd731e6e6U, 0x84c64242U, 0xd0b86868U,
276 | 0x82c34141U, 0x29b09999U, 0x5a772d2dU, 0x1e110f0fU,
277 | 0x7bcbb0b0U, 0xa8fc5454U, 0x6dd6bbbbU, 0x2c3a1616U,
278 | 0x50a7f451U, 0x5365417eU, 0xc3a4171aU, 0x965e273aU,
279 | 0xcb6bab3bU, 0xf1459d1fU, 0xab58faacU, 0x9303e34bU,
280 | 0x55fa3020U, 0xf66d76adU, 0x9176cc88U, 0x254c02f5U,
281 | 0xfcd7e54fU, 0xd7cb2ac5U, 0x80443526U, 0x8fa362b5U,
282 | 0x495ab1deU, 0x671bba25U, 0x980eea45U, 0xe1c0fe5dU,
283 | 0x02752fc3U, 0x12f04c81U, 0xa397468dU, 0xc6f9d36bU,
284 | 0xe75f8f03U, 0x959c9215U, 0xeb7a6dbfU, 0xda595295U,
285 | 0x2d83bed4U, 0xd3217458U, 0x2969e049U, 0x44c8c98eU,
286 | 0x6a89c275U, 0x78798ef4U, 0x6b3e5899U, 0xdd71b927U,
287 | 0xb64fe1beU, 0x17ad88f0U, 0x66ac20c9U, 0xb43ace7dU,
288 | 0x184adf63U, 0x82311ae5U, 0x60335197U, 0x457f5362U,
289 | 0xe07764b1U, 0x84ae6bbbU, 0x1ca081feU, 0x942b08f9U,
290 | 0x58684870U, 0x19fd458fU, 0x876cde94U, 0xb7f87b52U,
291 | 0x23d373abU, 0xe2024b72U, 0x578f1fe3U, 0x2aab5566U,
292 | 0x0728ebb2U, 0x03c2b52fU, 0x9a7bc586U, 0xa50837d3U,
293 | 0xf2872830U, 0xb2a5bf23U, 0xba6a0302U, 0x5c8216edU,
294 | 0x2b1ccf8aU, 0x92b479a7U, 0xf0f207f3U, 0xa1e2694eU,
295 | 0xcdf4da65U, 0xd5be0506U, 0x1f6234d1U, 0x8afea6c4U,
296 | 0x9d532e34U, 0xa055f3a2U, 0x32e18a05U, 0x75ebf6a4U,
297 | 0x39ec830bU, 0xaaef6040U, 0x069f715eU, 0x51106ebdU,
298 | 0xf98a213eU, 0x3d06dd96U, 0xae053eddU, 0x46bde64dU,
299 | 0xb58d5491U, 0x055dc471U, 0x6fd40604U, 0xff155060U,
300 | 0x24fb9819U, 0x97e9bdd6U, 0xcc434089U, 0x779ed967U,
301 | 0xbd42e8b0U, 0x888b8907U, 0x385b19e7U, 0xdbeec879U,
302 | 0x470a7ca1U, 0xe90f427cU, 0xc91e84f8U, 0x00000000U,
303 | 0x83868009U, 0x48ed2b32U, 0xac70111eU, 0x4e725a6cU,
304 | 0xfbff0efdU, 0x5638850fU, 0x1ed5ae3dU, 0x27392d36U,
305 | 0x64d90f0aU, 0x21a65c68U, 0xd1545b9bU, 0x3a2e3624U,
306 | 0xb1670a0cU, 0x0fe75793U, 0xd296eeb4U, 0x9e919b1bU,
307 | 0x4fc5c080U, 0xa220dc61U, 0x694b775aU, 0x161a121cU,
308 | 0x0aba93e2U, 0xe52aa0c0U, 0x43e0223cU, 0x1d171b12U,
309 | 0x0b0d090eU, 0xadc78bf2U, 0xb9a8b62dU, 0xc8a91e14U,
310 | 0x8519f157U, 0x4c0775afU, 0xbbdd99eeU, 0xfd607fa3U,
311 | 0x9f2601f7U, 0xbcf5725cU, 0xc53b6644U, 0x347efb5bU,
312 | 0x7629438bU, 0xdcc623cbU, 0x68fcedb6U, 0x63f1e4b8U,
313 | 0xcadc31d7U, 0x10856342U, 0x40229713U, 0x2011c684U,
314 | 0x7d244a85U, 0xf83dbbd2U, 0x1132f9aeU, 0x6da129c7U,
315 | 0x4b2f9e1dU, 0xf330b2dcU, 0xec52860dU, 0xd0e3c177U,
316 | 0x6c16b32bU, 0x99b970a9U, 0xfa489411U, 0x2264e947U,
317 | 0xc48cfca8U, 0x1a3ff0a0U, 0xd82c7d56U, 0xef903322U,
318 | 0xc74e4987U, 0xc1d138d9U, 0xfea2ca8cU, 0x360bd498U,
319 | 0xcf81f5a6U, 0x28de7aa5U, 0x268eb7daU, 0xa4bfad3fU,
320 | 0xe49d3a2cU, 0x0d927850U, 0x9bcc5f6aU, 0x62467e54U,
321 | 0xc2138df6U, 0xe8b8d890U, 0x5ef7392eU, 0xf5afc382U,
322 | 0xbe805d9fU, 0x7c93d069U, 0xa92dd56fU, 0xb31225cfU,
323 | 0x3b99acc8U, 0xa77d1810U, 0x6e639ce8U, 0x7bbb3bdbU,
324 | 0x097826cdU, 0xf418596eU, 0x01b79aecU, 0xa89a4f83U,
325 | 0x656e95e6U, 0x7ee6ffaaU, 0x08cfbc21U, 0xe6e815efU,
326 | 0xd99be7baU, 0xce366f4aU, 0xd4099feaU, 0xd67cb029U,
327 | 0xafb2a431U, 0x31233f2aU, 0x3094a5c6U, 0xc066a235U,
328 | 0x37bc4e74U, 0xa6ca82fcU, 0xb0d090e0U, 0x15d8a733U,
329 | 0x4a9804f1U, 0xf7daec41U, 0x0e50cd7fU, 0x2ff69117U,
330 | 0x8dd64d76U, 0x4db0ef43U, 0x544daaccU, 0xdf0496e4U,
331 | 0xe3b5d19eU, 0x1b886a4cU, 0xb81f2cc1U, 0x7f516546U,
332 | 0x04ea5e9dU, 0x5d358c01U, 0x737487faU, 0x2e410bfbU,
333 | 0x5a1d67b3U, 0x52d2db92U, 0x335610e9U, 0x1347d66dU,
334 | 0x8c61d79aU, 0x7a0ca137U, 0x8e14f859U, 0x893c13ebU,
335 | 0xee27a9ceU, 0x35c961b7U, 0xede51ce1U, 0x3cb1477aU,
336 | 0x59dfd29cU, 0x3f73f255U, 0x79ce1418U, 0xbf37c773U,
337 | 0xeacdf753U, 0x5baafd5fU, 0x146f3ddfU, 0x86db4478U,
338 | 0x81f3afcaU, 0x3ec468b9U, 0x2c342438U, 0x5f40a3c2U,
339 | 0x72c31d16U, 0x0c25e2bcU, 0x8b493c28U, 0x41950dffU,
340 | 0x7101a839U, 0xdeb30c08U, 0x9ce4b4d8U, 0x90c15664U,
341 | 0x6184cb7bU, 0x70b632d5U, 0x745c6c48U, 0x4257b8d0U,
342 | 0xa7f45150U, 0x65417e53U, 0xa4171ac3U, 0x5e273a96U,
343 | 0x6bab3bcbU, 0x459d1ff1U, 0x58faacabU, 0x03e34b93U,
344 | 0xfa302055U, 0x6d76adf6U, 0x76cc8891U, 0x4c02f525U,
345 | 0xd7e54ffcU, 0xcb2ac5d7U, 0x44352680U, 0xa362b58fU,
346 | 0x5ab1de49U, 0x1bba2567U, 0x0eea4598U, 0xc0fe5de1U,
347 | 0x752fc302U, 0xf04c8112U, 0x97468da3U, 0xf9d36bc6U,
348 | 0x5f8f03e7U, 0x9c921595U, 0x7a6dbfebU, 0x595295daU,
349 | 0x83bed42dU, 0x217458d3U, 0x69e04929U, 0xc8c98e44U,
350 | 0x89c2756aU, 0x798ef478U, 0x3e58996bU, 0x71b927ddU,
351 | 0x4fe1beb6U, 0xad88f017U, 0xac20c966U, 0x3ace7db4U,
352 | 0x4adf6318U, 0x311ae582U, 0x33519760U, 0x7f536245U,
353 | 0x7764b1e0U, 0xae6bbb84U, 0xa081fe1cU, 0x2b08f994U,
354 | 0x68487058U, 0xfd458f19U, 0x6cde9487U, 0xf87b52b7U,
355 | 0xd373ab23U, 0x024b72e2U, 0x8f1fe357U, 0xab55662aU,
356 | 0x28ebb207U, 0xc2b52f03U, 0x7bc5869aU, 0x0837d3a5U,
357 | 0x872830f2U, 0xa5bf23b2U, 0x6a0302baU, 0x8216ed5cU,
358 | 0x1ccf8a2bU, 0xb479a792U, 0xf207f3f0U, 0xe2694ea1U,
359 | 0xf4da65cdU, 0xbe0506d5U, 0x6234d11fU, 0xfea6c48aU,
360 | 0x532e349dU, 0x55f3a2a0U, 0xe18a0532U, 0xebf6a475U,
361 | 0xec830b39U, 0xef6040aaU, 0x9f715e06U, 0x106ebd51U,
362 | 0x8a213ef9U, 0x06dd963dU, 0x053eddaeU, 0xbde64d46U,
363 | 0x8d5491b5U, 0x5dc47105U, 0xd406046fU, 0x155060ffU,
364 | 0xfb981924U, 0xe9bdd697U, 0x434089ccU, 0x9ed96777U,
365 | 0x42e8b0bdU, 0x8b890788U, 0x5b19e738U, 0xeec879dbU,
366 | 0x0a7ca147U, 0x0f427ce9U, 0x1e84f8c9U, 0x00000000U,
367 | 0x86800983U, 0xed2b3248U, 0x70111eacU, 0x725a6c4eU,
368 | 0xff0efdfbU, 0x38850f56U, 0xd5ae3d1eU, 0x392d3627U,
369 | 0xd90f0a64U, 0xa65c6821U, 0x545b9bd1U, 0x2e36243aU,
370 | 0x670a0cb1U, 0xe757930fU, 0x96eeb4d2U, 0x919b1b9eU,
371 | 0xc5c0804fU, 0x20dc61a2U, 0x4b775a69U, 0x1a121c16U,
372 | 0xba93e20aU, 0x2aa0c0e5U, 0xe0223c43U, 0x171b121dU,
373 | 0x0d090e0bU, 0xc78bf2adU, 0xa8b62db9U, 0xa91e14c8U,
374 | 0x19f15785U, 0x0775af4cU, 0xdd99eebbU, 0x607fa3fdU,
375 | 0x2601f79fU, 0xf5725cbcU, 0x3b6644c5U, 0x7efb5b34U,
376 | 0x29438b76U, 0xc623cbdcU, 0xfcedb668U, 0xf1e4b863U,
377 | 0xdc31d7caU, 0x85634210U, 0x22971340U, 0x11c68420U,
378 | 0x244a857dU, 0x3dbbd2f8U, 0x32f9ae11U, 0xa129c76dU,
379 | 0x2f9e1d4bU, 0x30b2dcf3U, 0x52860decU, 0xe3c177d0U,
380 | 0x16b32b6cU, 0xb970a999U, 0x489411faU, 0x64e94722U,
381 | 0x8cfca8c4U, 0x3ff0a01aU, 0x2c7d56d8U, 0x903322efU,
382 | 0x4e4987c7U, 0xd138d9c1U, 0xa2ca8cfeU, 0x0bd49836U,
383 | 0x81f5a6cfU, 0xde7aa528U, 0x8eb7da26U, 0xbfad3fa4U,
384 | 0x9d3a2ce4U, 0x9278500dU, 0xcc5f6a9bU, 0x467e5462U,
385 | 0x138df6c2U, 0xb8d890e8U, 0xf7392e5eU, 0xafc382f5U,
386 | 0x805d9fbeU, 0x93d0697cU, 0x2dd56fa9U, 0x1225cfb3U,
387 | 0x99acc83bU, 0x7d1810a7U, 0x639ce86eU, 0xbb3bdb7bU,
388 | 0x7826cd09U, 0x18596ef4U, 0xb79aec01U, 0x9a4f83a8U,
389 | 0x6e95e665U, 0xe6ffaa7eU, 0xcfbc2108U, 0xe815efe6U,
390 | 0x9be7bad9U, 0x366f4aceU, 0x099fead4U, 0x7cb029d6U,
391 | 0xb2a431afU, 0x233f2a31U, 0x94a5c630U, 0x66a235c0U,
392 | 0xbc4e7437U, 0xca82fca6U, 0xd090e0b0U, 0xd8a73315U,
393 | 0x9804f14aU, 0xdaec41f7U, 0x50cd7f0eU, 0xf691172fU,
394 | 0xd64d768dU, 0xb0ef434dU, 0x4daacc54U, 0x0496e4dfU,
395 | 0xb5d19ee3U, 0x886a4c1bU, 0x1f2cc1b8U, 0x5165467fU,
396 | 0xea5e9d04U, 0x358c015dU, 0x7487fa73U, 0x410bfb2eU,
397 | 0x1d67b35aU, 0xd2db9252U, 0x5610e933U, 0x47d66d13U,
398 | 0x61d79a8cU, 0x0ca1377aU, 0x14f8598eU, 0x3c13eb89U,
399 | 0x27a9ceeeU, 0xc961b735U, 0xe51ce1edU, 0xb1477a3cU,
400 | 0xdfd29c59U, 0x73f2553fU, 0xce141879U, 0x37c773bfU,
401 | 0xcdf753eaU, 0xaafd5f5bU, 0x6f3ddf14U, 0xdb447886U,
402 | 0xf3afca81U, 0xc468b93eU, 0x3424382cU, 0x40a3c25fU,
403 | 0xc31d1672U, 0x25e2bc0cU, 0x493c288bU, 0x950dff41U,
404 | 0x01a83971U, 0xb30c08deU, 0xe4b4d89cU, 0xc1566490U,
405 | 0x84cb7b61U, 0xb632d570U, 0x5c6c4874U, 0x57b8d042U,
406 | 0xf45150a7U, 0x417e5365U, 0x171ac3a4U, 0x273a965eU,
407 | 0xab3bcb6bU, 0x9d1ff145U, 0xfaacab58U, 0xe34b9303U,
408 | 0x302055faU, 0x76adf66dU, 0xcc889176U, 0x02f5254cU,
409 | 0xe54ffcd7U, 0x2ac5d7cbU, 0x35268044U, 0x62b58fa3U,
410 | 0xb1de495aU, 0xba25671bU, 0xea45980eU, 0xfe5de1c0U,
411 | 0x2fc30275U, 0x4c8112f0U, 0x468da397U, 0xd36bc6f9U,
412 | 0x8f03e75fU, 0x9215959cU, 0x6dbfeb7aU, 0x5295da59U,
413 | 0xbed42d83U, 0x7458d321U, 0xe0492969U, 0xc98e44c8U,
414 | 0xc2756a89U, 0x8ef47879U, 0x58996b3eU, 0xb927dd71U,
415 | 0xe1beb64fU, 0x88f017adU, 0x20c966acU, 0xce7db43aU,
416 | 0xdf63184aU, 0x1ae58231U, 0x51976033U, 0x5362457fU,
417 | 0x64b1e077U, 0x6bbb84aeU, 0x81fe1ca0U, 0x08f9942bU,
418 | 0x48705868U, 0x458f19fdU, 0xde94876cU, 0x7b52b7f8U,
419 | 0x73ab23d3U, 0x4b72e202U, 0x1fe3578fU, 0x55662aabU,
420 | 0xebb20728U, 0xb52f03c2U, 0xc5869a7bU, 0x37d3a508U,
421 | 0x2830f287U, 0xbf23b2a5U, 0x0302ba6aU, 0x16ed5c82U,
422 | 0xcf8a2b1cU, 0x79a792b4U, 0x07f3f0f2U, 0x694ea1e2U,
423 | 0xda65cdf4U, 0x0506d5beU, 0x34d11f62U, 0xa6c48afeU,
424 | 0x2e349d53U, 0xf3a2a055U, 0x8a0532e1U, 0xf6a475ebU,
425 | 0x830b39ecU, 0x6040aaefU, 0x715e069fU, 0x6ebd5110U,
426 | 0x213ef98aU, 0xdd963d06U, 0x3eddae05U, 0xe64d46bdU,
427 | 0x5491b58dU, 0xc471055dU, 0x06046fd4U, 0x5060ff15U,
428 | 0x981924fbU, 0xbdd697e9U, 0x4089cc43U, 0xd967779eU,
429 | 0xe8b0bd42U, 0x8907888bU, 0x19e7385bU, 0xc879dbeeU,
430 | 0x7ca1470aU, 0x427ce90fU, 0x84f8c91eU, 0x00000000U,
431 | 0x80098386U, 0x2b3248edU, 0x111eac70U, 0x5a6c4e72U,
432 | 0x0efdfbffU, 0x850f5638U, 0xae3d1ed5U, 0x2d362739U,
433 | 0x0f0a64d9U, 0x5c6821a6U, 0x5b9bd154U, 0x36243a2eU,
434 | 0x0a0cb167U, 0x57930fe7U, 0xeeb4d296U, 0x9b1b9e91U,
435 | 0xc0804fc5U, 0xdc61a220U, 0x775a694bU, 0x121c161aU,
436 | 0x93e20abaU, 0xa0c0e52aU, 0x223c43e0U, 0x1b121d17U,
437 | 0x090e0b0dU, 0x8bf2adc7U, 0xb62db9a8U, 0x1e14c8a9U,
438 | 0xf1578519U, 0x75af4c07U, 0x99eebbddU, 0x7fa3fd60U,
439 | 0x01f79f26U, 0x725cbcf5U, 0x6644c53bU, 0xfb5b347eU,
440 | 0x438b7629U, 0x23cbdcc6U, 0xedb668fcU, 0xe4b863f1U,
441 | 0x31d7cadcU, 0x63421085U, 0x97134022U, 0xc6842011U,
442 | 0x4a857d24U, 0xbbd2f83dU, 0xf9ae1132U, 0x29c76da1U,
443 | 0x9e1d4b2fU, 0xb2dcf330U, 0x860dec52U, 0xc177d0e3U,
444 | 0xb32b6c16U, 0x70a999b9U, 0x9411fa48U, 0xe9472264U,
445 | 0xfca8c48cU, 0xf0a01a3fU, 0x7d56d82cU, 0x3322ef90U,
446 | 0x4987c74eU, 0x38d9c1d1U, 0xca8cfea2U, 0xd498360bU,
447 | 0xf5a6cf81U, 0x7aa528deU, 0xb7da268eU, 0xad3fa4bfU,
448 | 0x3a2ce49dU, 0x78500d92U, 0x5f6a9bccU, 0x7e546246U,
449 | 0x8df6c213U, 0xd890e8b8U, 0x392e5ef7U, 0xc382f5afU,
450 | 0x5d9fbe80U, 0xd0697c93U, 0xd56fa92dU, 0x25cfb312U,
451 | 0xacc83b99U, 0x1810a77dU, 0x9ce86e63U, 0x3bdb7bbbU,
452 | 0x26cd0978U, 0x596ef418U, 0x9aec01b7U, 0x4f83a89aU,
453 | 0x95e6656eU, 0xffaa7ee6U, 0xbc2108cfU, 0x15efe6e8U,
454 | 0xe7bad99bU, 0x6f4ace36U, 0x9fead409U, 0xb029d67cU,
455 | 0xa431afb2U, 0x3f2a3123U, 0xa5c63094U, 0xa235c066U,
456 | 0x4e7437bcU, 0x82fca6caU, 0x90e0b0d0U, 0xa73315d8U,
457 | 0x04f14a98U, 0xec41f7daU, 0xcd7f0e50U, 0x91172ff6U,
458 | 0x4d768dd6U, 0xef434db0U, 0xaacc544dU, 0x96e4df04U,
459 | 0xd19ee3b5U, 0x6a4c1b88U, 0x2cc1b81fU, 0x65467f51U,
460 | 0x5e9d04eaU, 0x8c015d35U, 0x87fa7374U, 0x0bfb2e41U,
461 | 0x67b35a1dU, 0xdb9252d2U, 0x10e93356U, 0xd66d1347U,
462 | 0xd79a8c61U, 0xa1377a0cU, 0xf8598e14U, 0x13eb893cU,
463 | 0xa9ceee27U, 0x61b735c9U, 0x1ce1ede5U, 0x477a3cb1U,
464 | 0xd29c59dfU, 0xf2553f73U, 0x141879ceU, 0xc773bf37U,
465 | 0xf753eacdU, 0xfd5f5baaU, 0x3ddf146fU, 0x447886dbU,
466 | 0xafca81f3U, 0x68b93ec4U, 0x24382c34U, 0xa3c25f40U,
467 | 0x1d1672c3U, 0xe2bc0c25U, 0x3c288b49U, 0x0dff4195U,
468 | 0xa8397101U, 0x0c08deb3U, 0xb4d89ce4U, 0x566490c1U,
469 | 0xcb7b6184U, 0x32d570b6U, 0x6c48745cU, 0xb8d04257U,
470 | 0x5150a7f4U, 0x7e536541U, 0x1ac3a417U, 0x3a965e27U,
471 | 0x3bcb6babU, 0x1ff1459dU, 0xacab58faU, 0x4b9303e3U,
472 | 0x2055fa30U, 0xadf66d76U, 0x889176ccU, 0xf5254c02U,
473 | 0x4ffcd7e5U, 0xc5d7cb2aU, 0x26804435U, 0xb58fa362U,
474 | 0xde495ab1U, 0x25671bbaU, 0x45980eeaU, 0x5de1c0feU,
475 | 0xc302752fU, 0x8112f04cU, 0x8da39746U, 0x6bc6f9d3U,
476 | 0x03e75f8fU, 0x15959c92U, 0xbfeb7a6dU, 0x95da5952U,
477 | 0xd42d83beU, 0x58d32174U, 0x492969e0U, 0x8e44c8c9U,
478 | 0x756a89c2U, 0xf478798eU, 0x996b3e58U, 0x27dd71b9U,
479 | 0xbeb64fe1U, 0xf017ad88U, 0xc966ac20U, 0x7db43aceU,
480 | 0x63184adfU, 0xe582311aU, 0x97603351U, 0x62457f53U,
481 | 0xb1e07764U, 0xbb84ae6bU, 0xfe1ca081U, 0xf9942b08U,
482 | 0x70586848U, 0x8f19fd45U, 0x94876cdeU, 0x52b7f87bU,
483 | 0xab23d373U, 0x72e2024bU, 0xe3578f1fU, 0x662aab55U,
484 | 0xb20728ebU, 0x2f03c2b5U, 0x869a7bc5U, 0xd3a50837U,
485 | 0x30f28728U, 0x23b2a5bfU, 0x02ba6a03U, 0xed5c8216U,
486 | 0x8a2b1ccfU, 0xa792b479U, 0xf3f0f207U, 0x4ea1e269U,
487 | 0x65cdf4daU, 0x06d5be05U, 0xd11f6234U, 0xc48afea6U,
488 | 0x349d532eU, 0xa2a055f3U, 0x0532e18aU, 0xa475ebf6U,
489 | 0x0b39ec83U, 0x40aaef60U, 0x5e069f71U, 0xbd51106eU,
490 | 0x3ef98a21U, 0x963d06ddU, 0xddae053eU, 0x4d46bde6U,
491 | 0x91b58d54U, 0x71055dc4U, 0x046fd406U, 0x60ff1550U,
492 | 0x1924fb98U, 0xd697e9bdU, 0x89cc4340U, 0x67779ed9U,
493 | 0xb0bd42e8U, 0x07888b89U, 0xe7385b19U, 0x79dbeec8U,
494 | 0xa1470a7cU, 0x7ce90f42U, 0xf8c91e84U, 0x00000000U,
495 | 0x09838680U, 0x3248ed2bU, 0x1eac7011U, 0x6c4e725aU,
496 | 0xfdfbff0eU, 0x0f563885U, 0x3d1ed5aeU, 0x3627392dU,
497 | 0x0a64d90fU, 0x6821a65cU, 0x9bd1545bU, 0x243a2e36U,
498 | 0x0cb1670aU, 0x930fe757U, 0xb4d296eeU, 0x1b9e919bU,
499 | 0x804fc5c0U, 0x61a220dcU, 0x5a694b77U, 0x1c161a12U,
500 | 0xe20aba93U, 0xc0e52aa0U, 0x3c43e022U, 0x121d171bU,
501 | 0x0e0b0d09U, 0xf2adc78bU, 0x2db9a8b6U, 0x14c8a91eU,
502 | 0x578519f1U, 0xaf4c0775U, 0xeebbdd99U, 0xa3fd607fU,
503 | 0xf79f2601U, 0x5cbcf572U, 0x44c53b66U, 0x5b347efbU,
504 | 0x8b762943U, 0xcbdcc623U, 0xb668fcedU, 0xb863f1e4U,
505 | 0xd7cadc31U, 0x42108563U, 0x13402297U, 0x842011c6U,
506 | 0x857d244aU, 0xd2f83dbbU, 0xae1132f9U, 0xc76da129U,
507 | 0x1d4b2f9eU, 0xdcf330b2U, 0x0dec5286U, 0x77d0e3c1U,
508 | 0x2b6c16b3U, 0xa999b970U, 0x11fa4894U, 0x472264e9U,
509 | 0xa8c48cfcU, 0xa01a3ff0U, 0x56d82c7dU, 0x22ef9033U,
510 | 0x87c74e49U, 0xd9c1d138U, 0x8cfea2caU, 0x98360bd4U,
511 | 0xa6cf81f5U, 0xa528de7aU, 0xda268eb7U, 0x3fa4bfadU,
512 | 0x2ce49d3aU, 0x500d9278U, 0x6a9bcc5fU, 0x5462467eU,
513 | 0xf6c2138dU, 0x90e8b8d8U, 0x2e5ef739U, 0x82f5afc3U,
514 | 0x9fbe805dU, 0x697c93d0U, 0x6fa92dd5U, 0xcfb31225U,
515 | 0xc83b99acU, 0x10a77d18U, 0xe86e639cU, 0xdb7bbb3bU,
516 | 0xcd097826U, 0x6ef41859U, 0xec01b79aU, 0x83a89a4fU,
517 | 0xe6656e95U, 0xaa7ee6ffU, 0x2108cfbcU, 0xefe6e815U,
518 | 0xbad99be7U, 0x4ace366fU, 0xead4099fU, 0x29d67cb0U,
519 | 0x31afb2a4U, 0x2a31233fU, 0xc63094a5U, 0x35c066a2U,
520 | 0x7437bc4eU, 0xfca6ca82U, 0xe0b0d090U, 0x3315d8a7U,
521 | 0xf14a9804U, 0x41f7daecU, 0x7f0e50cdU, 0x172ff691U,
522 | 0x768dd64dU, 0x434db0efU, 0xcc544daaU, 0xe4df0496U,
523 | 0x9ee3b5d1U, 0x4c1b886aU, 0xc1b81f2cU, 0x467f5165U,
524 | 0x9d04ea5eU, 0x015d358cU, 0xfa737487U, 0xfb2e410bU,
525 | 0xb35a1d67U, 0x9252d2dbU, 0xe9335610U, 0x6d1347d6U,
526 | 0x9a8c61d7U, 0x377a0ca1U, 0x598e14f8U, 0xeb893c13U,
527 | 0xceee27a9U, 0xb735c961U, 0xe1ede51cU, 0x7a3cb147U,
528 | 0x9c59dfd2U, 0x553f73f2U, 0x1879ce14U, 0x73bf37c7U,
529 | 0x53eacdf7U, 0x5f5baafdU, 0xdf146f3dU, 0x7886db44U,
530 | 0xca81f3afU, 0xb93ec468U, 0x382c3424U, 0xc25f40a3U,
531 | 0x1672c31dU, 0xbc0c25e2U, 0x288b493cU, 0xff41950dU,
532 | 0x397101a8U, 0x08deb30cU, 0xd89ce4b4U, 0x6490c156U,
533 | 0x7b6184cbU, 0xd570b632U, 0x48745c6cU, 0xd04257b8U,
534 | };
535 |
536 | __constant static const uint AES_KEY_FILL[16] = {
537 | 0x6daca553, 0x62716609, 0xdbb5552b, 0xb4f44917,
538 | 0x6d7caf07, 0x846a710d, 0x1725d378, 0x0da1dc4e,
539 | 0x3f1262f1, 0x9f947ec6, 0xf4c0794f, 0x3e20e345,
540 | 0x6aef8135, 0xb1ba317c, 0x16314c88, 0x49169154,
541 | };
542 |
543 | __constant static const uint AES_STATE_HASH[16] = {
544 | 0x92b52c0d, 0x9fa856de, 0xcc82db47, 0xd7983aad,
545 | 0x338d996e, 0x15c7b798, 0xf59e125a, 0xace78057,
546 | 0x6a770017, 0xae62c7d0, 0x5079506b, 0xe8a07ce4,
547 | 0x630a240c, 0x07ad828d, 0x79a10005, 0x7e994948,
548 | };
549 |
550 | uint get_byte(uint a, uint start_bit) { return (a >> start_bit) & 0xFF; }
551 |
552 | #include "randomx_constants.h"
553 |
554 | #define fillAes_name fillAes1Rx4_scratchpad
555 | #define outputSize RANDOMX_SCRATCHPAD_L3
556 | #define outputSize0 (outputSize + 64)
557 | #define unroll_factor 8
558 | #define num_rounds 1
559 | #include "fillAes1Rx4.cl"
560 | #undef num_rounds
561 | #undef unroll_factor
562 | #undef outputSize
563 | #undef outputSize0
564 | #undef fillAes_name
565 |
566 | #define fillAes_name fillAes4Rx4_entropy
567 | #define outputSize ENTROPY_SIZE
568 | #define outputSize0 outputSize
569 | #define unroll_factor 2
570 | #define num_rounds 4
571 | #include "fillAes1Rx4.cl"
572 | #undef num_rounds
573 | #undef unroll_factor
574 | #undef outputSize
575 | #undef outputSize0
576 | #undef fillAes_name
577 |
578 | #define inputSize RANDOMX_SCRATCHPAD_L3
579 |
580 | __attribute__((reqd_work_group_size(64, 1, 1)))
581 | __kernel void hashAes1Rx4(__global const void* input, __global void* hash, uint hashOffsetBytes, uint hashStrideBytes, uint batch_size)
582 | {
583 | __local uint T[2048];
584 |
585 | const uint global_index = get_global_id(0);
586 | if (global_index >= batch_size * 4)
587 | return;
588 |
589 | const uint idx = global_index / 4;
590 | const uint sub = global_index % 4;
591 |
592 | for (uint i = get_local_id(0), step = get_local_size(0); i < 2048; i += step)
593 | T[i] = AES_TABLE[i];
594 |
595 | barrier(CLK_LOCAL_MEM_FENCE);
596 |
597 | uint x[4] = { AES_STATE_HASH[sub * 4], AES_STATE_HASH[sub * 4 + 1], AES_STATE_HASH[sub * 4 + 2], AES_STATE_HASH[sub * 4 + 3] };
598 |
599 | const uint s1 = ((sub & 1) == 0) ? 8 : 24;
600 | const uint s3 = ((sub & 1) == 0) ? 24 : 8;
601 |
602 | __global const uint4* p = ((__global uint4*) input) + idx * ((inputSize + 64) / sizeof(uint4)) + sub;
603 |
604 | __local const uint* const t0 = ((sub & 1) == 0) ? T : (T + 1024);
605 | __local const uint* const t1 = ((sub & 1) == 0) ? (T + 256) : (T + 1792);
606 | __local const uint* const t2 = ((sub & 1) == 0) ? (T + 512) : (T + 1536);
607 | __local const uint* const t3 = ((sub & 1) == 0) ? (T + 768) : (T + 1280);
608 |
609 | #pragma unroll(8)
610 | for (uint i = 0; i < inputSize / sizeof(uint4); i += 4, p += 4)
611 | {
612 | uint k[4], y[4];
613 | *(uint4*)(k) = *p;
614 | y[0] = t0[get_byte(x[0], 0)] ^ t1[get_byte(x[1], s1)] ^ t2[get_byte(x[2], 16)] ^ t3[get_byte(x[3], s3)] ^ k[0];
615 | y[1] = t0[get_byte(x[1], 0)] ^ t1[get_byte(x[2], s1)] ^ t2[get_byte(x[3], 16)] ^ t3[get_byte(x[0], s3)] ^ k[1];
616 | y[2] = t0[get_byte(x[2], 0)] ^ t1[get_byte(x[3], s1)] ^ t2[get_byte(x[0], 16)] ^ t3[get_byte(x[1], s3)] ^ k[2];
617 | y[3] = t0[get_byte(x[3], 0)] ^ t1[get_byte(x[0], s1)] ^ t2[get_byte(x[1], 16)] ^ t3[get_byte(x[2], s3)] ^ k[3];
618 | x[0] = y[0];
619 | x[1] = y[1];
620 | x[2] = y[2];
621 | x[3] = y[3];
622 | }
623 |
624 | uint y[4];
625 |
626 | y[0] = t0[get_byte(x[0], 0)] ^ t1[get_byte(x[1], s1)] ^ t2[get_byte(x[2], 16)] ^ t3[get_byte(x[3], s3)] ^ 0xf6fa8389;
627 | y[1] = t0[get_byte(x[1], 0)] ^ t1[get_byte(x[2], s1)] ^ t2[get_byte(x[3], 16)] ^ t3[get_byte(x[0], s3)] ^ 0x8b24949f;
628 | y[2] = t0[get_byte(x[2], 0)] ^ t1[get_byte(x[3], s1)] ^ t2[get_byte(x[0], 16)] ^ t3[get_byte(x[1], s3)] ^ 0x90dc56bf;
629 | y[3] = t0[get_byte(x[3], 0)] ^ t1[get_byte(x[0], s1)] ^ t2[get_byte(x[1], 16)] ^ t3[get_byte(x[2], s3)] ^ 0x06890201;
630 |
631 | x[0] = t0[get_byte(y[0], 0)] ^ t1[get_byte(y[1], s1)] ^ t2[get_byte(y[2], 16)] ^ t3[get_byte(y[3], s3)] ^ 0x61b263d1;
632 | x[1] = t0[get_byte(y[1], 0)] ^ t1[get_byte(y[2], s1)] ^ t2[get_byte(y[3], 16)] ^ t3[get_byte(y[0], s3)] ^ 0x51f4e03c;
633 | x[2] = t0[get_byte(y[2], 0)] ^ t1[get_byte(y[3], s1)] ^ t2[get_byte(y[0], 16)] ^ t3[get_byte(y[1], s3)] ^ 0xee1043c6;
634 | x[3] = t0[get_byte(y[3], 0)] ^ t1[get_byte(y[0], s1)] ^ t2[get_byte(y[1], 16)] ^ t3[get_byte(y[2], s3)] ^ 0xed18f99b;
635 |
636 | *((__global uint4*)(hash) + idx * (hashStrideBytes / sizeof(uint4)) + sub + (hashOffsetBytes / sizeof(uint4))) = *(uint4*)(x);
637 | }
638 |
--------------------------------------------------------------------------------
/RandomX_OpenCL/CL/blake2b.cl:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (c) 2019 SChernykh
3 |
4 | This file is part of RandomX OpenCL.
5 |
6 | RandomX OpenCL is free software: you can redistribute it and/or modify
7 | it under the terms of the GNU General Public License as published by
8 | the Free Software Foundation, either version 3 of the License, or
9 | (at your option) any later version.
10 |
11 | RandomX OpenCL is distributed in the hope that it will be useful,
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | GNU General Public License for more details.
15 |
16 | You should have received a copy of the GNU General Public License
17 | along with RandomX OpenCL. If not, see .
18 | */
19 |
20 | #define BLOCK_TEMPLATE_SIZE 76
21 |
22 | __constant static const uchar blake2b_sigma[12 * 16] = {
23 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
24 | 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3,
25 | 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4,
26 | 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8,
27 | 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13,
28 | 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9,
29 | 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11,
30 | 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10,
31 | 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5,
32 | 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0,
33 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
34 | 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3,
35 | };
36 |
37 | enum Blake2b_IV
38 | {
39 | iv0 = 0x6a09e667f3bcc908ul,
40 | iv1 = 0xbb67ae8584caa73bul,
41 | iv2 = 0x3c6ef372fe94f82bul,
42 | iv3 = 0xa54ff53a5f1d36f1ul,
43 | iv4 = 0x510e527fade682d1ul,
44 | iv5 = 0x9b05688c2b3e6c1ful,
45 | iv6 = 0x1f83d9abfb41bd6bul,
46 | iv7 = 0x5be0cd19137e2179ul,
47 | };
48 |
49 | ulong rotr64(ulong a, ulong shift) { return rotate(a, 64 - shift); }
50 |
51 | #define G(r, i, a, b, c, d) \
52 | do { \
53 | a = a + b + m[blake2b_sigma[r * 16 + 2 * i + 0]]; \
54 | d = rotr64(d ^ a, 32); \
55 | c = c + d; \
56 | b = rotr64(b ^ c, 24); \
57 | a = a + b + m[blake2b_sigma[r * 16 + 2 * i + 1]]; \
58 | d = rotr64(d ^ a, 16); \
59 | c = c + d; \
60 | b = rotr64(b ^ c, 63); \
61 | } while (0)
62 |
63 | #define ROUND(r) \
64 | do { \
65 | G(r, 0, v[0], v[4], v[8], v[12]); \
66 | G(r, 1, v[1], v[5], v[9], v[13]); \
67 | G(r, 2, v[2], v[6], v[10], v[14]); \
68 | G(r, 3, v[3], v[7], v[11], v[15]); \
69 | G(r, 4, v[0], v[5], v[10], v[15]); \
70 | G(r, 5, v[1], v[6], v[11], v[12]); \
71 | G(r, 6, v[2], v[7], v[8], v[13]); \
72 | G(r, 7, v[3], v[4], v[9], v[14]); \
73 | } while (0)
74 |
75 | #define BLAKE2B_ROUNDS() ROUND(0);ROUND(1);ROUND(2);ROUND(3);ROUND(4);ROUND(5);ROUND(6);ROUND(7);ROUND(8);ROUND(9);ROUND(10);ROUND(11);
76 |
77 | void blake2b_512_process_single_block(ulong *h, const ulong* m)
78 | {
79 | ulong v[16] =
80 | {
81 | iv0 ^ 0x01010040, iv1, iv2, iv3, iv4 , iv5, iv6, iv7,
82 | iv0 , iv1, iv2, iv3, iv4 ^ BLOCK_TEMPLATE_SIZE, iv5, ~iv6, iv7,
83 | };
84 |
85 | BLAKE2B_ROUNDS();
86 |
87 | h[0] = v[0] ^ v[ 8] ^ iv0 ^ 0x01010040;
88 | h[1] = v[1] ^ v[ 9] ^ iv1;
89 | h[2] = v[2] ^ v[10] ^ iv2;
90 | h[3] = v[3] ^ v[11] ^ iv3;
91 | h[4] = v[4] ^ v[12] ^ iv4;
92 | h[5] = v[5] ^ v[13] ^ iv5;
93 | h[6] = v[6] ^ v[14] ^ iv6;
94 | h[7] = v[7] ^ v[15] ^ iv7;
95 | }
96 |
97 | __attribute__((reqd_work_group_size(64, 1, 1)))
98 | __kernel void blake2b_initial_hash(__global void *out, __global const void* blockTemplate, uint start_nonce)
99 | {
100 | const uint global_index = get_global_id(0);
101 |
102 | __global const ulong* p = (__global const ulong*) blockTemplate;
103 | ulong m[16] = {
104 | (BLOCK_TEMPLATE_SIZE > 0) ? p[ 0] : 0,
105 | (BLOCK_TEMPLATE_SIZE > 8) ? p[ 1] : 0,
106 | (BLOCK_TEMPLATE_SIZE > 16) ? p[ 2] : 0,
107 | (BLOCK_TEMPLATE_SIZE > 24) ? p[ 3] : 0,
108 | (BLOCK_TEMPLATE_SIZE > 32) ? p[ 4] : 0,
109 | (BLOCK_TEMPLATE_SIZE > 40) ? p[ 5] : 0,
110 | (BLOCK_TEMPLATE_SIZE > 48) ? p[ 6] : 0,
111 | (BLOCK_TEMPLATE_SIZE > 56) ? p[ 7] : 0,
112 | (BLOCK_TEMPLATE_SIZE > 64) ? p[ 8] : 0,
113 | (BLOCK_TEMPLATE_SIZE > 72) ? p[ 9] : 0,
114 | (BLOCK_TEMPLATE_SIZE > 80) ? p[10] : 0,
115 | (BLOCK_TEMPLATE_SIZE > 88) ? p[11] : 0,
116 | (BLOCK_TEMPLATE_SIZE > 96) ? p[12] : 0,
117 | (BLOCK_TEMPLATE_SIZE > 104) ? p[13] : 0,
118 | (BLOCK_TEMPLATE_SIZE > 112) ? p[14] : 0,
119 | (BLOCK_TEMPLATE_SIZE > 120) ? p[15] : 0,
120 | };
121 |
122 | if (BLOCK_TEMPLATE_SIZE % sizeof(ulong))
123 | m[BLOCK_TEMPLATE_SIZE / sizeof(ulong)] &= (ulong)(-1) >> (64 - (BLOCK_TEMPLATE_SIZE % sizeof(ulong)) * 8);
124 |
125 | const ulong nonce = start_nonce + global_index;
126 | m[4] = (m[4] & ((ulong)(-1) >> 8)) | (nonce << 56);
127 | m[5] = (m[5] & ((ulong)(-1) << 24)) | (nonce >> 8);
128 |
129 | ulong hash[8];
130 | blake2b_512_process_single_block(hash, m);
131 |
132 | __global ulong* t = ((__global ulong*) out) + global_index * 8;
133 | t[0] = hash[0];
134 | t[1] = hash[1];
135 | t[2] = hash[2];
136 | t[3] = hash[3];
137 | t[4] = hash[4];
138 | t[5] = hash[5];
139 | t[6] = hash[6];
140 | t[7] = hash[7];
141 | }
142 |
143 | __attribute__((reqd_work_group_size(64, 1, 1)))
144 | __kernel void blake2b_512_single_block_bench(__global ulong *out, __global const void* in, ulong start_nonce)
145 | {
146 | const uint global_index = get_global_id(0);
147 |
148 | __global const ulong* p = (__global const ulong*) in;
149 | ulong m[16] = {
150 | start_nonce + global_index,
151 | (BLOCK_TEMPLATE_SIZE > 8) ? p[1] : 0,
152 | (BLOCK_TEMPLATE_SIZE > 16) ? p[2] : 0,
153 | (BLOCK_TEMPLATE_SIZE > 24) ? p[3] : 0,
154 | (BLOCK_TEMPLATE_SIZE > 32) ? p[4] : 0,
155 | (BLOCK_TEMPLATE_SIZE > 40) ? p[5] : 0,
156 | (BLOCK_TEMPLATE_SIZE > 48) ? p[6] : 0,
157 | (BLOCK_TEMPLATE_SIZE > 56) ? p[7] : 0,
158 | (BLOCK_TEMPLATE_SIZE > 64) ? p[8] : 0,
159 | (BLOCK_TEMPLATE_SIZE > 72) ? p[9] : 0,
160 | (BLOCK_TEMPLATE_SIZE > 80) ? p[10] : 0,
161 | (BLOCK_TEMPLATE_SIZE > 88) ? p[11] : 0,
162 | (BLOCK_TEMPLATE_SIZE > 96) ? p[12] : 0,
163 | (BLOCK_TEMPLATE_SIZE > 104) ? p[13] : 0,
164 | (BLOCK_TEMPLATE_SIZE > 112) ? p[14] : 0,
165 | (BLOCK_TEMPLATE_SIZE > 120) ? p[15] : 0,
166 | };
167 |
168 | if (BLOCK_TEMPLATE_SIZE % sizeof(ulong))
169 | m[BLOCK_TEMPLATE_SIZE / sizeof(ulong)] &= (ulong)(-1) >> (64 - (BLOCK_TEMPLATE_SIZE % sizeof(ulong)) * 8);
170 |
171 | ulong hash[8];
172 | blake2b_512_process_single_block(hash, m);
173 |
174 | if (((uint*) hash)[15] == 0)
175 | *out = m[0];
176 | }
177 |
178 | #define in_len 256
179 |
180 | #define out_len 32
181 | #define blake2b_512_process_double_block_name blake2b_512_process_double_block_32
182 | #define blake2b_hash_registers_name blake2b_hash_registers_32
183 | #include "blake2b_double_block.cl"
184 | #undef blake2b_hash_registers_name
185 | #undef blake2b_512_process_double_block_name
186 | #undef out_len
187 |
188 | #define out_len 64
189 | #define blake2b_512_process_double_block_name blake2b_512_process_double_block_64
190 | #define blake2b_hash_registers_name blake2b_hash_registers_64
191 | #include "blake2b_double_block.cl"
192 | #undef blake2b_hash_registers_name
193 | #undef blake2b_512_process_double_block_name
194 | #undef out_len
195 |
196 | __attribute__((reqd_work_group_size(64, 1, 1)))
197 | __kernel void blake2b_512_double_block_bench(__global ulong *out, __global const void* in, ulong start_nonce)
198 | {
199 | const uint global_index = get_global_id(0);
200 |
201 | __global const ulong* p = (__global const ulong*) in;
202 | ulong m[16] = { start_nonce + global_index, p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15] };
203 |
204 | ulong hash[8];
205 | blake2b_512_process_double_block_64(hash, m, p);
206 |
207 | if (((uint*) hash)[15] == 0)
208 | *out = start_nonce + global_index;
209 | }
210 |
--------------------------------------------------------------------------------
/RandomX_OpenCL/CL/blake2b_double_block.cl:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (c) 2019 SChernykh
3 |
4 | This file is part of RandomX OpenCL.
5 |
6 | RandomX OpenCL is free software: you can redistribute it and/or modify
7 | it under the terms of the GNU General Public License as published by
8 | the Free Software Foundation, either version 3 of the License, or
9 | (at your option) any later version.
10 |
11 | RandomX OpenCL is distributed in the hope that it will be useful,
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | GNU General Public License for more details.
15 |
16 | You should have received a copy of the GNU General Public License
17 | along with RandomX OpenCL. If not, see .
18 | */
19 |
20 | void blake2b_512_process_double_block_name(ulong *out, ulong* m, __global const ulong* in)
21 | {
22 | ulong v[16] =
23 | {
24 | iv0 ^ (0x01010000u | out_len), iv1, iv2, iv3, iv4 , iv5, iv6, iv7,
25 | iv0 , iv1, iv2, iv3, iv4 ^ 128, iv5, iv6, iv7,
26 | };
27 |
28 | BLAKE2B_ROUNDS();
29 |
30 | ulong h[8];
31 | v[0] = h[0] = v[0] ^ v[8] ^ iv0 ^ (0x01010000u | out_len);
32 | v[1] = h[1] = v[1] ^ v[9] ^ iv1;
33 | v[2] = h[2] = v[2] ^ v[10] ^ iv2;
34 | v[3] = h[3] = v[3] ^ v[11] ^ iv3;
35 | v[4] = h[4] = v[4] ^ v[12] ^ iv4;
36 | v[5] = h[5] = v[5] ^ v[13] ^ iv5;
37 | v[6] = h[6] = v[6] ^ v[14] ^ iv6;
38 | v[7] = h[7] = v[7] ^ v[15] ^ iv7;
39 | v[8] = iv0;
40 | v[9] = iv1;
41 | v[10] = iv2;
42 | v[11] = iv3;
43 | v[12] = iv4 ^ in_len;
44 | v[13] = iv5;
45 | v[14] = ~iv6;
46 | v[15] = iv7;
47 |
48 | m[ 0] = (in_len > 128) ? in[16] : 0;
49 | m[ 1] = (in_len > 136) ? in[17] : 0;
50 | m[ 2] = (in_len > 144) ? in[18] : 0;
51 | m[ 3] = (in_len > 152) ? in[19] : 0;
52 | m[ 4] = (in_len > 160) ? in[20] : 0;
53 | m[ 5] = (in_len > 168) ? in[21] : 0;
54 | m[ 6] = (in_len > 176) ? in[22] : 0;
55 | m[ 7] = (in_len > 184) ? in[23] : 0;
56 | m[ 8] = (in_len > 192) ? in[24] : 0;
57 | m[ 9] = (in_len > 200) ? in[25] : 0;
58 | m[10] = (in_len > 208) ? in[26] : 0;
59 | m[11] = (in_len > 216) ? in[27] : 0;
60 | m[12] = (in_len > 224) ? in[28] : 0;
61 | m[13] = (in_len > 232) ? in[29] : 0;
62 | m[14] = (in_len > 240) ? in[30] : 0;
63 | m[15] = (in_len > 248) ? in[31] : 0;
64 |
65 | if (in_len % sizeof(ulong))
66 | m[(in_len - 128) / sizeof(ulong)] &= (ulong)(-1) >> (64 - (in_len % sizeof(ulong)) * 8);
67 |
68 | BLAKE2B_ROUNDS();
69 |
70 | if (out_len > 0) out[0] = h[0] ^ v[0] ^ v[8];
71 | if (out_len > 8) out[1] = h[1] ^ v[1] ^ v[9];
72 | if (out_len > 16) out[2] = h[2] ^ v[2] ^ v[10];
73 | if (out_len > 24) out[3] = h[3] ^ v[3] ^ v[11];
74 | if (out_len > 32) out[4] = h[4] ^ v[4] ^ v[12];
75 | if (out_len > 40) out[5] = h[5] ^ v[5] ^ v[13];
76 | if (out_len > 48) out[6] = h[6] ^ v[6] ^ v[14];
77 | if (out_len > 56) out[7] = h[7] ^ v[7] ^ v[15];
78 | }
79 |
80 | __attribute__((reqd_work_group_size(64, 1, 1)))
81 | __kernel void blake2b_hash_registers_name(__global void *out, __global const void* in, uint inStrideBytes)
82 | {
83 | const uint global_index = get_global_id(0);
84 | __global const ulong* p = ((__global const ulong*) in) + global_index * (inStrideBytes / sizeof(ulong));
85 | __global ulong* h = ((__global ulong*) out) + global_index * (out_len / sizeof(ulong));
86 |
87 | ulong m[16] = { p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15] };
88 |
89 | ulong hash[8];
90 | blake2b_512_process_double_block_name(hash, m, p);
91 |
92 | if (out_len > 0) h[0] = hash[0];
93 | if (out_len > 8) h[1] = hash[1];
94 | if (out_len > 16) h[2] = hash[2];
95 | if (out_len > 24) h[3] = hash[3];
96 | if (out_len > 32) h[4] = hash[4];
97 | if (out_len > 40) h[5] = hash[5];
98 | if (out_len > 48) h[6] = hash[6];
99 | if (out_len > 56) h[7] = hash[7];
100 | }
101 |
--------------------------------------------------------------------------------
/RandomX_OpenCL/CL/fillAes1Rx4.cl:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (c) 2019 SChernykh
3 |
4 | This file is part of RandomX OpenCL.
5 |
6 | RandomX OpenCL is free software: you can redistribute it and/or modify
7 | it under the terms of the GNU General Public License as published by
8 | the Free Software Foundation, either version 3 of the License, or
9 | (at your option) any later version.
10 |
11 | RandomX OpenCL is distributed in the hope that it will be useful,
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | GNU General Public License for more details.
15 |
16 | You should have received a copy of the GNU General Public License
17 | along with RandomX OpenCL. If not, see .
18 | */
19 |
20 | __attribute__((reqd_work_group_size(64, 1, 1)))
21 | __kernel void fillAes_name(__global void* state, __global void* out, uint batch_size)
22 | {
23 | __local uint T[2048];
24 |
25 | const uint global_index = get_global_id(0);
26 | if (global_index >= batch_size * 4)
27 | return;
28 |
29 | const uint idx = global_index / 4;
30 | const uint sub = global_index % 4;
31 |
32 | for (uint i = get_local_id(0), step = get_local_size(0); i < 2048; i += step)
33 | T[i] = AES_TABLE[i];
34 |
35 | barrier(CLK_LOCAL_MEM_FENCE);
36 |
37 | #if num_rounds != 4
38 | const uint k[4] = { AES_KEY_FILL[sub * 4], AES_KEY_FILL[sub * 4 + 1], AES_KEY_FILL[sub * 4 + 2], AES_KEY_FILL[sub * 4 + 3] };
39 | #else
40 | const bool b = (sub < 2);
41 | uint k[16];
42 | k[ 0] = b ? 0x6421aaddu : 0xb5826f73u;
43 | k[ 1] = b ? 0xd1833ddbu : 0xe3d6a7a6u;
44 | k[ 2] = b ? 0x2f546d2bu : 0x3d518b6du;
45 | k[ 3] = b ? 0x99e5d23fu : 0x229effb4u;
46 | k[ 4] = b ? 0xb20e3450u : 0xc7566bf3u;
47 | k[ 5] = b ? 0xb6913f55u : 0x9c10b3d9u;
48 | k[ 6] = b ? 0x06f79d53u : 0xe9024d4eu;
49 | k[ 7] = b ? 0xa5dfcde5u : 0xb272b7d2u;
50 | k[ 8] = b ? 0x5c3ed904u : 0xf273c9e7u;
51 | k[ 9] = b ? 0x515e7bafu : 0xf765a38bu;
52 | k[10] = b ? 0x0aa4679fu : 0x2ba9660au;
53 | k[11] = b ? 0x171c02bfu : 0xf63befa7u;
54 | k[12] = b ? 0x85623763u : 0x7a7cd609u;
55 | k[13] = b ? 0xe78f5d08u : 0x915839deu;
56 | k[14] = b ? 0xcd673785u : 0x0c06d1fdu;
57 | k[15] = b ? 0xd8ded291u : 0xc0b0762du;
58 | #endif
59 |
60 | __global uint* s = ((__global uint*) state) + idx * (64 / sizeof(uint)) + sub * (16 / sizeof(uint));
61 | uint x[4] = { s[0], s[1], s[2], s[3] };
62 |
63 | const uint s1 = (sub & 1) ? 8 : 24;
64 | const uint s3 = (sub & 1) ? 24 : 8;
65 |
66 | __global uint4* p = ((__global uint4*) out) + idx * (outputSize0 / sizeof(uint4)) + sub;
67 |
68 | const __local uint* const t0 = (sub & 1) ? T : (T + 1024);
69 | const __local uint* const t1 = (sub & 1) ? (T + 256) : (T + 1792);
70 | const __local uint* const t2 = (sub & 1) ? (T + 512) : (T + 1536);
71 | const __local uint* const t3 = (sub & 1) ? (T + 768) : (T + 1280);
72 |
73 | #pragma unroll(unroll_factor)
74 | for (uint i = 0; i < outputSize / sizeof(uint4); i += 4, p += 4)
75 | {
76 | uint y[4];
77 |
78 | #if num_rounds != 4
79 | y[0] = t0[get_byte(x[0], 0)] ^ t1[get_byte(x[1], s1)] ^ t2[get_byte(x[2], 16)] ^ t3[get_byte(x[3], s3)] ^ k[0];
80 | y[1] = t0[get_byte(x[1], 0)] ^ t1[get_byte(x[2], s1)] ^ t2[get_byte(x[3], 16)] ^ t3[get_byte(x[0], s3)] ^ k[1];
81 | y[2] = t0[get_byte(x[2], 0)] ^ t1[get_byte(x[3], s1)] ^ t2[get_byte(x[0], 16)] ^ t3[get_byte(x[1], s3)] ^ k[2];
82 | y[3] = t0[get_byte(x[3], 0)] ^ t1[get_byte(x[0], s1)] ^ t2[get_byte(x[1], 16)] ^ t3[get_byte(x[2], s3)] ^ k[3];
83 |
84 | *p = *(uint4*)(y);
85 |
86 | x[0] = y[0];
87 | x[1] = y[1];
88 | x[2] = y[2];
89 | x[3] = y[3];
90 | #else
91 | y[0] = t0[get_byte(x[0], 0)] ^ t1[get_byte(x[1], s1)] ^ t2[get_byte(x[2], 16)] ^ t3[get_byte(x[3], s3)] ^ k[ 0];
92 | y[1] = t0[get_byte(x[1], 0)] ^ t1[get_byte(x[2], s1)] ^ t2[get_byte(x[3], 16)] ^ t3[get_byte(x[0], s3)] ^ k[ 1];
93 | y[2] = t0[get_byte(x[2], 0)] ^ t1[get_byte(x[3], s1)] ^ t2[get_byte(x[0], 16)] ^ t3[get_byte(x[1], s3)] ^ k[ 2];
94 | y[3] = t0[get_byte(x[3], 0)] ^ t1[get_byte(x[0], s1)] ^ t2[get_byte(x[1], 16)] ^ t3[get_byte(x[2], s3)] ^ k[ 3];
95 |
96 | x[0] = t0[get_byte(y[0], 0)] ^ t1[get_byte(y[1], s1)] ^ t2[get_byte(y[2], 16)] ^ t3[get_byte(y[3], s3)] ^ k[ 4];
97 | x[1] = t0[get_byte(y[1], 0)] ^ t1[get_byte(y[2], s1)] ^ t2[get_byte(y[3], 16)] ^ t3[get_byte(y[0], s3)] ^ k[ 5];
98 | x[2] = t0[get_byte(y[2], 0)] ^ t1[get_byte(y[3], s1)] ^ t2[get_byte(y[0], 16)] ^ t3[get_byte(y[1], s3)] ^ k[ 6];
99 | x[3] = t0[get_byte(y[3], 0)] ^ t1[get_byte(y[0], s1)] ^ t2[get_byte(y[1], 16)] ^ t3[get_byte(y[2], s3)] ^ k[ 7];
100 |
101 | y[0] = t0[get_byte(x[0], 0)] ^ t1[get_byte(x[1], s1)] ^ t2[get_byte(x[2], 16)] ^ t3[get_byte(x[3], s3)] ^ k[ 8];
102 | y[1] = t0[get_byte(x[1], 0)] ^ t1[get_byte(x[2], s1)] ^ t2[get_byte(x[3], 16)] ^ t3[get_byte(x[0], s3)] ^ k[ 9];
103 | y[2] = t0[get_byte(x[2], 0)] ^ t1[get_byte(x[3], s1)] ^ t2[get_byte(x[0], 16)] ^ t3[get_byte(x[1], s3)] ^ k[10];
104 | y[3] = t0[get_byte(x[3], 0)] ^ t1[get_byte(x[0], s1)] ^ t2[get_byte(x[1], 16)] ^ t3[get_byte(x[2], s3)] ^ k[11];
105 |
106 | x[0] = t0[get_byte(y[0], 0)] ^ t1[get_byte(y[1], s1)] ^ t2[get_byte(y[2], 16)] ^ t3[get_byte(y[3], s3)] ^ k[12];
107 | x[1] = t0[get_byte(y[1], 0)] ^ t1[get_byte(y[2], s1)] ^ t2[get_byte(y[3], 16)] ^ t3[get_byte(y[0], s3)] ^ k[13];
108 | x[2] = t0[get_byte(y[2], 0)] ^ t1[get_byte(y[3], s1)] ^ t2[get_byte(y[0], 16)] ^ t3[get_byte(y[1], s3)] ^ k[14];
109 | x[3] = t0[get_byte(y[3], 0)] ^ t1[get_byte(y[0], s1)] ^ t2[get_byte(y[1], 16)] ^ t3[get_byte(y[2], s3)] ^ k[15];
110 |
111 | *p = *(uint4*)(x);
112 | #endif
113 | }
114 |
115 | *(__global uint4*)(s) = *(uint4*)(x);
116 | }
117 |
--------------------------------------------------------------------------------
/RandomX_OpenCL/CL/randomx_constants.h:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (c) 2019 SChernykh
3 |
4 | This file is part of RandomX OpenCL.
5 |
6 | RandomX OpenCL is free software: you can redistribute it and/or modify
7 | it under the terms of the GNU General Public License as published by
8 | the Free Software Foundation, either version 3 of the License, or
9 | (at your option) any later version.
10 |
11 | RandomX OpenCL is distributed in the hope that it will be useful,
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | GNU General Public License for more details.
15 |
16 | You should have received a copy of the GNU General Public License
17 | along with RandomX OpenCL. If not, see .
18 | */
19 |
20 | #pragma once
21 |
22 | //Dataset base size in bytes. Must be a power of 2.
23 | #define RANDOMX_DATASET_BASE_SIZE 2147483648
24 |
25 | //Dataset extra size. Must be divisible by 64.
26 | #define RANDOMX_DATASET_EXTRA_SIZE 33554368
27 |
28 | //Scratchpad L3 size in bytes. Must be a power of 2.
29 | #define RANDOMX_SCRATCHPAD_L3 2097152
30 |
31 | //Scratchpad L2 size in bytes. Must be a power of two and less than or equal to RANDOMX_SCRATCHPAD_L3.
32 | #define RANDOMX_SCRATCHPAD_L2 262144
33 |
34 | //Scratchpad L1 size in bytes. Must be a power of two (minimum 64) and less than or equal to RANDOMX_SCRATCHPAD_L2.
35 | #define RANDOMX_SCRATCHPAD_L1 16384
36 |
37 | //Jump condition mask size in bits.
38 | #define RANDOMX_JUMP_BITS 8
39 |
40 | //Jump condition mask offset in bits. The sum of RANDOMX_JUMP_BITS and RANDOMX_JUMP_OFFSET must not exceed 16.
41 | #define RANDOMX_JUMP_OFFSET 8
42 |
43 | //Integer instructions
44 | #define RANDOMX_FREQ_IADD_RS 16
45 | #define RANDOMX_FREQ_IADD_M 7
46 | #define RANDOMX_FREQ_ISUB_R 16
47 | #define RANDOMX_FREQ_ISUB_M 7
48 | #define RANDOMX_FREQ_IMUL_R 16
49 | #define RANDOMX_FREQ_IMUL_M 4
50 | #define RANDOMX_FREQ_IMULH_R 4
51 | #define RANDOMX_FREQ_IMULH_M 1
52 | #define RANDOMX_FREQ_ISMULH_R 4
53 | #define RANDOMX_FREQ_ISMULH_M 1
54 | #define RANDOMX_FREQ_IMUL_RCP 8
55 | #define RANDOMX_FREQ_INEG_R 2
56 | #define RANDOMX_FREQ_IXOR_R 15
57 | #define RANDOMX_FREQ_IXOR_M 5
58 | #define RANDOMX_FREQ_IROR_R 8
59 | #define RANDOMX_FREQ_IROL_R 2
60 | #define RANDOMX_FREQ_ISWAP_R 4
61 |
62 | //Floating point instructions
63 | #define RANDOMX_FREQ_FSWAP_R 4
64 | #define RANDOMX_FREQ_FADD_R 16
65 | #define RANDOMX_FREQ_FADD_M 5
66 | #define RANDOMX_FREQ_FSUB_R 16
67 | #define RANDOMX_FREQ_FSUB_M 5
68 | #define RANDOMX_FREQ_FSCAL_R 6
69 | #define RANDOMX_FREQ_FMUL_R 32
70 | #define RANDOMX_FREQ_FDIV_M 4
71 | #define RANDOMX_FREQ_FSQRT_R 6
72 |
73 | //Control instructions
74 | #define RANDOMX_FREQ_CBRANCH 25
75 | #define RANDOMX_FREQ_CFROUND 1
76 |
77 | //Store instruction
78 | #define RANDOMX_FREQ_ISTORE 16
79 |
80 | //No-op instruction
81 | #define RANDOMX_FREQ_NOP 0
82 |
83 | #define RANDOMX_DATASET_ITEM_SIZE 64
84 |
85 | #define RANDOMX_PROGRAM_SIZE 256
86 |
87 | #define HASH_SIZE 64
88 | #define ENTROPY_SIZE (128 + RANDOMX_PROGRAM_SIZE * 8)
89 | #define REGISTERS_SIZE 256
90 | #define IMM_BUF_SIZE (RANDOMX_PROGRAM_SIZE * 4 - REGISTERS_SIZE)
91 | #define IMM_INDEX_COUNT ((IMM_BUF_SIZE / 4) - 2)
92 | #define VM_STATE_SIZE (REGISTERS_SIZE + IMM_BUF_SIZE + RANDOMX_PROGRAM_SIZE * 4)
93 | #define ROUNDING_MODE (RANDOMX_FREQ_CFROUND ? -1 : 0)
94 |
95 | // Scratchpad L1/L2/L3 bits
96 | #define LOC_L1 (32 - 14)
97 | #define LOC_L2 (32 - 18)
98 | #define LOC_L3 (32 - 21)
99 |
--------------------------------------------------------------------------------
/RandomX_OpenCL/CL/randomx_constants_jit.h:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (c) 2019 SChernykh
3 |
4 | This file is part of RandomX OpenCL.
5 |
6 | RandomX OpenCL is free software: you can redistribute it and/or modify
7 | it under the terms of the GNU General Public License as published by
8 | the Free Software Foundation, either version 3 of the License, or
9 | (at your option) any later version.
10 |
11 | RandomX OpenCL is distributed in the hope that it will be useful,
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | GNU General Public License for more details.
15 |
16 | You should have received a copy of the GNU General Public License
17 | along with RandomX OpenCL. If not, see .
18 | */
19 |
20 | #pragma once
21 |
22 | #define INITIAL_HASH_SIZE 64
23 | #define INTERMEDIATE_PROGRAM_SIZE (RANDOMX_PROGRAM_SIZE * 16)
24 | #define COMPILED_PROGRAM_SIZE 10048
25 | #define NUM_VGPR_REGISTERS 128
26 |
--------------------------------------------------------------------------------
/RandomX_OpenCL/CL/randomx_run.cl:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (c) 2019 SChernykh
3 | Portions Copyright (c) 2018-2019 tevador
4 |
5 | This file is part of RandomX OpenCL.
6 |
7 | RandomX OpenCL is free software: you can redistribute it and/or modify
8 | it under the terms of the GNU General Public License as published by
9 | the Free Software Foundation, either version 3 of the License, or
10 | (at your option) any later version.
11 |
12 | RandomX OpenCL is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | GNU General Public License for more details.
16 |
17 | You should have received a copy of the GNU General Public License
18 | along with RandomX OpenCL. If not, see .
19 | */
20 |
21 | #include "randomx_constants.h"
22 | #include "randomx_constants_jit.h"
23 |
24 | #define REGISTERS_COUNT (REGISTERS_SIZE / 8)
25 |
26 | #define CacheLineSize 64U
27 | #define CacheLineAlignMask ((1U << 31) - 1) & ~(CacheLineSize - 1)
28 |
29 | #define mantissaSize 52
30 | #define dynamicExponentBits 4
31 | #define dynamicMantissaMask ((1UL << (mantissaSize + dynamicExponentBits)) - 1)
32 |
33 | double load_F_E_groups(int value, ulong andMask, ulong orMask)
34 | {
35 | ulong x = as_ulong(convert_double_rte(value));
36 | x &= andMask;
37 | x |= orMask;
38 | return as_double(x);
39 | }
40 |
41 | // This kernel is only used to dump binary and disassemble it into randomx_run.asm
42 | __attribute__((reqd_work_group_size(32, 1, 1)))
43 | __kernel void randomx_run(__global const uchar* dataset, __global uchar* scratchpad, __global ulong* registers, __global uint* rounding_modes, __global uint* programs, uint batch_size, uint rx_parameters)
44 | {
45 | __local ulong2 R_buf[REGISTERS_COUNT / 2];
46 |
47 | const uint idx = get_group_id(0);
48 | const uint sub = get_local_id(0);
49 |
50 | const uint program_iterations = 1U << (rx_parameters >> 15);
51 | const uint ScratchpadL3Size = 1U << ((rx_parameters >> 10) & 31);
52 | const uint ScratchpadL3Mask64 = ScratchpadL3Size - 64;
53 |
54 | __local ulong* R = (__local ulong*)(R_buf);
55 |
56 | __local double* F = (__local double*)(R + 8);
57 | __local double* E = (__local double*)(R + 16);
58 |
59 | registers += idx * REGISTERS_COUNT;
60 | scratchpad += idx * (ulong)(ScratchpadL3Size + 64);
61 | rounding_modes += idx;
62 | programs += get_group_id(0) * (COMPILED_PROGRAM_SIZE / sizeof(uint));
63 |
64 | // Copy registers (256 bytes) into shared memory: 32 workers, 8 bytes for each worker
65 | ((__local ulong*) R)[sub] = ((__global ulong*) registers)[sub];
66 | barrier(CLK_LOCAL_MEM_FENCE);
67 |
68 | if (sub >= 8)
69 | return;
70 |
71 | uint mx = ((__local uint*)(R + 16))[1];
72 | uint ma = ((__local uint*)(R + 16))[0];
73 |
74 | const uint readReg0 = ((__local uint*)(R + 17))[0];
75 | const uint readReg1 = ((__local uint*)(R + 17))[1];
76 | const uint readReg2 = ((__local uint*)(R + 17))[2];
77 | const uint readReg3 = ((__local uint*)(R + 17))[3];
78 |
79 | const uint datasetOffset = ((__local uint*)(R + 19))[0];
80 | dataset += datasetOffset;
81 |
82 | uint spAddr0 = mx;
83 | uint spAddr1 = ma;
84 |
85 | const bool f_group = (sub < 4);
86 | __local double* fe = f_group ? (F + sub * 2) : (E + (sub - 4) * 2);
87 |
88 | const ulong andMask = f_group ? (ulong)(-1) : dynamicMantissaMask;
89 | const ulong orMask1 = f_group ? 0 : R[20];
90 | const ulong orMask2 = f_group ? 0 : R[21];
91 |
92 | #pragma unroll(1)
93 | for (uint ic = 0; ic < program_iterations; ++ic)
94 | {
95 | const uint2 spMix = as_uint2(R[readReg0] ^ R[readReg1]);
96 | spAddr0 ^= spMix.x;
97 | spAddr0 &= ScratchpadL3Mask64;
98 | spAddr1 ^= spMix.y;
99 | spAddr1 &= ScratchpadL3Mask64;
100 |
101 | __global ulong* p0 = (__global ulong*)(scratchpad + (spAddr0 + sub * 8));
102 | __global ulong* p1 = (__global ulong*)(scratchpad + (spAddr1 + sub * 8));
103 |
104 | R[sub] ^= *p0;
105 |
106 | const int2 q = as_int2(*p1);
107 | fe[0] = load_F_E_groups(q.x, andMask, orMask1);
108 | fe[1] = load_F_E_groups(q.y, andMask, orMask2);
109 |
110 | barrier(CLK_LOCAL_MEM_FENCE);
111 |
112 | // TODO:
113 | //
114 | // 1) Compile with atomic_inc uncommented
115 | // 2) clrxdisasm -C randomx.bin > randomx.asm
116 | // 3) Replace GLOBAL_ATOMIC_ADD in randomx.asm with a call to JIT code (S_SWAPPC_B64 to call, S_SETPC_B64 to return)
117 | // 4) clrxasm randomx.asm -o randomx.bin
118 | // 5) ???
119 | // 6) PROFIT!!!
120 |
121 | atomic_inc(programs);
122 |
123 | mx ^= R[readReg2] ^ R[readReg3];
124 | mx &= CacheLineAlignMask;
125 |
126 | const ulong data = *(__global const ulong*)(dataset + ma + sub * 8);
127 |
128 | const ulong next_r = R[sub] ^ data;
129 | R[sub] = next_r;
130 |
131 | *p1 = next_r;
132 | *p0 = as_ulong(F[sub]) ^ as_ulong(E[sub]);
133 |
134 | uint tmp = ma;
135 | ma = mx;
136 | mx = tmp;
137 |
138 | spAddr0 = 0;
139 | spAddr1 = 0;
140 | }
141 |
142 | registers[sub] = R[sub];
143 | registers[sub + 8] = as_ulong(F[sub]) ^ as_ulong(E[sub]);
144 | registers[sub + 16] = as_ulong(E[sub]);
145 | }
146 |
--------------------------------------------------------------------------------
/RandomX_OpenCL/GCNASM/randomx_run_gfx1010.asm:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (c) 2019-2020 SChernykh
3 |
4 | This file is part of RandomX OpenCL.
5 |
6 | RandomX OpenCL is free software: you can redistribute it and/or modify
7 | it under the terms of the GNU General Public License as published by
8 | the Free Software Foundation, either version 3 of the License, or
9 | (at your option) any later version.
10 |
11 | RandomX OpenCL is distributed in the hope that it will be useful,
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | GNU General Public License for more details.
15 |
16 | You should have received a copy of the GNU General Public License
17 | along with RandomX OpenCL. If not, see .
18 | */
19 |
20 | .rocm
21 | .gpu GFX1010
22 | .arch_minor 1
23 | .arch_stepping 0
24 | .eflags 53
25 | .llvm10binfmt
26 | .metadatav3
27 | .md_version 1, 0
28 | .globaldata
29 | .fill 64, 1, 0
30 | .kernel randomx_run
31 | .config
32 | .dims x
33 | .sgprsnum 96
34 | .vgprsnum 128
35 | .shared_vgprs 0
36 | .dx10clamp
37 | .ieeemode
38 | .floatmode 0xf0
39 | .priority 0
40 | .exceptions 0
41 | .userdatanum 6
42 |
43 | # https://llvm.org/docs/AMDGPUUsage.html#amdgpu-amdhsa-compute-pgm-rsrc1-gfx6-gfx10-table
44 | # https://llvm.org/docs/AMDGPUUsage.html#amdgpu-amdhsa-compute-pgm-rsrc2-gfx6-gfx10-table
45 | # https://llvm.org/docs/AMDGPUUsage.html#amdgpu-amdhsa-compute-pgm-rsrc3-gfx10-table
46 | .pgmrsrc1 0x40af0105
47 | .pgmrsrc2 0x0000008c
48 | .pgmrsrc3 0x00000000
49 |
50 | .group_segment_fixed_size 256
51 | .private_segment_fixed_size 0
52 | .kernel_code_entry_offset 0x10c0
53 | .use_private_segment_buffer
54 | .use_kernarg_segment_ptr
55 | .use_wave32
56 | .config
57 | .md_symname "randomx_run.kd"
58 | .md_language "OpenCL C", 1, 2
59 | .reqd_work_group_size 32, 1, 1
60 | .md_kernarg_segment_size 104
61 | .md_kernarg_segment_align 8
62 | .md_group_segment_fixed_size 256
63 | .md_private_segment_fixed_size 0
64 | .md_wavefront_size 32
65 | .md_sgprsnum 96
66 | .md_vgprsnum 128
67 | .spilledsgprs 0
68 | .spilledvgprs 0
69 | .max_flat_work_group_size 32
70 | .arg dataset, "uchar*", 8, 0, globalbuf, u8, global, default const
71 | .arg scratchpad, "uchar*", 8, 8, globalbuf, u8, global, default
72 | .arg registers, "ulong*", 8, 16, globalbuf, u64, global, default
73 | .arg rounding_modes, "uint*", 8, 24, globalbuf, u32, global, default
74 | .arg programs, "uint*", 8, 32, globalbuf, u32, global, default
75 | .arg batch_size, "uint", 4, 40, value, u32
76 | .arg rx_parameters, "uint", 4, 44, value, u32
77 | .arg , "", 8, 48, gox, i64
78 | .arg , "", 8, 56, goy, i64
79 | .arg , "", 8, 64, goz, i64
80 | .arg , "", 8, 72, none, i8
81 | .arg , "", 8, 80, none, i8
82 | .arg , "", 8, 88, none, i8
83 | .arg , "", 8, 96, multigridsyncarg, i8
84 | .text
85 | randomx_run:
86 | # clear all caches
87 | s_dcache_wb
88 | s_waitcnt vmcnt(0) & lgkmcnt(0)
89 | s_waitcnt_vscnt null, 0x0
90 | s_icache_inv
91 | s_branch begin
92 |
93 | # pgmrsrc2 = 0x0000008c, bits 1:5 = 6, so first 6 SGPRs (s0-s7) contain user data
94 | # s6 contains group id
95 | # v0 contains local id
96 | begin:
97 | # s[0:1] - pointer to registers
98 | # s[2:3] - pointer to rounding modes
99 | s_load_dwordx4 s[0:3], s[4:5], 0x10
100 |
101 | # s[8:9] - group_id*group_size
102 | s_mov_b32 s9, 0
103 | s_lshl_b32 s8, s6, 5
104 |
105 | # v0 - local id (sub)
106 | # v39 - R[sub]
107 | v_lshlrev_b32 v39, 3, v0
108 |
109 | s_mov_b32 s12, s7
110 |
111 | # vcc_lo = "if (sub < 8)"
112 | v_cmp_gt_u32 vcc_lo, 8, v0
113 |
114 | s_waitcnt lgkmcnt(0)
115 |
116 | # load rounding mode
117 | s_lshl_b32 s16, s6, 2
118 | s_add_u32 s64, s2, s16
119 | s_addc_u32 s65, s3, 0
120 | v_mov_b32 v1, 0
121 | global_load_dword v1, v1, s[64:65]
122 | s_waitcnt vmcnt(0)
123 | v_readlane_b32 s66, v1, 0
124 | s_setreg_b32 hwreg(mode, 2, 2), s66
125 | s_mov_b32 s67, 0
126 |
127 | # ((__local ulong*) R)[sub] = ((__global ulong*) registers)[sub];
128 | s_lshl_b64 s[2:3], s[8:9], 3
129 | s_mov_b32 s32, s12
130 | s_add_u32 s0, s0, s2
131 | s_addc_u32 s1, s1, s3
132 | v_add_co_u32 v1, s0, s0, v39
133 | v_add_co_ci_u32 v2, s0, s1, 0, s0
134 | global_load_dwordx2 v[4:5], v[1:2], off
135 | s_waitcnt vmcnt(0)
136 | ds_write_b64 v39, v[4:5]
137 | s_waitcnt vmcnt(0) & lgkmcnt(0)
138 | s_waitcnt_vscnt null, 0x0
139 |
140 | # "if (sub >= 8) return"
141 | s_and_saveexec_b32 s0, vcc_lo
142 | s_cbranch_execz program_end
143 |
144 | # s[8:9] - pointer to dataset
145 | # s[10:11] - pointer to scratchpads
146 | # s[0:1] - pointer to programs
147 | s_load_dwordx4 s[8:11], s[4:5], 0x0
148 | s_load_dwordx2 s[0:1], s[4:5], 0x20
149 |
150 | # rx_parameters
151 | s_load_dword s20, s[4:5], 0x2c
152 |
153 | v_mov_b32 v5, 0
154 | v_mov_b32 v10, 0
155 | s_waitcnt_vscnt null, 0x0
156 | ds_read_b64 v[8:9], v39
157 | v_cmp_gt_u32 vcc_lo, 4, v0
158 | v_lshlrev_b32 v0, 3, v0
159 | ds_read2_b64 v[25:28], v5 offset0:16 offset1:17
160 | ds_read_b32 v11, v5 offset:152
161 | ds_read_b64 v[35:36], v5 offset:168
162 | ds_read2_b64 v[20:23], v5 offset0:18 offset1:20
163 | v_cndmask_b32 v4, 0xffffff, -1, vcc_lo
164 | v_add_nc_u32 v5, v39, v0
165 | s_waitcnt lgkmcnt(0)
166 | v_mov_b32 v13, s11
167 | v_mov_b32 v7, s1
168 | v_mov_b32 v6, s0
169 |
170 | # Scratchpad L1 size
171 | s_bfe_u32 s21, s20, 0x050000
172 | s_lshl_b32 s21, 1, s21
173 |
174 | # Scratchpad L2 size
175 | s_bfe_u32 s22, s20, 0x050005
176 | s_lshl_b32 s22, 1, s22
177 |
178 | # Scratchpad L3 size
179 | s_bfe_u32 s0, s20, 0x05000A
180 | s_lshl_b32 s23, 1, s0
181 |
182 | # program iterations
183 | s_bfe_u32 s24, s20, 0x04000F
184 | s_lshl_b32 s24, 1, s24
185 |
186 | v_mov_b32 v12, s10
187 | v_mad_u64_u32 v[6:7], s2, 10048, s6, v[6:7]
188 |
189 | # s[4:5] - pointer to current program
190 | v_readlane_b32 s4, v6, 0
191 | v_readlane_b32 s5, v7, 0
192 |
193 | s_lshl_b32 s2, 1, s0
194 | v_add_co_u32 v14, s0, s8, v11
195 | v_cndmask_b32 v34, v36, 0, vcc_lo
196 | v_cndmask_b32 v24, v23, 0, vcc_lo
197 | v_cndmask_b32 v3, v22, 0, vcc_lo
198 | s_add_i32 s3, s2, 64
199 | v_add_co_ci_u32 v29, s0, s9, v10, s0
200 | v_cndmask_b32 v35, v35, 0, vcc_lo
201 | v_add_co_u32 v22, vcc_lo, v14, v0
202 |
203 | # v[12:13] - pointer to current scratchpad
204 | v_mad_u64_u32 v[12:13], s2, s3, s6, v[12:13]
205 | v_mov_b32 v10, v26
206 | v_mov_b32 v11, v25
207 | v_lshlrev_b32 v36, 3, v27
208 | v_lshlrev_b32 v37, 3, v28
209 | v_lshlrev_b32 v20, 3, v20
210 | v_lshlrev_b32 v21, 3, v21
211 | v_add_co_ci_u32 v23, vcc_lo, 0, v29, vcc_lo
212 |
213 | # rename registers
214 | # v6 - R[sub]
215 | v_mov_b32 v6, v39
216 |
217 | # loop counter
218 | s_sub_u32 s2, s24, 1
219 |
220 | # used in IXOR_R instruction
221 | s_mov_b32 s63, -1
222 |
223 | # used in CBRANCH instruction
224 | s_mov_b32 s70, (0xFF << 8)
225 | s_mov_b32 s71, (0xFF << 9)
226 | s_mov_b32 s72, (0xFF << 10)
227 | s_mov_b32 s73, (0xFF << 11)
228 | s_mov_b32 s74, (0xFF << 12)
229 | s_mov_b32 s75, (0xFF << 13)
230 | s_mov_b32 s76, (0xFF << 14)
231 | s_mov_b32 s77, (0xFF << 15)
232 | s_mov_b32 s78, (0xFF << 16)
233 | s_mov_b32 s79, (0xFF << 17)
234 | s_mov_b32 s80, (0xFF << 18)
235 | s_mov_b32 s81, (0xFF << 19)
236 | s_mov_b32 s82, (0xFF << 20)
237 | s_mov_b32 s83, (0xFF << 21)
238 | s_mov_b32 s84, (0xFF << 22)
239 | s_mov_b32 s85, (0xFF << 23)
240 |
241 | # ScratchpadL3Mask64
242 | s_sub_u32 s86, s23, 64
243 |
244 | # Scratchpad masks for scratchpads
245 | v_sub_nc_u32 v38, s21, 8
246 | v_sub_nc_u32 v39, s22, 8
247 | v_sub_nc_u32 v50, s23, 8
248 |
249 | # mask for FSCAL_R
250 | v_mov_b32 v51, 0x80F00000
251 |
252 | # load scratchpad base address
253 | v_readlane_b32 s0, v12, 0
254 | v_readlane_b32 s1, v13, 0
255 |
256 | # v41, v44 = 0
257 | v_mov_b32 v41, 0
258 | v_mov_b32 v44, 0
259 |
260 | # v41 = 0 on lane 0, set it to 8 on lane 1
261 | # v44 = 0 on lane 0, set it to 4 on lane 1
262 | s_mov_b64 exec, 2
263 | v_mov_b32 v41, 8
264 | v_mov_b32 v44, 4
265 |
266 | # load group A registers
267 | # Read low 8 bytes into lane 0 and high 8 bytes into lane 1
268 | s_mov_b64 exec, 3
269 | ds_read2_b64 v[52:55], v41 offset0:24 offset1:26
270 | ds_read2_b64 v[56:59], v41 offset0:28 offset1:30
271 |
272 | # xmantissaMask
273 | v_mov_b32 v77, (1 << 24) - 1
274 |
275 | # xexponentMask
276 | ds_read_b64 v[78:79], v41 offset:160
277 |
278 | # Restore execution mask
279 | s_mov_b64 exec, 255
280 |
281 | # sign mask (used in FSQRT_R)
282 | v_mov_b32 v82, 0x80000000
283 |
284 | # used in FSQRT_R to check for "positive normal value" (v_cmpx_class_f64)
285 | s_mov_b32 s68, 256
286 | s_mov_b32 s69, 0
287 |
288 | # High 32 bits of "1.0" constant (used in FDIV_M)
289 | v_mov_b32 v83, (1023 << 20)
290 |
291 | # Used to multiply FP64 values by 0.5
292 | v_mov_b32 v84, (1 << 20)
293 |
294 | s_getpc_b64 s[14:15]
295 | cur_addr:
296 |
297 | # get addresses of FSQRT_R subroutines
298 | s_add_u32 s40, s14, fsqrt_r_sub0 - cur_addr
299 | s_addc_u32 s41, s15, 0
300 | s_add_u32 s42, s14, fsqrt_r_sub1 - cur_addr
301 | s_addc_u32 s43, s15, 0
302 | s_add_u32 s44, s14, fsqrt_r_sub2 - cur_addr
303 | s_addc_u32 s45, s15, 0
304 | s_add_u32 s46, s14, fsqrt_r_sub3 - cur_addr
305 | s_addc_u32 s47, s15, 0
306 |
307 | # get addresses of FDIV_M subroutines
308 | s_add_u32 s48, s14, fdiv_m_sub0 - cur_addr
309 | s_addc_u32 s49, s15, 0
310 | s_add_u32 s50, s14, fdiv_m_sub1 - cur_addr
311 | s_addc_u32 s51, s15, 0
312 | s_add_u32 s52, s14, fdiv_m_sub2 - cur_addr
313 | s_addc_u32 s53, s15, 0
314 | s_add_u32 s54, s14, fdiv_m_sub3 - cur_addr
315 | s_addc_u32 s55, s15, 0
316 |
317 | # get address for ISMULH_R subroutine
318 | s_add_u32 s56, s14, ismulh_r_sub - cur_addr
319 | s_addc_u32 s57, s15, 0
320 |
321 | # get address for IMULH_R subroutine
322 | s_add_u32 s58, s14, imulh_r_sub - cur_addr
323 | s_addc_u32 s59, s15, 0
324 |
325 | /*
326 | used: v0-v6, v8-v37
327 | not used: v7
328 | */
329 | main_loop:
330 | s_waitcnt_vscnt null, 0x0
331 |
332 | # v[27:28] = R[readReg0]
333 | # v[29:30] = R[readReg1]
334 | ds_read_b64 v[27:28], v37
335 | ds_read_b64 v[29:30], v36
336 | s_waitcnt lgkmcnt(0)
337 |
338 | # R[readReg0] ^ R[readReg0] (high 32 bits)
339 | v_xor_b32 v28, v28, v30
340 |
341 | # spAddr1
342 | v_xor_b32 v25, v28, v25
343 | v_and_b32 v25, s86, v25
344 | v_add_nc_u32 v25, v25, v0
345 |
346 | v_add_co_u32 v16, vcc_lo, s0, v25
347 |
348 | # R[readReg0] ^ R[readReg0] (low 32 bits)
349 | v_xor_b32 v25, v27, v29
350 |
351 | v_mov_b32 v29, v11
352 | v_add_co_ci_u32 v17, vcc_lo, 0, s1, vcc_lo
353 | v_xor_b32 v25, v25, v26
354 |
355 | # load from spAddr1
356 | global_load_dwordx2 v[27:28], v[16:17], off
357 |
358 | # spAddr0
359 | v_and_b32 v25, s86, v25
360 | v_add_nc_u32 v25, v25, v0
361 |
362 | v_add_co_u32 v31, vcc_lo, s0, v25
363 | v_add_co_ci_u32 v32, vcc_lo, 0, s1, vcc_lo
364 | v_add_co_u32 v29, vcc_lo, v22, v29
365 |
366 | # load from spAddr0
367 | global_load_dwordx2 v[25:26], v[31:32], off
368 |
369 | v_add_co_ci_u32 v30, vcc_lo, 0, v23, vcc_lo
370 | v_mov_b32 v33, v11
371 | s_and_b32 vcc_lo, exec_lo, vcc_lo
372 | s_waitcnt vmcnt(1)
373 | v_cvt_f64_i32 v[14:15], v28
374 | v_cvt_f64_i32 v[12:13], v27
375 | v_or_b32 v14, v14, v35
376 | s_waitcnt vmcnt(0)
377 |
378 | # R[sub] ^= *p0;
379 | v_xor_b32 v8, v25, v8
380 | v_xor_b32 v9, v26, v9
381 |
382 | v_and_b32 v26, v4, v15
383 |
384 | v_and_b32 v19, v4, v13
385 | v_or_b32 v15, v26, v34
386 | v_or_b32 v18, v12, v3
387 | v_mov_b32 v26, 0
388 | v_or_b32 v19, v19, v24
389 | v_mov_b32 v25, v26
390 | ds_write2_b64 v5, v[18:19], v[14:15] offset0:8 offset1:9
391 |
392 | # load from dataset
393 | global_load_dwordx2 v[18:19], v[29:30], off
394 |
395 | # load group F,E registers
396 | # Read low 8 bytes into lane 0 and high 8 bytes into lane 1
397 | s_mov_b64 exec, 3
398 | s_waitcnt lgkmcnt(0)
399 | ds_read2_b64 v[60:63], v41 offset0:8 offset1:10
400 | ds_read2_b64 v[64:67], v41 offset0:12 offset1:14
401 | ds_read2_b64 v[68:71], v41 offset0:16 offset1:18
402 | ds_read2_b64 v[72:75], v41 offset0:20 offset1:22
403 |
404 | # load VM integer registers
405 | v_readlane_b32 s16, v8, 0
406 | v_readlane_b32 s17, v9, 0
407 | v_readlane_b32 s18, v8, 1
408 | v_readlane_b32 s19, v9, 1
409 | v_readlane_b32 s20, v8, 2
410 | v_readlane_b32 s21, v9, 2
411 | v_readlane_b32 s22, v8, 3
412 | v_readlane_b32 s23, v9, 3
413 | v_readlane_b32 s24, v8, 4
414 | v_readlane_b32 s25, v9, 4
415 | v_readlane_b32 s26, v8, 5
416 | v_readlane_b32 s27, v9, 5
417 | v_readlane_b32 s28, v8, 6
418 | v_readlane_b32 s29, v9, 6
419 | v_readlane_b32 s30, v8, 7
420 | v_readlane_b32 s31, v9, 7
421 |
422 | s_waitcnt lgkmcnt(0)
423 |
424 | # Use only first 2 lanes for the program
425 | s_mov_b64 exec, 3
426 |
427 | # call JIT code
428 | s_swappc_b64 s[12:13], s[4:5]
429 |
430 | # Write out group F,E registers
431 | # Write low 8 bytes from lane 0 and high 8 bytes from lane 1
432 | ds_write2_b64 v41, v[60:61], v[62:63] offset0:8 offset1:10
433 | ds_write2_b64 v41, v[64:65], v[66:67] offset0:12 offset1:14
434 | ds_write2_b64 v41, v[68:69], v[70:71] offset0:16 offset1:18
435 | ds_write2_b64 v41, v[72:73], v[74:75] offset0:20 offset1:22
436 |
437 | # store VM integer registers
438 | v_writelane_b32 v8, s16, 0
439 | v_writelane_b32 v9, s17, 0
440 | v_writelane_b32 v8, s18, 1
441 | v_writelane_b32 v9, s19, 1
442 | v_writelane_b32 v8, s20, 2
443 | v_writelane_b32 v9, s21, 2
444 | v_writelane_b32 v8, s22, 3
445 | v_writelane_b32 v9, s23, 3
446 | v_writelane_b32 v8, s24, 4
447 | v_writelane_b32 v9, s25, 4
448 | v_writelane_b32 v8, s26, 5
449 | v_writelane_b32 v9, s27, 5
450 | v_writelane_b32 v8, s28, 6
451 | v_writelane_b32 v9, s29, 6
452 | v_writelane_b32 v8, s30, 7
453 | v_writelane_b32 v9, s31, 7
454 |
455 | # Turn back on 8 execution lanes
456 | s_mov_b64 exec, 255
457 |
458 | # Write out VM integer registers
459 | ds_write_b64 v6, v[8:9]
460 | s_waitcnt lgkmcnt(0)
461 |
462 | # R[readReg2], R[readReg3]
463 | ds_read_b32 v11, v21
464 | ds_read_b32 v27, v20
465 | s_waitcnt lgkmcnt(0)
466 |
467 | # mx ^= R[readReg2] ^ R[readReg3];
468 | v_xor_b32 v11, v11, v27
469 | v_xor_b32 v10, v10, v11
470 |
471 | # v[27:28] = R[sub]
472 | # v[29:30] = F[sub]
473 | ds_read2_b64 v[27:30], v6 offset1:8
474 |
475 | # mx &= CacheLineAlignMask;
476 | v_and_b32 v11, 0x7fffffc0, v10
477 | v_mov_b32 v10, v33
478 | s_waitcnt lgkmcnt(0)
479 |
480 | # const ulong next_r = R[sub] ^ data;
481 | s_waitcnt lgkmcnt(0)
482 | v_xor_b32 v8, v27, v18
483 | v_xor_b32 v9, v28, v19
484 |
485 | # *p1 = next_r;
486 | global_store_dwordx2 v[16:17], v[8:9], off
487 |
488 | # v[27:28] = E[sub]
489 | ds_read_b64 v[27:28], v6 offset:128
490 |
491 | # R[sub] = next_r;
492 | ds_write_b64 v6, v[8:9]
493 | s_waitcnt lgkmcnt(1)
494 |
495 | # *p0 = as_ulong(F[sub]) ^ as_ulong(E[sub]);
496 | v_xor_b32 v29, v27, v29
497 | v_xor_b32 v30, v28, v30
498 | global_store_dwordx2 v[31:32], v[29:30], off
499 |
500 | s_sub_u32 s2, s2, 1
501 | s_cbranch_scc0 main_loop
502 | main_loop_end:
503 |
504 | global_store_dwordx2 v[1:2], v[8:9], off
505 | global_store_dwordx2 v[1:2], v[29:30], off inst_offset:64
506 | global_store_dwordx2 v[1:2], v[27:28], off inst_offset:128
507 |
508 | # store rounding mode
509 | v_mov_b32 v0, 0
510 | v_mov_b32 v1, s66
511 | global_store_dword v0, v1, s[64:65]
512 |
513 | program_end:
514 | s_endpgm
515 |
516 | fsqrt_r_sub0:
517 | s_setreg_b32 hwreg(mode, 2, 2), s67
518 | v_rsq_f64 v[28:29], v[68:69]
519 |
520 | # Improve initial approximation (can be skipped)
521 | #v_mul_f64 v[42:43], v[28:29], v[68:69]
522 | #v_mul_f64 v[48:49], v[28:29], -0.5
523 | #v_fma_f64 v[48:49], v[48:49], v[42:43], 0.5
524 | #v_fma_f64 v[28:29], v[28:29], v[48:49], v[28:29]
525 |
526 | v_mul_f64 v[42:43], v[28:29], v[68:69]
527 | v_mov_b32 v48, v28
528 | v_sub_nc_u32 v49, v29, v84
529 | v_mov_b32 v46, v28
530 | v_xor_b32 v47, v49, v82
531 | v_fma_f64 v[46:47], v[46:47], v[42:43], 0.5
532 | v_fma_f64 v[42:43], v[42:43], v[46:47], v[42:43]
533 | v_fma_f64 v[48:49], v[48:49], v[46:47], v[48:49]
534 | v_fma_f64 v[46:47], -v[42:43], v[42:43], v[68:69]
535 | s_setreg_b32 hwreg(mode, 2, 2), s66
536 | v_fma_f64 v[42:43], v[46:47], v[48:49], v[42:43]
537 | v_cmpx_class_f64 v[68:69], s[68:69]
538 | v_mov_b32 v68, v42
539 | v_mov_b32 v69, v43
540 | s_mov_b64 exec, 3
541 | s_setpc_b64 s[60:61]
542 |
543 | fsqrt_r_sub1:
544 | s_setreg_b32 hwreg(mode, 2, 2), s67
545 | v_rsq_f64 v[28:29], v[70:71]
546 |
547 | # Improve initial approximation (can be skipped)
548 | #v_mul_f64 v[42:43], v[28:29], v[70:71]
549 | #v_mul_f64 v[48:49], v[28:29], -0.5
550 | #v_fma_f64 v[48:49], v[48:49], v[42:43], 0.5
551 | #v_fma_f64 v[28:29], v[28:29], v[48:49], v[28:29]
552 |
553 | v_mul_f64 v[42:43], v[28:29], v[70:71]
554 | v_mov_b32 v48, v28
555 | v_sub_nc_u32 v49, v29, v84
556 | v_mov_b32 v46, v28
557 | v_xor_b32 v47, v49, v82
558 | v_fma_f64 v[46:47], v[46:47], v[42:43], 0.5
559 | v_fma_f64 v[42:43], v[42:43], v[46:47], v[42:43]
560 | v_fma_f64 v[48:49], v[48:49], v[46:47], v[48:49]
561 | v_fma_f64 v[46:47], -v[42:43], v[42:43], v[70:71]
562 | s_setreg_b32 hwreg(mode, 2, 2), s66
563 | v_fma_f64 v[42:43], v[46:47], v[48:49], v[42:43]
564 | v_cmpx_class_f64 v[70:71], s[68:69]
565 | v_mov_b32 v70, v42
566 | v_mov_b32 v71, v43
567 | s_mov_b64 exec, 3
568 | s_setpc_b64 s[60:61]
569 |
570 | fsqrt_r_sub2:
571 | s_setreg_b32 hwreg(mode, 2, 2), s67
572 | v_rsq_f64 v[28:29], v[72:73]
573 |
574 | # Improve initial approximation (can be skipped)
575 | #v_mul_f64 v[42:43], v[28:29], v[72:73]
576 | #v_mul_f64 v[48:49], v[28:29], -0.5
577 | #v_fma_f64 v[48:49], v[48:49], v[42:43], 0.5
578 | #v_fma_f64 v[28:29], v[28:29], v[48:49], v[28:29]
579 |
580 | v_mul_f64 v[42:43], v[28:29], v[72:73]
581 | v_mov_b32 v48, v28
582 | v_sub_nc_u32 v49, v29, v84
583 | v_mov_b32 v46, v28
584 | v_xor_b32 v47, v49, v82
585 | v_fma_f64 v[46:47], v[46:47], v[42:43], 0.5
586 | v_fma_f64 v[42:43], v[42:43], v[46:47], v[42:43]
587 | v_fma_f64 v[48:49], v[48:49], v[46:47], v[48:49]
588 | v_fma_f64 v[46:47], -v[42:43], v[42:43], v[72:73]
589 | s_setreg_b32 hwreg(mode, 2, 2), s66
590 | v_fma_f64 v[42:43], v[46:47], v[48:49], v[42:43]
591 | v_cmpx_class_f64 v[72:73], s[68:69]
592 | v_mov_b32 v72, v42
593 | v_mov_b32 v73, v43
594 | s_mov_b64 exec, 3
595 | s_setpc_b64 s[60:61]
596 |
597 | fsqrt_r_sub3:
598 | s_setreg_b32 hwreg(mode, 2, 2), s67
599 | v_rsq_f64 v[28:29], v[74:75]
600 |
601 | # Improve initial approximation (can be skipped)
602 | #v_mul_f64 v[42:43], v[28:29], v[74:75]
603 | #v_mul_f64 v[48:49], v[28:29], -0.5
604 | #v_fma_f64 v[48:49], v[48:49], v[42:43], 0.5
605 | #v_fma_f64 v[28:29], v[28:29], v[48:49], v[28:29]
606 |
607 | v_mul_f64 v[42:43], v[28:29], v[74:75]
608 | v_mov_b32 v48, v28
609 | v_sub_nc_u32 v49, v29, v84
610 | v_mov_b32 v46, v28
611 | v_xor_b32 v47, v49, v82
612 | v_fma_f64 v[46:47], v[46:47], v[42:43], 0.5
613 | v_fma_f64 v[42:43], v[42:43], v[46:47], v[42:43]
614 | v_fma_f64 v[48:49], v[48:49], v[46:47], v[48:49]
615 | v_fma_f64 v[46:47], -v[42:43], v[42:43], v[74:75]
616 | s_setreg_b32 hwreg(mode, 2, 2), s66
617 | v_fma_f64 v[42:43], v[46:47], v[48:49], v[42:43]
618 | v_cmpx_class_f64 v[74:75], s[68:69]
619 | v_mov_b32 v74, v42
620 | v_mov_b32 v75, v43
621 | s_mov_b64 exec, 3
622 | s_setpc_b64 s[60:61]
623 |
624 | fdiv_m_sub0:
625 | v_or_b32 v28, v28, v78
626 | v_and_or_b32 v29, v29, v77, v79
627 | s_setreg_b32 hwreg(mode, 2, 2), s67
628 | v_rcp_f64 v[48:49], v[28:29]
629 | v_fma_f64 v[80:81], -v[28:29], v[48:49], 1.0
630 | v_fma_f64 v[48:49], v[48:49], v[80:81], v[48:49]
631 | v_mul_f64 v[80:81], v[68:69], v[48:49]
632 | v_fma_f64 v[42:43], -v[28:29], v[80:81], v[68:69]
633 | s_setreg_b32 hwreg(mode, 2, 2), s66
634 | v_fma_f64 v[42:43], v[42:43], v[48:49], v[80:81]
635 | v_div_fixup_f64 v[80:81], v[42:43], v[28:29], v[68:69]
636 | v_cmpx_eq_f64 v[68:69], v[28:29]
637 | v_mov_b32 v80, 0
638 | v_mov_b32 v81, v83
639 | s_mov_b64 exec, 3
640 | v_mov_b32 v68, v80
641 | v_mov_b32 v69, v81
642 | s_setpc_b64 s[60:61]
643 |
644 | fdiv_m_sub1:
645 | v_or_b32 v28, v28, v78
646 | v_and_or_b32 v29, v29, v77, v79
647 | s_setreg_b32 hwreg(mode, 2, 2), s67
648 | v_rcp_f64 v[48:49], v[28:29]
649 | v_fma_f64 v[80:81], -v[28:29], v[48:49], 1.0
650 | v_fma_f64 v[48:49], v[48:49], v[80:81], v[48:49]
651 | v_mul_f64 v[80:81], v[70:71], v[48:49]
652 | v_fma_f64 v[42:43], -v[28:29], v[80:81], v[70:71]
653 | s_setreg_b32 hwreg(mode, 2, 2), s66
654 | v_fma_f64 v[42:43], v[42:43], v[48:49], v[80:81]
655 | v_div_fixup_f64 v[80:81], v[42:43], v[28:29], v[70:71]
656 | v_cmpx_eq_f64 v[70:71], v[28:29]
657 | v_mov_b32 v80, 0
658 | v_mov_b32 v81, v83
659 | s_mov_b64 exec, 3
660 | v_mov_b32 v70, v80
661 | v_mov_b32 v71, v81
662 | s_setpc_b64 s[60:61]
663 |
664 | fdiv_m_sub2:
665 | v_or_b32 v28, v28, v78
666 | v_and_or_b32 v29, v29, v77, v79
667 | s_setreg_b32 hwreg(mode, 2, 2), s67
668 | v_rcp_f64 v[48:49], v[28:29]
669 | v_fma_f64 v[80:81], -v[28:29], v[48:49], 1.0
670 | v_fma_f64 v[48:49], v[48:49], v[80:81], v[48:49]
671 | v_mul_f64 v[80:81], v[72:73], v[48:49]
672 | v_fma_f64 v[42:43], -v[28:29], v[80:81], v[72:73]
673 | s_setreg_b32 hwreg(mode, 2, 2), s66
674 | v_fma_f64 v[42:43], v[42:43], v[48:49], v[80:81]
675 | v_div_fixup_f64 v[80:81], v[42:43], v[28:29], v[72:73]
676 | v_cmpx_eq_f64 v[72:73], v[28:29]
677 | v_mov_b32 v80, 0
678 | v_mov_b32 v81, v83
679 | s_mov_b64 exec, 3
680 | v_mov_b32 v72, v80
681 | v_mov_b32 v73, v81
682 | s_setpc_b64 s[60:61]
683 |
684 | fdiv_m_sub3:
685 | v_or_b32 v28, v28, v78
686 | v_and_or_b32 v29, v29, v77, v79
687 | s_setreg_b32 hwreg(mode, 2, 2), s67
688 | v_rcp_f64 v[48:49], v[28:29]
689 | v_fma_f64 v[80:81], -v[28:29], v[48:49], 1.0
690 | v_fma_f64 v[48:49], v[48:49], v[80:81], v[48:49]
691 | v_mul_f64 v[80:81], v[74:75], v[48:49]
692 | v_fma_f64 v[42:43], -v[28:29], v[80:81], v[74:75]
693 | s_setreg_b32 hwreg(mode, 2, 2), s66
694 | v_fma_f64 v[42:43], v[42:43], v[48:49], v[80:81]
695 | v_div_fixup_f64 v[80:81], v[42:43], v[28:29], v[74:75]
696 | v_cmpx_eq_f64 v[74:75], v[28:29]
697 | v_mov_b32 v80, 0
698 | v_mov_b32 v81, v83
699 | s_mov_b64 exec, 3
700 | v_mov_b32 v74, v80
701 | v_mov_b32 v75, v81
702 | s_setpc_b64 s[60:61]
703 |
704 | ismulh_r_sub:
705 | s_mov_b64 exec, 1
706 | v_mov_b32 v45, s14
707 | v_mul_hi_u32 v40, s38, v45
708 | v_mov_b32 v47, s15
709 | v_mad_u64_u32 v[42:43], s32, s38, v47, v[40:41]
710 | v_mov_b32 v40, v42
711 | v_mad_u64_u32 v[45:46], s32, s39, v45, v[40:41]
712 | v_mad_u64_u32 v[42:43], s32, s39, v47, v[43:44]
713 | v_add_co_u32 v42, vcc_lo, v42, v46
714 | v_add_co_ci_u32 v43, vcc_lo, 0, v43, vcc_lo
715 | v_readlane_b32 s32, v42, 0
716 | v_readlane_b32 s33, v43, 0
717 | s_cmp_lt_i32 s15, 0
718 | s_cselect_b64 s[34:35], s[38:39], 0
719 | s_sub_u32 s32, s32, s34
720 | s_subb_u32 s33, s33, s35
721 | s_cmp_lt_i32 s39, 0
722 | s_cselect_b64 s[34:35], s[14:15], 0
723 | s_sub_u32 s14, s32, s34
724 | s_subb_u32 s15, s33, s35
725 | s_mov_b64 exec, 3
726 | s_setpc_b64 s[60:61]
727 |
728 | imulh_r_sub:
729 | s_mov_b64 exec, 1
730 | v_mov_b32 v45, s38
731 | v_mul_hi_u32 v40, s14, v45
732 | v_mov_b32 v47, s39
733 | v_mad_u64_u32 v[42:43], s32, s14, v47, v[40:41]
734 | v_mov_b32 v40, v42
735 | v_mad_u64_u32 v[45:46], s32, s15, v45, v[40:41]
736 | v_mad_u64_u32 v[42:43], s32, s15, v47, v[43:44]
737 | v_add_co_u32 v42, vcc_lo, v42, v46
738 | v_add_co_ci_u32 v43, vcc_lo, 0, v43, vcc_lo
739 | v_readlane_b32 s14, v42, 0
740 | v_readlane_b32 s15, v43, 0
741 | s_mov_b64 exec, 3
742 | s_setpc_b64 s[60:61]
743 |
--------------------------------------------------------------------------------
/RandomX_OpenCL/GCNASM/randomx_run_gfx803.asm:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (c) 2019 SChernykh
3 |
4 | This file is part of RandomX OpenCL.
5 |
6 | RandomX OpenCL is free software: you can redistribute it and/or modify
7 | it under the terms of the GNU General Public License as published by
8 | the Free Software Foundation, either version 3 of the License, or
9 | (at your option) any later version.
10 |
11 | RandomX OpenCL is distributed in the hope that it will be useful,
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | GNU General Public License for more details.
15 |
16 | You should have received a copy of the GNU General Public License
17 | along with RandomX OpenCL. If not, see .
18 | */
19 |
20 | .amdcl2
21 | .gpu GFX803
22 | .64bit
23 | .arch_minor 0
24 | .arch_stepping 0
25 | .driver_version 203603
26 | .kernel randomx_run
27 | .config
28 | .dims x
29 | .cws 64, 1, 1
30 | .sgprsnum 96
31 | # 6 waves per SIMD: 37-40 VGPRs
32 | # 5 waves per SIMD: 41-48 VGPRs
33 | # 4 waves per SIMD: 49-64 VGPRs
34 | # 3 waves per SIMD: 65-84 VGPRs
35 | # 2 waves per SIMD: 85-128 VGPRs
36 | # 1 wave per SIMD: 129-256 VGPRs
37 | .vgprsnum 128
38 | .localsize 256
39 | .floatmode 0xc0
40 | .pgmrsrc1 0x00ac035f
41 | .pgmrsrc2 0x0000008c
42 | .dx10clamp
43 | .ieeemode
44 | .useargs
45 | .priority 0
46 | .arg _.global_offset_0, "size_t", long
47 | .arg _.global_offset_1, "size_t", long
48 | .arg _.global_offset_2, "size_t", long
49 | .arg _.printf_buffer, "size_t", void*, global, , rdonly
50 | .arg _.vqueue_pointer, "size_t", long
51 | .arg _.aqlwrap_pointer, "size_t", long
52 | .arg dataset, "uchar*", uchar*, global, const, rdonly
53 | .arg scratchpad, "uchar*", uchar*, global,
54 | .arg registers, "ulong*", ulong*, global,
55 | .arg rounding_modes, "uint*", uint*, global,
56 | .arg programs, "uint*", uint*, global,
57 | .arg batch_size, "uint", uint
58 | .arg rx_parameters, "uint", uint
59 | .text
60 | s_mov_b32 m0, 0x10000
61 | s_dcache_wb
62 | s_waitcnt vmcnt(0) & lgkmcnt(0)
63 | s_icache_inv
64 | s_branch begin
65 |
66 | # pgmrsrc2 = 0x00000090, bits 1:5 = 8, so first 8 SGPRs (s0-s7) contain user data
67 | # s8 contains group id
68 | # v0 contains local id
69 | begin:
70 | s_mov_b32 s8, s6
71 | v_lshlrev_b32 v1, 6, s8
72 | v_add_u32 v1, vcc, v1, v0
73 | s_load_dwordx2 s[0:1], s[4:5], 0x0
74 | s_load_dwordx2 s[2:3], s[4:5], 0x40
75 | s_load_dwordx2 s[64:65], s[4:5], 0x48
76 | s_waitcnt lgkmcnt(0)
77 |
78 | # load rounding mode
79 | s_lshl_b32 s16, s8, 2
80 | s_add_u32 s64, s64, s16
81 | s_addc_u32 s65, s65, 0
82 | v_mov_b32 v8, s64
83 | v_mov_b32 v9, s65
84 | flat_load_dword v8, v[8:9]
85 | s_waitcnt vmcnt(0)
86 | v_readlane_b32 s66, v8, 0
87 | s_setreg_b32 hwreg(mode, 2, 2), s66
88 | s_mov_b32 s67, 0
89 |
90 | # used in FSQRT_R to check for "positive normal value" (v_cmpx_class_f64)
91 | s_mov_b32 s68, 256
92 | s_mov_b32 s69, 0
93 |
94 | v_add_u32 v1, vcc, s0, v1
95 | v_lshrrev_b32 v2, 6, v1
96 | v_lshlrev_b32 v3, 5, v2
97 | v_and_b32 v1, 63, v1
98 | v_mov_b32 v4, 0
99 | v_lshlrev_b64 v[3:4], 3, v[3:4]
100 | v_lshlrev_b32 v5, 4, v1
101 | v_add_u32 v3, vcc, s2, v3
102 | v_mov_b32 v6, s3
103 | v_addc_u32 v4, vcc, v6, v4, vcc
104 | v_lshlrev_b32 v41, 2, v1
105 | v_add_u32 v6, vcc, v3, v41
106 | v_addc_u32 v7, vcc, v4, 0, vcc
107 | flat_load_dword v6, v[6:7]
108 | v_mov_b32 v0, 0
109 | s_waitcnt vmcnt(0)
110 | ds_write_b32 v41, v6
111 | s_waitcnt lgkmcnt(0)
112 | s_mov_b64 s[0:1], exec
113 | v_cmpx_le_u32 s[2:3], v1, 7
114 | s_cbranch_execz program_end
115 |
116 | # rx_parameters
117 | s_load_dword s20, s[4:5], 0x5c
118 | s_waitcnt lgkmcnt(0)
119 |
120 | # Scratchpad L1 size
121 | s_bfe_u32 s21, s20, 0x050000
122 | s_lshl_b32 s21, 1, s21
123 |
124 | # Scratchpad L2 size
125 | s_bfe_u32 s22, s20, 0x050005
126 | s_lshl_b32 s22, 1, s22
127 |
128 | # Scratchpad L3 size
129 | s_bfe_u32 s23, s20, 0x05000A
130 | s_lshl_b32 s23, 1, s23
131 |
132 | # program iterations
133 | s_bfe_u32 s24, s20, 0x04000F
134 | s_lshl_b32 s24, 1, s24
135 |
136 | # Base address for scratchpads
137 | s_add_u32 s2, s23, 64
138 | v_mul_hi_u32 v20, v2, s2
139 | v_mul_lo_u32 v2, v2, s2
140 |
141 | # v41, v44 = 0
142 | v_mov_b32 v41, 0
143 | v_mov_b32 v44, 0
144 |
145 | ds_read_b32 v6, v0 offset:152
146 | v_cmp_lt_u32 s[2:3], v1, 4
147 | ds_read2_b64 v[34:37], v0 offset0:18 offset1:16
148 | ds_read_b64 v[11:12], v0 offset:136
149 | s_movk_i32 s9, 0x0
150 | s_mov_b64 s[6:7], exec
151 | s_andn2_b64 exec, s[6:7], s[2:3]
152 | ds_read_b64 v[13:14], v0 offset:160
153 | s_andn2_b64 exec, s[6:7], exec
154 | v_mov_b32 v13, 0
155 | v_mov_b32 v14, 0
156 | s_mov_b64 exec, s[6:7]
157 |
158 | # compiled program size
159 | s_mov_b64 s[6:7], s[8:9]
160 | s_mulk_i32 s6, 10048
161 |
162 | v_add_u32 v5, vcc, v0, v5
163 | v_add_u32 v5, vcc, v5, 64
164 | s_mov_b64 s[8:9], exec
165 | s_andn2_b64 exec, s[8:9], s[2:3]
166 | ds_read_b64 v[15:16], v0 offset:168
167 | s_andn2_b64 exec, s[8:9], exec
168 | v_mov_b32 v15, 0
169 | v_mov_b32 v16, 0
170 | s_mov_b64 exec, s[8:9]
171 | s_load_dwordx4 s[8:11], s[4:5], 0x30
172 |
173 | # batch_size
174 | s_load_dword s16, s[4:5], 0x58
175 |
176 | s_load_dwordx2 s[4:5], s[4:5], 0x50
177 | v_lshlrev_b32 v1, 3, v1
178 | v_add_u32 v17, vcc, v0, v1
179 | s_waitcnt lgkmcnt(0)
180 | v_add_u32 v2, vcc, s10, v2
181 | v_mov_b32 v18, s11
182 | v_addc_u32 v18, vcc, v18, v20, vcc
183 | v_mov_b32 v19, 0xffffff
184 | v_add_u32 v6, vcc, s8, v6
185 | v_mov_b32 v20, s9
186 | v_addc_u32 v20, vcc, v20, 0, vcc
187 | ds_read_b64 v[21:22], v17
188 | s_add_u32 s4, s4, s6
189 | s_addc_u32 s5, s5, s7
190 | v_cndmask_b32 v19, v19, -1, s[2:3]
191 | v_lshlrev_b32 v8, 3, v35
192 | v_lshlrev_b32 v7, 3, v34
193 | v_lshlrev_b32 v12, 3, v12
194 | v_lshlrev_b32 v10, 3, v11
195 | v_add_u32 v8, vcc, v8, v0
196 | v_add_u32 v7, vcc, v7, v0
197 | v_add_u32 v12, vcc, v12, v0
198 | v_add_u32 v0, vcc, v10, v0
199 | v_mov_b32 v10, v36
200 | v_mov_b32 v23, v37
201 |
202 | # loop counter
203 | s_sub_u32 s2, s24, 1
204 |
205 | # batch_size
206 | s_mov_b32 s3, s16
207 |
208 | # Scratchpad masks for scratchpads
209 | v_sub_u32 v38, vcc, s21, 8
210 | v_sub_u32 v39, vcc, s22, 8
211 | v_sub_u32 v50, vcc, s23, 8
212 |
213 | # mask for FSCAL_R
214 | v_mov_b32 v51, 0x80F00000
215 |
216 | # swap v3 and v18
217 | v_mov_b32 v52, v3
218 | v_mov_b32 v3, v18
219 | v_mov_b32 v18, v52
220 |
221 | # load scratchpad base address
222 | v_readlane_b32 s0, v2, 0
223 | v_readlane_b32 s1, v3, 0
224 |
225 | # save current executiom mask
226 | s_mov_b64 s[36:37], exec
227 |
228 | # v41 = 0 on lane 0, set it to 8 on lane 1
229 | # v44 = 0 on lane 0, set it to 4 on lane 1
230 | s_mov_b64 exec, 2
231 | v_mov_b32 v41, 8
232 | v_mov_b32 v44, 4
233 |
234 | # load group A registers
235 | # Read low 8 bytes into lane 0 and high 8 bytes into lane 1
236 | s_mov_b64 exec, 3
237 | ds_read2_b64 v[52:55], v41 offset0:24 offset1:26
238 | ds_read2_b64 v[56:59], v41 offset0:28 offset1:30
239 |
240 | # xmantissaMask
241 | v_mov_b32 v77, (1 << 24) - 1
242 |
243 | # xexponentMask
244 | ds_read_b64 v[78:79], v41 offset:160
245 |
246 | # Restore execution mask
247 | s_mov_b64 exec, s[36:37]
248 |
249 | # sign mask (used in FSQRT_R)
250 | v_mov_b32 v82, 0x80000000
251 |
252 | # High 32 bits of "1.0" constant (used in FDIV_M)
253 | v_mov_b32 v83, (1023 << 20)
254 |
255 | # Used to multiply FP64 values by 0.5
256 | v_mov_b32 v84, (1 << 20)
257 |
258 | s_getpc_b64 s[14:15]
259 | cur_addr:
260 |
261 | # get addresses of FSQRT_R subroutines
262 | s_add_u32 s40, s14, fsqrt_r_sub0 - cur_addr
263 | s_addc_u32 s41, s15, 0
264 | s_add_u32 s42, s14, fsqrt_r_sub1 - cur_addr
265 | s_addc_u32 s43, s15, 0
266 | s_add_u32 s44, s14, fsqrt_r_sub2 - cur_addr
267 | s_addc_u32 s45, s15, 0
268 | s_add_u32 s46, s14, fsqrt_r_sub3 - cur_addr
269 | s_addc_u32 s47, s15, 0
270 |
271 | # get addresses of FDIV_M subroutines
272 | s_add_u32 s48, s14, fdiv_m_sub0 - cur_addr
273 | s_addc_u32 s49, s15, 0
274 | s_add_u32 s50, s14, fdiv_m_sub1 - cur_addr
275 | s_addc_u32 s51, s15, 0
276 | s_add_u32 s52, s14, fdiv_m_sub2 - cur_addr
277 | s_addc_u32 s53, s15, 0
278 | s_add_u32 s54, s14, fdiv_m_sub3 - cur_addr
279 | s_addc_u32 s55, s15, 0
280 |
281 | # get address for ISMULH_R subroutine
282 | s_add_u32 s56, s14, ismulh_r_sub - cur_addr
283 | s_addc_u32 s57, s15, 0
284 |
285 | # get address for IMULH_R subroutine
286 | s_add_u32 s58, s14, imulh_r_sub - cur_addr
287 | s_addc_u32 s59, s15, 0
288 |
289 | # used in IXOR_R instruction
290 | s_mov_b32 s63, -1
291 |
292 | # used in CBRANCH instruction
293 | s_mov_b32 s70, (0xFF << 8)
294 | s_mov_b32 s71, (0xFF << 9)
295 | s_mov_b32 s72, (0xFF << 10)
296 | s_mov_b32 s73, (0xFF << 11)
297 | s_mov_b32 s74, (0xFF << 12)
298 | s_mov_b32 s75, (0xFF << 13)
299 | s_mov_b32 s76, (0xFF << 14)
300 | s_mov_b32 s77, (0xFF << 15)
301 | s_mov_b32 s78, (0xFF << 16)
302 | s_mov_b32 s79, (0xFF << 17)
303 | s_mov_b32 s80, (0xFF << 18)
304 | s_mov_b32 s81, (0xFF << 19)
305 | s_mov_b32 s82, (0xFF << 20)
306 | s_mov_b32 s83, (0xFF << 21)
307 | s_mov_b32 s84, (0xFF << 22)
308 | s_mov_b32 s85, (0xFF << 23)
309 |
310 | # ScratchpadL3Mask64
311 | s_sub_u32 s86, s23, 64
312 |
313 | main_loop:
314 | # const uint2 spMix = as_uint2(R[readReg0] ^ R[readReg1]);
315 | ds_read_b64 v[24:25], v0
316 | ds_read_b64 v[26:27], v12
317 | s_waitcnt lgkmcnt(0)
318 | v_xor_b32 v25, v27, v25
319 | v_xor_b32 v24, v26, v24
320 |
321 | # spAddr1 ^= spMix.y;
322 | # spAddr0 ^= spMix.x;
323 | v_xor_b32 v10, v25, v10
324 | v_xor_b32 v23, v24, v23
325 |
326 | # spAddr1 &= ScratchpadL3Mask64;
327 | # spAddr0 &= ScratchpadL3Mask64;
328 | v_and_b32 v10, s86, v10
329 | v_and_b32 v23, s86, v23
330 |
331 | # Offset for scratchpads
332 | # offset1 = spAddr1 + sub * 8
333 | # offset0 = spAddr0 + sub * 8
334 | v_add_u32 v10, vcc, v10, v1
335 | v_add_u32 v23, vcc, v23, v1
336 |
337 | # __global ulong* p1 = (__global ulong*)(scratchpad + offset1);
338 | # __global ulong* p0 = (__global ulong*)(scratchpad + offset0);
339 | v_add_u32 v26, vcc, v2, v10
340 | v_addc_u32 v27, vcc, v3, 0, vcc
341 | v_add_u32 v23, vcc, v2, v23
342 | v_addc_u32 v24, vcc, v3, 0, vcc
343 |
344 | # load from spAddr1
345 | flat_load_dwordx2 v[28:29], v[26:27]
346 |
347 | # load from spAddr0
348 | flat_load_dwordx2 v[30:31], v[23:24]
349 | s_waitcnt vmcnt(1)
350 |
351 | v_cvt_f64_i32 v[32:33], v28
352 | v_cvt_f64_i32 v[28:29], v29
353 | s_waitcnt vmcnt(0)
354 |
355 | # R[sub] ^= *p0;
356 | v_xor_b32 v34, v21, v30
357 | v_xor_b32 v35, v22, v31
358 |
359 | v_add_u32 v22, vcc, v6, v36
360 | v_addc_u32 v25, vcc, v20, 0, vcc
361 | v_add_u32 v21, vcc, v22, v1
362 | v_addc_u32 v22, vcc, v25, 0, vcc
363 | flat_load_dwordx2 v[21:22], v[21:22]
364 | v_or_b32 v30, v32, v13
365 | v_and_b32 v31, v33, v19
366 | v_or_b32 v31, v31, v14
367 | v_or_b32 v28, v28, v15
368 | v_and_b32 v29, v29, v19
369 | v_or_b32 v29, v29, v16
370 | ds_write2_b64 v5, v[30:31], v[28:29] offset1:1
371 | s_waitcnt lgkmcnt(0)
372 |
373 | # Program 0
374 |
375 | # load group F,E registers
376 | # Read low 8 bytes into lane 0 and high 8 bytes into lane 1
377 | s_mov_b64 exec, 3
378 | ds_read2_b64 v[60:63], v41 offset0:8 offset1:10
379 | ds_read2_b64 v[64:67], v41 offset0:12 offset1:14
380 | ds_read2_b64 v[68:71], v41 offset0:16 offset1:18
381 | ds_read2_b64 v[72:75], v41 offset0:20 offset1:22
382 |
383 | # load VM integer registers
384 | v_readlane_b32 s16, v34, 0
385 | v_readlane_b32 s17, v35, 0
386 | v_readlane_b32 s18, v34, 1
387 | v_readlane_b32 s19, v35, 1
388 | v_readlane_b32 s20, v34, 2
389 | v_readlane_b32 s21, v35, 2
390 | v_readlane_b32 s22, v34, 3
391 | v_readlane_b32 s23, v35, 3
392 | v_readlane_b32 s24, v34, 4
393 | v_readlane_b32 s25, v35, 4
394 | v_readlane_b32 s26, v34, 5
395 | v_readlane_b32 s27, v35, 5
396 | v_readlane_b32 s28, v34, 6
397 | v_readlane_b32 s29, v35, 6
398 | v_readlane_b32 s30, v34, 7
399 | v_readlane_b32 s31, v35, 7
400 |
401 | s_waitcnt lgkmcnt(0)
402 |
403 | # call JIT code
404 | s_swappc_b64 s[12:13], s[4:5]
405 |
406 | # Write out group F,E registers
407 | # Write low 8 bytes from lane 0 and high 8 bytes from lane 1
408 | ds_write2_b64 v41, v[60:61], v[62:63] offset0:8 offset1:10
409 | ds_write2_b64 v41, v[64:65], v[66:67] offset0:12 offset1:14
410 | ds_write2_b64 v41, v[68:69], v[70:71] offset0:16 offset1:18
411 | ds_write2_b64 v41, v[72:73], v[74:75] offset0:20 offset1:22
412 |
413 | # store VM integer registers
414 | v_writelane_b32 v28, s16, 0
415 | v_writelane_b32 v29, s17, 0
416 | v_writelane_b32 v28, s18, 1
417 | v_writelane_b32 v29, s19, 1
418 | v_writelane_b32 v28, s20, 2
419 | v_writelane_b32 v29, s21, 2
420 | v_writelane_b32 v28, s22, 3
421 | v_writelane_b32 v29, s23, 3
422 | v_writelane_b32 v28, s24, 4
423 | v_writelane_b32 v29, s25, 4
424 | v_writelane_b32 v28, s26, 5
425 | v_writelane_b32 v29, s27, 5
426 | v_writelane_b32 v28, s28, 6
427 | v_writelane_b32 v29, s29, 6
428 | v_writelane_b32 v28, s30, 7
429 | v_writelane_b32 v29, s31, 7
430 |
431 | # Restore execution mask
432 | s_mov_b64 exec, s[36:37]
433 |
434 | # Write out VM integer registers
435 | ds_write_b64 v17, v[28:29]
436 |
437 | s_waitcnt lgkmcnt(0)
438 | v_xor_b32 v21, v28, v21
439 | v_xor_b32 v22, v29, v22
440 | ds_read_b32 v28, v7
441 | ds_read_b32 v29, v8
442 | ds_write_b64 v17, v[21:22]
443 | s_waitcnt lgkmcnt(1)
444 | ds_read2_b64 v[30:33], v17 offset0:8 offset1:16
445 | v_xor_b32 v10, v28, v37
446 | s_waitcnt lgkmcnt(0)
447 | v_xor_b32 v30, v32, v30
448 | v_xor_b32 v31, v33, v31
449 | v_xor_b32 v10, v10, v29
450 | flat_store_dwordx2 v[26:27], v[21:22]
451 | v_and_b32 v10, 0x7fffffc0, v10
452 | flat_store_dwordx2 v[23:24], v[30:31]
453 | s_cmp_eq_u32 s2, 0
454 | s_cbranch_scc1 main_loop_end
455 | s_sub_i32 s2, s2, 1
456 | v_mov_b32 v37, v36
457 | v_mov_b32 v23, 0
458 | v_mov_b32 v36, v10
459 | v_mov_b32 v10, 0
460 | s_branch main_loop
461 | main_loop_end:
462 |
463 | v_add_u32 v0, vcc, v18, v1
464 | v_addc_u32 v1, vcc, v4, 0, vcc
465 | flat_store_dwordx2 v[0:1], v[21:22]
466 | v_add_u32 v0, vcc, v0, 64
467 | v_addc_u32 v1, vcc, v1, 0, vcc
468 | flat_store_dwordx2 v[0:1], v[30:31]
469 | v_add_u32 v0, vcc, v0, 64
470 | v_addc_u32 v1, vcc, v1, 0, vcc
471 | flat_store_dwordx2 v[0:1], v[32:33]
472 |
473 | # store rounding mode
474 | v_mov_b32 v0, s64
475 | v_mov_b32 v1, s65
476 | v_mov_b32 v2, s66
477 | flat_store_dword v[0:1], v2
478 |
479 | program_end:
480 | s_endpgm
481 |
482 | fsqrt_r_sub0:
483 | s_setreg_b32 hwreg(mode, 2, 2), s67
484 | v_rsq_f64 v[28:29], v[68:69]
485 |
486 | # Improve initial approximation (can be skipped)
487 | #v_mul_f64 v[42:43], v[28:29], v[68:69]
488 | #v_mul_f64 v[48:49], v[28:29], -0.5
489 | #v_fma_f64 v[48:49], v[48:49], v[42:43], 0.5
490 | #v_fma_f64 v[28:29], v[28:29], v[48:49], v[28:29]
491 |
492 | v_mul_f64 v[42:43], v[28:29], v[68:69]
493 | v_mov_b32 v48, v28
494 | v_sub_u32 v49, vcc, v29, v84
495 | v_mov_b32 v46, v28
496 | v_xor_b32 v47, v49, v82
497 | v_fma_f64 v[46:47], v[46:47], v[42:43], 0.5
498 | v_fma_f64 v[42:43], v[42:43], v[46:47], v[42:43]
499 | v_fma_f64 v[48:49], v[48:49], v[46:47], v[48:49]
500 | v_fma_f64 v[46:47], -v[42:43], v[42:43], v[68:69]
501 | s_setreg_b32 hwreg(mode, 2, 2), s66
502 | v_fma_f64 v[42:43], v[46:47], v[48:49], v[42:43]
503 | v_cmpx_class_f64 s[14:15], v[68:69], s[68:69]
504 | v_mov_b32 v68, v42
505 | v_mov_b32 v69, v43
506 | s_mov_b64 exec, 3
507 | s_setpc_b64 s[60:61]
508 |
509 | fsqrt_r_sub1:
510 | s_setreg_b32 hwreg(mode, 2, 2), s67
511 | v_rsq_f64 v[28:29], v[70:71]
512 |
513 | # Improve initial approximation (can be skipped)
514 | #v_mul_f64 v[42:43], v[28:29], v[70:71]
515 | #v_mul_f64 v[48:49], v[28:29], -0.5
516 | #v_fma_f64 v[48:49], v[48:49], v[42:43], 0.5
517 | #v_fma_f64 v[28:29], v[28:29], v[48:49], v[28:29]
518 |
519 | v_mul_f64 v[42:43], v[28:29], v[70:71]
520 | v_mov_b32 v48, v28
521 | v_sub_u32 v49, vcc, v29, v84
522 | v_mov_b32 v46, v28
523 | v_xor_b32 v47, v49, v82
524 | v_fma_f64 v[46:47], v[46:47], v[42:43], 0.5
525 | v_fma_f64 v[42:43], v[42:43], v[46:47], v[42:43]
526 | v_fma_f64 v[48:49], v[48:49], v[46:47], v[48:49]
527 | v_fma_f64 v[46:47], -v[42:43], v[42:43], v[70:71]
528 | s_setreg_b32 hwreg(mode, 2, 2), s66
529 | v_fma_f64 v[42:43], v[46:47], v[48:49], v[42:43]
530 | v_cmpx_class_f64 s[14:15], v[70:71], s[68:69]
531 | v_mov_b32 v70, v42
532 | v_mov_b32 v71, v43
533 | s_mov_b64 exec, 3
534 | s_setpc_b64 s[60:61]
535 |
536 | fsqrt_r_sub2:
537 | s_setreg_b32 hwreg(mode, 2, 2), s67
538 | v_rsq_f64 v[28:29], v[72:73]
539 |
540 | # Improve initial approximation (can be skipped)
541 | #v_mul_f64 v[42:43], v[28:29], v[72:73]
542 | #v_mul_f64 v[48:49], v[28:29], -0.5
543 | #v_fma_f64 v[48:49], v[48:49], v[42:43], 0.5
544 | #v_fma_f64 v[28:29], v[28:29], v[48:49], v[28:29]
545 |
546 | v_mul_f64 v[42:43], v[28:29], v[72:73]
547 | v_mov_b32 v48, v28
548 | v_sub_u32 v49, vcc, v29, v84
549 | v_mov_b32 v46, v28
550 | v_xor_b32 v47, v49, v82
551 | v_fma_f64 v[46:47], v[46:47], v[42:43], 0.5
552 | v_fma_f64 v[42:43], v[42:43], v[46:47], v[42:43]
553 | v_fma_f64 v[48:49], v[48:49], v[46:47], v[48:49]
554 | v_fma_f64 v[46:47], -v[42:43], v[42:43], v[72:73]
555 | s_setreg_b32 hwreg(mode, 2, 2), s66
556 | v_fma_f64 v[42:43], v[46:47], v[48:49], v[42:43]
557 | v_cmpx_class_f64 s[14:15], v[72:73], s[68:69]
558 | v_mov_b32 v72, v42
559 | v_mov_b32 v73, v43
560 | s_mov_b64 exec, 3
561 | s_setpc_b64 s[60:61]
562 |
563 | fsqrt_r_sub3:
564 | s_setreg_b32 hwreg(mode, 2, 2), s67
565 | v_rsq_f64 v[28:29], v[74:75]
566 |
567 | # Improve initial approximation (can be skipped)
568 | #v_mul_f64 v[42:43], v[28:29], v[74:75]
569 | #v_mul_f64 v[48:49], v[28:29], -0.5
570 | #v_fma_f64 v[48:49], v[48:49], v[42:43], 0.5
571 | #v_fma_f64 v[28:29], v[28:29], v[48:49], v[28:29]
572 |
573 | v_mul_f64 v[42:43], v[28:29], v[74:75]
574 | v_mov_b32 v48, v28
575 | v_sub_u32 v49, vcc, v29, v84
576 | v_mov_b32 v46, v28
577 | v_xor_b32 v47, v49, v82
578 | v_fma_f64 v[46:47], v[46:47], v[42:43], 0.5
579 | v_fma_f64 v[42:43], v[42:43], v[46:47], v[42:43]
580 | v_fma_f64 v[48:49], v[48:49], v[46:47], v[48:49]
581 | v_fma_f64 v[46:47], -v[42:43], v[42:43], v[74:75]
582 | s_setreg_b32 hwreg(mode, 2, 2), s66
583 | v_fma_f64 v[42:43], v[46:47], v[48:49], v[42:43]
584 | v_cmpx_class_f64 s[14:15], v[74:75], s[68:69]
585 | v_mov_b32 v74, v42
586 | v_mov_b32 v75, v43
587 | s_mov_b64 exec, 3
588 | s_setpc_b64 s[60:61]
589 |
590 | fdiv_m_sub0:
591 | v_or_b32 v28, v28, v78
592 | v_and_b32 v29, v29, v77
593 | v_or_b32 v29, v29, v79
594 | s_setreg_b32 hwreg(mode, 2, 2), s67
595 | v_rcp_f64 v[48:49], v[28:29]
596 | v_fma_f64 v[80:81], -v[28:29], v[48:49], 1.0
597 | v_fma_f64 v[48:49], v[48:49], v[80:81], v[48:49]
598 | v_mul_f64 v[80:81], v[68:69], v[48:49]
599 | v_fma_f64 v[42:43], -v[28:29], v[80:81], v[68:69]
600 | s_setreg_b32 hwreg(mode, 2, 2), s66
601 | v_fma_f64 v[42:43], v[42:43], v[48:49], v[80:81]
602 | v_div_fixup_f64 v[80:81], v[42:43], v[28:29], v[68:69]
603 | v_cmpx_eq_f64 s[14:15], v[68:69], v[28:29]
604 | v_mov_b32 v80, 0
605 | v_mov_b32 v81, v83
606 | s_mov_b64 exec, 3
607 | v_mov_b32 v68, v80
608 | v_mov_b32 v69, v81
609 | s_setpc_b64 s[60:61]
610 |
611 | fdiv_m_sub1:
612 | v_or_b32 v28, v28, v78
613 | v_and_b32 v29, v29, v77
614 | v_or_b32 v29, v29, v79
615 | s_setreg_b32 hwreg(mode, 2, 2), s67
616 | v_rcp_f64 v[48:49], v[28:29]
617 | v_fma_f64 v[80:81], -v[28:29], v[48:49], 1.0
618 | v_fma_f64 v[48:49], v[48:49], v[80:81], v[48:49]
619 | v_mul_f64 v[80:81], v[70:71], v[48:49]
620 | v_fma_f64 v[42:43], -v[28:29], v[80:81], v[70:71]
621 | s_setreg_b32 hwreg(mode, 2, 2), s66
622 | v_fma_f64 v[42:43], v[42:43], v[48:49], v[80:81]
623 | v_div_fixup_f64 v[80:81], v[42:43], v[28:29], v[70:71]
624 | v_cmpx_eq_f64 s[14:15], v[70:71], v[28:29]
625 | v_mov_b32 v80, 0
626 | v_mov_b32 v81, v83
627 | s_mov_b64 exec, 3
628 | v_mov_b32 v70, v80
629 | v_mov_b32 v71, v81
630 | s_setpc_b64 s[60:61]
631 |
632 | fdiv_m_sub2:
633 | v_or_b32 v28, v28, v78
634 | v_and_b32 v29, v29, v77
635 | v_or_b32 v29, v29, v79
636 | s_setreg_b32 hwreg(mode, 2, 2), s67
637 | v_rcp_f64 v[48:49], v[28:29]
638 | v_fma_f64 v[80:81], -v[28:29], v[48:49], 1.0
639 | v_fma_f64 v[48:49], v[48:49], v[80:81], v[48:49]
640 | v_mul_f64 v[80:81], v[72:73], v[48:49]
641 | v_fma_f64 v[42:43], -v[28:29], v[80:81], v[72:73]
642 | s_setreg_b32 hwreg(mode, 2, 2), s66
643 | v_fma_f64 v[42:43], v[42:43], v[48:49], v[80:81]
644 | v_div_fixup_f64 v[80:81], v[42:43], v[28:29], v[72:73]
645 | v_cmpx_eq_f64 s[14:15], v[72:73], v[28:29]
646 | v_mov_b32 v80, 0
647 | v_mov_b32 v81, v83
648 | s_mov_b64 exec, 3
649 | v_mov_b32 v72, v80
650 | v_mov_b32 v73, v81
651 | s_setpc_b64 s[60:61]
652 |
653 | fdiv_m_sub3:
654 | v_or_b32 v28, v28, v78
655 | v_and_b32 v29, v29, v77
656 | v_or_b32 v29, v29, v79
657 | s_setreg_b32 hwreg(mode, 2, 2), s67
658 | v_rcp_f64 v[48:49], v[28:29]
659 | v_fma_f64 v[80:81], -v[28:29], v[48:49], 1.0
660 | v_fma_f64 v[48:49], v[48:49], v[80:81], v[48:49]
661 | v_mul_f64 v[80:81], v[74:75], v[48:49]
662 | v_fma_f64 v[42:43], -v[28:29], v[80:81], v[74:75]
663 | s_setreg_b32 hwreg(mode, 2, 2), s66
664 | v_fma_f64 v[42:43], v[42:43], v[48:49], v[80:81]
665 | v_div_fixup_f64 v[80:81], v[42:43], v[28:29], v[74:75]
666 | v_cmpx_eq_f64 s[14:15], v[74:75], v[28:29]
667 | v_mov_b32 v80, 0
668 | v_mov_b32 v81, v83
669 | s_mov_b64 exec, 3
670 | v_mov_b32 v74, v80
671 | v_mov_b32 v75, v81
672 | s_setpc_b64 s[60:61]
673 |
674 | ismulh_r_sub:
675 | s_mov_b64 exec, 1
676 | v_mov_b32 v45, s14
677 | v_mul_hi_u32 v40, s38, v45
678 | v_mov_b32 v47, s15
679 | v_mad_u64_u32 v[42:43], s[32:33], s38, v47, v[40:41]
680 | v_mov_b32 v40, v42
681 | v_mad_u64_u32 v[45:46], s[32:33], s39, v45, v[40:41]
682 | v_mad_u64_u32 v[42:43], s[32:33], s39, v47, v[43:44]
683 | v_add_u32 v42, vcc, v42, v46
684 | v_addc_u32 v43, vcc, 0, v43, vcc
685 | v_readlane_b32 s32, v42, 0
686 | v_readlane_b32 s33, v43, 0
687 | s_cmp_lt_i32 s15, 0
688 | s_cselect_b64 s[34:35], s[38:39], 0
689 | s_sub_u32 s32, s32, s34
690 | s_subb_u32 s33, s33, s35
691 | s_cmp_lt_i32 s39, 0
692 | s_cselect_b64 s[34:35], s[14:15], 0
693 | s_sub_u32 s14, s32, s34
694 | s_subb_u32 s15, s33, s35
695 | s_mov_b64 exec, 3
696 | s_setpc_b64 s[60:61]
697 |
698 | imulh_r_sub:
699 | s_mov_b64 exec, 1
700 | v_mov_b32 v45, s38
701 | v_mul_hi_u32 v40, s14, v45
702 | v_mov_b32 v47, s39
703 | v_mad_u64_u32 v[42:43], s[32:33], s14, v47, v[40:41]
704 | v_mov_b32 v40, v42
705 | v_mad_u64_u32 v[45:46], s[32:33], s15, v45, v[40:41]
706 | v_mad_u64_u32 v[42:43], s[32:33], s15, v47, v[43:44]
707 | v_add_u32 v42, vcc, v42, v46
708 | v_addc_u32 v43, vcc, 0, v43, vcc
709 | v_readlane_b32 s14, v42, 0
710 | v_readlane_b32 s15, v43, 0
711 | s_mov_b64 exec, 3
712 | s_setpc_b64 s[60:61]
713 |
--------------------------------------------------------------------------------
/RandomX_OpenCL/GCNASM/randomx_run_gfx900.asm:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (c) 2019 SChernykh
3 |
4 | This file is part of RandomX OpenCL.
5 |
6 | RandomX OpenCL is free software: you can redistribute it and/or modify
7 | it under the terms of the GNU General Public License as published by
8 | the Free Software Foundation, either version 3 of the License, or
9 | (at your option) any later version.
10 |
11 | RandomX OpenCL is distributed in the hope that it will be useful,
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | GNU General Public License for more details.
15 |
16 | You should have received a copy of the GNU General Public License
17 | along with RandomX OpenCL. If not, see .
18 | */
19 |
20 | .amdcl2
21 | .gpu GFX900
22 | .64bit
23 | .arch_minor 0
24 | .arch_stepping 0
25 | .driver_version 223600
26 | .kernel randomx_run
27 | .config
28 | .dims x
29 | .cws 64, 1, 1
30 | .sgprsnum 96
31 | # 6 waves per SIMD: 37-40 VGPRs
32 | # 5 waves per SIMD: 41-48 VGPRs
33 | # 4 waves per SIMD: 49-64 VGPRs
34 | # 3 waves per SIMD: 65-84 VGPRs
35 | # 2 waves per SIMD: 85-128 VGPRs
36 | # 1 wave per SIMD: 129-256 VGPRs
37 | .vgprsnum 128
38 | .localsize 256
39 | .floatmode 0xc0
40 | .pgmrsrc1 0x00ac035f
41 | .pgmrsrc2 0x00000090
42 | .dx10clamp
43 | .ieeemode
44 | .useargs
45 | .priority 0
46 | .arg _.global_offset_0, "size_t", long
47 | .arg _.global_offset_1, "size_t", long
48 | .arg _.global_offset_2, "size_t", long
49 | .arg _.printf_buffer, "size_t", void*, global, , rdonly
50 | .arg _.vqueue_pointer, "size_t", long
51 | .arg _.aqlwrap_pointer, "size_t", long
52 | .arg dataset, "uchar*", uchar*, global, const, rdonly
53 | .arg scratchpad, "uchar*", uchar*, global,
54 | .arg registers, "ulong*", ulong*, global,
55 | .arg rounding_modes, "uint*", uint*, global,
56 | .arg programs, "uint*", uint*, global,
57 | .arg batch_size, "uint", uint
58 | .arg rx_parameters, "uint", uint
59 | .text
60 | s_mov_b32 m0, 0x10000
61 | s_dcache_wb
62 | s_waitcnt vmcnt(0) & lgkmcnt(0)
63 | s_icache_inv
64 | s_branch begin
65 |
66 | # pgmrsrc2 = 0x00000090, bits 1:5 = 8, so first 8 SGPRs (s0-s7) contain user data
67 | # s8 contains group id
68 | # v0 contains local id
69 | begin:
70 | v_lshl_add_u32 v1, s8, 6, v0
71 | s_load_dwordx2 s[0:1], s[4:5], 0x0
72 | s_load_dwordx2 s[2:3], s[4:5], 0x40
73 | s_load_dwordx2 s[64:65], s[4:5], 0x48
74 | s_waitcnt lgkmcnt(0)
75 |
76 | # load rounding mode
77 | s_lshl_b32 s16, s8, 2
78 | s_add_u32 s64, s64, s16
79 | s_addc_u32 s65, s65, 0
80 | v_mov_b32 v8, 0
81 | global_load_dword v8, v8, s[64:65]
82 | s_waitcnt vmcnt(0)
83 | v_readlane_b32 s66, v8, 0
84 | s_setreg_b32 hwreg(mode, 2, 2), s66
85 | s_mov_b32 s67, 0
86 |
87 | # used in FSQRT_R to check for "positive normal value" (v_cmpx_class_f64)
88 | s_mov_b32 s68, 256
89 | s_mov_b32 s69, 0
90 |
91 | v_add_u32 v1, s0, v1
92 | v_lshrrev_b32 v2, 6, v1
93 | v_lshlrev_b32 v3, 5, v2
94 | v_and_b32 v1, 63, v1
95 | v_mov_b32 v4, 0
96 | v_lshlrev_b64 v[3:4], 3, v[3:4]
97 | v_lshlrev_b32 v5, 4, v1
98 | v_add_co_u32 v3, vcc, s2, v3
99 | v_mov_b32 v6, s3
100 | v_addc_co_u32 v4, vcc, v6, v4, vcc
101 | v_lshlrev_b32 v41, 2, v1
102 | v_add_co_u32 v6, vcc, v3, v41
103 | v_addc_co_u32 v7, vcc, v4, 0, vcc
104 | global_load_dword v6, v[6:7], off
105 | v_mov_b32 v0, 0
106 | s_waitcnt vmcnt(0)
107 | ds_write_b32 v41, v6
108 | s_waitcnt lgkmcnt(0)
109 | s_mov_b64 s[0:1], exec
110 | v_cmpx_le_u32 s[2:3], v1, 7
111 | s_cbranch_execz program_end
112 |
113 | # rx_parameters
114 | s_load_dword s20, s[4:5], 0x5c
115 | s_waitcnt lgkmcnt(0)
116 |
117 | # Scratchpad L1 size
118 | s_bfe_u32 s21, s20, 0x050000
119 | s_lshl_b32 s21, 1, s21
120 |
121 | # Scratchpad L2 size
122 | s_bfe_u32 s22, s20, 0x050005
123 | s_lshl_b32 s22, 1, s22
124 |
125 | # Scratchpad L3 size
126 | s_bfe_u32 s23, s20, 0x05000A
127 | s_lshl_b32 s23, 1, s23
128 |
129 | # program iterations
130 | s_bfe_u32 s24, s20, 0x04000F
131 | s_lshl_b32 s24, 1, s24
132 |
133 | # Base address for scratchpads
134 | s_add_u32 s2, s23, 64
135 | v_mul_hi_u32 v20, v2, s2
136 | v_mul_lo_u32 v2, v2, s2
137 |
138 | # v41, v44 = 0
139 | v_mov_b32 v41, 0
140 | v_mov_b32 v44, 0
141 |
142 | ds_read_b32 v6, v0 offset:152
143 | v_cmp_lt_u32 s[2:3], v1, 4
144 | ds_read2_b64 v[34:37], v0 offset0:18 offset1:16
145 | ds_read_b64 v[11:12], v0 offset:136
146 | s_movk_i32 s9, 0x0
147 | s_mov_b64 s[6:7], exec
148 | s_andn2_b64 exec, s[6:7], s[2:3]
149 | ds_read_b64 v[13:14], v0 offset:160
150 | s_andn2_b64 exec, s[6:7], exec
151 | v_mov_b32 v13, 0
152 | v_mov_b32 v14, 0
153 | s_mov_b64 exec, s[6:7]
154 |
155 | # compiled program size
156 | s_mov_b64 s[6:7], s[8:9]
157 | s_mulk_i32 s6, 10048
158 |
159 | v_add3_u32 v5, v0, v5, 64
160 | s_mov_b64 s[8:9], exec
161 | s_andn2_b64 exec, s[8:9], s[2:3]
162 | ds_read_b64 v[15:16], v0 offset:168
163 | s_andn2_b64 exec, s[8:9], exec
164 | v_mov_b32 v15, 0
165 | v_mov_b32 v16, 0
166 | s_mov_b64 exec, s[8:9]
167 | s_load_dwordx4 s[8:11], s[4:5], 0x30
168 |
169 | # batch_size
170 | s_load_dword s16, s[4:5], 0x58
171 |
172 | s_load_dwordx2 s[4:5], s[4:5], 0x50
173 | v_lshlrev_b32 v1, 3, v1
174 | v_add_u32 v17, v0, v1
175 | s_waitcnt lgkmcnt(0)
176 | v_add_co_u32 v2, vcc, s10, v2
177 | v_mov_b32 v18, s11
178 | v_addc_co_u32 v18, vcc, v18, v20, vcc
179 | v_mov_b32 v19, 0xffffff
180 | v_add_co_u32 v6, vcc, s8, v6
181 | v_mov_b32 v20, s9
182 | v_addc_co_u32 v20, vcc, v20, 0, vcc
183 | ds_read_b64 v[21:22], v17
184 | s_add_u32 s4, s4, s6
185 | s_addc_u32 s5, s5, s7
186 | v_cndmask_b32 v19, v19, -1, s[2:3]
187 | v_lshl_add_u32 v8, v35, 3, v0
188 | v_lshl_add_u32 v7, v34, 3, v0
189 | v_lshl_add_u32 v12, v12, 3, v0
190 | v_lshl_add_u32 v0, v11, 3, v0
191 | v_mov_b32 v10, v36
192 | v_mov_b32 v23, v37
193 |
194 | # loop counter
195 | s_sub_u32 s2, s24, 1
196 |
197 | # batch_size
198 | s_mov_b32 s3, s16
199 |
200 | # Scratchpad masks for scratchpads
201 | v_sub_u32 v38, s21, 8
202 | v_sub_u32 v39, s22, 8
203 | v_sub_u32 v50, s23, 8
204 |
205 | # mask for FSCAL_R
206 | v_mov_b32 v51, 0x80F00000
207 |
208 | # load scratchpad base address
209 | v_readlane_b32 s0, v2, 0
210 | v_readlane_b32 s1, v18, 0
211 |
212 | # save current executiom mask
213 | s_mov_b64 s[36:37], exec
214 |
215 | # v41 = 0 on lane 0, set it to 8 on lane 1
216 | # v44 = 0 on lane 0, set it to 4 on lane 1
217 | s_mov_b64 exec, 2
218 | v_mov_b32 v41, 8
219 | v_mov_b32 v44, 4
220 |
221 | # load group A registers
222 | # Read low 8 bytes into lane 0 and high 8 bytes into lane 1
223 | s_mov_b64 exec, 3
224 | ds_read2_b64 v[52:55], v41 offset0:24 offset1:26
225 | ds_read2_b64 v[56:59], v41 offset0:28 offset1:30
226 |
227 | # xmantissaMask
228 | v_mov_b32 v77, (1 << 24) - 1
229 |
230 | # xexponentMask
231 | ds_read_b64 v[78:79], v41 offset:160
232 |
233 | # Restore execution mask
234 | s_mov_b64 exec, s[36:37]
235 |
236 | # sign mask (used in FSQRT_R)
237 | v_mov_b32 v82, 0x80000000
238 |
239 | # High 32 bits of "1.0" constant (used in FDIV_M)
240 | v_mov_b32 v83, (1023 << 20)
241 |
242 | # Used to multiply FP64 values by 0.5
243 | v_mov_b32 v84, (1 << 20)
244 |
245 | s_getpc_b64 s[14:15]
246 | cur_addr:
247 |
248 | # get addresses of FSQRT_R subroutines
249 | s_add_u32 s40, s14, fsqrt_r_sub0 - cur_addr
250 | s_addc_u32 s41, s15, 0
251 | s_add_u32 s42, s14, fsqrt_r_sub1 - cur_addr
252 | s_addc_u32 s43, s15, 0
253 | s_add_u32 s44, s14, fsqrt_r_sub2 - cur_addr
254 | s_addc_u32 s45, s15, 0
255 | s_add_u32 s46, s14, fsqrt_r_sub3 - cur_addr
256 | s_addc_u32 s47, s15, 0
257 |
258 | # get addresses of FDIV_M subroutines
259 | s_add_u32 s48, s14, fdiv_m_sub0 - cur_addr
260 | s_addc_u32 s49, s15, 0
261 | s_add_u32 s50, s14, fdiv_m_sub1 - cur_addr
262 | s_addc_u32 s51, s15, 0
263 | s_add_u32 s52, s14, fdiv_m_sub2 - cur_addr
264 | s_addc_u32 s53, s15, 0
265 | s_add_u32 s54, s14, fdiv_m_sub3 - cur_addr
266 | s_addc_u32 s55, s15, 0
267 |
268 | # get address for ISMULH_R subroutine
269 | s_add_u32 s56, s14, ismulh_r_sub - cur_addr
270 | s_addc_u32 s57, s15, 0
271 |
272 | # get address for IMULH_R subroutine
273 | s_add_u32 s58, s14, imulh_r_sub - cur_addr
274 | s_addc_u32 s59, s15, 0
275 |
276 | # used in IXOR_R instruction
277 | s_mov_b32 s63, -1
278 |
279 | # used in CBRANCH instruction
280 | s_mov_b32 s70, (0xFF << 8)
281 | s_mov_b32 s71, (0xFF << 9)
282 | s_mov_b32 s72, (0xFF << 10)
283 | s_mov_b32 s73, (0xFF << 11)
284 | s_mov_b32 s74, (0xFF << 12)
285 | s_mov_b32 s75, (0xFF << 13)
286 | s_mov_b32 s76, (0xFF << 14)
287 | s_mov_b32 s77, (0xFF << 15)
288 | s_mov_b32 s78, (0xFF << 16)
289 | s_mov_b32 s79, (0xFF << 17)
290 | s_mov_b32 s80, (0xFF << 18)
291 | s_mov_b32 s81, (0xFF << 19)
292 | s_mov_b32 s82, (0xFF << 20)
293 | s_mov_b32 s83, (0xFF << 21)
294 | s_mov_b32 s84, (0xFF << 22)
295 | s_mov_b32 s85, (0xFF << 23)
296 |
297 | # ScratchpadL3Mask64
298 | s_sub_u32 s86, s23, 64
299 |
300 | main_loop:
301 | # const uint2 spMix = as_uint2(R[readReg0] ^ R[readReg1]);
302 | ds_read_b64 v[24:25], v0
303 | ds_read_b64 v[26:27], v12
304 | s_waitcnt lgkmcnt(0)
305 | v_xor_b32 v25, v27, v25
306 | v_xor_b32 v24, v26, v24
307 |
308 | # spAddr1 ^= spMix.y;
309 | # spAddr0 ^= spMix.x;
310 | v_xor_b32 v10, v25, v10
311 | v_xor_b32 v23, v24, v23
312 |
313 | # spAddr1 &= ScratchpadL3Mask64;
314 | # spAddr0 &= ScratchpadL3Mask64;
315 | v_and_b32 v10, s86, v10
316 | v_and_b32 v23, s86, v23
317 |
318 | # Offset for scratchpads
319 | # offset1 = spAddr1 + sub * 8
320 | # offset0 = spAddr0 + sub * 8
321 | v_add_u32 v10, v10, v1
322 | v_add_u32 v23, v23, v1
323 |
324 | # __global ulong* p1 = (__global ulong*)(scratchpad + offset1);
325 | # __global ulong* p0 = (__global ulong*)(scratchpad + offset0);
326 | v_add_co_u32 v26, vcc, v2, v10
327 | v_addc_co_u32 v27, vcc, v18, 0, vcc
328 | v_add_co_u32 v23, vcc, v2, v23
329 | v_addc_co_u32 v24, vcc, v18, 0, vcc
330 |
331 | # load from spAddr1
332 | global_load_dwordx2 v[28:29], v[26:27], off
333 |
334 | # load from spAddr0
335 | global_load_dwordx2 v[30:31], v[23:24], off
336 | s_waitcnt vmcnt(1)
337 |
338 | v_cvt_f64_i32 v[32:33], v28
339 | v_cvt_f64_i32 v[28:29], v29
340 | s_waitcnt vmcnt(0)
341 |
342 | # R[sub] ^= *p0;
343 | v_xor_b32 v34, v21, v30
344 | v_xor_b32 v35, v22, v31
345 |
346 | v_add_co_u32 v22, vcc, v6, v36
347 | v_addc_co_u32 v25, vcc, v20, 0, vcc
348 | v_add_co_u32 v21, vcc, v22, v1
349 | v_addc_co_u32 v22, vcc, v25, 0, vcc
350 | global_load_dwordx2 v[21:22], v[21:22], off
351 | v_or_b32 v30, v32, v13
352 | v_and_or_b32 v31, v33, v19, v14
353 | v_or_b32 v28, v28, v15
354 | v_and_or_b32 v29, v29, v19, v16
355 | ds_write2_b64 v5, v[30:31], v[28:29] offset1:1
356 | s_waitcnt lgkmcnt(0)
357 |
358 | # Program 0
359 |
360 | # load group F,E registers
361 | # Read low 8 bytes into lane 0 and high 8 bytes into lane 1
362 | s_mov_b64 exec, 3
363 | ds_read2_b64 v[60:63], v41 offset0:8 offset1:10
364 | ds_read2_b64 v[64:67], v41 offset0:12 offset1:14
365 | ds_read2_b64 v[68:71], v41 offset0:16 offset1:18
366 | ds_read2_b64 v[72:75], v41 offset0:20 offset1:22
367 |
368 | # load VM integer registers
369 | v_readlane_b32 s16, v34, 0
370 | v_readlane_b32 s17, v35, 0
371 | v_readlane_b32 s18, v34, 1
372 | v_readlane_b32 s19, v35, 1
373 | v_readlane_b32 s20, v34, 2
374 | v_readlane_b32 s21, v35, 2
375 | v_readlane_b32 s22, v34, 3
376 | v_readlane_b32 s23, v35, 3
377 | v_readlane_b32 s24, v34, 4
378 | v_readlane_b32 s25, v35, 4
379 | v_readlane_b32 s26, v34, 5
380 | v_readlane_b32 s27, v35, 5
381 | v_readlane_b32 s28, v34, 6
382 | v_readlane_b32 s29, v35, 6
383 | v_readlane_b32 s30, v34, 7
384 | v_readlane_b32 s31, v35, 7
385 |
386 | s_waitcnt lgkmcnt(0)
387 |
388 | # call JIT code
389 | s_swappc_b64 s[12:13], s[4:5]
390 |
391 | # Write out group F,E registers
392 | # Write low 8 bytes from lane 0 and high 8 bytes from lane 1
393 | ds_write2_b64 v41, v[60:61], v[62:63] offset0:8 offset1:10
394 | ds_write2_b64 v41, v[64:65], v[66:67] offset0:12 offset1:14
395 | ds_write2_b64 v41, v[68:69], v[70:71] offset0:16 offset1:18
396 | ds_write2_b64 v41, v[72:73], v[74:75] offset0:20 offset1:22
397 |
398 | # store VM integer registers
399 | v_writelane_b32 v28, s16, 0
400 | v_writelane_b32 v29, s17, 0
401 | v_writelane_b32 v28, s18, 1
402 | v_writelane_b32 v29, s19, 1
403 | v_writelane_b32 v28, s20, 2
404 | v_writelane_b32 v29, s21, 2
405 | v_writelane_b32 v28, s22, 3
406 | v_writelane_b32 v29, s23, 3
407 | v_writelane_b32 v28, s24, 4
408 | v_writelane_b32 v29, s25, 4
409 | v_writelane_b32 v28, s26, 5
410 | v_writelane_b32 v29, s27, 5
411 | v_writelane_b32 v28, s28, 6
412 | v_writelane_b32 v29, s29, 6
413 | v_writelane_b32 v28, s30, 7
414 | v_writelane_b32 v29, s31, 7
415 |
416 | # Restore execution mask
417 | s_mov_b64 exec, s[36:37]
418 |
419 | # Write out VM integer registers
420 | ds_write_b64 v17, v[28:29]
421 |
422 | s_waitcnt lgkmcnt(0)
423 | v_xor_b32 v21, v28, v21
424 | v_xor_b32 v22, v29, v22
425 | ds_read_b32 v28, v7
426 | ds_read_b32 v29, v8
427 | ds_write_b64 v17, v[21:22]
428 | s_waitcnt lgkmcnt(1)
429 | ds_read2_b64 v[30:33], v17 offset0:8 offset1:16
430 | v_xor_b32 v10, v28, v37
431 | s_waitcnt lgkmcnt(0)
432 | v_xor_b32 v30, v32, v30
433 | v_xor_b32 v31, v33, v31
434 | v_xor_b32 v10, v10, v29
435 | global_store_dwordx2 v[26:27], v[21:22], off
436 | v_and_b32 v10, 0x7fffffc0, v10
437 | global_store_dwordx2 v[23:24], v[30:31], off
438 | s_cmp_eq_u32 s2, 0
439 | s_cbranch_scc1 main_loop_end
440 | s_sub_i32 s2, s2, 1
441 | v_mov_b32 v37, v36
442 | v_mov_b32 v23, 0
443 | v_mov_b32 v36, v10
444 | v_mov_b32 v10, 0
445 | s_branch main_loop
446 | main_loop_end:
447 |
448 | v_add_co_u32 v0, vcc, v3, v1
449 | v_addc_co_u32 v1, vcc, v4, 0, vcc
450 | global_store_dwordx2 v[0:1], v[21:22], off
451 | global_store_dwordx2 v[0:1], v[30:31], off inst_offset:64
452 | global_store_dwordx2 v[0:1], v[32:33], off inst_offset:128
453 |
454 | # store rounding mode
455 | v_mov_b32 v0, 0
456 | v_mov_b32 v1, s66
457 | global_store_dword v0, v1, s[64:65]
458 |
459 | program_end:
460 | s_endpgm
461 |
462 | fsqrt_r_sub0:
463 | s_setreg_b32 hwreg(mode, 2, 2), s67
464 | v_rsq_f64 v[28:29], v[68:69]
465 |
466 | # Improve initial approximation (can be skipped)
467 | #v_mul_f64 v[42:43], v[28:29], v[68:69]
468 | #v_mul_f64 v[48:49], v[28:29], -0.5
469 | #v_fma_f64 v[48:49], v[48:49], v[42:43], 0.5
470 | #v_fma_f64 v[28:29], v[28:29], v[48:49], v[28:29]
471 |
472 | v_mul_f64 v[42:43], v[28:29], v[68:69]
473 | v_mov_b32 v48, v28
474 | v_sub_u32 v49, v29, v84
475 | v_mov_b32 v46, v28
476 | v_xor_b32 v47, v49, v82
477 | v_fma_f64 v[46:47], v[46:47], v[42:43], 0.5
478 | v_fma_f64 v[42:43], v[42:43], v[46:47], v[42:43]
479 | v_fma_f64 v[48:49], v[48:49], v[46:47], v[48:49]
480 | v_fma_f64 v[46:47], -v[42:43], v[42:43], v[68:69]
481 | s_setreg_b32 hwreg(mode, 2, 2), s66
482 | v_fma_f64 v[42:43], v[46:47], v[48:49], v[42:43]
483 | v_cmpx_class_f64 s[14:15], v[68:69], s[68:69]
484 | v_mov_b32 v68, v42
485 | v_mov_b32 v69, v43
486 | s_mov_b64 exec, 3
487 | s_setpc_b64 s[60:61]
488 |
489 | fsqrt_r_sub1:
490 | s_setreg_b32 hwreg(mode, 2, 2), s67
491 | v_rsq_f64 v[28:29], v[70:71]
492 |
493 | # Improve initial approximation (can be skipped)
494 | #v_mul_f64 v[42:43], v[28:29], v[70:71]
495 | #v_mul_f64 v[48:49], v[28:29], -0.5
496 | #v_fma_f64 v[48:49], v[48:49], v[42:43], 0.5
497 | #v_fma_f64 v[28:29], v[28:29], v[48:49], v[28:29]
498 |
499 | v_mul_f64 v[42:43], v[28:29], v[70:71]
500 | v_mov_b32 v48, v28
501 | v_sub_u32 v49, v29, v84
502 | v_mov_b32 v46, v28
503 | v_xor_b32 v47, v49, v82
504 | v_fma_f64 v[46:47], v[46:47], v[42:43], 0.5
505 | v_fma_f64 v[42:43], v[42:43], v[46:47], v[42:43]
506 | v_fma_f64 v[48:49], v[48:49], v[46:47], v[48:49]
507 | v_fma_f64 v[46:47], -v[42:43], v[42:43], v[70:71]
508 | s_setreg_b32 hwreg(mode, 2, 2), s66
509 | v_fma_f64 v[42:43], v[46:47], v[48:49], v[42:43]
510 | v_cmpx_class_f64 s[14:15], v[70:71], s[68:69]
511 | v_mov_b32 v70, v42
512 | v_mov_b32 v71, v43
513 | s_mov_b64 exec, 3
514 | s_setpc_b64 s[60:61]
515 |
516 | fsqrt_r_sub2:
517 | s_setreg_b32 hwreg(mode, 2, 2), s67
518 | v_rsq_f64 v[28:29], v[72:73]
519 |
520 | # Improve initial approximation (can be skipped)
521 | #v_mul_f64 v[42:43], v[28:29], v[72:73]
522 | #v_mul_f64 v[48:49], v[28:29], -0.5
523 | #v_fma_f64 v[48:49], v[48:49], v[42:43], 0.5
524 | #v_fma_f64 v[28:29], v[28:29], v[48:49], v[28:29]
525 |
526 | v_mul_f64 v[42:43], v[28:29], v[72:73]
527 | v_mov_b32 v48, v28
528 | v_sub_u32 v49, v29, v84
529 | v_mov_b32 v46, v28
530 | v_xor_b32 v47, v49, v82
531 | v_fma_f64 v[46:47], v[46:47], v[42:43], 0.5
532 | v_fma_f64 v[42:43], v[42:43], v[46:47], v[42:43]
533 | v_fma_f64 v[48:49], v[48:49], v[46:47], v[48:49]
534 | v_fma_f64 v[46:47], -v[42:43], v[42:43], v[72:73]
535 | s_setreg_b32 hwreg(mode, 2, 2), s66
536 | v_fma_f64 v[42:43], v[46:47], v[48:49], v[42:43]
537 | v_cmpx_class_f64 s[14:15], v[72:73], s[68:69]
538 | v_mov_b32 v72, v42
539 | v_mov_b32 v73, v43
540 | s_mov_b64 exec, 3
541 | s_setpc_b64 s[60:61]
542 |
543 | fsqrt_r_sub3:
544 | s_setreg_b32 hwreg(mode, 2, 2), s67
545 | v_rsq_f64 v[28:29], v[74:75]
546 |
547 | # Improve initial approximation (can be skipped)
548 | #v_mul_f64 v[42:43], v[28:29], v[74:75]
549 | #v_mul_f64 v[48:49], v[28:29], -0.5
550 | #v_fma_f64 v[48:49], v[48:49], v[42:43], 0.5
551 | #v_fma_f64 v[28:29], v[28:29], v[48:49], v[28:29]
552 |
553 | v_mul_f64 v[42:43], v[28:29], v[74:75]
554 | v_mov_b32 v48, v28
555 | v_sub_u32 v49, v29, v84
556 | v_mov_b32 v46, v28
557 | v_xor_b32 v47, v49, v82
558 | v_fma_f64 v[46:47], v[46:47], v[42:43], 0.5
559 | v_fma_f64 v[42:43], v[42:43], v[46:47], v[42:43]
560 | v_fma_f64 v[48:49], v[48:49], v[46:47], v[48:49]
561 | v_fma_f64 v[46:47], -v[42:43], v[42:43], v[74:75]
562 | s_setreg_b32 hwreg(mode, 2, 2), s66
563 | v_fma_f64 v[42:43], v[46:47], v[48:49], v[42:43]
564 | v_cmpx_class_f64 s[14:15], v[74:75], s[68:69]
565 | v_mov_b32 v74, v42
566 | v_mov_b32 v75, v43
567 | s_mov_b64 exec, 3
568 | s_setpc_b64 s[60:61]
569 |
570 | fdiv_m_sub0:
571 | v_or_b32 v28, v28, v78
572 | v_and_or_b32 v29, v29, v77, v79
573 | s_setreg_b32 hwreg(mode, 2, 2), s67
574 | v_rcp_f64 v[48:49], v[28:29]
575 | v_fma_f64 v[80:81], -v[28:29], v[48:49], 1.0
576 | v_fma_f64 v[48:49], v[48:49], v[80:81], v[48:49]
577 | v_mul_f64 v[80:81], v[68:69], v[48:49]
578 | v_fma_f64 v[42:43], -v[28:29], v[80:81], v[68:69]
579 | s_setreg_b32 hwreg(mode, 2, 2), s66
580 | v_fma_f64 v[42:43], v[42:43], v[48:49], v[80:81]
581 | v_div_fixup_f64 v[80:81], v[42:43], v[28:29], v[68:69]
582 | v_cmpx_eq_f64 s[14:15], v[68:69], v[28:29]
583 | v_mov_b32 v80, 0
584 | v_mov_b32 v81, v83
585 | s_mov_b64 exec, 3
586 | v_mov_b32 v68, v80
587 | v_mov_b32 v69, v81
588 | s_setpc_b64 s[60:61]
589 |
590 | fdiv_m_sub1:
591 | v_or_b32 v28, v28, v78
592 | v_and_or_b32 v29, v29, v77, v79
593 | s_setreg_b32 hwreg(mode, 2, 2), s67
594 | v_rcp_f64 v[48:49], v[28:29]
595 | v_fma_f64 v[80:81], -v[28:29], v[48:49], 1.0
596 | v_fma_f64 v[48:49], v[48:49], v[80:81], v[48:49]
597 | v_mul_f64 v[80:81], v[70:71], v[48:49]
598 | v_fma_f64 v[42:43], -v[28:29], v[80:81], v[70:71]
599 | s_setreg_b32 hwreg(mode, 2, 2), s66
600 | v_fma_f64 v[42:43], v[42:43], v[48:49], v[80:81]
601 | v_div_fixup_f64 v[80:81], v[42:43], v[28:29], v[70:71]
602 | v_cmpx_eq_f64 s[14:15], v[70:71], v[28:29]
603 | v_mov_b32 v80, 0
604 | v_mov_b32 v81, v83
605 | s_mov_b64 exec, 3
606 | v_mov_b32 v70, v80
607 | v_mov_b32 v71, v81
608 | s_setpc_b64 s[60:61]
609 |
610 | fdiv_m_sub2:
611 | v_or_b32 v28, v28, v78
612 | v_and_or_b32 v29, v29, v77, v79
613 | s_setreg_b32 hwreg(mode, 2, 2), s67
614 | v_rcp_f64 v[48:49], v[28:29]
615 | v_fma_f64 v[80:81], -v[28:29], v[48:49], 1.0
616 | v_fma_f64 v[48:49], v[48:49], v[80:81], v[48:49]
617 | v_mul_f64 v[80:81], v[72:73], v[48:49]
618 | v_fma_f64 v[42:43], -v[28:29], v[80:81], v[72:73]
619 | s_setreg_b32 hwreg(mode, 2, 2), s66
620 | v_fma_f64 v[42:43], v[42:43], v[48:49], v[80:81]
621 | v_div_fixup_f64 v[80:81], v[42:43], v[28:29], v[72:73]
622 | v_cmpx_eq_f64 s[14:15], v[72:73], v[28:29]
623 | v_mov_b32 v80, 0
624 | v_mov_b32 v81, v83
625 | s_mov_b64 exec, 3
626 | v_mov_b32 v72, v80
627 | v_mov_b32 v73, v81
628 | s_setpc_b64 s[60:61]
629 |
630 | fdiv_m_sub3:
631 | v_or_b32 v28, v28, v78
632 | v_and_or_b32 v29, v29, v77, v79
633 | s_setreg_b32 hwreg(mode, 2, 2), s67
634 | v_rcp_f64 v[48:49], v[28:29]
635 | v_fma_f64 v[80:81], -v[28:29], v[48:49], 1.0
636 | v_fma_f64 v[48:49], v[48:49], v[80:81], v[48:49]
637 | v_mul_f64 v[80:81], v[74:75], v[48:49]
638 | v_fma_f64 v[42:43], -v[28:29], v[80:81], v[74:75]
639 | s_setreg_b32 hwreg(mode, 2, 2), s66
640 | v_fma_f64 v[42:43], v[42:43], v[48:49], v[80:81]
641 | v_div_fixup_f64 v[80:81], v[42:43], v[28:29], v[74:75]
642 | v_cmpx_eq_f64 s[14:15], v[74:75], v[28:29]
643 | v_mov_b32 v80, 0
644 | v_mov_b32 v81, v83
645 | s_mov_b64 exec, 3
646 | v_mov_b32 v74, v80
647 | v_mov_b32 v75, v81
648 | s_setpc_b64 s[60:61]
649 |
650 | ismulh_r_sub:
651 | s_mov_b64 exec, 1
652 | v_mov_b32 v45, s14
653 | v_mul_hi_u32 v40, s38, v45
654 | v_mov_b32 v47, s15
655 | v_mad_u64_u32 v[42:43], s[32:33], s38, v47, v[40:41]
656 | v_mov_b32 v40, v42
657 | v_mad_u64_u32 v[45:46], s[32:33], s39, v45, v[40:41]
658 | v_mad_u64_u32 v[42:43], s[32:33], s39, v47, v[43:44]
659 | v_add_co_u32 v42, vcc, v42, v46
660 | v_addc_co_u32 v43, vcc, 0, v43, vcc
661 | v_readlane_b32 s32, v42, 0
662 | v_readlane_b32 s33, v43, 0
663 | s_cmp_lt_i32 s15, 0
664 | s_cselect_b64 s[34:35], s[38:39], 0
665 | s_sub_u32 s32, s32, s34
666 | s_subb_u32 s33, s33, s35
667 | s_cmp_lt_i32 s39, 0
668 | s_cselect_b64 s[34:35], s[14:15], 0
669 | s_sub_u32 s14, s32, s34
670 | s_subb_u32 s15, s33, s35
671 | s_mov_b64 exec, 3
672 | s_setpc_b64 s[60:61]
673 |
674 | imulh_r_sub:
675 | s_mov_b64 exec, 1
676 | v_mov_b32 v45, s38
677 | v_mul_hi_u32 v40, s14, v45
678 | v_mov_b32 v47, s39
679 | v_mad_u64_u32 v[42:43], s[32:33], s14, v47, v[40:41]
680 | v_mov_b32 v40, v42
681 | v_mad_u64_u32 v[45:46], s[32:33], s15, v45, v[40:41]
682 | v_mad_u64_u32 v[42:43], s[32:33], s15, v47, v[43:44]
683 | v_add_co_u32 v42, vcc, v42, v46
684 | v_addc_co_u32 v43, vcc, 0, v43, vcc
685 | v_readlane_b32 s14, v42, 0
686 | v_readlane_b32 s15, v43, 0
687 | s_mov_b64 exec, 3
688 | s_setpc_b64 s[60:61]
689 |
--------------------------------------------------------------------------------
/RandomX_OpenCL/RandomX_OpenCL.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (c) 2019 SChernykh
3 |
4 | This file is part of RandomX OpenCL.
5 |
6 | RandomX OpenCL is free software: you can redistribute it and/or modify
7 | it under the terms of the GNU General Public License as published by
8 | the Free Software Foundation, either version 3 of the License, or
9 | (at your option) any later version.
10 |
11 | RandomX OpenCL is distributed in the hope that it will be useful,
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | GNU General Public License for more details.
15 |
16 | You should have received a copy of the GNU General Public License
17 | along with RandomX OpenCL. If not, see .
18 | */
19 |
20 | #include
21 | #include
22 | #include
23 | #include
24 | #include "tests.h"
25 | #include "miner.h"
26 |
27 | int main(int argc, char** argv)
28 | {
29 | if (argc < 2)
30 | {
31 | printf("Usage: %s --mine [--validate] [--platform_id N] [--device_id N] [--intensity N] [--portable] [--workers N] [--bfactor N] [--dataset_host]\n\n", argv[0]);
32 | printf("platform_id 0 if you have only 1 OpenCL platform\n");
33 | printf("device_id 0 if you have only 1 GPU\n");
34 | printf("intensity number of scratchpads to allocate, if it's not set then as many as possible will be allocated.\n\n");
35 | printf("portable use generic OpenCL code that works on all GPUs.\n\n");
36 | printf("workers number of parallel workers per hash to run in portable mode. Can be 2,4,8,16, default is 8.\n\n");
37 | printf("bfactor splits main loop into multiple sub-steps. Use it to improve screen responsiveness. Can be 0-10, default is 5.\n\n");
38 | printf("dataset_host allocate dataset on host. This is required for 2 GB GPUs.\n\n");
39 | printf("Examples:\n%s --mine --validate --intensity 1984\n", argv[0]);
40 | return 0;
41 | }
42 |
43 | uint32_t platform_id = 0;
44 | uint32_t device_id = 0;
45 | size_t intensity = 0;
46 | uint32_t start_nonce = 0;
47 | uint32_t workers_per_hash = 8;
48 | uint32_t bfactor = 5;
49 | bool portable = false;
50 | bool dataset_host_allocated = false;
51 | bool validate = false;
52 |
53 | for (int i = 1; i < argc; ++i)
54 | {
55 | if ((strcmp(argv[i], "--platform_id") == 0) && (i + 1 < argc))
56 | platform_id = atoi(argv[i + 1]);
57 | else if ((strcmp(argv[i], "--device_id") == 0) && (i + 1 < argc))
58 | device_id = atoi(argv[i + 1]);
59 | else if ((strcmp(argv[i], "--intensity") == 0) && (i + 1 < argc))
60 | intensity = atoi(argv[i + 1]);
61 | else if ((strcmp(argv[i], "--nonce") == 0) && (i + 1 < argc))
62 | start_nonce = atoi(argv[i + 1]);
63 | else if ((strcmp(argv[i], "--workers") == 0) && (i + 1 < argc))
64 | workers_per_hash = atoi(argv[i + 1]);
65 | else if ((strcmp(argv[i], "--bfactor") == 0) && (i + 1 < argc))
66 | bfactor = atoi(argv[i + 1]);
67 | else if (strcmp(argv[i], "--portable") == 0)
68 | portable = true;
69 | else if (strcmp(argv[i], "--dataset_host") == 0)
70 | dataset_host_allocated = true;
71 | else if (strcmp(argv[i], "--validate") == 0)
72 | validate = true;
73 | }
74 |
75 | if (strcmp(argv[1], "--mine") == 0)
76 | return test_mining(platform_id, device_id, intensity, start_nonce, workers_per_hash, bfactor, portable, dataset_host_allocated, validate) ? 0 : 1;
77 | else if (strcmp(argv[1], "--test") == 0)
78 | return tests(platform_id, device_id, intensity) ? 0 : 1;
79 |
80 | return 0;
81 | }
82 |
--------------------------------------------------------------------------------
/RandomX_OpenCL/RandomX_OpenCL.vcxproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Debug
6 | x64
7 |
8 |
9 | Release
10 | x64
11 |
12 |
13 |
14 | 15.0
15 | {32C205F3-317C-4EE6-8B8B-E2B6D87B8CC4}
16 | Win32Proj
17 | RandomXOpenCL
18 | 10.0
19 |
20 |
21 |
22 | Application
23 | true
24 | v142
25 | MultiByte
26 |
27 |
28 | Application
29 | false
30 | v142
31 | true
32 | MultiByte
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 | true
48 |
49 |
50 | false
51 | NativeRecommendedRules.ruleset
52 |
53 |
54 |
55 | Level4
56 | Disabled
57 | true
58 | true
59 | $(CUDA_PATH)\include
60 | true
61 | _CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)
62 |
63 |
64 | Console
65 | true
66 | OpenCL.lib;%(AdditionalDependencies)
67 | $(CUDA_PATH)\lib\x64
68 |
69 |
70 |
71 |
72 | Level4
73 | MaxSpeed
74 | true
75 | true
76 | true
77 | true
78 | $(CUDA_PATH)\include
79 | true
80 | _CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)
81 |
82 |
83 | Console
84 | true
85 | true
86 | true
87 | OpenCL.lib;%(AdditionalDependencies)
88 | $(CUDA_PATH)\lib\x64
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 | {3346a4ad-c438-4324-8b77-47a16452954b}
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 | Document
120 | clrxasm %(Identity) -o %(Filename).bin
121 | %(Filename).bin
122 | clrxasm %(Identity) -o %(Filename).bin
123 | %(Filename).bin
124 |
125 |
126 | Document
127 | clrxasm %(Identity) -o %(Filename).bin
128 | clrxasm %(Identity) -o %(Filename).bin
129 | %(Filename).bin
130 | %(Filename).bin
131 |
132 |
133 | Document
134 | clrxasm %(Identity) -o %(Filename).bin
135 | %(Filename).bin
136 |
137 |
138 |
139 |
140 |
141 |
--------------------------------------------------------------------------------
/RandomX_OpenCL/RandomX_OpenCL.vcxproj.filters:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF}
6 | cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx
7 |
8 |
9 | {93995380-89BD-4b04-88EB-625FBE52EBFB}
10 | h;hh;hpp;hxx;hm;inl;inc;ipp;xsd
11 |
12 |
13 | {7789d323-c959-469b-addc-91336f3201fc}
14 |
15 |
16 | {181c0a82-12c6-4d82-8e7f-51fe2a1cee17}
17 |
18 |
19 |
20 |
21 | Source Files
22 |
23 |
24 | Source Files
25 |
26 |
27 | Source Files
28 |
29 |
30 | Source Files
31 |
32 |
33 |
34 |
35 | Header Files
36 |
37 |
38 | Header Files
39 |
40 |
41 | Header Files
42 |
43 |
44 | Header Files
45 |
46 |
47 | Header Files
48 |
49 |
50 | Header Files
51 |
52 |
53 |
54 |
55 | Source Files\CL
56 |
57 |
58 | Source Files\CL
59 |
60 |
61 | Source Files\CL
62 |
63 |
64 | Source Files\CL
65 |
66 |
67 | Source Files\CL
68 |
69 |
70 | Source Files\CL
71 |
72 |
73 | Source Files\CL
74 |
75 |
76 |
77 |
78 | Source Files\GCNASM
79 |
80 |
81 | Source Files\GCNASM
82 |
83 |
84 | Source Files\GCNASM
85 |
86 |
87 |
--------------------------------------------------------------------------------
/RandomX_OpenCL/RandomX_OpenCL.vcxproj.user:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | --mine --validate --portable --workers 8 --bfactor 5 --intensity 3584
5 | WindowsLocalDebugger
6 |
7 |
8 | --mine --validate --intensity 1984
9 | WindowsLocalDebugger
10 |
11 |
--------------------------------------------------------------------------------
/RandomX_OpenCL/definitions.h:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (c) 2019 SChernykh
3 |
4 | This file is part of RandomX OpenCL.
5 |
6 | RandomX OpenCL is free software: you can redistribute it and/or modify
7 | it under the terms of the GNU General Public License as published by
8 | the Free Software Foundation, either version 3 of the License, or
9 | (at your option) any later version.
10 |
11 | RandomX OpenCL is distributed in the hope that it will be useful,
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | GNU General Public License for more details.
15 |
16 | You should have received a copy of the GNU General Public License
17 | along with RandomX OpenCL. If not, see .
18 | */
19 |
20 | #pragma once
21 |
22 | #include
23 | #include "CL/randomx_constants.h"
24 | #include "CL/randomx_constants_jit.h"
25 |
26 | static const std::string AES_CL = "CL/aes.cl";
27 | static const std::string CL_FILLAES1RX4_SCRATCHPAD = "fillAes1Rx4_scratchpad";
28 | static const std::string CL_FILLAES4RX4_ENTROPY = "fillAes4Rx4_entropy";
29 | static const std::string CL_HASHAES1RX4 = "hashAes1Rx4";
30 |
31 | static const std::string BLAKE2B_CL = "CL/blake2b.cl";
32 | static const std::string CL_BLAKE2B_INITIAL_HASH = "blake2b_initial_hash";
33 | static const std::string CL_BLAKE2B_HASH_REGISTERS_32 = "blake2b_hash_registers_32";
34 | static const std::string CL_BLAKE2B_HASH_REGISTERS_64 = "blake2b_hash_registers_64";
35 | static const std::string CL_BLAKE2B_512_SINGLE_BLOCK_BENCH = "blake2b_512_single_block_bench";
36 | static const std::string CL_BLAKE2B_512_DOUBLE_BLOCK_BENCH = "blake2b_512_double_block_bench";
37 |
38 | static const std::string RANDOMX_INIT_CL = "CL/randomx_init.cl";
39 | static const std::string CL_RANDOMX_INIT = "randomx_init";
40 |
41 | static const std::string RANDOMX_RUN_CL = "CL/randomx_run.cl";
42 | static const std::string CL_RANDOMX_RUN = "randomx_run";
43 |
44 | static const std::string RANDOMX_VM_CL = "CL/randomx_vm.cl";
45 | static const std::string CL_INIT_VM = "init_vm";
46 | static const std::string CL_EXECUTE_VM = "execute_vm";
47 |
48 | static uint8_t blockTemplate[] = {
49 | 0x07, 0x07, 0xf7, 0xa4, 0xf0, 0xd6, 0x05, 0xb3, 0x03, 0x26, 0x08, 0x16, 0xba, 0x3f, 0x10, 0x90, 0x2e, 0x1a, 0x14,
50 | 0x5a, 0xc5, 0xfa, 0xd3, 0xaa, 0x3a, 0xf6, 0xea, 0x44, 0xc1, 0x18, 0x69, 0xdc, 0x4f, 0x85, 0x3f, 0x00, 0x2b, 0x2e,
51 | 0xea, 0x00, 0x00, 0x00, 0x00, 0x77, 0xb2, 0x06, 0xa0, 0x2c, 0xa5, 0xb1, 0xd4, 0xce, 0x6b, 0xbf, 0xdf, 0x0a, 0xca,
52 | 0xc3, 0x8b, 0xde, 0xd3, 0x4d, 0x2d, 0xcd, 0xee, 0xf9, 0x5c, 0xd2, 0x0c, 0xef, 0xc1, 0x2f, 0x61, 0xd5, 0x61, 0x09
53 | };
54 |
--------------------------------------------------------------------------------
/RandomX_OpenCL/makefile:
--------------------------------------------------------------------------------
1 | release: *.h *.cpp GCNASM/*.asm
2 | clrxasm GCNASM/randomx_run_gfx803.asm -o randomx_run_gfx803.bin
3 | clrxasm GCNASM/randomx_run_gfx900.asm -o randomx_run_gfx900.bin
4 | g++ *.cpp -O3 -lOpenCL -lpthread ../RandomX/build/librandomx.a -o opencl_test
5 |
--------------------------------------------------------------------------------
/RandomX_OpenCL/miner.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (c) 2019 SChernykh
3 |
4 | This file is part of RandomX OpenCL.
5 |
6 | RandomX OpenCL is free software: you can redistribute it and/or modify
7 | it under the terms of the GNU General Public License as published by
8 | the Free Software Foundation, either version 3 of the License, or
9 | (at your option) any later version.
10 |
11 | RandomX OpenCL is distributed in the hope that it will be useful,
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | GNU General Public License for more details.
15 |
16 | You should have received a copy of the GNU General Public License
17 | along with RandomX OpenCL. If not, see .
18 | */
19 |
20 | #include
21 | #include
22 | #include
23 | #include
24 | #include
25 | #include "miner.h"
26 | #include "opencl_helpers.h"
27 | #include "definitions.h"
28 |
29 | #include "../RandomX/src/randomx.h"
30 | #include "../RandomX/src/configuration.h"
31 | #include "../RandomX/src/common.hpp"
32 |
33 | using namespace std::chrono;
34 |
35 | bool test_mining(uint32_t platform_id, uint32_t device_id, size_t intensity, uint32_t start_nonce, uint32_t workers_per_hash, uint32_t bfactor, bool portable, bool dataset_host_allocated, bool validate)
36 | {
37 | std::cout << "Initializing GPU #" << device_id << " on OpenCL platform #" << platform_id << std::endl << std::endl;
38 |
39 | OpenCLContext ctx;
40 | if (!ctx.Init(platform_id, device_id))
41 | {
42 | return false;
43 | }
44 |
45 | if (!ctx.Compile("base_kernels.bin",
46 | {
47 | AES_CL,
48 | BLAKE2B_CL
49 | },
50 | {
51 | CL_FILLAES1RX4_SCRATCHPAD,
52 | CL_FILLAES4RX4_ENTROPY,
53 | CL_HASHAES1RX4,
54 | CL_BLAKE2B_INITIAL_HASH,
55 | CL_BLAKE2B_HASH_REGISTERS_32,
56 | CL_BLAKE2B_HASH_REGISTERS_64,
57 | CL_BLAKE2B_512_SINGLE_BLOCK_BENCH,
58 | CL_BLAKE2B_512_DOUBLE_BLOCK_BENCH
59 | },
60 | "", COMPILE_CACHE_BINARY))
61 | {
62 | return false;
63 | }
64 |
65 | int gcn_version = 12;
66 |
67 | if (portable)
68 | {
69 | switch (workers_per_hash)
70 | {
71 | case 2:
72 | case 4:
73 | case 8:
74 | case 16:
75 | break;
76 |
77 | default:
78 | workers_per_hash = 8;
79 | break;
80 | }
81 |
82 | if (bfactor > 10)
83 | bfactor = 10;
84 |
85 | std::stringstream options;
86 | options << "-D WORKERS_PER_HASH=" << workers_per_hash << " -Werror";
87 | if (!ctx.Compile("randomx_vm.bin", { RANDOMX_VM_CL }, { CL_INIT_VM, CL_EXECUTE_VM }, options.str(), COMPILE_CACHE_BINARY))
88 | {
89 | return false;
90 | }
91 | }
92 | else
93 | {
94 | const char* gcn_binary = "randomx_run_gfx803.bin";
95 |
96 | std::vector t;
97 | std::transform(ctx.device_name.begin(), ctx.device_name.end(), std::back_inserter(t), [](char c) { return static_cast(std::toupper(c)); });
98 | if ((strcmp(t.data(), "GFX900") == 0) || (strcmp(t.data(), "GFX906") == 0))
99 | {
100 | gcn_binary = "randomx_run_gfx900.bin";
101 | gcn_version = 14;
102 | }
103 | else if ((strcmp(t.data(), "GFX1010") == 0) || (strcmp(t.data(), "GFX1011") == 0) || (strcmp(t.data(), "GFX1012") == 0))
104 | {
105 | gcn_binary = "randomx_run_gfx1010.bin";
106 | gcn_version = 15;
107 | }
108 |
109 | std::stringstream options;
110 | options << "-D GCN_VERSION=" << gcn_version;
111 | if (!ctx.Compile("randomx_init.bin", { RANDOMX_INIT_CL }, { CL_RANDOMX_INIT }, options.str(), ALWAYS_COMPILE))
112 | {
113 | return false;
114 | }
115 |
116 | options.str("");
117 | options << "-D RANDOMX_PROGRAM_ITERATIONS=" << RANDOMX_PROGRAM_ITERATIONS;
118 | if (!ctx.Compile(gcn_binary, { RANDOMX_RUN_CL }, { CL_RANDOMX_RUN }, options.str(), ALWAYS_USE_BINARY, ctx.elf_binary_flags))
119 | {
120 | return false;
121 | }
122 | }
123 |
124 | if (!intensity)
125 | intensity = std::min(ctx.device_max_alloc_size, ctx.device_global_mem_size) / RANDOMX_SCRATCHPAD_L3;
126 |
127 | intensity -= (intensity & 63);
128 |
129 | const size_t dataset_size = randomx_dataset_item_count() * RANDOMX_DATASET_ITEM_SIZE;
130 | cl_int err;
131 | cl_mem dataset_gpu = nullptr;
132 | if (!dataset_host_allocated)
133 | {
134 | dataset_gpu = clCreateBuffer(ctx.context, CL_MEM_READ_ONLY, dataset_size, nullptr, &err);
135 | CL_CHECK_RESULT(clCreateBuffer);
136 | std::cout << "Allocated " << (dataset_size / 1048576.0) << " MB dataset on GPU" << std::endl;
137 | }
138 | std::cout << "Initializing dataset...";
139 |
140 | randomx_dataset *myDataset;
141 | bool large_pages_available = true;
142 | {
143 | auto t1 = high_resolution_clock::now();
144 |
145 | myDataset = randomx_alloc_dataset(RANDOMX_FLAG_LARGE_PAGES);
146 | if (!myDataset)
147 | {
148 | std::cout << "\nCouldn't allocate dataset using large pages" << std::endl;
149 | myDataset = randomx_alloc_dataset(RANDOMX_FLAG_DEFAULT);
150 | large_pages_available = false;
151 | }
152 |
153 | char* dataset_memory = reinterpret_cast(randomx_get_dataset_memory(myDataset));
154 | bool read_ok = false;
155 |
156 | FILE* fp = fopen("dataset.bin", "rb");
157 | if (fp)
158 | {
159 | read_ok = (fread(dataset_memory, 1, randomx::DatasetSize, fp) == randomx::DatasetSize);
160 | fclose(fp);
161 | }
162 |
163 | if (!read_ok)
164 | {
165 | randomx_cache *myCache = randomx_alloc_cache((randomx_flags)(RANDOMX_FLAG_JIT | (large_pages_available ? RANDOMX_FLAG_LARGE_PAGES : 0)));
166 | if (!myCache)
167 | {
168 | std::cout << "\nCouldn't allocate cache using large pages" << std::endl;
169 | myCache = randomx_alloc_cache(RANDOMX_FLAG_JIT);
170 | large_pages_available = false;
171 | }
172 |
173 | const char mySeed[] = "RandomX example seed";
174 | randomx_init_cache(myCache, mySeed, sizeof(mySeed));
175 |
176 | std::vector threads;
177 | for (uint32_t i = 0, n = std::thread::hardware_concurrency(); i < n; ++i)
178 | threads.emplace_back([myDataset, myCache, i, n]() { randomx_init_dataset(myDataset, myCache, (i * randomx_dataset_item_count()) / n, ((i + 1) * randomx_dataset_item_count()) / n - (i * randomx_dataset_item_count()) / n); });
179 |
180 | for (auto& t : threads)
181 | t.join();
182 |
183 | randomx_release_cache(myCache);
184 |
185 | fp = fopen("dataset.bin", "wb");
186 | if (fp)
187 | {
188 | fwrite(dataset_memory, 1, randomx::DatasetSize, fp);
189 | fclose(fp);
190 | }
191 | }
192 |
193 | if (!dataset_host_allocated)
194 | {
195 | CL_CHECKED_CALL(clEnqueueWriteBuffer, ctx.queue, dataset_gpu, CL_TRUE, 0, dataset_size, randomx_get_dataset_memory(myDataset), 0, nullptr, nullptr);
196 | }
197 |
198 | std::cout << "done in " << (duration_cast(high_resolution_clock::now() - t1).count() / 1e9) << " seconds" << std::endl;
199 | }
200 |
201 | if (dataset_host_allocated)
202 | {
203 | dataset_gpu = clCreateBuffer(ctx.context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, dataset_size, randomx_get_dataset_memory(myDataset), &err);
204 | CL_CHECK_RESULT(clCreateBuffer);
205 | std::cout << "Using host-allocated " << (dataset_size / 1048576.0) << " MB dataset" << std::endl;
206 | }
207 |
208 | ALLOCATE_DEVICE_MEMORY(scratchpads_gpu, ctx, intensity * (RANDOMX_SCRATCHPAD_L3 + 64));
209 | std::cout << "Allocated " << intensity << " scratchpads\n" << std::endl;
210 |
211 | ALLOCATE_DEVICE_MEMORY(hashes_gpu, ctx, intensity * INITIAL_HASH_SIZE);
212 | ALLOCATE_DEVICE_MEMORY(entropy_gpu, ctx, intensity * ENTROPY_SIZE);
213 | ALLOCATE_DEVICE_MEMORY(vm_states_gpu, ctx, portable ? (intensity * VM_STATE_SIZE) : (intensity * REGISTERS_SIZE));
214 | ALLOCATE_DEVICE_MEMORY(rounding_gpu, ctx, intensity * sizeof(uint32_t));
215 | ALLOCATE_DEVICE_MEMORY(blocktemplate_gpu, ctx, intensity * sizeof(blockTemplate));
216 | ALLOCATE_DEVICE_MEMORY(intermediate_programs_gpu, ctx, portable ? 0 : (intensity * INTERMEDIATE_PROGRAM_SIZE));
217 | ALLOCATE_DEVICE_MEMORY(compiled_programs_gpu, ctx, portable ? 0 : (intensity * COMPILED_PROGRAM_SIZE));
218 |
219 | CL_CHECKED_CALL(clEnqueueWriteBuffer, ctx.queue, blocktemplate_gpu, CL_TRUE, 0, sizeof(blockTemplate), blockTemplate, 0, nullptr, nullptr);
220 |
221 | auto prev_time = high_resolution_clock::now();
222 |
223 | std::vector hashes, hashes_check;
224 | hashes.resize(intensity * 32);
225 | hashes_check.resize(intensity * 32);
226 |
227 | std::vector threads;
228 | std::atomic nonce_counter;
229 | bool cpu_limited = false;
230 |
231 | uint32_t failed_nonces = 0;
232 |
233 | cl_kernel kernel_blake2b_initial_hash = ctx.kernels[CL_BLAKE2B_INITIAL_HASH];
234 | if (!clSetKernelArgs(kernel_blake2b_initial_hash, hashes_gpu, blocktemplate_gpu, 0U))
235 | {
236 | return false;
237 | }
238 |
239 | cl_kernel kernel_fillaes1rx4_scratchpad = ctx.kernels[CL_FILLAES1RX4_SCRATCHPAD];
240 | if (!clSetKernelArgs(kernel_fillaes1rx4_scratchpad, hashes_gpu, scratchpads_gpu, static_cast(intensity)))
241 | {
242 | return false;
243 | }
244 |
245 | cl_kernel kernel_fillaes1rx4_entropy = ctx.kernels[CL_FILLAES4RX4_ENTROPY];
246 | if (!clSetKernelArgs(kernel_fillaes1rx4_entropy, hashes_gpu, entropy_gpu, static_cast(intensity)))
247 | {
248 | return false;
249 | }
250 |
251 | cl_kernel kernel_randomx_init, kernel_randomx_run;
252 | if (portable)
253 | {
254 | kernel_randomx_init = ctx.kernels[CL_INIT_VM];
255 | if (!clSetKernelArgs(kernel_randomx_init, entropy_gpu, vm_states_gpu))
256 | {
257 | return false;
258 | }
259 |
260 | kernel_randomx_run = ctx.kernels[CL_EXECUTE_VM];
261 | if (!clSetKernelArgs(kernel_randomx_run, vm_states_gpu, rounding_gpu, scratchpads_gpu, dataset_gpu, static_cast(intensity), static_cast(RANDOMX_PROGRAM_ITERATIONS >> bfactor), 1U, 1U))
262 | {
263 | return false;
264 | }
265 | }
266 | else
267 | {
268 | kernel_randomx_init = ctx.kernels[CL_RANDOMX_INIT];
269 | if (!clSetKernelArgs(kernel_randomx_init, entropy_gpu, vm_states_gpu, intermediate_programs_gpu, compiled_programs_gpu, static_cast(intensity)))
270 | {
271 | return false;
272 | }
273 |
274 | kernel_randomx_run = ctx.kernels[CL_RANDOMX_RUN];
275 |
276 | constexpr uint32_t rx_parameters =
277 | (PowerOf2(RANDOMX_SCRATCHPAD_L1) << 0) |
278 | (PowerOf2(RANDOMX_SCRATCHPAD_L2) << 5) |
279 | (PowerOf2(RANDOMX_SCRATCHPAD_L3) << 10) |
280 | (PowerOf2(RANDOMX_PROGRAM_ITERATIONS) << 15);
281 |
282 | if (!clSetKernelArgs(kernel_randomx_run, dataset_gpu, scratchpads_gpu, vm_states_gpu, rounding_gpu, compiled_programs_gpu, static_cast(intensity), rx_parameters))
283 | {
284 | return false;
285 | }
286 | }
287 |
288 | cl_kernel kernel_hashaes1rx4 = ctx.kernels[CL_HASHAES1RX4];
289 | if (!clSetKernelArgs(kernel_hashaes1rx4, scratchpads_gpu, vm_states_gpu, 192U, static_cast(portable ? VM_STATE_SIZE : REGISTERS_SIZE), static_cast(intensity)))
290 | {
291 | return false;
292 | }
293 |
294 | cl_kernel kernel_blake2b_hash_registers_32 = ctx.kernels[CL_BLAKE2B_HASH_REGISTERS_32];
295 | if (!clSetKernelArgs(kernel_blake2b_hash_registers_32, hashes_gpu, vm_states_gpu, static_cast(portable ? VM_STATE_SIZE : REGISTERS_SIZE)))
296 | {
297 | return false;
298 | }
299 |
300 | cl_kernel kernel_blake2b_hash_registers_64 = ctx.kernels[CL_BLAKE2B_HASH_REGISTERS_64];
301 | if (!clSetKernelArgs(kernel_blake2b_hash_registers_64, hashes_gpu, vm_states_gpu, static_cast(portable ? VM_STATE_SIZE : REGISTERS_SIZE)))
302 | {
303 | return false;
304 | }
305 |
306 | const size_t global_work_size = intensity;
307 | const size_t global_work_size4 = intensity * 4;
308 | const size_t global_work_size8 = intensity * 8;
309 | const size_t global_work_size16 = intensity * 16;
310 | const size_t global_work_size32 = intensity * 32;
311 | const size_t global_work_size64 = intensity * 64;
312 | const size_t local_work_size = 64;
313 | const size_t local_work_size32 = 32;
314 | const size_t local_work_size16 = 16;
315 | const uint32_t zero = 0;
316 |
317 | for (size_t nonce = start_nonce, k = 0; nonce < 0xFFFFFFFFUL; nonce += intensity, ++k)
318 | {
319 | auto validation_thread = [&nonce_counter, myDataset, &hashes_check, intensity, nonce, &large_pages_available]() {
320 | const randomx_flags flags = (randomx_flags)(RANDOMX_FLAG_FULL_MEM | RANDOMX_FLAG_JIT | RANDOMX_FLAG_HARD_AES);
321 | randomx_vm *myMachine = randomx_create_vm((randomx_flags)(flags | (large_pages_available ? RANDOMX_FLAG_LARGE_PAGES : 0)), nullptr, myDataset);
322 |
323 | if (!myMachine && large_pages_available)
324 | {
325 | large_pages_available = false;
326 | myMachine = randomx_create_vm(flags, nullptr, myDataset);
327 | }
328 |
329 | uint8_t buf[sizeof(blockTemplate)];
330 | memcpy(buf, blockTemplate, sizeof(buf));
331 |
332 | for (;;)
333 | {
334 | const uint32_t i = nonce_counter.fetch_add(1);
335 | if (i >= intensity)
336 | break;
337 |
338 | *(uint32_t*)(buf + 39) = static_cast(nonce + i);
339 |
340 | randomx_calculate_hash(myMachine, buf, sizeof(buf), (hashes_check.data() + i * 32));
341 | }
342 | randomx_destroy_vm(myMachine);
343 | };
344 |
345 | if (validate)
346 | {
347 | nonce_counter = 0;
348 |
349 | const uint32_t n = std::max(std::thread::hardware_concurrency() / 2, 1U);
350 |
351 | threads.clear();
352 | for (uint32_t i = 0; i < n; ++i)
353 | threads.emplace_back(validation_thread);
354 | }
355 |
356 | auto cur_time = high_resolution_clock::now();
357 | if (k > 0)
358 | {
359 | const double dt = duration_cast(cur_time - prev_time).count() / 1e9;
360 |
361 | if (validate)
362 | {
363 | const size_t n = nonce - start_nonce;
364 | printf("%zu (%.3f%%) hashes validated successfully, %u (%.3f%%) hashes failed, %.0f h/s%s\n",
365 | n - failed_nonces,
366 | static_cast(n - failed_nonces) / n * 100.0,
367 | failed_nonces,
368 | static_cast(failed_nonces) / n * 100.0,
369 | intensity / dt,
370 | cpu_limited ? ", limited by CPU" : " "
371 | );
372 | }
373 | else
374 | {
375 | printf("%.0f h/s\t\r", intensity / dt);
376 | }
377 | }
378 | prev_time = cur_time;
379 |
380 | CL_CHECKED_CALL(clSetKernelArg, kernel_blake2b_initial_hash, 2, sizeof(uint32_t), &nonce);
381 | CL_CHECKED_CALL(clEnqueueNDRangeKernel, ctx.queue, kernel_blake2b_initial_hash, 1, nullptr, &global_work_size, &local_work_size, 0, nullptr, nullptr);
382 | CL_CHECKED_CALL(clEnqueueNDRangeKernel, ctx.queue, kernel_fillaes1rx4_scratchpad, 1, nullptr, &global_work_size4, &local_work_size, 0, nullptr, nullptr);
383 | CL_CHECKED_CALL(clEnqueueFillBuffer, ctx.queue, rounding_gpu, &zero, sizeof(zero), 0, intensity * sizeof(uint32_t), 0, nullptr, nullptr);
384 |
385 | for (size_t i = 0; i < RANDOMX_PROGRAM_COUNT; ++i)
386 | {
387 | CL_CHECKED_CALL(clEnqueueNDRangeKernel, ctx.queue, kernel_fillaes1rx4_entropy, 1, nullptr, &global_work_size4, &local_work_size, 0, nullptr, nullptr);
388 | CL_CHECKED_CALL(clEnqueueNDRangeKernel, ctx.queue, kernel_randomx_init, 1, nullptr, portable ? &global_work_size8 : &global_work_size32, portable ? &local_work_size32 : &local_work_size, 0, nullptr, nullptr);
389 | if (portable)
390 | {
391 | //if (i == 0)
392 | //{
393 | // CL_CHECKED_CALL(clFinish, ctx.queue);
394 | // std::vector buf(intensity * VM_STATE_SIZE);
395 | // CL_CHECKED_CALL(clEnqueueReadBuffer, ctx.queue, vm_states_gpu, CL_TRUE, 0, buf.size(), buf.data(), 0, nullptr, nullptr);
396 | // FILE* fp;
397 | // fopen_s(&fp, "vm_states.bin", "wb");
398 | // fwrite(buf.data(), 1, buf.size(), fp);
399 | // fclose(fp);
400 | // return false;
401 | //}
402 | uint32_t first = 1;
403 | uint32_t last = 0;
404 | CL_CHECKED_CALL(clSetKernelArg, kernel_randomx_run, 6, sizeof(uint32_t), &first);
405 | CL_CHECKED_CALL(clSetKernelArg, kernel_randomx_run, 7, sizeof(uint32_t), &last);
406 | for (int j = 0, n = 1 << bfactor; j < n; ++j)
407 | {
408 | if (j == n - 1)
409 | {
410 | last = 1;
411 | CL_CHECKED_CALL(clSetKernelArg, kernel_randomx_run, 7, sizeof(uint32_t), &last);
412 | }
413 |
414 | CL_CHECKED_CALL(clEnqueueNDRangeKernel, ctx.queue, kernel_randomx_run, 1, nullptr, (workers_per_hash == 16) ? &global_work_size16 : &global_work_size8, (workers_per_hash == 16) ? &local_work_size32 : &local_work_size16, 0, nullptr, nullptr);
415 |
416 | if (j == 0)
417 | {
418 | first = 0;
419 | CL_CHECKED_CALL(clSetKernelArg, kernel_randomx_run, 6, sizeof(uint32_t), &first);
420 | }
421 | }
422 | }
423 | else
424 | {
425 | //if (i == 0)
426 | //{
427 | // CL_CHECKED_CALL(clFinish, ctx.queue);
428 | // std::vector buf(intensity * COMPILED_PROGRAM_SIZE);
429 | // CL_CHECKED_CALL(clEnqueueReadBuffer, ctx.queue, compiled_programs_gpu, CL_TRUE, 0, buf.size(), buf.data(), 0, nullptr, nullptr);
430 | // FILE* fp;
431 | // fopen_s(&fp, "compiled_program.bin", "wb");
432 | // fwrite(buf.data(), 1, buf.size(), fp);
433 | // fclose(fp);
434 | // return false;
435 | //}
436 | CL_CHECKED_CALL(clFinish, ctx.queue);
437 | if (gcn_version == 15)
438 | {
439 | CL_CHECKED_CALL(clEnqueueNDRangeKernel, ctx.queue, kernel_randomx_run, 1, nullptr, &global_work_size32, &local_work_size32, 0, nullptr, nullptr);
440 | }
441 | else
442 | {
443 | CL_CHECKED_CALL(clEnqueueNDRangeKernel, ctx.queue, kernel_randomx_run, 1, nullptr, &global_work_size64, &local_work_size, 0, nullptr, nullptr);
444 | }
445 | }
446 |
447 | if (i == RANDOMX_PROGRAM_COUNT - 1)
448 | {
449 | CL_CHECKED_CALL(clEnqueueNDRangeKernel, ctx.queue, kernel_hashaes1rx4, 1, nullptr, &global_work_size4, &local_work_size, 0, nullptr, nullptr);
450 | CL_CHECKED_CALL(clEnqueueNDRangeKernel, ctx.queue, kernel_blake2b_hash_registers_32, 1, nullptr, &global_work_size, &local_work_size, 0, nullptr, nullptr);
451 | }
452 | else
453 | {
454 | CL_CHECKED_CALL(clEnqueueNDRangeKernel, ctx.queue, kernel_blake2b_hash_registers_64, 1, nullptr, &global_work_size, &local_work_size, 0, nullptr, nullptr);
455 | }
456 | }
457 |
458 | CL_CHECKED_CALL(clFinish, ctx.queue);
459 |
460 | if (validate)
461 | {
462 | CL_CHECKED_CALL(clEnqueueReadBuffer, ctx.queue, hashes_gpu, CL_TRUE, 0, intensity * 32, hashes.data(), 0, nullptr, nullptr);
463 |
464 | cpu_limited = nonce_counter.load() < intensity;
465 |
466 | for (auto& thread : threads)
467 | thread.join();
468 |
469 | if (memcmp(hashes.data(), hashes_check.data(), intensity * 32) != 0)
470 | {
471 | for (uint32_t i = 0; i < intensity * 32; i += 32)
472 | {
473 | if (memcmp(hashes.data() + i, hashes_check.data() + i, 32))
474 | {
475 | std::cerr << "CPU validation error, failing nonce = " << (nonce + i / 32) << std::endl;
476 | ++failed_nonces;
477 | }
478 | }
479 | }
480 | }
481 | }
482 |
483 | return true;
484 | }
485 |
--------------------------------------------------------------------------------
/RandomX_OpenCL/miner.h:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (c) 2019 SChernykh
3 |
4 | This file is part of RandomX OpenCL.
5 |
6 | RandomX OpenCL is free software: you can redistribute it and/or modify
7 | it under the terms of the GNU General Public License as published by
8 | the Free Software Foundation, either version 3 of the License, or
9 | (at your option) any later version.
10 |
11 | RandomX OpenCL is distributed in the hope that it will be useful,
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | GNU General Public License for more details.
15 |
16 | You should have received a copy of the GNU General Public License
17 | along with RandomX OpenCL. If not, see .
18 | */
19 |
20 | #pragma once
21 |
22 | bool test_mining(uint32_t platform_id, uint32_t device_id, size_t intensity, uint32_t start_nonce, uint32_t workers_per_hash, uint32_t bfactor, bool portable, bool dataset_host_allocated, bool validate);
23 |
--------------------------------------------------------------------------------
/RandomX_OpenCL/opencl_helpers.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (c) 2019 SChernykh
3 |
4 | This file is part of RandomX OpenCL.
5 |
6 | RandomX OpenCL is free software: you can redistribute it and/or modify
7 | it under the terms of the GNU General Public License as published by
8 | the Free Software Foundation, either version 3 of the License, or
9 | (at your option) any later version.
10 |
11 | RandomX OpenCL is distributed in the hope that it will be useful,
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | GNU General Public License for more details.
15 |
16 | You should have received a copy of the GNU General Public License
17 | along with RandomX OpenCL. If not, see .
18 | */
19 |
20 | #include
21 | #include
22 | #include
23 | #include "opencl_helpers.h"
24 |
25 | OpenCLContext::~OpenCLContext()
26 | {
27 | for (auto& k : kernels)
28 | clReleaseKernel(k.second);
29 |
30 | clReleaseCommandQueue(queue);
31 | clReleaseContext(context);
32 | }
33 |
34 | bool OpenCLContext::Init(uint32_t platform_id, uint32_t device_id)
35 | {
36 | cl_int err;
37 |
38 | cl_platform_id platforms[4];
39 | cl_uint num_platforms;
40 | CL_CHECKED_CALL(clGetPlatformIDs, 4, platforms, &num_platforms);
41 |
42 | if (platform_id >= num_platforms)
43 | {
44 | std::cerr << "Invalid platform ID (" << platform_id << "), " << num_platforms << " OpenCL platforms available" << std::endl;
45 | return false;
46 | }
47 |
48 | cl_device_id devices[32];
49 | cl_uint num_devices;
50 | CL_CHECKED_CALL(clGetDeviceIDs, platforms[platform_id], CL_DEVICE_TYPE_GPU, 32, devices, &num_devices);
51 |
52 | if (device_id >= num_devices)
53 | {
54 | std::cerr << "Invalid device ID (" << device_id << "), " << num_devices << " OpenCL GPU devices available" << std::endl;
55 | return false;
56 | }
57 |
58 | device = devices[device_id];
59 |
60 | size_t size;
61 |
62 | CL_CHECKED_CALL(clGetDeviceInfo, device, CL_DEVICE_NAME, 0, nullptr, &size);
63 | device_name.resize(size);
64 | CL_CHECKED_CALL(clGetDeviceInfo, device, CL_DEVICE_NAME, size, device_name.data(), nullptr);
65 |
66 | CL_CHECKED_CALL(clGetDeviceInfo, device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(device_global_mem_size), &device_global_mem_size, nullptr);
67 | CL_CHECKED_CALL(clGetDeviceInfo, device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(device_local_mem_size), &device_local_mem_size, nullptr);
68 | CL_CHECKED_CALL(clGetDeviceInfo, device, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(device_freq), &device_freq, nullptr);
69 | CL_CHECKED_CALL(clGetDeviceInfo, device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(device_compute_units), &device_compute_units, nullptr);
70 | CL_CHECKED_CALL(clGetDeviceInfo, device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(device_max_alloc_size), &device_max_alloc_size, nullptr);
71 |
72 | CL_CHECKED_CALL(clGetDeviceInfo, device, CL_DEVICE_VENDOR, 0, nullptr, &size);
73 | device_vendor.resize(size);
74 | CL_CHECKED_CALL(clGetDeviceInfo, device, CL_DEVICE_VENDOR, size, device_vendor.data(), nullptr);
75 |
76 | CL_CHECKED_CALL(clGetDeviceInfo, device, CL_DEVICE_VERSION, 0, nullptr, &size);
77 | device_version.resize(size);
78 | CL_CHECKED_CALL(clGetDeviceInfo, device, CL_DEVICE_VERSION, size, device_version.data(), nullptr);
79 |
80 | CL_CHECKED_CALL(clGetDeviceInfo, device, CL_DRIVER_VERSION, 0, nullptr, &size);
81 | device_driver_version.resize(size);
82 | CL_CHECKED_CALL(clGetDeviceInfo, device, CL_DRIVER_VERSION, size, device_driver_version.data(), nullptr);
83 |
84 | CL_CHECKED_CALL(clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, 0, nullptr, &size);
85 | device_extensions.resize(size);
86 | CL_CHECKED_CALL(clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, size, device_extensions.data(), nullptr);
87 |
88 | std::cout << "Device name: " << device_name.data() << std::endl;
89 | std::cout << "Device vendor: " << device_vendor.data() << std::endl;
90 | std::cout << "Global memory: " << (device_global_mem_size >> 20) << " MB" << std::endl;
91 | std::cout << "Local memory: " << (device_local_mem_size >> 10) << " KB" << std::endl;
92 | std::cout << "Clock speed: " << device_freq << " MHz" << std::endl;
93 | std::cout << "Compute units: " << device_compute_units << std::endl;
94 | std::cout << "OpenCL version: " << device_version.data() << std::endl;
95 | std::cout << "Driver version: " << device_driver_version.data() << std::endl;
96 | std::cout << "Extensions: " << device_extensions.data() << std::endl << std::endl;
97 |
98 | context = clCreateContext(nullptr, 1, &device, nullptr, nullptr, &err);
99 | CL_CHECK_RESULT(clCreateContext);
100 |
101 | queue = clCreateCommandQueue(context, device, 0, &err);
102 | CL_CHECK_RESULT(clCreateCommandQueue);
103 |
104 | return true;
105 | }
106 |
107 | bool OpenCLContext::Compile(const char* binary_name, const std::initializer_list& source_files, const std::initializer_list& kernel_names, const std::string& options, CachingParameters caching, uint32_t force_elf_binary_flags)
108 | {
109 | std::vector source;
110 | source.reserve(source_files.size());
111 | for (const std::string& source_file : source_files)
112 | {
113 | std::ifstream f(source_file);
114 | if (!f.is_open())
115 | {
116 | std::cerr << "Couldn't open " << source_file << std::endl;
117 | return false;
118 | }
119 | source.emplace_back((std::istreambuf_iterator(f)), std::istreambuf_iterator());
120 | }
121 |
122 | std::vector data;
123 | data.reserve(source_files.size());
124 | for (const std::string& s : source)
125 | data.emplace_back(s.data());
126 |
127 | const char** p = data.data();
128 | cl_int err;
129 |
130 | cl_program program = nullptr;
131 | bool created_with_binary = false;
132 | if (caching != ALWAYS_COMPILE)
133 | {
134 | std::ifstream f(binary_name, std::ios::binary);
135 | if (f.is_open())
136 | {
137 | std::vector buf;
138 | buf.insert(buf.begin(), std::istreambuf_iterator(f), std::istreambuf_iterator());
139 |
140 | const size_t data_length = buf.size();
141 | if (force_elf_binary_flags)
142 | *(uint32_t*)(buf.data() + 0x30) = force_elf_binary_flags;
143 |
144 | const unsigned char* binary_data = reinterpret_cast(buf.data());
145 |
146 | program = clCreateProgramWithBinary(context, 1, &device, &data_length, &binary_data, nullptr, &err);
147 | CL_CHECK_RESULT(clCreateProgramWithBinary);
148 |
149 | created_with_binary = true;
150 | }
151 | else if (caching == ALWAYS_USE_BINARY)
152 | {
153 | std::cerr << "Couldn't open " << binary_name << std::endl;
154 | return false;
155 | }
156 | }
157 |
158 | if (!program)
159 | {
160 | if (caching == ALWAYS_USE_BINARY)
161 | {
162 | std::cerr << "Couldn't create program from binary " << binary_name << std::endl;
163 | return false;
164 | }
165 | program = clCreateProgramWithSource(context, static_cast(source_files.size()), p, nullptr, &err);
166 | CL_CHECK_RESULT(clCreateProgramWithSource);
167 | }
168 |
169 | std::cout << "Compiling " << binary_name << "...";
170 | std::string s = "-Werror -I CL";
171 | if (!options.empty())
172 | {
173 | s += ' ';
174 | s += options;
175 | }
176 | err = clBuildProgram(program, 1, &device, s.c_str(), nullptr, nullptr);
177 | if (err != CL_SUCCESS)
178 | {
179 | std::cerr << "clBuildProgram failed: error " << err << std::endl;
180 |
181 | size_t size;
182 | CL_CHECKED_CALL(clGetProgramBuildInfo, program, device, CL_PROGRAM_BUILD_LOG, 0, nullptr, &size);
183 |
184 | std::vector build_log;
185 | build_log.resize(size);
186 | CL_CHECKED_CALL(clGetProgramBuildInfo, program, device, CL_PROGRAM_BUILD_LOG, size, build_log.data(), nullptr);
187 |
188 | std::cerr << build_log.data() << std::endl;
189 |
190 | return false;
191 | }
192 | std::cout << "done" << std::endl;
193 |
194 | size_t bin_size;
195 | CL_CHECKED_CALL(clGetProgramInfo, program, CL_PROGRAM_BINARY_SIZES, sizeof(bin_size), &bin_size, nullptr);
196 |
197 | std::vector binary_data(bin_size);
198 | char* tmp[1] = { binary_data.data() };
199 | CL_CHECKED_CALL(clGetProgramInfo, program, CL_PROGRAM_BINARIES, sizeof(tmp), tmp, NULL);
200 |
201 | elf_binary_flags = (bin_size >= 0x34) ? *(uint32_t*)(binary_data.data() + 0x30) : 0;
202 |
203 | if (!created_with_binary)
204 | {
205 | std::ofstream f(binary_name, std::ios::binary);
206 | f.write(tmp[0], bin_size);
207 | f.close();
208 | }
209 |
210 | for (const std::string& name : kernel_names)
211 | {
212 | cl_kernel kernel = clCreateKernel(program, name.c_str(), &err);
213 | CL_CHECK_RESULT(clCreateKernel);
214 |
215 | kernels.emplace(name, kernel);
216 | }
217 |
218 | CL_CHECKED_CALL(clReleaseProgram, program);
219 | return true;
220 | }
221 |
--------------------------------------------------------------------------------
/RandomX_OpenCL/opencl_helpers.h:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (c) 2019 SChernykh
3 |
4 | This file is part of RandomX OpenCL.
5 |
6 | RandomX OpenCL is free software: you can redistribute it and/or modify
7 | it under the terms of the GNU General Public License as published by
8 | the Free Software Foundation, either version 3 of the License, or
9 | (at your option) any later version.
10 |
11 | RandomX OpenCL is distributed in the hope that it will be useful,
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | GNU General Public License for more details.
15 |
16 | You should have received a copy of the GNU General Public License
17 | along with RandomX OpenCL. If not, see .
18 | */
19 |
20 | #pragma once
21 |
22 | #include
23 | #include
24 | #include
25 | #include
26 | #include
27 | #include