├── keccak ├── keccak256.h └── keccak256.cl ├── settings.0.ini ├── requirements.txt ├── pathutils └── __init__.py ├── README.md ├── LICENSE ├── .vscode └── launch.json ├── kernels ├── gen_pub_key.cl └── gen_eth_addr.cl ├── .gitignore ├── secp256k1 ├── inc_vendor.h ├── inc_ecc_secp256k1.h ├── inc_ecc_secp256k1.cl └── inc_types.h ├── pygeneth.py └── pyvanityeth.py /keccak/keccak256.h: -------------------------------------------------------------------------------- 1 | #define KECCAK256_HASH_LEN 32 2 | #define KECCAK256_BLOCKSIZE (200-KECCAK256_HASH_LEN*2) 3 | #define KECCAK256_STATE_LEN 25 -------------------------------------------------------------------------------- /settings.0.ini: -------------------------------------------------------------------------------- 1 | [settings] 2 | CL_PATH=C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.44.35207\bin\Hostx64\x64 3 | CUDA_DLL_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0\bin\x64 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | ecdsa==0.19.1 2 | Mako==1.3.10 3 | MarkupSafe==3.0.2 4 | numpy==2.3.3 5 | platformdirs==4.4.0 6 | pycryptodome==3.23.0 7 | pycuda==2025.1.2 8 | python-decouple==3.8 9 | pytools==2025.2.4 10 | siphash24==1.8 11 | six==1.17.0 12 | typing_extensions==4.15.0 13 | -------------------------------------------------------------------------------- /pathutils/__init__.py: -------------------------------------------------------------------------------- 1 | from decouple import config 2 | import os 3 | 4 | CL_PATH = config('CL_PATH', default='') 5 | if len(CL_PATH) > 0: 6 | os.environ['PATH'] += ';'+CL_PATH 7 | CUDA_DLL_PATH = config('CUDA_DLL_PATH', default='') 8 | if len(CUDA_DLL_PATH) > 0: 9 | os.add_dll_directory(CUDA_DLL_PATH) 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # gen_eth 2 | 3 | ## Usage 4 | 5 | Generate vanity Ethereum address with prefix 0x1a2b 6 | 7 | ```bash 8 | python .\pyvanityeth.py --prefix 0x1a2b 9 | ``` 10 | 11 | To display a help message listing the available command line arguments, run: 12 | 13 | ```bash 14 | python .\pyvanityeth.py -h 15 | ``` 16 | 17 | ## Installation 18 | 19 | ```bash 20 | python -m pip install pycuda 21 | python -m pip install numpy 22 | python -m pip install python-decouple 23 | python -m pip install pycryptodome 24 | python -m pip install ecdsa 25 | ``` 26 | 27 | ## Configuration 28 | 29 | Configuration is done through a file called `settings.ini`. Example: 30 | 31 | ```ini 32 | [settings] 33 | CL_PATH=C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.44.35207\bin\Hostx64\x64 34 | CUDA_DLL_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0\bin\x64 35 | ``` 36 | 37 | - CL_PATH - C++ compiler path 38 | - CUDA_DLL_PATH - CUDA Toolkit DLLs path -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Vitaly 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "name": "pygeneth", 9 | "type": "debugpy", 10 | "request": "launch", 11 | "program": "pygeneth.py", 12 | "console": "integratedTerminal", 13 | "justMyCode": true 14 | }, 15 | { 16 | "name": "pyvanityeth", 17 | "type": "debugpy", 18 | "request": "launch", 19 | "program": "pyvanityeth.py", 20 | "console": "integratedTerminal", 21 | "justMyCode": true, 22 | "args": [ 23 | "--verbose", "--verify", 24 | "--output", "./output/l_1a2b.txt", 25 | "--prefix", "1a2b", 26 | "--blocks", "5000", 27 | "--blockSize", "256", 28 | "--blockIterations", "20" 29 | ] 30 | } 31 | ] 32 | } -------------------------------------------------------------------------------- /keccak/keccak256.cl: -------------------------------------------------------------------------------- 1 | #include "keccak256.h" 2 | 3 | #define NUM_ROUNDS 24 4 | 5 | DECLSPEC u64 rotl64(u64 x, int i) { 6 | return ((0U + x) << i) | (x >> ((64 - i) & 63)); 7 | } 8 | 9 | DECLSPEC void keccak256_absorb(PRIVATE_AS u64* state, PRIVATE_AS const int* rotation) { // u64 state[5 * 5] 10 | u8 r = 1; // LFSR 11 | for (int i = 0; i < NUM_ROUNDS; i++) { 12 | // Theta step 13 | u64 c[5] = {}; 14 | for (int x = 0; x < 5; x++) { 15 | for (int y = 0; y < 5; y++) 16 | c[x] ^= state[x + y + (y << 2)]; // x * 5 + y 17 | } 18 | for (int x = 0; x < 5; x++) { 19 | u64 d = c[(x + 4) % 5] ^ rotl64(c[(x + 1) % 5], 1); 20 | for (int y = 0; y < 5; y++) 21 | state[x + y + (y << 2)] ^= d; 22 | } 23 | // Rho and pi steps 24 | u64 b[5][5]; 25 | for (int x = 0; x < 5; x++) { 26 | for (int y = 0; y < 5; y++) 27 | b[y][(x * 2 + y * 3) % 5] = rotl64(state[x + y + (y << 2)], rotation[(x << 2) + x + y]); 28 | } 29 | // Chi step 30 | for (int x = 0; x < 5; x++) { 31 | for (int y = 0; y < 5; y++) 32 | state[x + y + (y << 2)] = b[x][y] ^ (~b[(x + 1) % 5][y] & b[(x + 2) % 5][y]); 33 | } 34 | // Iota step 35 | for (int j = 0; j < 7; j++) { 36 | state[0] ^= (u64)(r & 1) << ((1 << j) - 1); 37 | r = (u8)((r << 1) ^ ((r >> 7) * 0x171)); 38 | } 39 | } 40 | } 41 | 42 | DECLSPEC void keccak256_update_state(PRIVATE_AS u64* state, PRIVATE_AS const u8* msg, PRIVATE_AS const u32 len) // u64 state[5 * 5] 43 | { 44 | const int rotation[25] = { 45 | 0, 36, 3, 41, 18, 46 | 1, 44, 10, 45, 2, 47 | 62, 6, 43, 15, 61, 48 | 28, 55, 25, 21, 56, 49 | 27, 20, 39, 8, 14 50 | }; 51 | u32 blockOff = 0; 52 | for (u32 i = 0; i < len; i++) { 53 | u32 j = blockOff >> 3; 54 | u32 xj = j % 5; 55 | u32 yj = j / 5; 56 | state[xj + yj + (yj << 2)] ^= (u64)(msg[i]) << ((blockOff & 7) << 3); 57 | blockOff++; 58 | if (blockOff == KECCAK256_BLOCKSIZE) { 59 | keccak256_absorb(state, rotation); 60 | blockOff = 0; 61 | } 62 | } 63 | // Final block and padding 64 | { 65 | int i = blockOff >> 3; 66 | u32 xi = i % 5; 67 | u32 yi = i / 5; 68 | state[xi + yi + (yi << 2)] ^= UINT64_C(0x01) << ((blockOff & 7) << 3); 69 | blockOff = KECCAK256_BLOCKSIZE - 1; 70 | int j = blockOff >> 3; 71 | u32 xj = j % 5; 72 | u32 yj = j / 5; 73 | state[xj + yj + (yj << 2)] ^= UINT64_C(0x80) << ((blockOff & 7) << 3); 74 | keccak256_absorb(state, rotation); 75 | } 76 | } 77 | 78 | DECLSPEC void keccak256_get_hash(GLOBAL_AS u8* r, GLOBAL_AS const u8* msg, GLOBAL_AS const u32 len) 79 | { 80 | u64 state[25] = {}; 81 | keccak256_update_state(state, (u8*)msg, len); 82 | // Uint64 array to bytes in little endian 83 | for (int i = 0; i < KECCAK256_HASH_LEN; i++) { 84 | int j = i >> 3; 85 | u32 xj = j % 5; 86 | u32 yj = j / 5; 87 | r[i] = (u8)(state[xj + yj + (yj << 2)] >> ((i & 7) << 3)); 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /kernels/gen_pub_key.cl: -------------------------------------------------------------------------------- 1 | // @author: Vitaly | github.com/optinsoft 2 | 3 | // little endian to big endian 4 | DECLSPEC u32 l2be(u32 x) { 5 | return (x & 0xff) << 24 | (x & 0xff00) << 8 | (x & 0xff0000) >> 8 | (x & 0xff000000) >> 24; 6 | } 7 | 8 | __global__ void genPubKey( 9 | GLOBAL_AS u32 *r0, GLOBAL_AS u32 *r1, GLOBAL_AS u32 *r2, GLOBAL_AS u32 *r3, 10 | GLOBAL_AS u32 *r4, GLOBAL_AS u32 *r5, GLOBAL_AS u32 *r6, GLOBAL_AS u32 *r7, 11 | GLOBAL_AS u32 *r8, GLOBAL_AS u32 *r9, GLOBAL_AS u32 *r10, GLOBAL_AS u32 *r11, 12 | GLOBAL_AS u32 *r12, GLOBAL_AS u32 *r13, GLOBAL_AS u32 *r14, GLOBAL_AS u32 *r15, 13 | GLOBAL_AS u32* h0, GLOBAL_AS u32* h1, GLOBAL_AS u32* h2, GLOBAL_AS u32* h3, 14 | GLOBAL_AS u32* h4, GLOBAL_AS u32* h5, GLOBAL_AS u32* h6, GLOBAL_AS u32* h7, 15 | GLOBAL_AS const u32 *k0, GLOBAL_AS const u32 *k1, GLOBAL_AS const u32 *k2, GLOBAL_AS const u32 *k3, 16 | GLOBAL_AS const u32 *k4, GLOBAL_AS const u32 *k5, GLOBAL_AS const u32 *k6, GLOBAL_AS const u32 *k7) 17 | { 18 | u32 g_local[PUBLIC_KEY_LENGTH_WITH_PARITY]; 19 | u32 k_local[PRIVATE_KEY_LENGTH]; 20 | secp256k1_t g_xy_local; 21 | u32 return_value; 22 | 23 | int i = threadIdx.x; 24 | 25 | g_local[0] = SECP256K1_G_STRING0; 26 | g_local[1] = SECP256K1_G_STRING1; 27 | g_local[2] = SECP256K1_G_STRING2; 28 | g_local[3] = SECP256K1_G_STRING3; 29 | g_local[4] = SECP256K1_G_STRING4; 30 | g_local[5] = SECP256K1_G_STRING5; 31 | g_local[6] = SECP256K1_G_STRING6; 32 | g_local[7] = SECP256K1_G_STRING7; 33 | g_local[8] = SECP256K1_G_STRING8; 34 | 35 | // global to local 36 | k_local[7] = k0[i]; 37 | k_local[6] = k1[i]; 38 | k_local[5] = k2[i]; 39 | k_local[4] = k3[i]; 40 | k_local[3] = k4[i]; 41 | k_local[2] = k5[i]; 42 | k_local[1] = k6[i]; 43 | k_local[0] = k7[i]; 44 | 45 | return_value = parse_public(&g_xy_local, g_local); 46 | if (return_value != 0) { 47 | return; 48 | } 49 | 50 | u32 x[8]; 51 | u32 y[8]; 52 | point_mul_xy (x, y, k_local, &g_xy_local); 53 | 54 | // local to global 55 | r7[i] = x[0]; 56 | r6[i] = x[1]; 57 | r5[i] = x[2]; 58 | r4[i] = x[3]; 59 | r3[i] = x[4]; 60 | r2[i] = x[5]; 61 | r1[i] = x[6]; 62 | r0[i] = x[7]; 63 | r15[i] = y[0]; 64 | r14[i] = y[1]; 65 | r13[i] = y[2]; 66 | r12[i] = y[3]; 67 | r11[i] = y[4]; 68 | r10[i] = y[5]; 69 | r9[i] = y[6]; 70 | r8[i] = y[7]; 71 | 72 | // keccak256 73 | u64 keccak_state[KECCAK256_STATE_LEN] = {}; 74 | u32 w[16]; 75 | 76 | w[7] = l2be(x[0]); 77 | w[6] = l2be(x[1]); 78 | w[5] = l2be(x[2]); 79 | w[4] = l2be(x[3]); 80 | w[3] = l2be(x[4]); 81 | w[2] = l2be(x[5]); 82 | w[1] = l2be(x[6]); 83 | w[0] = l2be(x[7]); 84 | w[15] = l2be(y[0]); 85 | w[14] = l2be(y[1]); 86 | w[13] = l2be(y[2]); 87 | w[12] = l2be(y[3]); 88 | w[11] = l2be(y[4]); 89 | w[10] = l2be(y[5]); 90 | w[9] = l2be(y[6]); 91 | w[8] = l2be(y[7]); 92 | 93 | keccak256_update_state(keccak_state, (u8*)w, 64); 94 | 95 | h0[i] = l2be((u32)keccak_state[0]); 96 | h1[i] = l2be((u32)(keccak_state[0] >> 32)); 97 | h2[i] = l2be((u32)keccak_state[1]); 98 | h3[i] = l2be((u32)(keccak_state[1] >> 32)); 99 | h4[i] = l2be((u32)keccak_state[2]); 100 | h5[i] = l2be((u32)(keccak_state[2] >> 32)); 101 | h6[i] = l2be((u32)keccak_state[3]); 102 | h7[i] = l2be((u32)(keccak_state[3] >> 32)); 103 | } -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | output/ 2 | temp/ 3 | settings.ini 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | cover/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | .pybuilder/ 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | # For a library or package, you might want to ignore these files since the code is 91 | # intended to run in multiple environments; otherwise, check them in: 92 | # .python-version 93 | 94 | # pipenv 95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 98 | # install all needed dependencies. 99 | #Pipfile.lock 100 | 101 | # poetry 102 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 103 | # This is especially recommended for binary packages to ensure reproducibility, and is more 104 | # commonly ignored for libraries. 105 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 106 | #poetry.lock 107 | 108 | # pdm 109 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 110 | #pdm.lock 111 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 112 | # in version control. 113 | # https://pdm.fming.dev/#use-with-ide 114 | .pdm.toml 115 | 116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 117 | __pypackages__/ 118 | 119 | # Celery stuff 120 | celerybeat-schedule 121 | celerybeat.pid 122 | 123 | # SageMath parsed files 124 | *.sage.py 125 | 126 | # Environments 127 | .env 128 | .venv 129 | env/ 130 | venv/ 131 | ENV/ 132 | env.bak/ 133 | venv.bak/ 134 | 135 | # Spyder project settings 136 | .spyderproject 137 | .spyproject 138 | 139 | # Rope project settings 140 | .ropeproject 141 | 142 | # mkdocs documentation 143 | /site 144 | 145 | # mypy 146 | .mypy_cache/ 147 | .dmypy.json 148 | dmypy.json 149 | 150 | # Pyre type checker 151 | .pyre/ 152 | 153 | # pytype static type analyzer 154 | .pytype/ 155 | 156 | # Cython debug symbols 157 | cython_debug/ 158 | 159 | # PyCharm 160 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 161 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 162 | # and can be added to the global gitignore or merged into this file. For a more nuclear 163 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 164 | #.idea/ 165 | 166 | kernel.cl 167 | -------------------------------------------------------------------------------- /secp256k1/inc_vendor.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Author......: See docs/credits.txt 3 | * License.....: MIT 4 | */ 5 | 6 | #ifndef INC_VENDOR_H 7 | #define INC_VENDOR_H 8 | 9 | #if defined HC_CPU_OPENCL_EMU_H 10 | #define IS_NATIVE 11 | #elif defined __CUDACC__ 12 | #define IS_CUDA 13 | #elif defined __HIPCC__ 14 | #define IS_HIP 15 | #elif defined __METAL__ || defined __METAL_MACOS__ 16 | #define IS_METAL 17 | #else 18 | #define IS_OPENCL 19 | #endif 20 | 21 | #if defined IS_METAL 22 | #include 23 | 24 | using namespace metal; 25 | #endif 26 | 27 | #if defined IS_NATIVE 28 | #define CONSTANT_VK 29 | #define CONSTANT_AS 30 | #define GLOBAL_AS 31 | #define LOCAL_VK 32 | #define LOCAL_AS 33 | #define PRIVATE_AS 34 | #define KERNEL_FQ 35 | #elif defined IS_CUDA 36 | #define CONSTANT_VK __constant__ 37 | #define CONSTANT_AS 38 | #define GLOBAL_AS 39 | #define LOCAL_VK __shared__ 40 | #define LOCAL_AS 41 | #define PRIVATE_AS 42 | #define KERNEL_FQ extern "C" __global__ 43 | #elif defined IS_HIP 44 | #define CONSTANT_VK __constant__ 45 | #define CONSTANT_AS 46 | #define GLOBAL_AS 47 | #define LOCAL_VK __shared__ 48 | #define LOCAL_AS 49 | #define PRIVATE_AS 50 | #define KERNEL_FQ extern "C" __global__ 51 | #elif defined IS_METAL 52 | #define CONSTANT_VK constant 53 | #define CONSTANT_AS constant 54 | #define GLOBAL_AS device 55 | #define LOCAL_VK threadgroup 56 | #define LOCAL_AS threadgroup 57 | #define PRIVATE_AS thread 58 | #define KERNEL_FQ kernel 59 | #elif defined IS_OPENCL 60 | #define CONSTANT_VK __constant 61 | #define CONSTANT_AS __constant 62 | #define GLOBAL_AS __global 63 | #define LOCAL_VK __local 64 | #define LOCAL_AS __local 65 | #define PRIVATE_AS 66 | #define KERNEL_FQ __kernel 67 | #endif 68 | 69 | #ifndef MAYBE_UNUSED 70 | #define MAYBE_UNUSED 71 | #endif 72 | 73 | /** 74 | * device type 75 | */ 76 | 77 | #define DEVICE_TYPE_CPU 2 78 | #define DEVICE_TYPE_GPU 4 79 | #define DEVICE_TYPE_ACCEL 8 80 | 81 | #if DEVICE_TYPE == DEVICE_TYPE_CPU 82 | #define IS_CPU 83 | #elif DEVICE_TYPE == DEVICE_TYPE_GPU 84 | #define IS_GPU 85 | #elif DEVICE_TYPE == DEVICE_TYPE_ACCEL 86 | #define IS_ACCEL 87 | #endif 88 | 89 | /** 90 | * vendor specific 91 | */ 92 | 93 | #if VENDOR_ID == (1 << 0) 94 | #define IS_AMD 95 | #elif VENDOR_ID == (1 << 1) 96 | #define IS_APPLE 97 | #define IS_GENERIC 98 | #elif VENDOR_ID == (1 << 2) 99 | #define IS_INTEL_BEIGNET 100 | #define IS_GENERIC 101 | #elif VENDOR_ID == (1 << 3) 102 | #define IS_INTEL_SDK 103 | #define IS_GENERIC 104 | #elif VENDOR_ID == (1 << 4) 105 | #define IS_MESA 106 | #define IS_GENERIC 107 | #elif VENDOR_ID == (1 << 5) 108 | #define IS_NV 109 | #elif VENDOR_ID == (1 << 6) 110 | #define IS_POCL 111 | #define IS_GENERIC 112 | #elif VENDOR_ID == (1 << 8) 113 | #define IS_AMD_USE_HIP 114 | #else 115 | #define IS_GENERIC 116 | #endif 117 | 118 | #if defined IS_AMD && HAS_VPERM == 1 119 | #define IS_ROCM 120 | #endif 121 | 122 | #define LOCAL_MEM_TYPE_LOCAL 1 123 | #define LOCAL_MEM_TYPE_GLOBAL 2 124 | 125 | #if LOCAL_MEM_TYPE == LOCAL_MEM_TYPE_LOCAL 126 | #define REAL_SHM 127 | #endif 128 | 129 | // So far, only used by -m 22100 and only affects NVIDIA on OpenCL. CUDA seems to work fine. 130 | #ifdef FORCE_DISABLE_SHM 131 | #undef REAL_SHM 132 | #endif 133 | 134 | #ifdef REAL_SHM 135 | #define SHM_TYPE LOCAL_AS 136 | #else 137 | #define SHM_TYPE CONSTANT_AS 138 | #endif 139 | 140 | /** 141 | * function declarations can have a large influence depending on the opencl runtime 142 | * fast but pure kernels on rocm is a good example 143 | */ 144 | 145 | #ifdef NO_INLINE 146 | #define HC_INLINE 147 | #else 148 | #define HC_INLINE inline static 149 | #endif 150 | 151 | #if defined IS_AMD && defined IS_GPU 152 | #define DECLSPEC HC_INLINE 153 | #elif defined IS_HIP 154 | #define DECLSPEC __device__ HC_INLINE 155 | #else 156 | #define DECLSPEC __device__ 157 | #endif 158 | 159 | /** 160 | * AMD specific 161 | */ 162 | 163 | #ifdef IS_AMD 164 | #if defined(cl_amd_media_ops) 165 | #pragma OPENCL EXTENSION cl_amd_media_ops : enable 166 | #endif 167 | #if defined(cl_amd_media_ops2) 168 | #pragma OPENCL EXTENSION cl_amd_media_ops2 : enable 169 | #endif 170 | #endif 171 | 172 | // Whitelist some OpenCL specific functions 173 | // This could create more stable kernels on systems with bad OpenCL drivers 174 | 175 | #ifdef IS_CUDA 176 | #define USE_BITSELECT 177 | #define USE_ROTATE 178 | #endif 179 | 180 | #ifdef IS_HIP 181 | #define USE_BITSELECT 182 | #define USE_ROTATE 183 | #endif 184 | 185 | #ifdef IS_ROCM 186 | #define USE_BITSELECT 187 | #define USE_ROTATE 188 | #endif 189 | 190 | #ifdef IS_INTEL_SDK 191 | #ifdef IS_CPU 192 | //#define USE_BITSELECT 193 | //#define USE_ROTATE 194 | #endif 195 | #endif 196 | 197 | #ifdef IS_OPENCL 198 | //#define USE_BITSELECT 199 | //#define USE_ROTATE 200 | //#define USE_SWIZZLE 201 | #endif 202 | 203 | #ifdef IS_METAL 204 | #define USE_ROTATE 205 | 206 | // Metal support max VECT_SIZE = 4 207 | #define s0 x 208 | #define s1 y 209 | #define s2 z 210 | #define s3 w 211 | #endif 212 | 213 | #endif // INC_VENDOR_H 214 | -------------------------------------------------------------------------------- /kernels/gen_eth_addr.cl: -------------------------------------------------------------------------------- 1 | // @author: Vitaly | github.com/optinsoft 2 | 3 | // little endian to big endian 4 | DECLSPEC u32 l2be(u32 x) { 5 | return (x & 0xff) << 24 | (x & 0xff00) << 8 | (x & 0xff0000) >> 8 | (x & 0xff000000) >> 24; 6 | } 7 | 8 | __global__ void genEthAddress( 9 | GLOBAL_AS u32 *r0, GLOBAL_AS u32 *r1, GLOBAL_AS u32 *r2, GLOBAL_AS u32 *r3, GLOBAL_AS u32 *r4, 10 | GLOBAL_AS const u32 *k0, GLOBAL_AS const u32 *k1, GLOBAL_AS const u32 *k2, GLOBAL_AS const u32 *k3, 11 | GLOBAL_AS const u32 *k4, GLOBAL_AS const u32 *k5, GLOBAL_AS const u32 *k6, GLOBAL_AS const u32 *k7) 12 | { 13 | u32 g_local[PUBLIC_KEY_LENGTH_WITH_PARITY]; 14 | u32 k_local[PRIVATE_KEY_LENGTH]; 15 | secp256k1_t g_xy_local; 16 | u32 return_value; 17 | 18 | int i = threadIdx.x; 19 | 20 | g_local[0] = SECP256K1_G_STRING0; 21 | g_local[1] = SECP256K1_G_STRING1; 22 | g_local[2] = SECP256K1_G_STRING2; 23 | g_local[3] = SECP256K1_G_STRING3; 24 | g_local[4] = SECP256K1_G_STRING4; 25 | g_local[5] = SECP256K1_G_STRING5; 26 | g_local[6] = SECP256K1_G_STRING6; 27 | g_local[7] = SECP256K1_G_STRING7; 28 | g_local[8] = SECP256K1_G_STRING8; 29 | 30 | // global to local 31 | k_local[7] = k0[i]; 32 | k_local[6] = k1[i]; 33 | k_local[5] = k2[i]; 34 | k_local[4] = k3[i]; 35 | k_local[3] = k4[i]; 36 | k_local[2] = k5[i]; 37 | k_local[1] = k6[i]; 38 | k_local[0] = k7[i]; 39 | 40 | return_value = parse_public(&g_xy_local, g_local); 41 | if (return_value != 0) { 42 | return; 43 | } 44 | 45 | u32 x[8]; 46 | u32 y[8]; 47 | point_mul_xy (x, y, k_local, &g_xy_local); 48 | 49 | // keccak256 50 | u64 keccak_state[KECCAK256_STATE_LEN] = {}; 51 | u32 w[16]; 52 | 53 | w[7] = l2be(x[0]); 54 | w[6] = l2be(x[1]); 55 | w[5] = l2be(x[2]); 56 | w[4] = l2be(x[3]); 57 | w[3] = l2be(x[4]); 58 | w[2] = l2be(x[5]); 59 | w[1] = l2be(x[6]); 60 | w[0] = l2be(x[7]); 61 | w[15] = l2be(y[0]); 62 | w[14] = l2be(y[1]); 63 | w[13] = l2be(y[2]); 64 | w[12] = l2be(y[3]); 65 | w[11] = l2be(y[4]); 66 | w[10] = l2be(y[5]); 67 | w[9] = l2be(y[6]); 68 | w[8] = l2be(y[7]); 69 | 70 | keccak256_update_state(keccak_state, (u8*)w, 64); 71 | 72 | r0[i] = l2be((u32)(keccak_state[1] >> 32)); 73 | r1[i] = l2be((u32)keccak_state[2]); 74 | r2[i] = l2be((u32)(keccak_state[2] >> 32)); 75 | r3[i] = l2be((u32)keccak_state[3]); 76 | r4[i] = l2be((u32)(keccak_state[3] >> 32)); 77 | } 78 | 79 | __global__ void genEthAddressWithPrefix( 80 | GLOBAL_AS u32 *r0, GLOBAL_AS u32 *r1, GLOBAL_AS u32 *r2, GLOBAL_AS u32 *r3, GLOBAL_AS u32 *r4, GLOBAL_AS u32 *rp, 81 | GLOBAL_AS u32 *k0, GLOBAL_AS u32 *k1, GLOBAL_AS u32 *k2, GLOBAL_AS u32 *k3, 82 | GLOBAL_AS u32 *k4, GLOBAL_AS u32 *k5, GLOBAL_AS u32 *k6, GLOBAL_AS u32 *k7, 83 | GLOBAL_AS const u32 p[5], GLOBAL_AS const u32 plen, GLOBAL_AS const u32 n) 84 | { 85 | u32 g_local[PUBLIC_KEY_LENGTH_WITH_PARITY]; 86 | u32 k_local[PRIVATE_KEY_LENGTH]; 87 | secp256k1_t g_xy_local; 88 | u32 return_value; 89 | u32 p_local[5]; 90 | u32 m_local[5]; 91 | u32 r_local[5]; 92 | u32 rp_local = 0; 93 | 94 | int i = threadIdx.x; 95 | 96 | // global to local 97 | k_local[7] = k0[i]; 98 | k_local[6] = k1[i]; 99 | k_local[5] = k2[i]; 100 | k_local[4] = k3[i]; 101 | k_local[3] = k4[i]; 102 | k_local[2] = k5[i]; 103 | k_local[1] = k6[i]; 104 | k_local[0] = k7[i]; 105 | 106 | u32 l = plen; 107 | m_local[0] = (l >= 4) ? 0xffffffff : 0xffffffff << ((4-l) << 3); 108 | l = (l >= 4) ? l-4 : 0; 109 | p_local[0] = p[0] & m_local[0]; 110 | m_local[1] = (l >= 4) ? 0xffffffff : 0xffffffff << ((4-l) << 3); 111 | l = (l >= 4) ? l-4 : 0; 112 | p_local[1] = p[1] & m_local[1]; 113 | m_local[2] = (l >= 4) ? 0xffffffff : 0xffffffff << ((4-l) << 3); 114 | l = (l >= 4) ? l-4 : 0; 115 | p_local[2] = p[2] & m_local[2]; 116 | m_local[3] = (l >= 4) ? 0xffffffff : 0xffffffff << ((4-l) << 3); 117 | l = (l >= 4) ? l-4 : 0; 118 | p_local[3] = p[3] & m_local[3]; 119 | m_local[4] = (l >= 4) ? 0xffffffff : 0xffffffff << ((4-l) << 3); 120 | l = (l >= 4) ? l-4 : 0; 121 | p_local[4] = p[4] & m_local[4]; 122 | 123 | u32 n_local = n > 0 ? n : 1; 124 | 125 | u32 x[8]; 126 | u32 y[8]; 127 | u32 w[16]; 128 | u32 ni = 0; 129 | 130 | while (1) { 131 | 132 | g_local[0] = SECP256K1_G_STRING0; 133 | g_local[1] = SECP256K1_G_STRING1; 134 | g_local[2] = SECP256K1_G_STRING2; 135 | g_local[3] = SECP256K1_G_STRING3; 136 | g_local[4] = SECP256K1_G_STRING4; 137 | g_local[5] = SECP256K1_G_STRING5; 138 | g_local[6] = SECP256K1_G_STRING6; 139 | g_local[7] = SECP256K1_G_STRING7; 140 | g_local[8] = SECP256K1_G_STRING8; 141 | 142 | return_value = parse_public(&g_xy_local, g_local); 143 | if (return_value != 0) { 144 | return; 145 | } 146 | 147 | point_mul_xy (x, y, k_local, &g_xy_local); 148 | 149 | // keccak256 150 | u64 keccak_state[KECCAK256_STATE_LEN] = {0}; 151 | 152 | w[7] = l2be(x[0]); 153 | w[6] = l2be(x[1]); 154 | w[5] = l2be(x[2]); 155 | w[4] = l2be(x[3]); 156 | w[3] = l2be(x[4]); 157 | w[2] = l2be(x[5]); 158 | w[1] = l2be(x[6]); 159 | w[0] = l2be(x[7]); 160 | w[15] = l2be(y[0]); 161 | w[14] = l2be(y[1]); 162 | w[13] = l2be(y[2]); 163 | w[12] = l2be(y[3]); 164 | w[11] = l2be(y[4]); 165 | w[10] = l2be(y[5]); 166 | w[9] = l2be(y[6]); 167 | w[8] = l2be(y[7]); 168 | 169 | keccak256_update_state(keccak_state, (u8*)w, 64); 170 | 171 | ni++; 172 | 173 | r_local[0] = l2be((u32)(keccak_state[1] >> 32)); 174 | r_local[1] = l2be((u32)keccak_state[2]); 175 | r_local[2] = l2be((u32)(keccak_state[2] >> 32)); 176 | r_local[3] = l2be((u32)keccak_state[3]); 177 | r_local[4] = l2be((u32)(keccak_state[3] >> 32)); 178 | rp_local = (((r_local[0] & m_local[0]) == p_local[0]) && 179 | ((r_local[1] & m_local[1]) == p_local[1]) && 180 | ((r_local[2] & m_local[2]) == p_local[2]) && 181 | ((r_local[3] & m_local[3]) == p_local[3]) && 182 | ((r_local[4] & m_local[4]) == p_local[4])) 183 | ? ni : 0; 184 | 185 | if (ni >= n_local || rp_local) break; 186 | 187 | k_local[(ni & 7)] += 479001599; 188 | } 189 | 190 | //save results 191 | r0[i] = r_local[0]; 192 | r1[i] = r_local[1]; 193 | r2[i] = r_local[2]; 194 | r3[i] = r_local[3]; 195 | r4[i] = r_local[4]; 196 | rp[i] = rp_local; 197 | 198 | k0[i] = k_local[7]; 199 | k1[i] = k_local[6]; 200 | k2[i] = k_local[5]; 201 | k3[i] = k_local[4]; 202 | k4[i] = k_local[3]; 203 | k5[i] = k_local[2]; 204 | k6[i] = k_local[1]; 205 | k7[i] = k_local[0]; 206 | } -------------------------------------------------------------------------------- /pygeneth.py: -------------------------------------------------------------------------------- 1 | """ 2 | @author: Vitaly | github.com/optinsoft 3 | """ 4 | import pathutils 5 | import pycuda.driver as cuda 6 | from pycuda.compiler import SourceModule 7 | import pycuda.gpuarray as gpuarray 8 | import pycuda.autoinit 9 | import numpy as np 10 | from decouple import config 11 | import os 12 | from functools import reduce 13 | import ecdsa 14 | from Crypto.Hash import keccak 15 | 16 | def randomUInt32() -> int: 17 | return int.from_bytes(np.random.bytes(4), byteorder='little', signed=False) 18 | 19 | ''' 20 | test private key: 0x68e23530deb6d5011ab56d8ad9f7b4a3b424f1112f08606357497495929f72dc 21 | test public key: 0x5d99d81d9e731e0d7eebd1c858b1155da7981b1f0a16d322a361f8b589ad2e3bde53dc614e3a84164dab3f5899abde3b09553dca10c9716fa623a5942b9ea420 22 | test keccak256: 0x4c84817f57c18372837905af33f4b63eb1c5a9966a31cebc302f563685695506 23 | test eth address: 0x33f4b63eb1c5a9966a31cebc302f563685695506 24 | ''' 25 | 26 | def testUInt32(idx: int) -> int: 27 | r = [0x68e23530, 0xdeb6d501, 0x1ab56d8a, 0xd9f7b4a3, 0xb424f111, 0x2f086063, 0x57497495, 0x929f72dc][idx] 28 | return r 29 | 30 | def randomUInt32Array(count: int) -> list[int]: 31 | return [randomUInt32() for i in range(count)] 32 | 33 | def randomWithTestUInt32Array(count: int, idx: int) -> list[int]: 34 | return [testUInt32(idx) if i == 0 else randomUInt32() for i in range(count)] 35 | 36 | def constUInt32Array(count: int, v: int) -> list[int]: 37 | return [v for i in range(count)] 38 | 39 | def public_key_to_address(public_key, i, print_keccak): 40 | keccak_hash = keccak.new(digest_bits=256) 41 | keccak_hash.update(public_key) 42 | keccak_digest = keccak_hash.digest() 43 | if print_keccak: 44 | print(f'keccak[{i}] (verification): 0x{keccak_digest.hex()}') 45 | address = '0x' + keccak_digest[-20:].hex() 46 | return address 47 | 48 | def key_to_hex(k: list[int]) -> str: 49 | return reduce(lambda s, t: str(s) + t.to_bytes(4, byteorder='big').hex(), k[1:], k[0].to_bytes(4, byteorder='big').hex()) 50 | 51 | def main_genPubKey(keyCount: int, verify: bool): 52 | kernel_code = ''' 53 | 54 | ''' 55 | def load_code(path: str) -> str: 56 | with open(path, 'r') as text_file: 57 | code_text = text_file.read() 58 | lines = code_text.splitlines() 59 | result = reduce(lambda t, l: 60 | t + "\n" + l if len(l) > 0 and not l.startswith('#include ') else t, 61 | lines, '') 62 | return result 63 | dirSecp256k1 = './secp256k1/' 64 | kernel_code += load_code(dirSecp256k1 + 'inc_vendor.h') 65 | kernel_code += load_code(dirSecp256k1 + 'inc_types.h') 66 | kernel_code += load_code(dirSecp256k1 + 'inc_ecc_secp256k1.h') 67 | kernel_code += load_code(dirSecp256k1 + 'inc_ecc_secp256k1.cl') 68 | dirKeccak = './keccak/' 69 | kernel_code += load_code(dirKeccak + 'keccak256.h') 70 | kernel_code += load_code(dirKeccak + 'keccak256.cl') 71 | dirKernels = './kernels/' 72 | kernel_code += load_code(dirKernels + 'gen_pub_key.cl') 73 | 74 | # with open('./kernel.cl', 'w') as f: 75 | # f.write(kernel_code) 76 | 77 | k = [np.array(randomUInt32Array(keyCount), dtype=np.uint32) for i in range(8)] 78 | xy = [np.array(constUInt32Array(keyCount, 0), dtype=np.uint32) for i in range(16)] 79 | h = [np.array(constUInt32Array(keyCount, 0), dtype=np.uint32) for i in range(8)] 80 | 81 | k_gpu = [gpuarray.to_gpu(k[i]) for i in range(8)] 82 | xy_gpu = [gpuarray.to_gpu(xy[i]) for i in range(16)] 83 | h_gpu = [gpuarray.to_gpu(h[i]) for i in range(8)] 84 | 85 | mod = SourceModule(kernel_code) 86 | genPubKey = mod.get_function('genPubKey') 87 | 88 | genPubKey(xy_gpu[0], xy_gpu[1], xy_gpu[2], xy_gpu[3], xy_gpu[4], xy_gpu[5], xy_gpu[6], xy_gpu[7], 89 | xy_gpu[8], xy_gpu[9], xy_gpu[10], xy_gpu[11], xy_gpu[12], xy_gpu[13], xy_gpu[14], xy_gpu[15], 90 | h_gpu[0], h_gpu[1], h_gpu[2], h_gpu[3], h_gpu[4], h_gpu[5], h_gpu[6], h_gpu[7], 91 | k_gpu[0], k_gpu[1], k_gpu[2], k_gpu[3], k_gpu[4], k_gpu[5], k_gpu[6], k_gpu[7], 92 | block=(keyCount, 1, 1)) 93 | 94 | for i in range(keyCount): 95 | # print(f'--- [{i}] ---') 96 | _k = [k_gpu[j][i].get().item() for j in range(8)] 97 | priv = key_to_hex(_k) 98 | print(f"priv[{i}]: 0x{priv}") 99 | xy = [xy_gpu[j][i].get().item() for j in range(16)] 100 | pub = key_to_hex(xy) 101 | print(f"pub[{i}]: 0x{pub}") 102 | _h = [h_gpu[j][i].get().item() for j in range(8)] 103 | keccak = key_to_hex(_h) 104 | print(f"keccak[{i}]: 0x{keccak}") 105 | if verify: 106 | pk_bytes = bytes.fromhex(priv) 107 | public_key = ecdsa.SigningKey.from_string(pk_bytes, curve=ecdsa.SECP256k1).verifying_key.to_string() 108 | print(f"public Key[{i}] (verification): 0x{public_key.hex()}") 109 | address = public_key_to_address(public_key, i, True) 110 | # print(f"Address[{i}]: {address}") 111 | 112 | def main_genEthAddress(keyCount: int, verify: bool): 113 | kernel_code = ''' 114 | 115 | ''' 116 | def load_code(path: str) -> str: 117 | with open(path, 'r') as text_file: 118 | code_text = text_file.read() 119 | lines = code_text.splitlines() 120 | result = reduce(lambda t, l: 121 | t + "\n" + l if len(l) > 0 and not l.startswith('#include ') else t, 122 | lines, '') 123 | return result 124 | dirSecp256k1 = './secp256k1/' 125 | kernel_code += load_code(dirSecp256k1 + 'inc_vendor.h') 126 | kernel_code += load_code(dirSecp256k1 + 'inc_types.h') 127 | kernel_code += load_code(dirSecp256k1 + 'inc_ecc_secp256k1.h') 128 | kernel_code += load_code(dirSecp256k1 + 'inc_ecc_secp256k1.cl') 129 | dirKeccak = './keccak/' 130 | kernel_code += load_code(dirKeccak + 'keccak256.h') 131 | kernel_code += load_code(dirKeccak + 'keccak256.cl') 132 | dirKernels = './kernels/' 133 | kernel_code += load_code(dirKernels + 'gen_eth_addr.cl') 134 | 135 | # with open('./kernel.cl', 'w') as f: 136 | # f.write(kernel_code) 137 | 138 | k = [np.array(randomUInt32Array(keyCount), dtype=np.uint32) for i in range(8)] 139 | a = [np.array(constUInt32Array(keyCount, 0), dtype=np.uint32) for i in range(5)] 140 | 141 | k_gpu = [gpuarray.to_gpu(k[i]) for i in range(8)] 142 | a_gpu = [gpuarray.to_gpu(a[i]) for i in range(5)] 143 | 144 | mod = SourceModule(kernel_code) 145 | genEthAddress = mod.get_function('genEthAddress') 146 | 147 | genEthAddress( 148 | a_gpu[0], a_gpu[1], a_gpu[2], a_gpu[3], a_gpu[4], 149 | k_gpu[0], k_gpu[1], k_gpu[2], k_gpu[3], k_gpu[4], k_gpu[5], k_gpu[6], k_gpu[7], 150 | block=(keyCount, 1, 1)) 151 | 152 | for i in range(keyCount): 153 | # print(f'--- [{i}] ---') 154 | _k = [k_gpu[j][i].get().item() for j in range(8)] 155 | priv = key_to_hex(_k) 156 | if verify: 157 | print(f"priv[{i}]: 0x{priv}") 158 | _a = [a_gpu[j][i].get().item() for j in range(5)] 159 | eth_address = key_to_hex(_a) 160 | if verify: 161 | print(f"eth address[{i}]: 0x{eth_address}") 162 | pk_bytes = bytes.fromhex(priv) 163 | public_key = ecdsa.SigningKey.from_string(pk_bytes, curve=ecdsa.SECP256k1).verifying_key.to_string() 164 | address = public_key_to_address(public_key, i, False) 165 | print(f"eth address[{i}] (verification): {address}") 166 | else: 167 | print(f"0x{priv},0x{eth_address}") 168 | 169 | if __name__ == "__main__": 170 | # main_genPubKey(keyCount=32, verify=True) 171 | main_genEthAddress(keyCount=32, verify=True) -------------------------------------------------------------------------------- /pyvanityeth.py: -------------------------------------------------------------------------------- 1 | """ 2 | @author: Vitaly | github.com/optinsoft 3 | """ 4 | import pathutils 5 | import pycuda.driver as cuda 6 | from pycuda.compiler import SourceModule 7 | import pycuda.gpuarray as gpuarray 8 | import pycuda.autoinit 9 | import numpy as np 10 | from decouple import config 11 | import os 12 | from functools import reduce 13 | import ecdsa 14 | from Crypto.Hash import keccak 15 | import argparse 16 | import time 17 | 18 | def randomUInt32() -> int: 19 | return int.from_bytes(os.urandom(4), byteorder='little', signed=False) 20 | 21 | def randomUInt32Array(count: int) -> list[int]: 22 | return [randomUInt32() for i in range(count)] 23 | 24 | def constUInt32Array(count: int, v: int) -> list[int]: 25 | return [v for i in range(count)] 26 | 27 | def prefixUInt32(prefixBytes: bytes) -> int: 28 | pl = len(prefixBytes) 29 | p = [prefixBytes[i] if i < pl else 0 for i in range(4)] 30 | return int.from_bytes(p, byteorder='big', signed=False) 31 | 32 | def prefixUInt32Array(prefixBytes: bytes) -> list[int]: 33 | return [prefixUInt32(prefixBytes[i*4:i*4+4]) for i in range(5)] 34 | 35 | def public_key_to_address(public_key, i, print_keccak): 36 | keccak_hash = keccak.new(digest_bits=256) 37 | keccak_hash.update(public_key) 38 | keccak_digest = keccak_hash.digest() 39 | if print_keccak: 40 | print(f'Keccak[{i}] (verification): 0x{keccak_digest.hex()}') 41 | address = '0x' + keccak_digest[-20:].hex() 42 | return address 43 | 44 | def key_to_hex(k: list[int]) -> str: 45 | return reduce(lambda s, t: str(s) + t.to_bytes(4, byteorder='big').hex(), k[1:], k[0].to_bytes(4, byteorder='big').hex()) 46 | 47 | def main_vanityEthAddress(prefixBytes: bytes, keyBlockCount: int, maxBlocks: int, blockIterations: int, verify: bool, verbose: bool, outputFile: str) -> int: 48 | kernel_code = ''' 49 | 50 | ''' 51 | def load_code(path: str) -> str: 52 | with open(path, 'r') as text_file: 53 | code_text = text_file.read() 54 | lines = code_text.splitlines() 55 | result = reduce(lambda t, l: 56 | t + "\n" + l if len(l) > 0 and not l.startswith('#include ') else t, 57 | lines, '') 58 | return result 59 | dirSecp256k1 = './secp256k1/' 60 | kernel_code += load_code(dirSecp256k1 + 'inc_vendor.h') 61 | kernel_code += load_code(dirSecp256k1 + 'inc_types.h') 62 | kernel_code += load_code(dirSecp256k1 + 'inc_ecc_secp256k1.h') 63 | kernel_code += load_code(dirSecp256k1 + 'inc_ecc_secp256k1.cl') 64 | dirKeccak = './keccak/' 65 | kernel_code += load_code(dirKeccak + 'keccak256.h') 66 | kernel_code += load_code(dirKeccak + 'keccak256.cl') 67 | dirKernels = './kernels/' 68 | kernel_code += load_code(dirKernels + 'gen_eth_addr.cl') 69 | 70 | # with open('./kernel.cl', 'w') as f: 71 | # f.write(kernel_code) 72 | 73 | if verbose: 74 | print("Building kernel...") 75 | 76 | mod = SourceModule(kernel_code) 77 | genEthAddressWithPrefix = mod.get_function('genEthAddressWithPrefix') 78 | 79 | prefix = prefixBytes.hex() 80 | 81 | if verbose: 82 | print(f'Searching vanity address with prefix "{prefix}"...') 83 | 84 | start_time = time.time() 85 | 86 | a = [np.array(constUInt32Array(keyBlockCount, 0), dtype=np.uint32) for i in range(5)] 87 | a_gpu = [gpuarray.to_gpu(a[i]) for i in range(5)] 88 | ap_gpu = gpuarray.to_gpu(np.array(constUInt32Array(keyBlockCount, 0), dtype=np.uint32)) 89 | 90 | p = np.array(prefixUInt32Array(prefixBytes), dtype=np.uint32) 91 | p_gpu = gpuarray.to_gpu(p) 92 | p_len = np.int32(len(prefixBytes)) 93 | n_iterations = np.int32(blockIterations) 94 | 95 | for n in range(maxBlocks): 96 | k = [np.array(randomUInt32Array(keyBlockCount), dtype=np.uint32) for i in range(8)] 97 | k_gpu = [gpuarray.to_gpu(k[i]) for i in range(8)] 98 | 99 | genEthAddressWithPrefix( 100 | a_gpu[0], a_gpu[1], a_gpu[2], a_gpu[3], a_gpu[4], ap_gpu, 101 | k_gpu[0], k_gpu[1], k_gpu[2], k_gpu[3], k_gpu[4], k_gpu[5], k_gpu[6], k_gpu[7], 102 | p_gpu, p_len, n_iterations, 103 | block=(keyBlockCount, 1, 1)) 104 | 105 | for i in range(keyBlockCount): 106 | # print(f'--- [{i}] ---') 107 | _ap = ap_gpu[i].get().item() 108 | if _ap != 0: 109 | _a = [a_gpu[j][i].get().item() for j in range(5)] 110 | eth_address = '0x'+key_to_hex(_a) 111 | if eth_address.startswith('0x'+prefix): 112 | if verbose: 113 | end_time = time.time() # end time 114 | elapsed_time = end_time - start_time 115 | print(f"Vanity address found in block # {n+1} iteration # {_ap}, {elapsed_time:.2f} seconds") 116 | count = (n + 1) * keyBlockCount * (blockIterations if blockIterations > 0 else 1) 117 | print(f"Generated {count} ethereum addresses, {count/elapsed_time:.2f} addresses/second") 118 | _k = [k_gpu[j][i].get().item() for j in range(8)] 119 | priv = key_to_hex(_k) 120 | if verify and verbose: 121 | print(f"private key[{i}]: 0x{priv}") 122 | print(f"eth address[{i}]: {eth_address}") 123 | if verify: 124 | pk_bytes = bytes.fromhex(priv) 125 | public_key = ecdsa.SigningKey.from_string(pk_bytes, curve=ecdsa.SECP256k1).verifying_key.to_string() 126 | address = public_key_to_address(public_key, i, False) 127 | if verbose: 128 | print(f"eth address[{i}] (verification): {address}") 129 | if address != eth_address: 130 | print(f"Verification failed: _as[{i}]={_ap}, eth_address[{i}]={eth_address}, verification={address}") 131 | else: 132 | print(f"0x{priv},{eth_address}") 133 | if outputFile: 134 | with open(outputFile, "a") as of: 135 | of.write(f"0x{priv},{eth_address}\n") 136 | else: 137 | print(f"0x{priv},{eth_address}") 138 | if outputFile: 139 | with open(outputFile, "a") as of: 140 | of.write(f"0x{priv},{eth_address}\n") 141 | return 1 142 | else: 143 | print(f"Unexpected result: _ap[{i}]={_ap}, eth_address[{i}]={eth_address}") 144 | if verbose: 145 | end_time = time.time() # end time 146 | elapsed_time = end_time - start_time 147 | print(f"Not found, {elapsed_time:.2f} seconds") 148 | count = maxBlocks * keyBlockCount * (blockIterations if blockIterations > 0 else 1) 149 | print(f"Generated {count} ethereum addresses, {count/elapsed_time:.2f} addresses/second") 150 | return 0 151 | 152 | def hexPrefix(s: str) -> bytes: 153 | if s.startswith('0x'): 154 | return bytes.fromhex(s[2:]) 155 | return bytes.fromhex(s) 156 | 157 | if __name__ == "__main__": 158 | parser = argparse.ArgumentParser(description="pyvanityeth.py") 159 | parser.add_argument('-v', '--verbose', action='store_true', help='verbose') 160 | parser.add_argument('--verify', action='store_true', help='verify found ethereum address') 161 | parser.add_argument("--prefix", required=True, type=hexPrefix, help="vanity ethereum address PREFIX (without leading 0x)") 162 | parser.add_argument("--blocks", required=False, type=int, default=1000, help="try find vanity ethereum address within BLOCKS blocks (default: 1000)") 163 | parser.add_argument("--blockSize", required=False, type=int, default=128, help="generate block of BLOCKSIZE ethereum addresses by using GPU (default: 128)") 164 | parser.add_argument("--blockIterations", required=False, type=int, default=1, help="attempts to find vanity ethereum address within each block") 165 | parser.add_argument("--output", required=False, type=str, default="", help="output found ethereum address to file") 166 | args = parser.parse_args() 167 | main_vanityEthAddress(args.prefix, args.blockSize, args.blocks, args.blockIterations, args.verify, args.verbose, args.output) 168 | -------------------------------------------------------------------------------- /secp256k1/inc_ecc_secp256k1.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Author......: See docs/credits.txt 3 | * License.....: MIT 4 | */ 5 | 6 | #ifndef INC_ECC_SECP256K1_H 7 | #define INC_ECC_SECP256K1_H 8 | 9 | // y^2 = x^3 + ax + b with a = 0 and b = 7 => y^2 = x^3 + 7: 10 | 11 | #define SECP256K1_B 7 12 | 13 | // finite field Fp 14 | // p = FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2F 15 | #define SECP256K1_P0 0xfffffc2f 16 | #define SECP256K1_P1 0xfffffffe 17 | #define SECP256K1_P2 0xffffffff 18 | #define SECP256K1_P3 0xffffffff 19 | #define SECP256K1_P4 0xffffffff 20 | #define SECP256K1_P5 0xffffffff 21 | #define SECP256K1_P6 0xffffffff 22 | #define SECP256K1_P7 0xffffffff 23 | 24 | // prime order N 25 | // n = FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE BAAEDCE6 AF48A03B BFD25E8C D0364141 26 | #define SECP256K1_N0 0xd0364141 27 | #define SECP256K1_N1 0xbfd25e8c 28 | #define SECP256K1_N2 0xaf48a03b 29 | #define SECP256K1_N3 0xbaaedce6 30 | #define SECP256K1_N4 0xfffffffe 31 | #define SECP256K1_N5 0xffffffff 32 | #define SECP256K1_N6 0xffffffff 33 | #define SECP256K1_N7 0xffffffff 34 | 35 | // the base point G in compressed form for transform_public 36 | // G = 02 79BE667E F9DCBBAC 55A06295 CE870B07 029BFCDB 2DCE28D9 59F2815B 16F81798 37 | #define SECP256K1_G_PARITY 0x00000002 38 | #define SECP256K1_G0 0x16f81798 39 | #define SECP256K1_G1 0x59f2815b 40 | #define SECP256K1_G2 0x2dce28d9 41 | #define SECP256K1_G3 0x029bfcdb 42 | #define SECP256K1_G4 0xce870b07 43 | #define SECP256K1_G5 0x55a06295 44 | #define SECP256K1_G6 0xf9dcbbac 45 | #define SECP256K1_G7 0x79be667e 46 | 47 | // the base point G in compressed form for parse_public 48 | // parity and reversed byte/char (8 bit) byte order 49 | // G = 02 79BE667E F9DCBBAC 55A06295 CE870B07 029BFCDB 2DCE28D9 59F2815B 16F81798 50 | #define SECP256K1_G_STRING0 0x66be7902 51 | #define SECP256K1_G_STRING1 0xbbdcf97e 52 | #define SECP256K1_G_STRING2 0x62a055ac 53 | #define SECP256K1_G_STRING3 0x0b87ce95 54 | #define SECP256K1_G_STRING4 0xfc9b0207 55 | #define SECP256K1_G_STRING5 0x28ce2ddb 56 | #define SECP256K1_G_STRING6 0x81f259d9 57 | #define SECP256K1_G_STRING7 0x17f8165b 58 | #define SECP256K1_G_STRING8 0x00000098 59 | 60 | // pre computed values, can be verified using private keys for 61 | // x1 is the same as the basepoint g 62 | // x1 WIF: KwDiBf89QgGbjEhKnhXJuH7LrciVrZi3qYjgd9M7rFU73sVHnoWn 63 | // x3 WIF: KwDiBf89QgGbjEhKnhXJuH7LrciVrZi3qYjgd9M7rFU74sHUHy8S 64 | // x5 WIF: KwDiBf89QgGbjEhKnhXJuH7LrciVrZi3qYjgd9M7rFU75s2EPgZf 65 | // x7 WIF: KwDiBf89QgGbjEhKnhXJuH7LrciVrZi3qYjgd9M7rFU76rnZwVdz 66 | 67 | // x1: 79BE667E F9DCBBAC 55A06295 CE870B07 029BFCDB 2DCE28D9 59F2815B 16F81798 68 | // x1: 79BE667EF9DCBBAC55A06295CE870B07029BFCDB2DCE28D959F2815B16F81798 69 | #define SECP256K1_G_PRE_COMPUTED_00 0x16f81798 70 | #define SECP256K1_G_PRE_COMPUTED_01 0x59f2815b 71 | #define SECP256K1_G_PRE_COMPUTED_02 0x2dce28d9 72 | #define SECP256K1_G_PRE_COMPUTED_03 0x029bfcdb 73 | #define SECP256K1_G_PRE_COMPUTED_04 0xce870b07 74 | #define SECP256K1_G_PRE_COMPUTED_05 0x55a06295 75 | #define SECP256K1_G_PRE_COMPUTED_06 0xf9dcbbac 76 | #define SECP256K1_G_PRE_COMPUTED_07 0x79be667e 77 | 78 | // y1: 483ADA77 26A3C465 5DA4FBFC 0E1108A8 FD17B448 A6855419 9C47D08F FB10D4B8 79 | // y1: 483ADA7726A3C4655DA4FBFC0E1108A8FD17B448A68554199C47D08FFB10D4B8 80 | #define SECP256K1_G_PRE_COMPUTED_08 0xfb10d4b8 81 | #define SECP256K1_G_PRE_COMPUTED_09 0x9c47d08f 82 | #define SECP256K1_G_PRE_COMPUTED_10 0xa6855419 83 | #define SECP256K1_G_PRE_COMPUTED_11 0xfd17b448 84 | #define SECP256K1_G_PRE_COMPUTED_12 0x0e1108a8 85 | #define SECP256K1_G_PRE_COMPUTED_13 0x5da4fbfc 86 | #define SECP256K1_G_PRE_COMPUTED_14 0x26a3c465 87 | #define SECP256K1_G_PRE_COMPUTED_15 0x483ada77 88 | 89 | // -y1: B7C52588 D95C3B9A A25B0403 F1EEF757 02E84BB7 597AABE6 63B82F6F 04EF2777 90 | // -y1: B7C52588D95C3B9AA25B0403F1EEF75702E84BB7597AABE663B82F6F04EF2777 91 | #define SECP256K1_G_PRE_COMPUTED_16 0x04ef2777 92 | #define SECP256K1_G_PRE_COMPUTED_17 0x63b82f6f 93 | #define SECP256K1_G_PRE_COMPUTED_18 0x597aabe6 94 | #define SECP256K1_G_PRE_COMPUTED_19 0x02e84bb7 95 | #define SECP256K1_G_PRE_COMPUTED_20 0xf1eef757 96 | #define SECP256K1_G_PRE_COMPUTED_21 0xa25b0403 97 | #define SECP256K1_G_PRE_COMPUTED_22 0xd95c3b9a 98 | #define SECP256K1_G_PRE_COMPUTED_23 0xb7c52588 99 | 100 | // x3: F9308A01 9258C310 49344F85 F89D5229 B531C845 836F99B0 8601F113 BCE036F9 101 | // x3: F9308A019258C31049344F85F89D5229B531C845836F99B08601F113BCE036F9 102 | #define SECP256K1_G_PRE_COMPUTED_24 0xbce036f9 103 | #define SECP256K1_G_PRE_COMPUTED_25 0x8601f113 104 | #define SECP256K1_G_PRE_COMPUTED_26 0x836f99b0 105 | #define SECP256K1_G_PRE_COMPUTED_27 0xb531c845 106 | #define SECP256K1_G_PRE_COMPUTED_28 0xf89d5229 107 | #define SECP256K1_G_PRE_COMPUTED_29 0x49344f85 108 | #define SECP256K1_G_PRE_COMPUTED_30 0x9258c310 109 | #define SECP256K1_G_PRE_COMPUTED_31 0xf9308a01 110 | 111 | // y3: 388F7B0F 632DE814 0FE337E6 2A37F356 6500A999 34C2231B 6CB9FD75 84B8E672 112 | // y3: 388F7B0F632DE8140FE337E62A37F3566500A99934C2231B6CB9FD7584B8E672 113 | #define SECP256K1_G_PRE_COMPUTED_32 0x84b8e672 114 | #define SECP256K1_G_PRE_COMPUTED_33 0x6cb9fd75 115 | #define SECP256K1_G_PRE_COMPUTED_34 0x34c2231b 116 | #define SECP256K1_G_PRE_COMPUTED_35 0x6500a999 117 | #define SECP256K1_G_PRE_COMPUTED_36 0x2a37f356 118 | #define SECP256K1_G_PRE_COMPUTED_37 0x0fe337e6 119 | #define SECP256K1_G_PRE_COMPUTED_38 0x632de814 120 | #define SECP256K1_G_PRE_COMPUTED_39 0x388f7b0f 121 | 122 | // -y3: C77084F0 9CD217EB F01CC819 D5C80CA9 9AFF5666 CB3DDCE4 93460289 7B4715BD 123 | // -y3: C77084F09CD217EBF01CC819D5C80CA99AFF5666CB3DDCE4934602897B4715BD 124 | #define SECP256K1_G_PRE_COMPUTED_40 0x7b4715bd 125 | #define SECP256K1_G_PRE_COMPUTED_41 0x93460289 126 | #define SECP256K1_G_PRE_COMPUTED_42 0xcb3ddce4 127 | #define SECP256K1_G_PRE_COMPUTED_43 0x9aff5666 128 | #define SECP256K1_G_PRE_COMPUTED_44 0xd5c80ca9 129 | #define SECP256K1_G_PRE_COMPUTED_45 0xf01cc819 130 | #define SECP256K1_G_PRE_COMPUTED_46 0x9cd217eb 131 | #define SECP256K1_G_PRE_COMPUTED_47 0xc77084f0 132 | 133 | // x5: 2F8BDE4D 1A072093 55B4A725 0A5C5128 E88B84BD DC619AB7 CBA8D569 B240EFE4 134 | // x5: 2F8BDE4D1A07209355B4A7250A5C5128E88B84BDDC619AB7CBA8D569B240EFE4 135 | #define SECP256K1_G_PRE_COMPUTED_48 0xb240efe4 136 | #define SECP256K1_G_PRE_COMPUTED_49 0xcba8d569 137 | #define SECP256K1_G_PRE_COMPUTED_50 0xdc619ab7 138 | #define SECP256K1_G_PRE_COMPUTED_51 0xe88b84bd 139 | #define SECP256K1_G_PRE_COMPUTED_52 0x0a5c5128 140 | #define SECP256K1_G_PRE_COMPUTED_53 0x55b4a725 141 | #define SECP256K1_G_PRE_COMPUTED_54 0x1a072093 142 | #define SECP256K1_G_PRE_COMPUTED_55 0x2f8bde4d 143 | 144 | // y5: D8AC2226 36E5E3D6 D4DBA9DD A6C9C426 F788271B AB0D6840 DCA87D3A A6AC62D6 145 | // y5: D8AC222636E5E3D6D4DBA9DDA6C9C426F788271BAB0D6840DCA87D3AA6AC62D6 146 | #define SECP256K1_G_PRE_COMPUTED_56 0xa6ac62d6 147 | #define SECP256K1_G_PRE_COMPUTED_57 0xdca87d3a 148 | #define SECP256K1_G_PRE_COMPUTED_58 0xab0d6840 149 | #define SECP256K1_G_PRE_COMPUTED_59 0xf788271b 150 | #define SECP256K1_G_PRE_COMPUTED_60 0xa6c9c426 151 | #define SECP256K1_G_PRE_COMPUTED_61 0xd4dba9dd 152 | #define SECP256K1_G_PRE_COMPUTED_62 0x36e5e3d6 153 | #define SECP256K1_G_PRE_COMPUTED_63 0xd8ac2226 154 | 155 | // -y5: 2753DDD9 C91A1C29 2B245622 59363BD9 0877D8E4 54F297BF 235782C4 59539959 156 | // -y5: 2753DDD9C91A1C292B24562259363BD90877D8E454F297BF235782C459539959 157 | #define SECP256K1_G_PRE_COMPUTED_64 0x59539959 158 | #define SECP256K1_G_PRE_COMPUTED_65 0x235782c4 159 | #define SECP256K1_G_PRE_COMPUTED_66 0x54f297bf 160 | #define SECP256K1_G_PRE_COMPUTED_67 0x0877d8e4 161 | #define SECP256K1_G_PRE_COMPUTED_68 0x59363bd9 162 | #define SECP256K1_G_PRE_COMPUTED_69 0x2b245622 163 | #define SECP256K1_G_PRE_COMPUTED_70 0xc91a1c29 164 | #define SECP256K1_G_PRE_COMPUTED_71 0x2753ddd9 165 | 166 | // x7: 5CBDF064 6E5DB4EA A398F365 F2EA7A0E 3D419B7E 0330E39C E92BDDED CAC4F9BC 167 | // x7: 5CBDF0646E5DB4EAA398F365F2EA7A0E3D419B7E0330E39CE92BDDEDCAC4F9BC 168 | #define SECP256K1_G_PRE_COMPUTED_72 0xcac4f9bc 169 | #define SECP256K1_G_PRE_COMPUTED_73 0xe92bdded 170 | #define SECP256K1_G_PRE_COMPUTED_74 0x0330e39c 171 | #define SECP256K1_G_PRE_COMPUTED_75 0x3d419b7e 172 | #define SECP256K1_G_PRE_COMPUTED_76 0xf2ea7a0e 173 | #define SECP256K1_G_PRE_COMPUTED_77 0xa398f365 174 | #define SECP256K1_G_PRE_COMPUTED_78 0x6e5db4ea 175 | #define SECP256K1_G_PRE_COMPUTED_79 0x5cbdf064 176 | 177 | // y7: 6AEBCA40 BA255960 A3178D6D 861A54DB A813D0B8 13FDE7B5 A5082628 087264DA 178 | // y7: 6AEBCA40BA255960A3178D6D861A54DBA813D0B813FDE7B5A5082628087264DA 179 | #define SECP256K1_G_PRE_COMPUTED_80 0x087264da 180 | #define SECP256K1_G_PRE_COMPUTED_81 0xa5082628 181 | #define SECP256K1_G_PRE_COMPUTED_82 0x13fde7b5 182 | #define SECP256K1_G_PRE_COMPUTED_83 0xa813d0b8 183 | #define SECP256K1_G_PRE_COMPUTED_84 0x861a54db 184 | #define SECP256K1_G_PRE_COMPUTED_85 0xa3178d6d 185 | #define SECP256K1_G_PRE_COMPUTED_86 0xba255960 186 | #define SECP256K1_G_PRE_COMPUTED_87 0x6aebca40 187 | 188 | // -y7: 951435BF 45DAA69F 5CE87292 79E5AB24 57EC2F47 EC02184A 5AF7D9D6 F78D9755 189 | // -y7: 951435BF45DAA69F5CE8729279E5AB2457EC2F47EC02184A5AF7D9D6F78D9755 190 | #define SECP256K1_G_PRE_COMPUTED_88 0xf78d9755 191 | #define SECP256K1_G_PRE_COMPUTED_89 0x5af7d9d6 192 | #define SECP256K1_G_PRE_COMPUTED_90 0xec02184a 193 | #define SECP256K1_G_PRE_COMPUTED_91 0x57ec2f47 194 | #define SECP256K1_G_PRE_COMPUTED_92 0x79e5ab24 195 | #define SECP256K1_G_PRE_COMPUTED_93 0x5ce87292 196 | #define SECP256K1_G_PRE_COMPUTED_94 0x45daa69f 197 | #define SECP256K1_G_PRE_COMPUTED_95 0x951435bf 198 | 199 | #define SECP256K1_PRE_COMPUTED_XY_SIZE 96 200 | #define SECP256K1_NAF_SIZE 33 // 32+1, we need one extra slot 201 | 202 | #define PUBLIC_KEY_LENGTH_WITHOUT_PARITY 8 203 | #define PUBLIC_KEY_LENGTH_X_Y_WITHOUT_PARITY 16 204 | // 8+1 to make room for the parity 205 | #define PUBLIC_KEY_LENGTH_WITH_PARITY 9 206 | 207 | // (32*8 == 256) 208 | #define PRIVATE_KEY_LENGTH 8 209 | 210 | // change the type of input/tmps in your kernel (e.g. PRIVATE_AS / CONSTANT_AS): 211 | #ifndef SECP256K1_TMPS_TYPE 212 | #define SECP256K1_TMPS_TYPE GLOBAL_AS 213 | #endif 214 | 215 | typedef struct secp256k1 216 | { 217 | u32 xy[SECP256K1_PRE_COMPUTED_XY_SIZE]; // pre-computed points: (x1,y1,-y1),(x3,y3,-y3),(x5,y5,-y5),(x7,y7,-y7) 218 | 219 | } secp256k1_t; 220 | 221 | 222 | DECLSPEC u32 transform_public (PRIVATE_AS secp256k1_t *r, PRIVATE_AS const u32 *x, const u32 first_byte); 223 | DECLSPEC u32 parse_public (PRIVATE_AS secp256k1_t *r, PRIVATE_AS const u32 *k); 224 | 225 | DECLSPEC void point_mul_xy (PRIVATE_AS u32 *x1, PRIVATE_AS u32 *y1, PRIVATE_AS const u32 *k, SECP256K1_TMPS_TYPE const secp256k1_t *tmps); 226 | DECLSPEC void point_mul (PRIVATE_AS u32 *r, PRIVATE_AS const u32 *k, SECP256K1_TMPS_TYPE const secp256k1_t *tmps); 227 | 228 | DECLSPEC void set_precomputed_basepoint_g (PRIVATE_AS secp256k1_t *r); 229 | 230 | #endif // INC_ECC_SECP256K1_H 231 | -------------------------------------------------------------------------------- /secp256k1/inc_ecc_secp256k1.cl: -------------------------------------------------------------------------------- 1 | /** 2 | * Author......: See docs/credits.txt 3 | * License.....: MIT 4 | * 5 | * Furthermore, since elliptic curve operations are highly researched and optimized, 6 | * we've consulted a lot of online resources to implement this, including several papers and 7 | * example code. 8 | * 9 | * Credits where credits are due: there are a lot of nice projects that explain and/or optimize 10 | * elliptic curve operations (especially elliptic curve multiplications by a scalar). 11 | * 12 | * We want to shout out following projects, which were quite helpful when implementing this: 13 | * - secp256k1 by Pieter Wuille (https://github.com/bitcoin-core/secp256k1/, MIT) 14 | * - secp256k1-cl by hhanh00 (https://github.com/hhanh00/secp256k1-cl/, MIT) 15 | * - ec_pure_c by masterzorag (https://github.com/masterzorag/ec_pure_c/) 16 | * - ecc-gmp by leivaburto (https://github.com/leivaburto/ecc-gmp) 17 | * - micro-ecc by Ken MacKay (https://github.com/kmackay/micro-ecc/, BSD) 18 | * - curve_example by willem (https://gist.github.com/nlitsme/c9031c7b9bf6bb009e5a) 19 | * - py_ecc by Vitalik Buterin (https://github.com/ethereum/py_ecc/, MIT) 20 | * 21 | * 22 | * Some BigNum operations are implemented similar to micro-ecc which is licensed under these terms: 23 | * Copyright 2014 Ken MacKay, 2-Clause BSD License 24 | * 25 | * Redistribution and use in source and binary forms, with or without modification, are permitted 26 | * provided that the following conditions are met: 27 | * 28 | * 1. Redistributions of source code must retain the above copyright notice, this list of 29 | * conditions and the following disclaimer. 30 | * 31 | * 2. Redistributions in binary form must reproduce the above copyright notice, this list of 32 | * conditions and the following disclaimer in the documentation and/or other materials 33 | * provided with the distribution. 34 | * 35 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 36 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY 37 | * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 38 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 39 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 40 | * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 41 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 42 | * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 43 | * POSSIBILITY OF SUCH DAMAGE. 44 | */ 45 | 46 | /* 47 | * ATTENTION: this code is NOT meant to be used in security critical environments that are at risk 48 | * of side-channel or timing attacks etc, it's only purpose is to make it work fast for GPGPU 49 | * (OpenCL/CUDA). Some attack vectors like side-channel and timing-attacks might be possible, 50 | * because of some optimizations used within this code (non-constant time etc). 51 | */ 52 | 53 | /* 54 | * Implementation considerations: 55 | * point double and point add are implemented similar to algorithms mentioned in this 2011 paper: 56 | * http://eprint.iacr.org/2011/338.pdf 57 | * (Fast and Regular Algorithms for Scalar Multiplication over Elliptic Curves by Matthieu Rivain) 58 | * 59 | * In theory we could use the Jacobian Co-Z enhancement to get rid of the larger buffer caused by 60 | * the z coordinates (and in this way reduce register pressure etc). 61 | * For the Co-Z improvement there are a lot of fast algorithms, but we might still be faster 62 | * with this implementation (b/c we allow non-constant time) without the Brier/Joye Montgomery-like 63 | * ladder. Of course, this claim would need to be verified and tested to see which one is faster 64 | * for our specific scenario at the end. 65 | * 66 | * We accomplish a "little" speedup by using scalars converted to w-NAF (non-adjacent form): 67 | * The general idea of w-NAF is to pre-compute some zi coefficients like below to reduce the 68 | * costly point additions by using a non-binary ("signed") number system (values other than just 69 | * 0 and 1, but ranging from -2^(w-1)-1 to 2^(w-1)-1). This works best with the left-to-right 70 | * binary algorithm such that we just add zi * P when adding point P (we pre-compute all the 71 | * possible zi * P values because the x/y coordinates are known before the kernel starts): 72 | * 73 | * // Example with window size w = 2 (i.e. mod 4 => & 3): 74 | * // 173 => 1 0 -1 0 -1 0 -1 0 1 = 2^8 - 2^6 - 2^4 - 2^2 + 1 75 | * int e = 0b10101101; // 173 76 | * int z[8 + 1] = { 0 }; // our zi/di, we need one extra slot to make the subtraction work 77 | * 78 | * int i = 0; 79 | * 80 | * while (e) 81 | * { 82 | * if (e & 1) 83 | * { 84 | * // for window size w = 3 it would be: 85 | * // => 2^(w-0) = 2^3 = 8 86 | * // => 2^(w-1) = 2^2 = 4 87 | * 88 | * int bit; // = 2 - (e & 3) for w = 2 89 | * 90 | * if ((e & 3) >= 2) // e % 4 == e & 3, use (e & 7) >= 4 for w = 3 91 | * bit = (e & 3) - 4; // (e & 7) - 8 for w = 3 92 | * else 93 | * bit = e & 3; // e & 7 for w = 3 94 | * 95 | * z[i] = bit; 96 | * e -= bit; 97 | * } 98 | * 99 | * e >>= 1; // e / 2 100 | * i++; 101 | * } 102 | */ 103 | 104 | #include "inc_ecc_secp256k1.h" 105 | 106 | DECLSPEC u32 sub (PRIVATE_AS u32 *r, PRIVATE_AS const u32 *a, PRIVATE_AS const u32 *b) 107 | { 108 | u32 c = 0; // carry/borrow 109 | 110 | #if defined IS_NV && HAS_SUB == 1 && HAS_SUBC == 1 111 | asm volatile 112 | ( 113 | "sub.cc.u32 %0, %9, %17;" 114 | "subc.cc.u32 %1, %10, %18;" 115 | "subc.cc.u32 %2, %11, %19;" 116 | "subc.cc.u32 %3, %12, %20;" 117 | "subc.cc.u32 %4, %13, %21;" 118 | "subc.cc.u32 %5, %14, %22;" 119 | "subc.cc.u32 %6, %15, %23;" 120 | "subc.cc.u32 %7, %16, %24;" 121 | "subc.u32 %8, 0, 0;" 122 | : "=r"(r[0]), "=r"(r[1]), "=r"(r[2]), "=r"(r[3]), "=r"(r[4]), "=r"(r[5]), "=r"(r[6]), "=r"(r[7]), 123 | "=r"(c) 124 | : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(a[4]), "r"(a[5]), "r"(a[6]), "r"(a[7]), 125 | "r"(b[0]), "r"(b[1]), "r"(b[2]), "r"(b[3]), "r"(b[4]), "r"(b[5]), "r"(b[6]), "r"(b[7]) 126 | ); 127 | // HIP doesnt support these so we stick to OpenCL (aka IS_AMD) - is also faster without asm 128 | //#elif (defined IS_AMD || defined IS_HIP) && HAS_VSUB == 1 && HAS_VSUBB == 1 129 | #elif 0 130 | __asm__ __volatile__ 131 | ( 132 | "V_SUB_U32 %0, %9, %17;" 133 | "V_SUBB_U32 %1, %10, %18;" 134 | "V_SUBB_U32 %2, %11, %19;" 135 | "V_SUBB_U32 %3, %12, %20;" 136 | "V_SUBB_U32 %4, %13, %21;" 137 | "V_SUBB_U32 %5, %14, %22;" 138 | "V_SUBB_U32 %6, %15, %23;" 139 | "V_SUBB_U32 %7, %16, %24;" 140 | "V_SUBB_U32 %8, 0, 0;" 141 | : "=v"(r[0]), "=v"(r[1]), "=v"(r[2]), "=v"(r[3]), "=v"(r[4]), "=v"(r[5]), "=v"(r[6]), "=v"(r[7]), 142 | "=v"(c) 143 | : "v"(a[0]), "v"(a[1]), "v"(a[2]), "v"(a[3]), "v"(a[4]), "v"(a[5]), "v"(a[6]), "v"(a[7]), 144 | "v"(b[0]), "v"(b[1]), "v"(b[2]), "v"(b[3]), "v"(b[4]), "v"(b[5]), "v"(b[6]), "v"(b[7]) 145 | ); 146 | #else 147 | for (u32 i = 0; i < 8; i++) 148 | { 149 | const u32 diff = a[i] - b[i] - c; 150 | 151 | if (diff != a[i]) c = (diff > a[i]); 152 | 153 | r[i] = diff; 154 | } 155 | #endif 156 | 157 | return c; 158 | } 159 | 160 | DECLSPEC u32 add (PRIVATE_AS u32 *r, PRIVATE_AS const u32 *a, PRIVATE_AS const u32 *b) 161 | { 162 | u32 c = 0; // carry/borrow 163 | 164 | #if defined IS_NV && HAS_ADD == 1 && HAS_ADDC == 1 165 | asm volatile 166 | ( 167 | "add.cc.u32 %0, %9, %17;" 168 | "addc.cc.u32 %1, %10, %18;" 169 | "addc.cc.u32 %2, %11, %19;" 170 | "addc.cc.u32 %3, %12, %20;" 171 | "addc.cc.u32 %4, %13, %21;" 172 | "addc.cc.u32 %5, %14, %22;" 173 | "addc.cc.u32 %6, %15, %23;" 174 | "addc.cc.u32 %7, %16, %24;" 175 | "addc.u32 %8, 0, 0;" 176 | : "=r"(r[0]), "=r"(r[1]), "=r"(r[2]), "=r"(r[3]), "=r"(r[4]), "=r"(r[5]), "=r"(r[6]), "=r"(r[7]), 177 | "=r"(c) 178 | : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(a[4]), "r"(a[5]), "r"(a[6]), "r"(a[7]), 179 | "r"(b[0]), "r"(b[1]), "r"(b[2]), "r"(b[3]), "r"(b[4]), "r"(b[5]), "r"(b[6]), "r"(b[7]) 180 | ); 181 | // HIP doesnt support these so we stick to OpenCL (aka IS_AMD) - is also faster without asm 182 | //#elif (defined IS_AMD || defined IS_HIP) && HAS_VSUB == 1 && HAS_VSUBB == 1 183 | #elif 0 184 | __asm__ __volatile__ 185 | ( 186 | "V_ADD_U32 %0, %9, %17;" 187 | "V_ADDC_U32 %1, %10, %18;" 188 | "V_ADDC_U32 %2, %11, %19;" 189 | "V_ADDC_U32 %3, %12, %20;" 190 | "V_ADDC_U32 %4, %13, %21;" 191 | "V_ADDC_U32 %5, %14, %22;" 192 | "V_ADDC_U32 %6, %15, %23;" 193 | "V_ADDC_U32 %7, %16, %24;" 194 | "V_ADDC_U32 %8, 0, 0;" 195 | : "=v"(r[0]), "=v"(r[1]), "=v"(r[2]), "=v"(r[3]), "=v"(r[4]), "=v"(r[5]), "=v"(r[6]), "=v"(r[7]), 196 | "=v"(c) 197 | : "v"(a[0]), "v"(a[1]), "v"(a[2]), "v"(a[3]), "v"(a[4]), "v"(a[5]), "v"(a[6]), "v"(a[7]), 198 | "v"(b[0]), "v"(b[1]), "v"(b[2]), "v"(b[3]), "v"(b[4]), "v"(b[5]), "v"(b[6]), "v"(b[7]) 199 | ); 200 | #else 201 | for (u32 i = 0; i < 8; i++) 202 | { 203 | const u32 t = a[i] + b[i] + c; 204 | 205 | if (t != a[i]) c = (t < a[i]); 206 | 207 | r[i] = t; 208 | } 209 | #endif 210 | 211 | return c; 212 | } 213 | 214 | DECLSPEC void sub_mod (PRIVATE_AS u32 *r, PRIVATE_AS const u32 *a, PRIVATE_AS const u32 *b) 215 | { 216 | const u32 c = sub (r, a, b); // carry 217 | 218 | if (c) 219 | { 220 | u32 t[8]; 221 | 222 | t[0] = SECP256K1_P0; 223 | t[1] = SECP256K1_P1; 224 | t[2] = SECP256K1_P2; 225 | t[3] = SECP256K1_P3; 226 | t[4] = SECP256K1_P4; 227 | t[5] = SECP256K1_P5; 228 | t[6] = SECP256K1_P6; 229 | t[7] = SECP256K1_P7; 230 | 231 | add (r, r, t); 232 | } 233 | } 234 | 235 | DECLSPEC void add_mod (PRIVATE_AS u32 *r, PRIVATE_AS const u32 *a, PRIVATE_AS const u32 *b) 236 | { 237 | const u32 c = add (r, a, b); // carry 238 | 239 | /* 240 | * Modulo operation: 241 | */ 242 | 243 | // note: we could have an early exit in case of c == 1 => sub () 244 | 245 | u32 t[8]; 246 | 247 | t[0] = SECP256K1_P0; 248 | t[1] = SECP256K1_P1; 249 | t[2] = SECP256K1_P2; 250 | t[3] = SECP256K1_P3; 251 | t[4] = SECP256K1_P4; 252 | t[5] = SECP256K1_P5; 253 | t[6] = SECP256K1_P6; 254 | t[7] = SECP256K1_P7; 255 | 256 | // check if modulo operation is needed 257 | 258 | u32 mod = 1; 259 | 260 | if (c == 0) 261 | { 262 | for (int i = 7; i >= 0; i--) 263 | { 264 | if (r[i] < t[i]) 265 | { 266 | mod = 0; 267 | 268 | break; // or return ! (check if faster) 269 | } 270 | 271 | if (r[i] > t[i]) break; 272 | } 273 | } 274 | 275 | if (mod == 1) 276 | { 277 | sub (r, r, t); 278 | } 279 | } 280 | 281 | DECLSPEC void mod_512 (PRIVATE_AS u32 *n) 282 | { 283 | // we need to perform a modulo operation with 512-bit % 256-bit (bignum modulo): 284 | // the modulus is the secp256k1 group order 285 | 286 | // ATTENTION: for this function the byte-order is reversed (most significant bytes 287 | // at the left) 288 | 289 | /* 290 | the general modulo by shift and substract code (a = a % b): 291 | 292 | x = b; 293 | 294 | t = a >> 1; 295 | 296 | while (x <= t) x <<= 1; 297 | 298 | while (a >= b) 299 | { 300 | if (a >= x) a -= x; 301 | 302 | x >>= 1; 303 | } 304 | 305 | return a; // remainder 306 | */ 307 | 308 | u32 a[16]; 309 | 310 | a[ 0] = n[ 0]; 311 | a[ 1] = n[ 1]; 312 | a[ 2] = n[ 2]; 313 | a[ 3] = n[ 3]; 314 | a[ 4] = n[ 4]; 315 | a[ 5] = n[ 5]; 316 | a[ 6] = n[ 6]; 317 | a[ 7] = n[ 7]; 318 | a[ 8] = n[ 8]; 319 | a[ 9] = n[ 9]; 320 | a[10] = n[10]; 321 | a[11] = n[11]; 322 | a[12] = n[12]; 323 | a[13] = n[13]; 324 | a[14] = n[14]; 325 | a[15] = n[15]; 326 | 327 | u32 b[16]; 328 | 329 | b[ 0] = 0x00000000; 330 | b[ 1] = 0x00000000; 331 | b[ 2] = 0x00000000; 332 | b[ 3] = 0x00000000; 333 | b[ 4] = 0x00000000; 334 | b[ 5] = 0x00000000; 335 | b[ 6] = 0x00000000; 336 | b[ 7] = 0x00000000; 337 | b[ 8] = SECP256K1_N7; 338 | b[ 9] = SECP256K1_N6; 339 | b[10] = SECP256K1_N5; 340 | b[11] = SECP256K1_N4; 341 | b[12] = SECP256K1_N3; 342 | b[13] = SECP256K1_N2; 343 | b[14] = SECP256K1_N1; 344 | b[15] = SECP256K1_N0; 345 | 346 | /* 347 | * Start: 348 | */ 349 | 350 | // x = b (but with a fast "shift" trick to avoid the while loop) 351 | 352 | u32 x[16]; 353 | 354 | x[ 0] = b[ 8]; // this is a trick: we just put the group order's most significant bit all the 355 | x[ 1] = b[ 9]; // way to the top to avoid doing the initial: while (x <= t) x <<= 1 356 | x[ 2] = b[10]; 357 | x[ 3] = b[11]; 358 | x[ 4] = b[12]; 359 | x[ 5] = b[13]; 360 | x[ 6] = b[14]; 361 | x[ 7] = b[15]; 362 | x[ 8] = 0x00000000; 363 | x[ 9] = 0x00000000; 364 | x[10] = 0x00000000; 365 | x[11] = 0x00000000; 366 | x[12] = 0x00000000; 367 | x[13] = 0x00000000; 368 | x[14] = 0x00000000; 369 | x[15] = 0x00000000; 370 | 371 | // a >= b 372 | 373 | while (a[0] >= b[0]) 374 | { 375 | u32 l00 = a[ 0] < b[ 0]; 376 | u32 l01 = a[ 1] < b[ 1]; 377 | u32 l02 = a[ 2] < b[ 2]; 378 | u32 l03 = a[ 3] < b[ 3]; 379 | u32 l04 = a[ 4] < b[ 4]; 380 | u32 l05 = a[ 5] < b[ 5]; 381 | u32 l06 = a[ 6] < b[ 6]; 382 | u32 l07 = a[ 7] < b[ 7]; 383 | u32 l08 = a[ 8] < b[ 8]; 384 | u32 l09 = a[ 9] < b[ 9]; 385 | u32 l10 = a[10] < b[10]; 386 | u32 l11 = a[11] < b[11]; 387 | u32 l12 = a[12] < b[12]; 388 | u32 l13 = a[13] < b[13]; 389 | u32 l14 = a[14] < b[14]; 390 | u32 l15 = a[15] < b[15]; 391 | 392 | u32 e00 = a[ 0] == b[ 0]; 393 | u32 e01 = a[ 1] == b[ 1]; 394 | u32 e02 = a[ 2] == b[ 2]; 395 | u32 e03 = a[ 3] == b[ 3]; 396 | u32 e04 = a[ 4] == b[ 4]; 397 | u32 e05 = a[ 5] == b[ 5]; 398 | u32 e06 = a[ 6] == b[ 6]; 399 | u32 e07 = a[ 7] == b[ 7]; 400 | u32 e08 = a[ 8] == b[ 8]; 401 | u32 e09 = a[ 9] == b[ 9]; 402 | u32 e10 = a[10] == b[10]; 403 | u32 e11 = a[11] == b[11]; 404 | u32 e12 = a[12] == b[12]; 405 | u32 e13 = a[13] == b[13]; 406 | u32 e14 = a[14] == b[14]; 407 | 408 | if (l00) break; 409 | if (l01 && e00) break; 410 | if (l02 && e00 && e01) break; 411 | if (l03 && e00 && e01 && e02) break; 412 | if (l04 && e00 && e01 && e02 && e03) break; 413 | if (l05 && e00 && e01 && e02 && e03 && e04) break; 414 | if (l06 && e00 && e01 && e02 && e03 && e04 && e05) break; 415 | if (l07 && e00 && e01 && e02 && e03 && e04 && e05 && e06) break; 416 | if (l08 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07) break; 417 | if (l09 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08) break; 418 | if (l10 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09) break; 419 | if (l11 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09 && e10) break; 420 | if (l12 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09 && e10 && e11) break; 421 | if (l13 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09 && e10 && e11 && e12) break; 422 | if (l14 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09 && e10 && e11 && e12 && e13) break; 423 | if (l15 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09 && e10 && e11 && e12 && e13 && e14) break; 424 | 425 | // r = x (copy it to have the original values for the subtraction) 426 | 427 | u32 r[16]; 428 | 429 | r[ 0] = x[ 0]; 430 | r[ 1] = x[ 1]; 431 | r[ 2] = x[ 2]; 432 | r[ 3] = x[ 3]; 433 | r[ 4] = x[ 4]; 434 | r[ 5] = x[ 5]; 435 | r[ 6] = x[ 6]; 436 | r[ 7] = x[ 7]; 437 | r[ 8] = x[ 8]; 438 | r[ 9] = x[ 9]; 439 | r[10] = x[10]; 440 | r[11] = x[11]; 441 | r[12] = x[12]; 442 | r[13] = x[13]; 443 | r[14] = x[14]; 444 | r[15] = x[15]; 445 | 446 | // x <<= 1 447 | 448 | x[15] = x[15] >> 1 | x[14] << 31; 449 | x[14] = x[14] >> 1 | x[13] << 31; 450 | x[13] = x[13] >> 1 | x[12] << 31; 451 | x[12] = x[12] >> 1 | x[11] << 31; 452 | x[11] = x[11] >> 1 | x[10] << 31; 453 | x[10] = x[10] >> 1 | x[ 9] << 31; 454 | x[ 9] = x[ 9] >> 1 | x[ 8] << 31; 455 | x[ 8] = x[ 8] >> 1 | x[ 7] << 31; 456 | x[ 7] = x[ 7] >> 1 | x[ 6] << 31; 457 | x[ 6] = x[ 6] >> 1 | x[ 5] << 31; 458 | x[ 5] = x[ 5] >> 1 | x[ 4] << 31; 459 | x[ 4] = x[ 4] >> 1 | x[ 3] << 31; 460 | x[ 3] = x[ 3] >> 1 | x[ 2] << 31; 461 | x[ 2] = x[ 2] >> 1 | x[ 1] << 31; 462 | x[ 1] = x[ 1] >> 1 | x[ 0] << 31; 463 | x[ 0] = x[ 0] >> 1; 464 | 465 | // if (a >= r) a -= r; 466 | 467 | l00 = a[ 0] < r[ 0]; 468 | l01 = a[ 1] < r[ 1]; 469 | l02 = a[ 2] < r[ 2]; 470 | l03 = a[ 3] < r[ 3]; 471 | l04 = a[ 4] < r[ 4]; 472 | l05 = a[ 5] < r[ 5]; 473 | l06 = a[ 6] < r[ 6]; 474 | l07 = a[ 7] < r[ 7]; 475 | l08 = a[ 8] < r[ 8]; 476 | l09 = a[ 9] < r[ 9]; 477 | l10 = a[10] < r[10]; 478 | l11 = a[11] < r[11]; 479 | l12 = a[12] < r[12]; 480 | l13 = a[13] < r[13]; 481 | l14 = a[14] < r[14]; 482 | l15 = a[15] < r[15]; 483 | 484 | e00 = a[ 0] == r[ 0]; 485 | e01 = a[ 1] == r[ 1]; 486 | e02 = a[ 2] == r[ 2]; 487 | e03 = a[ 3] == r[ 3]; 488 | e04 = a[ 4] == r[ 4]; 489 | e05 = a[ 5] == r[ 5]; 490 | e06 = a[ 6] == r[ 6]; 491 | e07 = a[ 7] == r[ 7]; 492 | e08 = a[ 8] == r[ 8]; 493 | e09 = a[ 9] == r[ 9]; 494 | e10 = a[10] == r[10]; 495 | e11 = a[11] == r[11]; 496 | e12 = a[12] == r[12]; 497 | e13 = a[13] == r[13]; 498 | e14 = a[14] == r[14]; 499 | 500 | if (l00) continue; 501 | if (l01 && e00) continue; 502 | if (l02 && e00 && e01) continue; 503 | if (l03 && e00 && e01 && e02) continue; 504 | if (l04 && e00 && e01 && e02 && e03) continue; 505 | if (l05 && e00 && e01 && e02 && e03 && e04) continue; 506 | if (l06 && e00 && e01 && e02 && e03 && e04 && e05) continue; 507 | if (l07 && e00 && e01 && e02 && e03 && e04 && e05 && e06) continue; 508 | if (l08 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07) continue; 509 | if (l09 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08) continue; 510 | if (l10 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09) continue; 511 | if (l11 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09 && e10) continue; 512 | if (l12 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09 && e10 && e11) continue; 513 | if (l13 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09 && e10 && e11 && e12) continue; 514 | if (l14 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09 && e10 && e11 && e12 && e13) continue; 515 | if (l15 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09 && e10 && e11 && e12 && e13 && e14) continue; 516 | 517 | // substract (a -= r): 518 | 519 | if ((r[ 0] | r[ 1] | r[ 2] | r[ 3] | r[ 4] | r[ 5] | r[ 6] | r[ 7] | 520 | r[ 8] | r[ 9] | r[10] | r[11] | r[12] | r[13] | r[14] | r[15]) == 0) break; 521 | 522 | r[ 0] = a[ 0] - r[ 0]; 523 | r[ 1] = a[ 1] - r[ 1]; 524 | r[ 2] = a[ 2] - r[ 2]; 525 | r[ 3] = a[ 3] - r[ 3]; 526 | r[ 4] = a[ 4] - r[ 4]; 527 | r[ 5] = a[ 5] - r[ 5]; 528 | r[ 6] = a[ 6] - r[ 6]; 529 | r[ 7] = a[ 7] - r[ 7]; 530 | r[ 8] = a[ 8] - r[ 8]; 531 | r[ 9] = a[ 9] - r[ 9]; 532 | r[10] = a[10] - r[10]; 533 | r[11] = a[11] - r[11]; 534 | r[12] = a[12] - r[12]; 535 | r[13] = a[13] - r[13]; 536 | r[14] = a[14] - r[14]; 537 | r[15] = a[15] - r[15]; 538 | 539 | // take care of the "borrow" (we can't do it the other way around 15...1 because r[x] is changed!) 540 | 541 | if (r[ 1] > a[ 1]) r[ 0]--; 542 | if (r[ 2] > a[ 2]) r[ 1]--; 543 | if (r[ 3] > a[ 3]) r[ 2]--; 544 | if (r[ 4] > a[ 4]) r[ 3]--; 545 | if (r[ 5] > a[ 5]) r[ 4]--; 546 | if (r[ 6] > a[ 6]) r[ 5]--; 547 | if (r[ 7] > a[ 7]) r[ 6]--; 548 | if (r[ 8] > a[ 8]) r[ 7]--; 549 | if (r[ 9] > a[ 9]) r[ 8]--; 550 | if (r[10] > a[10]) r[ 9]--; 551 | if (r[11] > a[11]) r[10]--; 552 | if (r[12] > a[12]) r[11]--; 553 | if (r[13] > a[13]) r[12]--; 554 | if (r[14] > a[14]) r[13]--; 555 | if (r[15] > a[15]) r[14]--; 556 | 557 | a[ 0] = r[ 0]; 558 | a[ 1] = r[ 1]; 559 | a[ 2] = r[ 2]; 560 | a[ 3] = r[ 3]; 561 | a[ 4] = r[ 4]; 562 | a[ 5] = r[ 5]; 563 | a[ 6] = r[ 6]; 564 | a[ 7] = r[ 7]; 565 | a[ 8] = r[ 8]; 566 | a[ 9] = r[ 9]; 567 | a[10] = r[10]; 568 | a[11] = r[11]; 569 | a[12] = r[12]; 570 | a[13] = r[13]; 571 | a[14] = r[14]; 572 | a[15] = r[15]; 573 | } 574 | 575 | n[ 0] = a[ 0]; 576 | n[ 1] = a[ 1]; 577 | n[ 2] = a[ 2]; 578 | n[ 3] = a[ 3]; 579 | n[ 4] = a[ 4]; 580 | n[ 5] = a[ 5]; 581 | n[ 6] = a[ 6]; 582 | n[ 7] = a[ 7]; 583 | n[ 8] = a[ 8]; 584 | n[ 9] = a[ 9]; 585 | n[10] = a[10]; 586 | n[11] = a[11]; 587 | n[12] = a[12]; 588 | n[13] = a[13]; 589 | n[14] = a[14]; 590 | n[15] = a[15]; 591 | } 592 | 593 | DECLSPEC void mul_mod (PRIVATE_AS u32 *r, PRIVATE_AS const u32 *a, PRIVATE_AS const u32 *b) // TODO get rid of u64 ? 594 | { 595 | u32 t[16] = { 0 }; // we need up to double the space (2 * 8) 596 | 597 | /* 598 | * First start with the basic a * b multiplication: 599 | */ 600 | 601 | u32 t0 = 0; 602 | u32 t1 = 0; 603 | u32 c = 0; 604 | 605 | for (u32 i = 0; i < 8; i++) 606 | { 607 | for (u32 j = 0; j <= i; j++) 608 | { 609 | u64 p = ((u64) a[j]) * b[i - j]; 610 | 611 | u64 d = ((u64) t1) << 32 | t0; 612 | 613 | d += p; 614 | 615 | t0 = (u32) d; 616 | t1 = d >> 32; 617 | 618 | c += d < p; // carry 619 | } 620 | 621 | t[i] = t0; 622 | 623 | t0 = t1; 624 | t1 = c; 625 | 626 | c = 0; 627 | } 628 | 629 | for (u32 i = 8; i < 15; i++) 630 | { 631 | for (u32 j = i - 7; j < 8; j++) 632 | { 633 | u64 p = ((u64) a[j]) * b[i - j]; 634 | 635 | u64 d = ((u64) t1) << 32 | t0; 636 | 637 | d += p; 638 | 639 | t0 = (u32) d; 640 | t1 = d >> 32; 641 | 642 | c += d < p; 643 | } 644 | 645 | t[i] = t0; 646 | 647 | t0 = t1; 648 | t1 = c; 649 | 650 | c = 0; 651 | } 652 | 653 | t[15] = t0; 654 | 655 | 656 | 657 | /* 658 | * Now do the modulo operation: 659 | * (r = t % p) 660 | * 661 | * http://www.isys.uni-klu.ac.at/PDF/2001-0126-MT.pdf (p.354 or p.9 in that document) 662 | */ 663 | 664 | u32 tmp[16] = { 0 }; 665 | 666 | // c = 0; 667 | 668 | // Note: SECP256K1_P = 2^256 - 2^32 - 977 (0x03d1 = 977) 669 | // multiply t[8]...t[15] by omega: 670 | 671 | for (u32 i = 0, j = 8; i < 8; i++, j++) 672 | { 673 | u64 p = ((u64) 0x03d1) * t[j] + c; 674 | 675 | tmp[i] = (u32) p; 676 | 677 | c = p >> 32; 678 | } 679 | 680 | tmp[8] = c; 681 | 682 | c = add (tmp + 1, tmp + 1, t + 8); // modifies tmp[1]...tmp[8] 683 | 684 | tmp[9] = c; 685 | 686 | 687 | // r = t + tmp 688 | 689 | c = add (r, t, tmp); 690 | 691 | // multiply t[0]...t[7] by omega: 692 | 693 | u32 c2 = 0; 694 | 695 | // memset (t, 0, sizeof (t)); 696 | 697 | for (u32 i = 0, j = 8; i < 8; i++, j++) 698 | { 699 | u64 p = ((u64) 0x3d1) * tmp[j] + c2; 700 | 701 | t[i] = (u32) p; 702 | 703 | c2 = p >> 32; 704 | } 705 | 706 | t[8] = c2; 707 | 708 | c2 = add (t + 1, t + 1, tmp + 8); // modifies t[1]...t[8] 709 | 710 | t[9] = c2; 711 | 712 | 713 | // r = r + t 714 | 715 | c2 = add (r, r, t); 716 | 717 | c += c2; 718 | 719 | t[0] = SECP256K1_P0; 720 | t[1] = SECP256K1_P1; 721 | t[2] = SECP256K1_P2; 722 | t[3] = SECP256K1_P3; 723 | t[4] = SECP256K1_P4; 724 | t[5] = SECP256K1_P5; 725 | t[6] = SECP256K1_P6; 726 | t[7] = SECP256K1_P7; 727 | 728 | for (u32 i = c; i > 0; i--) 729 | { 730 | sub (r, r, t); 731 | } 732 | 733 | for (int i = 7; i >= 0; i--) 734 | { 735 | if (r[i] < t[i]) break; 736 | 737 | if (r[i] > t[i]) 738 | { 739 | sub (r, r, t); 740 | 741 | break; 742 | } 743 | } 744 | } 745 | 746 | DECLSPEC void sqrt_mod (PRIVATE_AS u32 *r) 747 | { 748 | // Fermat's Little Theorem 749 | // secp256k1: y^2 = x^3 + 7 % p 750 | // y ^ (p - 1) = 1 751 | // y ^ (p - 1) = (y^2) ^ ((p - 1) / 2) = 1 => y^2 = (y^2) ^ (((p - 1) / 2) + 1) 752 | // => y = (y^2) ^ ((((p - 1) / 2) + 1) / 2) 753 | // y = (y^2) ^ (((p - 1 + 2) / 2) / 2) = (y^2) ^ ((p + 1) / 4) 754 | 755 | // y1 = (x^3 + 7) ^ ((p + 1) / 4) 756 | // y2 = p - y1 (or y2 = y1 * -1 % p) 757 | 758 | u32 s[8]; 759 | 760 | s[0] = SECP256K1_P0 + 1; // because of (p + 1) / 4 or use add (s, s, 1) 761 | s[1] = SECP256K1_P1; 762 | s[2] = SECP256K1_P2; 763 | s[3] = SECP256K1_P3; 764 | s[4] = SECP256K1_P4; 765 | s[5] = SECP256K1_P5; 766 | s[6] = SECP256K1_P6; 767 | s[7] = SECP256K1_P7; 768 | 769 | u32 t[8] = { 0 }; 770 | 771 | t[0] = 1; 772 | 773 | for (u32 i = 255; i > 1; i--) // we just skip the last 2 multiplications (=> exp / 4) 774 | { 775 | mul_mod (t, t, t); // r * r 776 | 777 | u32 idx = i >> 5; 778 | u32 mask = 1 << (i & 0x1f); 779 | 780 | if (s[idx] & mask) 781 | { 782 | mul_mod (t, t, r); // t * r 783 | } 784 | } 785 | 786 | r[0] = t[0]; 787 | r[1] = t[1]; 788 | r[2] = t[2]; 789 | r[3] = t[3]; 790 | r[4] = t[4]; 791 | r[5] = t[5]; 792 | r[6] = t[6]; 793 | r[7] = t[7]; 794 | } 795 | 796 | // (inverse (a, p) * a) % p == 1 (or think of a * a^-1 = a / a = 1) 797 | 798 | DECLSPEC void inv_mod (PRIVATE_AS u32 *a) 799 | { 800 | // How often does this really happen? it should "almost" never happen (but would be safer) 801 | // if ((a[0] | a[1] | a[2] | a[3] | a[4] | a[5] | a[6] | a[7]) == 0) return; 802 | 803 | u32 t0[8]; 804 | 805 | t0[0] = a[0]; 806 | t0[1] = a[1]; 807 | t0[2] = a[2]; 808 | t0[3] = a[3]; 809 | t0[4] = a[4]; 810 | t0[5] = a[5]; 811 | t0[6] = a[6]; 812 | t0[7] = a[7]; 813 | 814 | u32 p[8]; 815 | 816 | p[0] = SECP256K1_P0; 817 | p[1] = SECP256K1_P1; 818 | p[2] = SECP256K1_P2; 819 | p[3] = SECP256K1_P3; 820 | p[4] = SECP256K1_P4; 821 | p[5] = SECP256K1_P5; 822 | p[6] = SECP256K1_P6; 823 | p[7] = SECP256K1_P7; 824 | 825 | u32 t1[8]; 826 | 827 | t1[0] = SECP256K1_P0; 828 | t1[1] = SECP256K1_P1; 829 | t1[2] = SECP256K1_P2; 830 | t1[3] = SECP256K1_P3; 831 | t1[4] = SECP256K1_P4; 832 | t1[5] = SECP256K1_P5; 833 | t1[6] = SECP256K1_P6; 834 | t1[7] = SECP256K1_P7; 835 | 836 | u32 t2[8] = { 0 }; 837 | 838 | t2[0] = 0x00000001; 839 | 840 | u32 t3[8] = { 0 }; 841 | 842 | u32 b = (t0[0] != t1[0]) 843 | | (t0[1] != t1[1]) 844 | | (t0[2] != t1[2]) 845 | | (t0[3] != t1[3]) 846 | | (t0[4] != t1[4]) 847 | | (t0[5] != t1[5]) 848 | | (t0[6] != t1[6]) 849 | | (t0[7] != t1[7]); 850 | 851 | while (b) 852 | { 853 | if ((t0[0] & 1) == 0) // even 854 | { 855 | t0[0] = t0[0] >> 1 | t0[1] << 31; 856 | t0[1] = t0[1] >> 1 | t0[2] << 31; 857 | t0[2] = t0[2] >> 1 | t0[3] << 31; 858 | t0[3] = t0[3] >> 1 | t0[4] << 31; 859 | t0[4] = t0[4] >> 1 | t0[5] << 31; 860 | t0[5] = t0[5] >> 1 | t0[6] << 31; 861 | t0[6] = t0[6] >> 1 | t0[7] << 31; 862 | t0[7] = t0[7] >> 1; 863 | 864 | u32 c = 0; 865 | 866 | if (t2[0] & 1) c = add (t2, t2, p); 867 | 868 | t2[0] = t2[0] >> 1 | t2[1] << 31; 869 | t2[1] = t2[1] >> 1 | t2[2] << 31; 870 | t2[2] = t2[2] >> 1 | t2[3] << 31; 871 | t2[3] = t2[3] >> 1 | t2[4] << 31; 872 | t2[4] = t2[4] >> 1 | t2[5] << 31; 873 | t2[5] = t2[5] >> 1 | t2[6] << 31; 874 | t2[6] = t2[6] >> 1 | t2[7] << 31; 875 | t2[7] = t2[7] >> 1 | c << 31; 876 | } 877 | else if ((t1[0] & 1) == 0) 878 | { 879 | t1[0] = t1[0] >> 1 | t1[1] << 31; 880 | t1[1] = t1[1] >> 1 | t1[2] << 31; 881 | t1[2] = t1[2] >> 1 | t1[3] << 31; 882 | t1[3] = t1[3] >> 1 | t1[4] << 31; 883 | t1[4] = t1[4] >> 1 | t1[5] << 31; 884 | t1[5] = t1[5] >> 1 | t1[6] << 31; 885 | t1[6] = t1[6] >> 1 | t1[7] << 31; 886 | t1[7] = t1[7] >> 1; 887 | 888 | u32 c = 0; 889 | 890 | if (t3[0] & 1) c = add (t3, t3, p); 891 | 892 | t3[0] = t3[0] >> 1 | t3[1] << 31; 893 | t3[1] = t3[1] >> 1 | t3[2] << 31; 894 | t3[2] = t3[2] >> 1 | t3[3] << 31; 895 | t3[3] = t3[3] >> 1 | t3[4] << 31; 896 | t3[4] = t3[4] >> 1 | t3[5] << 31; 897 | t3[5] = t3[5] >> 1 | t3[6] << 31; 898 | t3[6] = t3[6] >> 1 | t3[7] << 31; 899 | t3[7] = t3[7] >> 1 | c << 31; 900 | } 901 | else 902 | { 903 | u32 gt = 0; 904 | 905 | for (int i = 7; i >= 0; i--) 906 | { 907 | if (t0[i] > t1[i]) 908 | { 909 | gt = 1; 910 | 911 | break; 912 | } 913 | 914 | if (t0[i] < t1[i]) break; 915 | } 916 | 917 | if (gt) 918 | { 919 | sub (t0, t0, t1); 920 | 921 | t0[0] = t0[0] >> 1 | t0[1] << 31; 922 | t0[1] = t0[1] >> 1 | t0[2] << 31; 923 | t0[2] = t0[2] >> 1 | t0[3] << 31; 924 | t0[3] = t0[3] >> 1 | t0[4] << 31; 925 | t0[4] = t0[4] >> 1 | t0[5] << 31; 926 | t0[5] = t0[5] >> 1 | t0[6] << 31; 927 | t0[6] = t0[6] >> 1 | t0[7] << 31; 928 | t0[7] = t0[7] >> 1; 929 | 930 | u32 lt = 0; 931 | 932 | for (int i = 7; i >= 0; i--) 933 | { 934 | if (t2[i] < t3[i]) 935 | { 936 | lt = 1; 937 | 938 | break; 939 | } 940 | 941 | if (t2[i] > t3[i]) break; 942 | } 943 | 944 | if (lt) add (t2, t2, p); 945 | 946 | sub (t2, t2, t3); 947 | 948 | u32 c = 0; 949 | 950 | if (t2[0] & 1) c = add (t2, t2, p); 951 | 952 | t2[0] = t2[0] >> 1 | t2[1] << 31; 953 | t2[1] = t2[1] >> 1 | t2[2] << 31; 954 | t2[2] = t2[2] >> 1 | t2[3] << 31; 955 | t2[3] = t2[3] >> 1 | t2[4] << 31; 956 | t2[4] = t2[4] >> 1 | t2[5] << 31; 957 | t2[5] = t2[5] >> 1 | t2[6] << 31; 958 | t2[6] = t2[6] >> 1 | t2[7] << 31; 959 | t2[7] = t2[7] >> 1 | c << 31; 960 | } 961 | else 962 | { 963 | sub (t1, t1, t0); 964 | 965 | t1[0] = t1[0] >> 1 | t1[1] << 31; 966 | t1[1] = t1[1] >> 1 | t1[2] << 31; 967 | t1[2] = t1[2] >> 1 | t1[3] << 31; 968 | t1[3] = t1[3] >> 1 | t1[4] << 31; 969 | t1[4] = t1[4] >> 1 | t1[5] << 31; 970 | t1[5] = t1[5] >> 1 | t1[6] << 31; 971 | t1[6] = t1[6] >> 1 | t1[7] << 31; 972 | t1[7] = t1[7] >> 1; 973 | 974 | u32 lt = 0; 975 | 976 | for (int i = 7; i >= 0; i--) 977 | { 978 | if (t3[i] < t2[i]) 979 | { 980 | lt = 1; 981 | 982 | break; 983 | } 984 | 985 | if (t3[i] > t2[i]) break; 986 | } 987 | 988 | if (lt) add (t3, t3, p); 989 | 990 | sub (t3, t3, t2); 991 | 992 | u32 c = 0; 993 | 994 | if (t3[0] & 1) c = add (t3, t3, p); 995 | 996 | t3[0] = t3[0] >> 1 | t3[1] << 31; 997 | t3[1] = t3[1] >> 1 | t3[2] << 31; 998 | t3[2] = t3[2] >> 1 | t3[3] << 31; 999 | t3[3] = t3[3] >> 1 | t3[4] << 31; 1000 | t3[4] = t3[4] >> 1 | t3[5] << 31; 1001 | t3[5] = t3[5] >> 1 | t3[6] << 31; 1002 | t3[6] = t3[6] >> 1 | t3[7] << 31; 1003 | t3[7] = t3[7] >> 1 | c << 31; 1004 | } 1005 | } 1006 | 1007 | // update b: 1008 | 1009 | b = (t0[0] != t1[0]) 1010 | | (t0[1] != t1[1]) 1011 | | (t0[2] != t1[2]) 1012 | | (t0[3] != t1[3]) 1013 | | (t0[4] != t1[4]) 1014 | | (t0[5] != t1[5]) 1015 | | (t0[6] != t1[6]) 1016 | | (t0[7] != t1[7]); 1017 | } 1018 | 1019 | // set result: 1020 | 1021 | a[0] = t2[0]; 1022 | a[1] = t2[1]; 1023 | a[2] = t2[2]; 1024 | a[3] = t2[3]; 1025 | a[4] = t2[4]; 1026 | a[5] = t2[5]; 1027 | a[6] = t2[6]; 1028 | a[7] = t2[7]; 1029 | } 1030 | 1031 | /* 1032 | // everything from the formulas below of course MOD the prime: 1033 | 1034 | // we use this formula: 1035 | 1036 | X = (3/2 * x^2)^2 - 2 * x * y^2 1037 | Y = (3/2 * x^2) * (x * y^2 - X) - y^4 1038 | Z = y * z 1039 | 1040 | this is identical to the more frequently used form: 1041 | 1042 | X = (3 * x^2)^2 - 8 * x * y^2 1043 | Y = 3 * x^2 * (4 * x * y^2 - X) - 8 * y^4 1044 | Z = 2 * y * z 1045 | */ 1046 | 1047 | DECLSPEC void point_double (PRIVATE_AS u32 *x, PRIVATE_AS u32 *y, PRIVATE_AS u32 *z) 1048 | { 1049 | // How often does this really happen? it should "almost" never happen (but would be safer) 1050 | 1051 | /* 1052 | if ((y[0] | y[1] | y[2] | y[3] | y[4] | y[5] | y[6] | y[7]) == 0) 1053 | { 1054 | x[0] = 0; 1055 | x[1] = 0; 1056 | x[2] = 0; 1057 | x[3] = 0; 1058 | x[4] = 0; 1059 | x[5] = 0; 1060 | x[6] = 0; 1061 | x[7] = 0; 1062 | 1063 | y[0] = 0; 1064 | y[1] = 0; 1065 | y[2] = 0; 1066 | y[3] = 0; 1067 | y[4] = 0; 1068 | y[5] = 0; 1069 | y[6] = 0; 1070 | y[7] = 0; 1071 | 1072 | z[0] = 0; 1073 | z[1] = 0; 1074 | z[2] = 0; 1075 | z[3] = 0; 1076 | z[4] = 0; 1077 | z[5] = 0; 1078 | z[6] = 0; 1079 | z[7] = 0; 1080 | 1081 | return; 1082 | } 1083 | */ 1084 | 1085 | u32 t1[8]; 1086 | 1087 | t1[0] = x[0]; 1088 | t1[1] = x[1]; 1089 | t1[2] = x[2]; 1090 | t1[3] = x[3]; 1091 | t1[4] = x[4]; 1092 | t1[5] = x[5]; 1093 | t1[6] = x[6]; 1094 | t1[7] = x[7]; 1095 | 1096 | u32 t2[8]; 1097 | 1098 | t2[0] = y[0]; 1099 | t2[1] = y[1]; 1100 | t2[2] = y[2]; 1101 | t2[3] = y[3]; 1102 | t2[4] = y[4]; 1103 | t2[5] = y[5]; 1104 | t2[6] = y[6]; 1105 | t2[7] = y[7]; 1106 | 1107 | u32 t3[8]; 1108 | 1109 | t3[0] = z[0]; 1110 | t3[1] = z[1]; 1111 | t3[2] = z[2]; 1112 | t3[3] = z[3]; 1113 | t3[4] = z[4]; 1114 | t3[5] = z[5]; 1115 | t3[6] = z[6]; 1116 | t3[7] = z[7]; 1117 | 1118 | u32 t4[8]; 1119 | u32 t5[8]; 1120 | u32 t6[8]; 1121 | 1122 | mul_mod (t4, t1, t1); // t4 = x^2 1123 | 1124 | mul_mod (t5, t2, t2); // t5 = y^2 1125 | 1126 | mul_mod (t1, t1, t5); // t1 = x*y^2 1127 | 1128 | mul_mod (t5, t5, t5); // t5 = t5^2 = y^4 1129 | 1130 | // here the z^2 and z^4 is not needed for a = 0 1131 | 1132 | mul_mod (t3, t2, t3); // t3 = x * z 1133 | 1134 | add_mod (t2, t4, t4); // t2 = 2 * t4 = 2 * x^2 1135 | add_mod (t4, t4, t2); // t4 = 3 * t4 = 3 * x^2 1136 | 1137 | // a * z^4 = 0 * 1^4 = 0 1138 | 1139 | // don't discard the least significant bit it's important too! 1140 | 1141 | u32 c = 0; 1142 | 1143 | if (t4[0] & 1) 1144 | { 1145 | u32 t[8]; 1146 | 1147 | t[0] = SECP256K1_P0; 1148 | t[1] = SECP256K1_P1; 1149 | t[2] = SECP256K1_P2; 1150 | t[3] = SECP256K1_P3; 1151 | t[4] = SECP256K1_P4; 1152 | t[5] = SECP256K1_P5; 1153 | t[6] = SECP256K1_P6; 1154 | t[7] = SECP256K1_P7; 1155 | 1156 | c = add (t4, t4, t); // t4 + SECP256K1_P 1157 | } 1158 | 1159 | // right shift (t4 / 2): 1160 | 1161 | t4[0] = t4[0] >> 1 | t4[1] << 31; 1162 | t4[1] = t4[1] >> 1 | t4[2] << 31; 1163 | t4[2] = t4[2] >> 1 | t4[3] << 31; 1164 | t4[3] = t4[3] >> 1 | t4[4] << 31; 1165 | t4[4] = t4[4] >> 1 | t4[5] << 31; 1166 | t4[5] = t4[5] >> 1 | t4[6] << 31; 1167 | t4[6] = t4[6] >> 1 | t4[7] << 31; 1168 | t4[7] = t4[7] >> 1 | c << 31; 1169 | 1170 | mul_mod (t6, t4, t4); // t6 = t4^2 = (3/2 * x^2)^2 1171 | 1172 | add_mod (t2, t1, t1); // t2 = 2 * t1 1173 | 1174 | sub_mod (t6, t6, t2); // t6 = t6 - t2 1175 | sub_mod (t1, t1, t6); // t1 = t1 - t6 1176 | 1177 | mul_mod (t4, t4, t1); // t4 = t4 * t1 1178 | 1179 | sub_mod (t1, t4, t5); // t1 = t4 - t5 1180 | 1181 | // => x = t6, y = t1, z = t3: 1182 | 1183 | x[0] = t6[0]; 1184 | x[1] = t6[1]; 1185 | x[2] = t6[2]; 1186 | x[3] = t6[3]; 1187 | x[4] = t6[4]; 1188 | x[5] = t6[5]; 1189 | x[6] = t6[6]; 1190 | x[7] = t6[7]; 1191 | 1192 | y[0] = t1[0]; 1193 | y[1] = t1[1]; 1194 | y[2] = t1[2]; 1195 | y[3] = t1[3]; 1196 | y[4] = t1[4]; 1197 | y[5] = t1[5]; 1198 | y[6] = t1[6]; 1199 | y[7] = t1[7]; 1200 | 1201 | z[0] = t3[0]; 1202 | z[1] = t3[1]; 1203 | z[2] = t3[2]; 1204 | z[3] = t3[3]; 1205 | z[4] = t3[4]; 1206 | z[5] = t3[5]; 1207 | z[6] = t3[6]; 1208 | z[7] = t3[7]; 1209 | } 1210 | 1211 | /* 1212 | * madd-2004-hmv: 1213 | * (from https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html) 1214 | * t1 = z1^2 1215 | * t2 = t1*z1 1216 | * t1 = t1*x2 1217 | * t2 = t2*y2 1218 | * t1 = t1-x1 1219 | * t2 = t2-y1 1220 | * z3 = z1*t1 1221 | * t3 = t1^2 1222 | * t4 = t3*t1 1223 | * t3 = t3*x1 1224 | * t1 = 2*t3 1225 | * x3 = t2^2 1226 | * x3 = x3-t1 1227 | * x3 = x3-t4 1228 | * t3 = t3-x3 1229 | * t3 = t3*t2 1230 | * t4 = t4*y1 1231 | * y3 = t3-t4 1232 | */ 1233 | 1234 | DECLSPEC void point_add (PRIVATE_AS u32 *x1, PRIVATE_AS u32 *y1, PRIVATE_AS u32 *z1, PRIVATE_AS u32 *x2, PRIVATE_AS u32 *y2) // z2 = 1 1235 | { 1236 | // How often does this really happen? it should "almost" never happen (but would be safer) 1237 | 1238 | /* 1239 | if ((y2[0] | y2[1] | y2[2] | y2[3] | y2[4] | y2[5] | y2[6] | y2[7]) == 0) return; 1240 | 1241 | if ((y1[0] | y1[1] | y1[2] | y1[3] | y1[4] | y1[5] | y1[6] | y1[7]) == 0) 1242 | { 1243 | x1[0] = x2[0]; 1244 | x1[1] = x2[1]; 1245 | x1[2] = x2[2]; 1246 | x1[3] = x2[3]; 1247 | x1[4] = x2[4]; 1248 | x1[5] = x2[5]; 1249 | x1[6] = x2[6]; 1250 | x1[7] = x2[7]; 1251 | 1252 | y1[0] = y2[0]; 1253 | y1[1] = y2[1]; 1254 | y1[2] = y2[2]; 1255 | y1[3] = y2[3]; 1256 | y1[4] = y2[4]; 1257 | y1[5] = y2[5]; 1258 | y1[6] = y2[6]; 1259 | y1[7] = y2[7]; 1260 | 1261 | z1[0] = z2[0]; 1262 | z1[1] = z2[1]; 1263 | z1[2] = z2[2]; 1264 | z1[3] = z2[3]; 1265 | z1[4] = z2[4]; 1266 | z1[5] = z2[5]; 1267 | z1[6] = z2[6]; 1268 | z1[7] = z2[7]; 1269 | 1270 | return; 1271 | } 1272 | */ 1273 | 1274 | // if x1 == x2 and y2 == y2 and z2 == z2 we need to double instead? 1275 | 1276 | // x1/y1/z1: 1277 | 1278 | u32 t1[8]; 1279 | 1280 | t1[0] = x1[0]; 1281 | t1[1] = x1[1]; 1282 | t1[2] = x1[2]; 1283 | t1[3] = x1[3]; 1284 | t1[4] = x1[4]; 1285 | t1[5] = x1[5]; 1286 | t1[6] = x1[6]; 1287 | t1[7] = x1[7]; 1288 | 1289 | u32 t2[8]; 1290 | 1291 | t2[0] = y1[0]; 1292 | t2[1] = y1[1]; 1293 | t2[2] = y1[2]; 1294 | t2[3] = y1[3]; 1295 | t2[4] = y1[4]; 1296 | t2[5] = y1[5]; 1297 | t2[6] = y1[6]; 1298 | t2[7] = y1[7]; 1299 | 1300 | u32 t3[8]; 1301 | 1302 | t3[0] = z1[0]; 1303 | t3[1] = z1[1]; 1304 | t3[2] = z1[2]; 1305 | t3[3] = z1[3]; 1306 | t3[4] = z1[4]; 1307 | t3[5] = z1[5]; 1308 | t3[6] = z1[6]; 1309 | t3[7] = z1[7]; 1310 | 1311 | // x2/y2: 1312 | 1313 | u32 t4[8]; 1314 | 1315 | t4[0] = x2[0]; 1316 | t4[1] = x2[1]; 1317 | t4[2] = x2[2]; 1318 | t4[3] = x2[3]; 1319 | t4[4] = x2[4]; 1320 | t4[5] = x2[5]; 1321 | t4[6] = x2[6]; 1322 | t4[7] = x2[7]; 1323 | 1324 | u32 t5[8]; 1325 | 1326 | t5[0] = y2[0]; 1327 | t5[1] = y2[1]; 1328 | t5[2] = y2[2]; 1329 | t5[3] = y2[3]; 1330 | t5[4] = y2[4]; 1331 | t5[5] = y2[5]; 1332 | t5[6] = y2[6]; 1333 | t5[7] = y2[7]; 1334 | 1335 | u32 t6[8]; 1336 | u32 t7[8]; 1337 | u32 t8[8]; 1338 | u32 t9[8]; 1339 | 1340 | mul_mod (t6, t3, t3); // t6 = t3^2 1341 | 1342 | mul_mod (t7, t6, t3); // t7 = t6*t3 1343 | mul_mod (t6, t6, t4); // t6 = t6*t4 1344 | mul_mod (t7, t7, t5); // t7 = t7*t5 1345 | 1346 | sub_mod (t6, t6, t1); // t6 = t6-t1 1347 | sub_mod (t7, t7, t2); // t7 = t7-t2 1348 | 1349 | mul_mod (t8, t3, t6); // t8 = t3*t6 1350 | mul_mod (t4, t6, t6); // t4 = t6^2 1351 | mul_mod (t9, t4, t6); // t9 = t4*t6 1352 | mul_mod (t4, t4, t1); // t4 = t4*t1 1353 | 1354 | // left shift (t4 * 2): 1355 | 1356 | t6[7] = t4[7] << 1 | t4[6] >> 31; 1357 | t6[6] = t4[6] << 1 | t4[5] >> 31; 1358 | t6[5] = t4[5] << 1 | t4[4] >> 31; 1359 | t6[4] = t4[4] << 1 | t4[3] >> 31; 1360 | t6[3] = t4[3] << 1 | t4[2] >> 31; 1361 | t6[2] = t4[2] << 1 | t4[1] >> 31; 1362 | t6[1] = t4[1] << 1 | t4[0] >> 31; 1363 | t6[0] = t4[0] << 1; 1364 | 1365 | // don't discard the most significant bit, it's important too! 1366 | 1367 | if (t4[7] & 0x80000000) 1368 | { 1369 | // use most significant bit and perform mod P, since we have: t4 * 2 % P 1370 | 1371 | u32 a[8] = { 0 }; 1372 | 1373 | a[1] = 1; 1374 | a[0] = 0x000003d1; // omega (see: mul_mod ()) 1375 | 1376 | add (t6, t6, a); 1377 | } 1378 | 1379 | mul_mod (t5, t7, t7); // t5 = t7*t7 1380 | 1381 | sub_mod (t5, t5, t6); // t5 = t5-t6 1382 | sub_mod (t5, t5, t9); // t5 = t5-t9 1383 | sub_mod (t4, t4, t5); // t4 = t4-t5 1384 | 1385 | mul_mod (t4, t4, t7); // t4 = t4*t7 1386 | mul_mod (t9, t9, t2); // t9 = t9*t2 1387 | 1388 | sub_mod (t9, t4, t9); // t9 = t4-t9 1389 | 1390 | x1[0] = t5[0]; 1391 | x1[1] = t5[1]; 1392 | x1[2] = t5[2]; 1393 | x1[3] = t5[3]; 1394 | x1[4] = t5[4]; 1395 | x1[5] = t5[5]; 1396 | x1[6] = t5[6]; 1397 | x1[7] = t5[7]; 1398 | 1399 | y1[0] = t9[0]; 1400 | y1[1] = t9[1]; 1401 | y1[2] = t9[2]; 1402 | y1[3] = t9[3]; 1403 | y1[4] = t9[4]; 1404 | y1[5] = t9[5]; 1405 | y1[6] = t9[6]; 1406 | y1[7] = t9[7]; 1407 | 1408 | z1[0] = t8[0]; 1409 | z1[1] = t8[1]; 1410 | z1[2] = t8[2]; 1411 | z1[3] = t8[3]; 1412 | z1[4] = t8[4]; 1413 | z1[5] = t8[5]; 1414 | z1[6] = t8[6]; 1415 | z1[7] = t8[7]; 1416 | } 1417 | 1418 | DECLSPEC void point_get_coords (PRIVATE_AS secp256k1_t *r, PRIVATE_AS const u32 *x, PRIVATE_AS const u32 *y) 1419 | { 1420 | /* 1421 | pre-compute 1/-1, 3/-3, 5/-5, 7/-7 times P (x, y) 1422 | for wNAF with window size 4 (max/min: +/- 2^3-1): -7, -5, -3, -1, 1, 3, 5, 7 1423 | 1424 | +x1 ( 0) 1425 | +y1 ( 8) 1426 | -y1 (16) 1427 | 1428 | +x3 (24) 1429 | +y3 (32) 1430 | -y3 (40) 1431 | 1432 | +x5 (48) 1433 | +y5 (56) 1434 | -y5 (64) 1435 | 1436 | +x7 (72) 1437 | +y7 (80) 1438 | -y7 (88) 1439 | */ 1440 | 1441 | // note: we use jacobian forms with (x, y, z) for computation, but affine 1442 | // (or just converted to z = 1) for storage 1443 | 1444 | // 1: 1445 | 1446 | r->xy[ 0] = x[0]; 1447 | r->xy[ 1] = x[1]; 1448 | r->xy[ 2] = x[2]; 1449 | r->xy[ 3] = x[3]; 1450 | r->xy[ 4] = x[4]; 1451 | r->xy[ 5] = x[5]; 1452 | r->xy[ 6] = x[6]; 1453 | r->xy[ 7] = x[7]; 1454 | 1455 | r->xy[ 8] = y[0]; 1456 | r->xy[ 9] = y[1]; 1457 | r->xy[10] = y[2]; 1458 | r->xy[11] = y[3]; 1459 | r->xy[12] = y[4]; 1460 | r->xy[13] = y[5]; 1461 | r->xy[14] = y[6]; 1462 | r->xy[15] = y[7]; 1463 | 1464 | // -1: 1465 | 1466 | u32 p[8]; 1467 | 1468 | p[0] = SECP256K1_P0; 1469 | p[1] = SECP256K1_P1; 1470 | p[2] = SECP256K1_P2; 1471 | p[3] = SECP256K1_P3; 1472 | p[4] = SECP256K1_P4; 1473 | p[5] = SECP256K1_P5; 1474 | p[6] = SECP256K1_P6; 1475 | p[7] = SECP256K1_P7; 1476 | 1477 | u32 neg[8]; 1478 | 1479 | neg[0] = y[0]; 1480 | neg[1] = y[1]; 1481 | neg[2] = y[2]; 1482 | neg[3] = y[3]; 1483 | neg[4] = y[4]; 1484 | neg[5] = y[5]; 1485 | neg[6] = y[6]; 1486 | neg[7] = y[7]; 1487 | 1488 | sub_mod (neg, p, neg); // -y = p - y 1489 | 1490 | r->xy[16] = neg[0]; 1491 | r->xy[17] = neg[1]; 1492 | r->xy[18] = neg[2]; 1493 | r->xy[19] = neg[3]; 1494 | r->xy[20] = neg[4]; 1495 | r->xy[21] = neg[5]; 1496 | r->xy[22] = neg[6]; 1497 | r->xy[23] = neg[7]; 1498 | 1499 | 1500 | // copy of 1: 1501 | 1502 | u32 tx[8]; 1503 | 1504 | tx[0] = x[0]; 1505 | tx[1] = x[1]; 1506 | tx[2] = x[2]; 1507 | tx[3] = x[3]; 1508 | tx[4] = x[4]; 1509 | tx[5] = x[5]; 1510 | tx[6] = x[6]; 1511 | tx[7] = x[7]; 1512 | 1513 | u32 ty[8]; 1514 | 1515 | ty[0] = y[0]; 1516 | ty[1] = y[1]; 1517 | ty[2] = y[2]; 1518 | ty[3] = y[3]; 1519 | ty[4] = y[4]; 1520 | ty[5] = y[5]; 1521 | ty[6] = y[6]; 1522 | ty[7] = y[7]; 1523 | 1524 | u32 rx[8]; 1525 | 1526 | rx[0] = x[0]; 1527 | rx[1] = x[1]; 1528 | rx[2] = x[2]; 1529 | rx[3] = x[3]; 1530 | rx[4] = x[4]; 1531 | rx[5] = x[5]; 1532 | rx[6] = x[6]; 1533 | rx[7] = x[7]; 1534 | 1535 | u32 ry[8]; 1536 | 1537 | ry[0] = y[0]; 1538 | ry[1] = y[1]; 1539 | ry[2] = y[2]; 1540 | ry[3] = y[3]; 1541 | ry[4] = y[4]; 1542 | ry[5] = y[5]; 1543 | ry[6] = y[6]; 1544 | ry[7] = y[7]; 1545 | 1546 | u32 rz[8] = { 0 }; 1547 | 1548 | rz[0] = 1; 1549 | 1550 | 1551 | // 3: 1552 | 1553 | point_double (rx, ry, rz); // 2 1554 | point_add (rx, ry, rz, tx, ty); // 3 1555 | 1556 | // to affine: 1557 | 1558 | inv_mod (rz); 1559 | 1560 | mul_mod (neg, rz, rz); // neg is temporary variable (z^2) 1561 | mul_mod (rx, rx, neg); 1562 | 1563 | mul_mod (rz, neg, rz); 1564 | mul_mod (ry, ry, rz); 1565 | 1566 | r->xy[24] = rx[0]; 1567 | r->xy[25] = rx[1]; 1568 | r->xy[26] = rx[2]; 1569 | r->xy[27] = rx[3]; 1570 | r->xy[28] = rx[4]; 1571 | r->xy[29] = rx[5]; 1572 | r->xy[30] = rx[6]; 1573 | r->xy[31] = rx[7]; 1574 | 1575 | r->xy[32] = ry[0]; 1576 | r->xy[33] = ry[1]; 1577 | r->xy[34] = ry[2]; 1578 | r->xy[35] = ry[3]; 1579 | r->xy[36] = ry[4]; 1580 | r->xy[37] = ry[5]; 1581 | r->xy[38] = ry[6]; 1582 | r->xy[39] = ry[7]; 1583 | 1584 | // -3: 1585 | 1586 | neg[0] = ry[0]; 1587 | neg[1] = ry[1]; 1588 | neg[2] = ry[2]; 1589 | neg[3] = ry[3]; 1590 | neg[4] = ry[4]; 1591 | neg[5] = ry[5]; 1592 | neg[6] = ry[6]; 1593 | neg[7] = ry[7]; 1594 | 1595 | sub_mod (neg, p, neg); 1596 | 1597 | r->xy[40] = neg[0]; 1598 | r->xy[41] = neg[1]; 1599 | r->xy[42] = neg[2]; 1600 | r->xy[43] = neg[3]; 1601 | r->xy[44] = neg[4]; 1602 | r->xy[45] = neg[5]; 1603 | r->xy[46] = neg[6]; 1604 | r->xy[47] = neg[7]; 1605 | 1606 | 1607 | // 5: 1608 | 1609 | rz[0] = 1; // actually we could take advantage of rz being 1 too (alternative point_add ()), 1610 | rz[1] = 0; // but it is not important because this is performed only once per "hash" 1611 | rz[2] = 0; 1612 | rz[3] = 0; 1613 | rz[4] = 0; 1614 | rz[5] = 0; 1615 | rz[6] = 0; 1616 | rz[7] = 0; 1617 | 1618 | point_add (rx, ry, rz, tx, ty); // 4 1619 | point_add (rx, ry, rz, tx, ty); // 5 1620 | 1621 | // to affine: 1622 | 1623 | inv_mod (rz); 1624 | 1625 | mul_mod (neg, rz, rz); 1626 | mul_mod (rx, rx, neg); 1627 | 1628 | mul_mod (rz, neg, rz); 1629 | mul_mod (ry, ry, rz); 1630 | 1631 | r->xy[48] = rx[0]; 1632 | r->xy[49] = rx[1]; 1633 | r->xy[50] = rx[2]; 1634 | r->xy[51] = rx[3]; 1635 | r->xy[52] = rx[4]; 1636 | r->xy[53] = rx[5]; 1637 | r->xy[54] = rx[6]; 1638 | r->xy[55] = rx[7]; 1639 | 1640 | r->xy[56] = ry[0]; 1641 | r->xy[57] = ry[1]; 1642 | r->xy[58] = ry[2]; 1643 | r->xy[59] = ry[3]; 1644 | r->xy[60] = ry[4]; 1645 | r->xy[61] = ry[5]; 1646 | r->xy[62] = ry[6]; 1647 | r->xy[63] = ry[7]; 1648 | 1649 | // -5: 1650 | 1651 | neg[0] = ry[0]; 1652 | neg[1] = ry[1]; 1653 | neg[2] = ry[2]; 1654 | neg[3] = ry[3]; 1655 | neg[4] = ry[4]; 1656 | neg[5] = ry[5]; 1657 | neg[6] = ry[6]; 1658 | neg[7] = ry[7]; 1659 | 1660 | sub_mod (neg, p, neg); 1661 | 1662 | r->xy[64] = neg[0]; 1663 | r->xy[65] = neg[1]; 1664 | r->xy[66] = neg[2]; 1665 | r->xy[67] = neg[3]; 1666 | r->xy[68] = neg[4]; 1667 | r->xy[69] = neg[5]; 1668 | r->xy[70] = neg[6]; 1669 | r->xy[71] = neg[7]; 1670 | 1671 | 1672 | // 7: 1673 | 1674 | rz[0] = 1; 1675 | rz[1] = 0; 1676 | rz[2] = 0; 1677 | rz[3] = 0; 1678 | rz[4] = 0; 1679 | rz[5] = 0; 1680 | rz[6] = 0; 1681 | rz[7] = 0; 1682 | 1683 | point_add (rx, ry, rz, tx, ty); // 6 1684 | point_add (rx, ry, rz, tx, ty); // 7 1685 | 1686 | // to affine: 1687 | 1688 | inv_mod (rz); 1689 | 1690 | mul_mod (neg, rz, rz); 1691 | mul_mod (rx, rx, neg); 1692 | 1693 | mul_mod (rz, neg, rz); 1694 | mul_mod (ry, ry, rz); 1695 | 1696 | r->xy[72] = rx[0]; 1697 | r->xy[73] = rx[1]; 1698 | r->xy[74] = rx[2]; 1699 | r->xy[75] = rx[3]; 1700 | r->xy[76] = rx[4]; 1701 | r->xy[77] = rx[5]; 1702 | r->xy[78] = rx[6]; 1703 | r->xy[79] = rx[7]; 1704 | 1705 | r->xy[80] = ry[0]; 1706 | r->xy[81] = ry[1]; 1707 | r->xy[82] = ry[2]; 1708 | r->xy[83] = ry[3]; 1709 | r->xy[84] = ry[4]; 1710 | r->xy[85] = ry[5]; 1711 | r->xy[86] = ry[6]; 1712 | r->xy[87] = ry[7]; 1713 | 1714 | // -7: 1715 | 1716 | neg[0] = ry[0]; 1717 | neg[1] = ry[1]; 1718 | neg[2] = ry[2]; 1719 | neg[3] = ry[3]; 1720 | neg[4] = ry[4]; 1721 | neg[5] = ry[5]; 1722 | neg[6] = ry[6]; 1723 | neg[7] = ry[7]; 1724 | 1725 | sub_mod (neg, p, neg); 1726 | 1727 | r->xy[88] = neg[0]; 1728 | r->xy[89] = neg[1]; 1729 | r->xy[90] = neg[2]; 1730 | r->xy[91] = neg[3]; 1731 | r->xy[92] = neg[4]; 1732 | r->xy[93] = neg[5]; 1733 | r->xy[94] = neg[6]; 1734 | r->xy[95] = neg[7]; 1735 | } 1736 | 1737 | /* 1738 | * Convert the tweak/scalar k to w-NAF (window size is 4). 1739 | * @param naf out: w-NAF form of the tweak/scalar, a pointer to an u32 array with a size of 33. 1740 | * @param k in: tweak/scalar which should be converted, a pointer to an u32 array with a size of 8. 1741 | * @return Returns the loop start index. 1742 | */ 1743 | DECLSPEC int convert_to_window_naf (PRIVATE_AS u32 *naf, PRIVATE_AS const u32 *k) 1744 | { 1745 | int loop_start = 0; 1746 | 1747 | u32 n[9]; 1748 | 1749 | n[0] = 0; // we need this extra slot sometimes for the subtraction to work 1750 | n[1] = k[7]; 1751 | n[2] = k[6]; 1752 | n[3] = k[5]; 1753 | n[4] = k[4]; 1754 | n[5] = k[3]; 1755 | n[6] = k[2]; 1756 | n[7] = k[1]; 1757 | n[8] = k[0]; 1758 | 1759 | for (int i = 0; i <= 256; i++) 1760 | { 1761 | if (n[8] & 1) 1762 | { 1763 | // for window size w = 4: 1764 | // => 2^(w-0) = 2^4 = 16 (0x10) 1765 | // => 2^(w-1) = 2^3 = 8 (0x08) 1766 | 1767 | int diff = n[8] & 0x0f; // n % 2^w == n & (2^w - 1) 1768 | 1769 | // convert diff to val according to this table: 1770 | // 1 -> +1 -> 1 1771 | // 3 -> +3 -> 3 1772 | // 5 -> +5 -> 5 1773 | // 7 -> +7 -> 7 1774 | // 9 -> -7 -> 8 1775 | // 11 -> -5 -> 6 1776 | // 13 -> -3 -> 4 1777 | // 15 -> -1 -> 2 1778 | 1779 | int val = diff; 1780 | 1781 | if (diff >= 0x08) 1782 | { 1783 | diff -= 0x10; 1784 | 1785 | val = 0x11 - val; 1786 | } 1787 | 1788 | naf[i >> 3] |= val << ((i & 7) << 2); 1789 | 1790 | u32 t = n[8]; // t is the (temporary) old/unmodified value 1791 | 1792 | n[8] -= diff; 1793 | 1794 | // we need to take care of the carry/borrow: 1795 | 1796 | u32 k = 8; 1797 | 1798 | if (diff > 0) 1799 | { 1800 | while (n[k] > t) // overflow propagation 1801 | { 1802 | if (k == 0) break; // needed ? 1803 | 1804 | k--; 1805 | 1806 | t = n[k]; 1807 | 1808 | n[k]--; 1809 | } 1810 | } 1811 | else // if (diff < 0) 1812 | { 1813 | while (t > n[k]) // overflow propagation 1814 | { 1815 | if (k == 0) break; 1816 | 1817 | k--; 1818 | 1819 | t = n[k]; 1820 | 1821 | n[k]++; 1822 | } 1823 | } 1824 | 1825 | // update start: 1826 | 1827 | loop_start = i; 1828 | } 1829 | 1830 | // n = n / 2: 1831 | 1832 | n[8] = n[8] >> 1 | n[7] << 31; 1833 | n[7] = n[7] >> 1 | n[6] << 31; 1834 | n[6] = n[6] >> 1 | n[5] << 31; 1835 | n[5] = n[5] >> 1 | n[4] << 31; 1836 | n[4] = n[4] >> 1 | n[3] << 31; 1837 | n[3] = n[3] >> 1 | n[2] << 31; 1838 | n[2] = n[2] >> 1 | n[1] << 31; 1839 | n[1] = n[1] >> 1 | n[0] << 31; 1840 | n[0] = n[0] >> 1; 1841 | } 1842 | 1843 | return loop_start; 1844 | } 1845 | 1846 | /* 1847 | * @param x1 out: x coordinate, a pointer to an u32 array with a size of 8. 1848 | * @param y1 out: y coordinate, a pointer to an u32 array with a size of 8. 1849 | * @param k in: tweak/scalar which should be converted, a pointer to an u32 array with a size of 8. 1850 | * @param tmps in: a basepoint for the multiplication. 1851 | * @return Returns the x coordinate with a leading parity/sign (for odd/even y), it is named a compressed coordinate. 1852 | */ 1853 | DECLSPEC void point_mul_xy (PRIVATE_AS u32 *x1, PRIVATE_AS u32 *y1, PRIVATE_AS const u32 *k, SECP256K1_TMPS_TYPE const secp256k1_t *tmps) 1854 | { 1855 | u32 naf[SECP256K1_NAF_SIZE] = { 0 }; 1856 | 1857 | int loop_start = convert_to_window_naf (naf, k); 1858 | 1859 | // first set: 1860 | 1861 | const u32 multiplier = (naf[loop_start >> 3] >> ((loop_start & 7) << 2)) & 0x0f; // or use u8 ? 1862 | 1863 | const u32 odd = multiplier & 1; 1864 | 1865 | const u32 x_pos = ((multiplier - 1 + odd) >> 1) * 24; 1866 | const u32 y_pos = odd ? (x_pos + 8) : (x_pos + 16); 1867 | 1868 | 1869 | x1[0] = tmps->xy[x_pos + 0]; 1870 | x1[1] = tmps->xy[x_pos + 1]; 1871 | x1[2] = tmps->xy[x_pos + 2]; 1872 | x1[3] = tmps->xy[x_pos + 3]; 1873 | x1[4] = tmps->xy[x_pos + 4]; 1874 | x1[5] = tmps->xy[x_pos + 5]; 1875 | x1[6] = tmps->xy[x_pos + 6]; 1876 | x1[7] = tmps->xy[x_pos + 7]; 1877 | 1878 | y1[0] = tmps->xy[y_pos + 0]; 1879 | y1[1] = tmps->xy[y_pos + 1]; 1880 | y1[2] = tmps->xy[y_pos + 2]; 1881 | y1[3] = tmps->xy[y_pos + 3]; 1882 | y1[4] = tmps->xy[y_pos + 4]; 1883 | y1[5] = tmps->xy[y_pos + 5]; 1884 | y1[6] = tmps->xy[y_pos + 6]; 1885 | y1[7] = tmps->xy[y_pos + 7]; 1886 | 1887 | u32 z1[8] = { 0 }; 1888 | 1889 | z1[0] = 1; 1890 | 1891 | /* 1892 | * Start: 1893 | */ 1894 | 1895 | // main loop (left-to-right binary algorithm): 1896 | 1897 | for (int pos = loop_start - 1; pos >= 0; pos--) // -1 because we've set/add the point already 1898 | { 1899 | // always double: 1900 | 1901 | point_double (x1, y1, z1); 1902 | 1903 | // add only if needed: 1904 | 1905 | const u32 multiplier = (naf[pos >> 3] >> ((pos & 7) << 2)) & 0x0f; 1906 | 1907 | if (multiplier) 1908 | { 1909 | /* 1910 | m -> y | y = ((m - (m & 1)) / 2) * 24 1911 | ---------------------------------- 1912 | 1 -> 0 | 1/2 * 24 = 0 1913 | 2 -> 16 1914 | 3 -> 24 | 3/2 * 24 = 24 1915 | 4 -> 40 1916 | 5 -> 48 | 5/2 * 24 = 2*24 1917 | 6 -> 64 1918 | 7 -> 72 | 7/2 * 24 = 3*24 1919 | 8 -> 88 1920 | */ 1921 | 1922 | const u32 odd = multiplier & 1; 1923 | 1924 | const u32 x_pos = ((multiplier - 1 + odd) >> 1) * 24; 1925 | const u32 y_pos = odd ? (x_pos + 8) : (x_pos + 16); 1926 | 1927 | u32 x2[8]; 1928 | 1929 | x2[0] = tmps->xy[x_pos + 0]; 1930 | x2[1] = tmps->xy[x_pos + 1]; 1931 | x2[2] = tmps->xy[x_pos + 2]; 1932 | x2[3] = tmps->xy[x_pos + 3]; 1933 | x2[4] = tmps->xy[x_pos + 4]; 1934 | x2[5] = tmps->xy[x_pos + 5]; 1935 | x2[6] = tmps->xy[x_pos + 6]; 1936 | x2[7] = tmps->xy[x_pos + 7]; 1937 | 1938 | u32 y2[8]; 1939 | 1940 | y2[0] = tmps->xy[y_pos + 0]; 1941 | y2[1] = tmps->xy[y_pos + 1]; 1942 | y2[2] = tmps->xy[y_pos + 2]; 1943 | y2[3] = tmps->xy[y_pos + 3]; 1944 | y2[4] = tmps->xy[y_pos + 4]; 1945 | y2[5] = tmps->xy[y_pos + 5]; 1946 | y2[6] = tmps->xy[y_pos + 6]; 1947 | y2[7] = tmps->xy[y_pos + 7]; 1948 | 1949 | // (x1, y1, z1) + multiplier * (x, y, z) = (x1, y1, z1) + (x2, y2, z2) 1950 | 1951 | point_add (x1, y1, z1, x2, y2); 1952 | 1953 | // optimization (there can't be any adds after an add for w-1 times): 1954 | // (but it seems to be faster without this manipulation of "pos") 1955 | 1956 | //for (u32 i = 0; i < 3; i++) 1957 | //{ 1958 | // if (pos == 0) break; 1959 | // point_double (x1, y1, z1); 1960 | // pos--; 1961 | //} 1962 | } 1963 | } 1964 | 1965 | 1966 | /* 1967 | * Get the corresponding affine coordinates x/y: 1968 | * 1969 | * Note: 1970 | * x1_affine = x1_jacobian / z1^2 = x1_jacobian * z1_inv^2 1971 | * y1_affine = y1_jacobian / z1^2 = y1_jacobian * z1_inv^2 1972 | * 1973 | */ 1974 | 1975 | inv_mod (z1); 1976 | 1977 | u32 z2[8]; 1978 | 1979 | mul_mod (z2, z1, z1); // z1^2 1980 | mul_mod (x1, x1, z2); // x1_affine 1981 | 1982 | mul_mod (z1, z2, z1); // z1^3 1983 | mul_mod (y1, y1, z1); // y1_affine 1984 | 1985 | // return values are already in x1 and y1 1986 | } 1987 | 1988 | /* 1989 | * @param r out: x coordinate with leading parity/sign (for odd/even y), a pointer to an u32 array with a size of 9. 1990 | * @param k in: tweak/scalar which should be converted, a pointer to an u32 array with a size of 8. 1991 | * @param tmps in: a basepoint for the multiplication. 1992 | * @return Returns the x coordinate with a leading parity/sign (for odd/even y), it is named a compressed coordinate. 1993 | */ 1994 | DECLSPEC void point_mul (PRIVATE_AS u32 *r, PRIVATE_AS const u32 *k, SECP256K1_TMPS_TYPE const secp256k1_t *tmps) 1995 | { 1996 | u32 x[8]; 1997 | u32 y[8]; 1998 | 1999 | point_mul_xy (x, y, k, tmps); 2000 | 2001 | /* 2002 | * output: 2003 | */ 2004 | 2005 | // shift by 1 byte (8 bits) to make room and add the parity/sign (for odd/even y): 2006 | 2007 | r[8] = (x[0] << 24); 2008 | r[7] = (x[0] >> 8) | (x[1] << 24); 2009 | r[6] = (x[1] >> 8) | (x[2] << 24); 2010 | r[5] = (x[2] >> 8) | (x[3] << 24); 2011 | r[4] = (x[3] >> 8) | (x[4] << 24); 2012 | r[3] = (x[4] >> 8) | (x[5] << 24); 2013 | r[2] = (x[5] >> 8) | (x[6] << 24); 2014 | r[1] = (x[6] >> 8) | (x[7] << 24); 2015 | r[0] = (x[7] >> 8); 2016 | 2017 | const u32 type = 0x02 | (y[0] & 1); // (note: 0b10 | 0b01 = 0x03) 2018 | 2019 | r[0] = r[0] | type << 24; // 0x02 or 0x03 2020 | } 2021 | 2022 | /* 2023 | * Transform a x coordinate and separate parity to secp256k1_t. 2024 | * @param r out: x and y coordinates. 2025 | * @param x in: x coordinate which should be converted, a pointer to an u32 array with a size of 8. 2026 | * @param first_byte in: The parity of the y coordinate, a u32. 2027 | * @return Returns 0 if successfull, returns 1 if x is greater than the basepoint. 2028 | */ 2029 | DECLSPEC u32 transform_public (PRIVATE_AS secp256k1_t *r, PRIVATE_AS const u32 *x, const u32 first_byte) 2030 | { 2031 | u32 p[8]; 2032 | 2033 | p[0] = SECP256K1_P0; 2034 | p[1] = SECP256K1_P1; 2035 | p[2] = SECP256K1_P2; 2036 | p[3] = SECP256K1_P3; 2037 | p[4] = SECP256K1_P4; 2038 | p[5] = SECP256K1_P5; 2039 | p[6] = SECP256K1_P6; 2040 | p[7] = SECP256K1_P7; 2041 | 2042 | // x must be smaller than p (because of y ^ 2 = x ^ 3 % p) 2043 | 2044 | for (int i = 7; i >= 0; i--) 2045 | { 2046 | if (x[i] < p[i]) break; 2047 | if (x[i] > p[i]) return 1; 2048 | } 2049 | 2050 | 2051 | // get y^2 = x^3 + 7: 2052 | 2053 | u32 b[8] = { 0 }; 2054 | 2055 | b[0] = SECP256K1_B; 2056 | 2057 | u32 y[8]; 2058 | 2059 | mul_mod (y, x, x); 2060 | mul_mod (y, y, x); 2061 | add_mod (y, y, b); 2062 | 2063 | // get y = sqrt (y^2): 2064 | 2065 | sqrt_mod (y); 2066 | 2067 | // check if it's of the correct parity that we want (odd/even): 2068 | 2069 | if ((first_byte & 1) != (y[0] & 1)) 2070 | { 2071 | // y2 = p - y1 (or y2 = y1 * -1) 2072 | 2073 | sub_mod (y, p, y); 2074 | } 2075 | 2076 | // get xy: 2077 | 2078 | point_get_coords (r, x, y); 2079 | 2080 | return 0; 2081 | } 2082 | 2083 | /* 2084 | * Parse a x coordinate with leading parity to secp256k1_t. 2085 | * @param r out: x and y coordinates. 2086 | * @param k in: x coordinate which should be converted with leading parity, a pointer to an u32 array with a size of 9. 2087 | * @return Returns 0 if successfull, returns 1 if x is greater than the basepoint or the parity has an unexpected value. 2088 | */ 2089 | DECLSPEC u32 parse_public (PRIVATE_AS secp256k1_t *r, PRIVATE_AS const u32 *k) 2090 | { 2091 | // verify: 2092 | 2093 | const u32 first_byte = k[0] & 0xff; 2094 | 2095 | if ((first_byte != '\x02') && (first_byte != '\x03')) 2096 | { 2097 | return 1; 2098 | } 2099 | 2100 | // load k into x without the first byte: 2101 | 2102 | u32 x[8]; 2103 | 2104 | x[0] = (k[7] & 0xff00) << 16 | (k[7] & 0xff0000) | (k[7] & 0xff000000) >> 16 | (k[8] & 0xff); 2105 | x[1] = (k[6] & 0xff00) << 16 | (k[6] & 0xff0000) | (k[6] & 0xff000000) >> 16 | (k[7] & 0xff); 2106 | x[2] = (k[5] & 0xff00) << 16 | (k[5] & 0xff0000) | (k[5] & 0xff000000) >> 16 | (k[6] & 0xff); 2107 | x[3] = (k[4] & 0xff00) << 16 | (k[4] & 0xff0000) | (k[4] & 0xff000000) >> 16 | (k[5] & 0xff); 2108 | x[4] = (k[3] & 0xff00) << 16 | (k[3] & 0xff0000) | (k[3] & 0xff000000) >> 16 | (k[4] & 0xff); 2109 | x[5] = (k[2] & 0xff00) << 16 | (k[2] & 0xff0000) | (k[2] & 0xff000000) >> 16 | (k[3] & 0xff); 2110 | x[6] = (k[1] & 0xff00) << 16 | (k[1] & 0xff0000) | (k[1] & 0xff000000) >> 16 | (k[2] & 0xff); 2111 | x[7] = (k[0] & 0xff00) << 16 | (k[0] & 0xff0000) | (k[0] & 0xff000000) >> 16 | (k[1] & 0xff); 2112 | 2113 | return transform_public (r, x, first_byte); 2114 | } 2115 | 2116 | 2117 | /* 2118 | * Set precomputed values of the basepoint g to a secp256k1 structure. 2119 | * @param r out: x and y coordinates. pre-computed points: (x1,y1,-y1),(x3,y3,-y3),(x5,y5,-y5),(x7,y7,-y7) 2120 | */ 2121 | DECLSPEC void set_precomputed_basepoint_g (PRIVATE_AS secp256k1_t *r) 2122 | { 2123 | // x1 2124 | r->xy[ 0] = SECP256K1_G_PRE_COMPUTED_00; 2125 | r->xy[ 1] = SECP256K1_G_PRE_COMPUTED_01; 2126 | r->xy[ 2] = SECP256K1_G_PRE_COMPUTED_02; 2127 | r->xy[ 3] = SECP256K1_G_PRE_COMPUTED_03; 2128 | r->xy[ 4] = SECP256K1_G_PRE_COMPUTED_04; 2129 | r->xy[ 5] = SECP256K1_G_PRE_COMPUTED_05; 2130 | r->xy[ 6] = SECP256K1_G_PRE_COMPUTED_06; 2131 | r->xy[ 7] = SECP256K1_G_PRE_COMPUTED_07; 2132 | 2133 | // y1 2134 | r->xy[ 8] = SECP256K1_G_PRE_COMPUTED_08; 2135 | r->xy[ 9] = SECP256K1_G_PRE_COMPUTED_09; 2136 | r->xy[10] = SECP256K1_G_PRE_COMPUTED_10; 2137 | r->xy[11] = SECP256K1_G_PRE_COMPUTED_11; 2138 | r->xy[12] = SECP256K1_G_PRE_COMPUTED_12; 2139 | r->xy[13] = SECP256K1_G_PRE_COMPUTED_13; 2140 | r->xy[14] = SECP256K1_G_PRE_COMPUTED_14; 2141 | r->xy[15] = SECP256K1_G_PRE_COMPUTED_15; 2142 | 2143 | // -y1 2144 | r->xy[16] = SECP256K1_G_PRE_COMPUTED_16; 2145 | r->xy[17] = SECP256K1_G_PRE_COMPUTED_17; 2146 | r->xy[18] = SECP256K1_G_PRE_COMPUTED_18; 2147 | r->xy[19] = SECP256K1_G_PRE_COMPUTED_19; 2148 | r->xy[20] = SECP256K1_G_PRE_COMPUTED_20; 2149 | r->xy[21] = SECP256K1_G_PRE_COMPUTED_21; 2150 | r->xy[22] = SECP256K1_G_PRE_COMPUTED_22; 2151 | r->xy[23] = SECP256K1_G_PRE_COMPUTED_23; 2152 | 2153 | // x3 2154 | r->xy[24] = SECP256K1_G_PRE_COMPUTED_24; 2155 | r->xy[25] = SECP256K1_G_PRE_COMPUTED_25; 2156 | r->xy[26] = SECP256K1_G_PRE_COMPUTED_26; 2157 | r->xy[27] = SECP256K1_G_PRE_COMPUTED_27; 2158 | r->xy[28] = SECP256K1_G_PRE_COMPUTED_28; 2159 | r->xy[29] = SECP256K1_G_PRE_COMPUTED_29; 2160 | r->xy[30] = SECP256K1_G_PRE_COMPUTED_30; 2161 | r->xy[31] = SECP256K1_G_PRE_COMPUTED_31; 2162 | 2163 | // y3 2164 | r->xy[32] = SECP256K1_G_PRE_COMPUTED_32; 2165 | r->xy[33] = SECP256K1_G_PRE_COMPUTED_33; 2166 | r->xy[34] = SECP256K1_G_PRE_COMPUTED_34; 2167 | r->xy[35] = SECP256K1_G_PRE_COMPUTED_35; 2168 | r->xy[36] = SECP256K1_G_PRE_COMPUTED_36; 2169 | r->xy[37] = SECP256K1_G_PRE_COMPUTED_37; 2170 | r->xy[38] = SECP256K1_G_PRE_COMPUTED_38; 2171 | r->xy[39] = SECP256K1_G_PRE_COMPUTED_39; 2172 | 2173 | // -y3 2174 | r->xy[40] = SECP256K1_G_PRE_COMPUTED_40; 2175 | r->xy[41] = SECP256K1_G_PRE_COMPUTED_41; 2176 | r->xy[42] = SECP256K1_G_PRE_COMPUTED_42; 2177 | r->xy[43] = SECP256K1_G_PRE_COMPUTED_43; 2178 | r->xy[44] = SECP256K1_G_PRE_COMPUTED_44; 2179 | r->xy[45] = SECP256K1_G_PRE_COMPUTED_45; 2180 | r->xy[46] = SECP256K1_G_PRE_COMPUTED_46; 2181 | r->xy[47] = SECP256K1_G_PRE_COMPUTED_47; 2182 | 2183 | // x5 2184 | r->xy[48] = SECP256K1_G_PRE_COMPUTED_48; 2185 | r->xy[49] = SECP256K1_G_PRE_COMPUTED_49; 2186 | r->xy[50] = SECP256K1_G_PRE_COMPUTED_50; 2187 | r->xy[51] = SECP256K1_G_PRE_COMPUTED_51; 2188 | r->xy[52] = SECP256K1_G_PRE_COMPUTED_52; 2189 | r->xy[53] = SECP256K1_G_PRE_COMPUTED_53; 2190 | r->xy[54] = SECP256K1_G_PRE_COMPUTED_54; 2191 | r->xy[55] = SECP256K1_G_PRE_COMPUTED_55; 2192 | 2193 | // y5 2194 | r->xy[56] = SECP256K1_G_PRE_COMPUTED_56; 2195 | r->xy[57] = SECP256K1_G_PRE_COMPUTED_57; 2196 | r->xy[58] = SECP256K1_G_PRE_COMPUTED_58; 2197 | r->xy[59] = SECP256K1_G_PRE_COMPUTED_59; 2198 | r->xy[60] = SECP256K1_G_PRE_COMPUTED_60; 2199 | r->xy[61] = SECP256K1_G_PRE_COMPUTED_61; 2200 | r->xy[62] = SECP256K1_G_PRE_COMPUTED_62; 2201 | r->xy[63] = SECP256K1_G_PRE_COMPUTED_63; 2202 | 2203 | // -y5 2204 | r->xy[64] = SECP256K1_G_PRE_COMPUTED_64; 2205 | r->xy[65] = SECP256K1_G_PRE_COMPUTED_65; 2206 | r->xy[66] = SECP256K1_G_PRE_COMPUTED_66; 2207 | r->xy[67] = SECP256K1_G_PRE_COMPUTED_67; 2208 | r->xy[68] = SECP256K1_G_PRE_COMPUTED_68; 2209 | r->xy[69] = SECP256K1_G_PRE_COMPUTED_69; 2210 | r->xy[70] = SECP256K1_G_PRE_COMPUTED_70; 2211 | r->xy[71] = SECP256K1_G_PRE_COMPUTED_71; 2212 | 2213 | // x7 2214 | r->xy[72] = SECP256K1_G_PRE_COMPUTED_72; 2215 | r->xy[73] = SECP256K1_G_PRE_COMPUTED_73; 2216 | r->xy[74] = SECP256K1_G_PRE_COMPUTED_74; 2217 | r->xy[75] = SECP256K1_G_PRE_COMPUTED_75; 2218 | r->xy[76] = SECP256K1_G_PRE_COMPUTED_76; 2219 | r->xy[77] = SECP256K1_G_PRE_COMPUTED_77; 2220 | r->xy[78] = SECP256K1_G_PRE_COMPUTED_78; 2221 | r->xy[79] = SECP256K1_G_PRE_COMPUTED_79; 2222 | 2223 | // y7 2224 | r->xy[80] = SECP256K1_G_PRE_COMPUTED_80; 2225 | r->xy[81] = SECP256K1_G_PRE_COMPUTED_81; 2226 | r->xy[82] = SECP256K1_G_PRE_COMPUTED_82; 2227 | r->xy[83] = SECP256K1_G_PRE_COMPUTED_83; 2228 | r->xy[84] = SECP256K1_G_PRE_COMPUTED_84; 2229 | r->xy[85] = SECP256K1_G_PRE_COMPUTED_85; 2230 | r->xy[86] = SECP256K1_G_PRE_COMPUTED_86; 2231 | r->xy[87] = SECP256K1_G_PRE_COMPUTED_87; 2232 | 2233 | // -y7 2234 | r->xy[88] = SECP256K1_G_PRE_COMPUTED_88; 2235 | r->xy[89] = SECP256K1_G_PRE_COMPUTED_89; 2236 | r->xy[90] = SECP256K1_G_PRE_COMPUTED_90; 2237 | r->xy[91] = SECP256K1_G_PRE_COMPUTED_91; 2238 | r->xy[92] = SECP256K1_G_PRE_COMPUTED_92; 2239 | r->xy[93] = SECP256K1_G_PRE_COMPUTED_93; 2240 | r->xy[94] = SECP256K1_G_PRE_COMPUTED_94; 2241 | r->xy[95] = SECP256K1_G_PRE_COMPUTED_95; 2242 | } 2243 | -------------------------------------------------------------------------------- /secp256k1/inc_types.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Author......: See docs/credits.txt 3 | * License.....: MIT 4 | */ 5 | 6 | #ifndef INC_TYPES_H 7 | #define INC_TYPES_H 8 | 9 | #if ATTACK_MODE == 9 10 | #define BITMAP_MASK kernel_param->bitmap_mask 11 | #define BITMAP_SHIFT1 kernel_param->bitmap_shift1 12 | #define BITMAP_SHIFT2 kernel_param->bitmap_shift2 13 | #define SALT_POS_HOST (kernel_param->pws_pos + gid) 14 | #define LOOP_POS kernel_param->loop_pos 15 | #define LOOP_CNT kernel_param->loop_cnt 16 | #define IL_CNT kernel_param->il_cnt 17 | #define DIGESTS_CNT 1 18 | #define DIGESTS_OFFSET_HOST (kernel_param->pws_pos + gid) 19 | #define COMBS_MODE kernel_param->combs_mode 20 | #define SALT_REPEAT kernel_param->salt_repeat 21 | #define PWS_POS kernel_param->pws_pos 22 | #define GID_CNT kernel_param->gid_max 23 | #else 24 | #define BITMAP_MASK kernel_param->bitmap_mask 25 | #define BITMAP_SHIFT1 kernel_param->bitmap_shift1 26 | #define BITMAP_SHIFT2 kernel_param->bitmap_shift2 27 | #define SALT_POS_HOST kernel_param->salt_pos_host 28 | #define LOOP_POS kernel_param->loop_pos 29 | #define LOOP_CNT kernel_param->loop_cnt 30 | #define IL_CNT kernel_param->il_cnt 31 | #define DIGESTS_CNT kernel_param->digests_cnt 32 | #define DIGESTS_OFFSET_HOST kernel_param->digests_offset_host 33 | #define COMBS_MODE kernel_param->combs_mode 34 | #define SALT_REPEAT kernel_param->salt_repeat 35 | #define PWS_POS kernel_param->pws_pos 36 | #define GID_CNT kernel_param->gid_max 37 | #endif 38 | 39 | #ifdef IS_CUDA 40 | // https://docs.nvidia.com/cuda/nvrtc/index.html#integer-size 41 | typedef unsigned char uchar; 42 | typedef unsigned short ushort; 43 | typedef unsigned int uint; 44 | typedef unsigned long ulong; 45 | typedef unsigned long long ullong; 46 | #endif 47 | 48 | #ifdef IS_METAL 49 | typedef unsigned char uchar; 50 | typedef unsigned short ushort; 51 | typedef unsigned int uint; 52 | typedef unsigned long ulong; 53 | #define ullong ulong 54 | #endif 55 | 56 | #ifdef IS_OPENCL 57 | typedef ulong ullong; 58 | typedef ulong2 ullong2; 59 | typedef ulong4 ullong4; 60 | typedef ulong8 ullong8; 61 | typedef ulong16 ullong16; 62 | #endif 63 | 64 | #ifdef KERNEL_STATIC 65 | typedef uchar u8; 66 | typedef ushort u16; 67 | typedef uint u32; 68 | #ifdef IS_METAL 69 | typedef ulong u64; 70 | #else 71 | typedef ullong u64; 72 | #endif 73 | #else 74 | typedef uint8_t u8; 75 | typedef uint16_t u16; 76 | typedef uint32_t u32; 77 | typedef uint64_t u64; 78 | #endif 79 | 80 | //testwise disabled 81 | //typedef u8 u8a __attribute__ ((aligned (8))); 82 | //typedef u16 u16a __attribute__ ((aligned (8))); 83 | //typedef u32 u32a __attribute__ ((aligned (8))); 84 | //typedef u64 u64a __attribute__ ((aligned (8))); 85 | 86 | typedef u8 u8a; 87 | typedef u16 u16a; 88 | typedef u32 u32a; 89 | typedef u64 u64a; 90 | 91 | #ifndef NEW_SIMD_CODE 92 | #undef VECT_SIZE 93 | #define VECT_SIZE 1 94 | #endif 95 | 96 | #define CONCAT(a, b) a##b 97 | #define VTYPE(type, width) CONCAT(type, width) 98 | 99 | // emulated is always VECT_SIZE = 1 100 | #if VECT_SIZE == 1 101 | typedef u8 u8x; 102 | typedef u16 u16x; 103 | typedef u32 u32x; 104 | typedef u64 u64x; 105 | 106 | #define make_u8x (u8) 107 | #define make_u16x (u16) 108 | #define make_u32x (u32) 109 | #define make_u64x (u64) 110 | 111 | #else 112 | 113 | #if defined IS_CUDA || defined IS_HIP 114 | 115 | #if VECT_SIZE == 2 116 | 117 | struct __device_builtin__ __builtin_align__(2) u8x 118 | { 119 | u8 s0; 120 | u8 s1; 121 | 122 | inline __device__ u8x (const u8 a, const u8 b) : s0(a), s1(b) { } 123 | inline __device__ u8x (const u8 a) : s0(a), s1(a) { } 124 | 125 | inline __device__ u8x (void) : s0(0), s1(0) { } 126 | inline __device__ ~u8x (void) { } 127 | }; 128 | 129 | struct __device_builtin__ __builtin_align__(4) u16x 130 | { 131 | u16 s0; 132 | u16 s1; 133 | 134 | inline __device__ u16x (const u16 a, const u16 b) : s0(a), s1(b) { } 135 | inline __device__ u16x (const u16 a) : s0(a), s1(a) { } 136 | 137 | inline __device__ u16x (void) : s0(0), s1(0) { } 138 | inline __device__ ~u16x (void) { } 139 | }; 140 | 141 | struct __device_builtin__ __builtin_align__(8) u32x 142 | { 143 | u32 s0; 144 | u32 s1; 145 | 146 | inline __device__ u32x (const u32 a, const u32 b) : s0(a), s1(b) { } 147 | inline __device__ u32x (const u32 a) : s0(a), s1(a) { } 148 | 149 | inline __device__ u32x (void) : s0(0), s1(0) { } 150 | inline __device__ ~u32x (void) { } 151 | }; 152 | 153 | struct __device_builtin__ __builtin_align__(16) u64x 154 | { 155 | u64 s0; 156 | u64 s1; 157 | 158 | inline __device__ u64x (const u64 a, const u64 b) : s0(a), s1(b) { } 159 | inline __device__ u64x (const u64 a) : s0(a), s1(a) { } 160 | 161 | inline __device__ u64x (void) : s0(0), s1(0) { } 162 | inline __device__ ~u64x (void) { } 163 | }; 164 | 165 | inline __device__ bool operator != (const u32x a, const u32 b) { return ((a.s0 != b) && (a.s1 != b)); } 166 | inline __device__ bool operator != (const u32x a, const u32x b) { return ((a.s0 != b.s0) && (a.s1 != b.s1)); } 167 | 168 | inline __device__ void operator ^= (u32x &a, const u32 b) { a.s0 ^= b; a.s1 ^= b; } 169 | inline __device__ void operator ^= (u32x &a, const u32x b) { a.s0 ^= b.s0; a.s1 ^= b.s1; } 170 | 171 | inline __device__ void operator |= (u32x &a, const u32 b) { a.s0 |= b; a.s1 |= b; } 172 | inline __device__ void operator |= (u32x &a, const u32x b) { a.s0 |= b.s0; a.s1 |= b.s1; } 173 | 174 | inline __device__ void operator &= (u32x &a, const u32 b) { a.s0 &= b; a.s1 &= b; } 175 | inline __device__ void operator &= (u32x &a, const u32x b) { a.s0 &= b.s0; a.s1 &= b.s1; } 176 | 177 | inline __device__ void operator += (u32x &a, const u32 b) { a.s0 += b; a.s1 += b; } 178 | inline __device__ void operator += (u32x &a, const u32x b) { a.s0 += b.s0; a.s1 += b.s1; } 179 | 180 | inline __device__ void operator -= (u32x &a, const u32 b) { a.s0 -= b; a.s1 -= b; } 181 | inline __device__ void operator -= (u32x &a, const u32x b) { a.s0 -= b.s0; a.s1 -= b.s1; } 182 | 183 | inline __device__ void operator *= (u32x &a, const u32 b) { a.s0 *= b; a.s1 *= b; } 184 | inline __device__ void operator *= (u32x &a, const u32x b) { a.s0 *= b.s0; a.s1 *= b.s1; } 185 | 186 | inline __device__ void operator >>= (u32x &a, const u32 b) { a.s0 >>= b; a.s1 >>= b; } 187 | inline __device__ void operator >>= (u32x &a, const u32x b) { a.s0 >>= b.s0; a.s1 >>= b.s1; } 188 | 189 | inline __device__ void operator <<= (u32x &a, const u32 b) { a.s0 <<= b; a.s1 <<= b; } 190 | inline __device__ void operator <<= (u32x &a, const u32x b) { a.s0 <<= b.s0; a.s1 <<= b.s1; } 191 | 192 | inline __device__ u32x operator << (const u32x a, const u32 b) { return u32x ((a.s0 << b), (a.s1 << b) ); } 193 | inline __device__ u32x operator << (const u32x a, const u32x b) { return u32x ((a.s0 << b.s0), (a.s1 << b.s1)); } 194 | 195 | inline __device__ u32x operator >> (const u32x a, const u32 b) { return u32x ((a.s0 >> b), (a.s1 >> b) ); } 196 | inline __device__ u32x operator >> (const u32x a, const u32x b) { return u32x ((a.s0 >> b.s0), (a.s1 >> b.s1)); } 197 | 198 | inline __device__ u32x operator ^ (const u32x a, const u32 b) { return u32x ((a.s0 ^ b), (a.s1 ^ b) ); } 199 | inline __device__ u32x operator ^ (const u32x a, const u32x b) { return u32x ((a.s0 ^ b.s0), (a.s1 ^ b.s1)); } 200 | 201 | inline __device__ u32x operator | (const u32x a, const u32 b) { return u32x ((a.s0 | b), (a.s1 | b) ); } 202 | inline __device__ u32x operator | (const u32x a, const u32x b) { return u32x ((a.s0 | b.s0), (a.s1 | b.s1)); } 203 | 204 | inline __device__ u32x operator & (const u32x a, const u32 b) { return u32x ((a.s0 & b), (a.s1 & b) ); } 205 | inline __device__ u32x operator & (const u32x a, const u32x b) { return u32x ((a.s0 & b.s0), (a.s1 & b.s1)); } 206 | 207 | inline __device__ u32x operator + (const u32x a, const u32 b) { return u32x ((a.s0 + b), (a.s1 + b) ); } 208 | inline __device__ u32x operator + (const u32x a, const u32x b) { return u32x ((a.s0 + b.s0), (a.s1 + b.s1)); } 209 | 210 | inline __device__ u32x operator - (const u32x a, const u32 b) { return u32x ((a.s0 - b), (a.s1 - b) ); } 211 | inline __device__ u32x operator - (const u32x a, const u32x b) { return u32x ((a.s0 - b.s0), (a.s1 - b.s1)); } 212 | 213 | inline __device__ u32x operator * (const u32x a, const u32 b) { return u32x ((a.s0 * b), (a.s1 * b) ); } 214 | inline __device__ u32x operator * (const u32x a, const u32x b) { return u32x ((a.s0 * b.s0), (a.s1 * b.s1)); } 215 | 216 | inline __device__ u32x operator % (const u32x a, const u32 b) { return u32x ((a.s0 % b), (a.s1 % b) ); } 217 | inline __device__ u32x operator % (const u32x a, const u32x b) { return u32x ((a.s0 % b.s0), (a.s1 % b.s1)); } 218 | 219 | inline __device__ u32x operator ~ (const u32x a) { return u32x (~a.s0, ~a.s1); } 220 | 221 | inline __device__ bool operator != (const u64x a, const u64 b) { return ((a.s0 != b) && (a.s1 != b)); } 222 | inline __device__ bool operator != (const u64x a, const u64x b) { return ((a.s0 != b.s0) && (a.s1 != b.s1)); } 223 | 224 | inline __device__ void operator ^= (u64x &a, const u64 b) { a.s0 ^= b; a.s1 ^= b; } 225 | inline __device__ void operator ^= (u64x &a, const u64x b) { a.s0 ^= b.s0; a.s1 ^= b.s1; } 226 | 227 | inline __device__ void operator |= (u64x &a, const u64 b) { a.s0 |= b; a.s1 |= b; } 228 | inline __device__ void operator |= (u64x &a, const u64x b) { a.s0 |= b.s0; a.s1 |= b.s1; } 229 | 230 | inline __device__ void operator &= (u64x &a, const u64 b) { a.s0 &= b; a.s1 &= b; } 231 | inline __device__ void operator &= (u64x &a, const u64x b) { a.s0 &= b.s0; a.s1 &= b.s1; } 232 | 233 | inline __device__ void operator += (u64x &a, const u64 b) { a.s0 += b; a.s1 += b; } 234 | inline __device__ void operator += (u64x &a, const u64x b) { a.s0 += b.s0; a.s1 += b.s1; } 235 | 236 | inline __device__ void operator -= (u64x &a, const u64 b) { a.s0 -= b; a.s1 -= b; } 237 | inline __device__ void operator -= (u64x &a, const u64x b) { a.s0 -= b.s0; a.s1 -= b.s1; } 238 | 239 | inline __device__ void operator *= (u64x &a, const u64 b) { a.s0 *= b; a.s1 *= b; } 240 | inline __device__ void operator *= (u64x &a, const u64x b) { a.s0 *= b.s0; a.s1 *= b.s1; } 241 | 242 | inline __device__ void operator >>= (u64x &a, const u64 b) { a.s0 >>= b; a.s1 >>= b; } 243 | inline __device__ void operator >>= (u64x &a, const u64x b) { a.s0 >>= b.s0; a.s1 >>= b.s1; } 244 | 245 | inline __device__ void operator <<= (u64x &a, const u64 b) { a.s0 <<= b; a.s1 <<= b; } 246 | inline __device__ void operator <<= (u64x &a, const u64x b) { a.s0 <<= b.s0; a.s1 <<= b.s1; } 247 | 248 | inline __device__ u64x operator << (const u64x a, const u64 b) { return u64x ((a.s0 << b), (a.s1 << b) ); } 249 | inline __device__ u64x operator << (const u64x a, const u64x b) { return u64x ((a.s0 << b.s0), (a.s1 << b.s1)); } 250 | 251 | inline __device__ u64x operator >> (const u64x a, const u64 b) { return u64x ((a.s0 >> b), (a.s1 >> b) ); } 252 | inline __device__ u64x operator >> (const u64x a, const u64x b) { return u64x ((a.s0 >> b.s0), (a.s1 >> b.s1)); } 253 | 254 | inline __device__ u64x operator ^ (const u64x a, const u64 b) { return u64x ((a.s0 ^ b), (a.s1 ^ b) ); } 255 | inline __device__ u64x operator ^ (const u64x a, const u64x b) { return u64x ((a.s0 ^ b.s0), (a.s1 ^ b.s1)); } 256 | 257 | inline __device__ u64x operator | (const u64x a, const u64 b) { return u64x ((a.s0 | b), (a.s1 | b) ); } 258 | inline __device__ u64x operator | (const u64x a, const u64x b) { return u64x ((a.s0 | b.s0), (a.s1 | b.s1)); } 259 | 260 | inline __device__ u64x operator & (const u64x a, const u64 b) { return u64x ((a.s0 & b), (a.s1 & b) ); } 261 | inline __device__ u64x operator & (const u64x a, const u64x b) { return u64x ((a.s0 & b.s0), (a.s1 & b.s1)); } 262 | 263 | inline __device__ u64x operator + (const u64x a, const u64 b) { return u64x ((a.s0 + b), (a.s1 + b) ); } 264 | inline __device__ u64x operator + (const u64x a, const u64x b) { return u64x ((a.s0 + b.s0), (a.s1 + b.s1)); } 265 | 266 | inline __device__ u64x operator - (const u64x a, const u64 b) { return u64x ((a.s0 - b), (a.s1 - b) ); } 267 | inline __device__ u64x operator - (const u64x a, const u64x b) { return u64x ((a.s0 - b.s0), (a.s1 - b.s1)); } 268 | 269 | inline __device__ u64x operator * (const u64x a, const u64 b) { return u64x ((a.s0 * b), (a.s1 * b) ); } 270 | inline __device__ u64x operator * (const u64x a, const u64x b) { return u64x ((a.s0 * b.s0), (a.s1 * b.s1)); } 271 | 272 | inline __device__ u64x operator % (const u64x a, const u64 b) { return u64x ((a.s0 % b), (a.s1 % b) ); } 273 | inline __device__ u64x operator % (const u64x a, const u64x b) { return u64x ((a.s0 % b.s0), (a.s1 % b.s1)); } 274 | 275 | inline __device__ u64x operator ~ (const u64x a) { return u64x (~a.s0, ~a.s1); } 276 | 277 | #endif 278 | 279 | #if VECT_SIZE == 4 280 | 281 | struct __device_builtin__ __builtin_align__(4) u8x 282 | { 283 | u8 s0; 284 | u8 s1; 285 | u8 s2; 286 | u8 s3; 287 | 288 | inline __device__ u8x (const u8 a, const u8 b, const u8 c, const u8 d) : s0(a), s1(b), s2(c), s3(d) { } 289 | inline __device__ u8x (const u8 a) : s0(a), s1(a), s2(a), s3(a) { } 290 | 291 | inline __device__ u8x (void) : s0(0), s1(0), s2(0), s3(0) { } 292 | inline __device__ ~u8x (void) { } 293 | }; 294 | 295 | struct __device_builtin__ __builtin_align__(8) u16x 296 | { 297 | u16 s0; 298 | u16 s1; 299 | u16 s2; 300 | u16 s3; 301 | 302 | inline __device__ u16x (const u16 a, const u16 b, const u16 c, const u16 d) : s0(a), s1(b), s2(c), s3(d) { } 303 | inline __device__ u16x (const u16 a) : s0(a), s1(a), s2(a), s3(a) { } 304 | 305 | inline __device__ u16x (void) : s0(0), s1(0), s2(0), s3(0) { } 306 | inline __device__ ~u16x (void) { } 307 | }; 308 | 309 | struct __device_builtin__ __builtin_align__(16) u32x 310 | { 311 | u32 s0; 312 | u32 s1; 313 | u32 s2; 314 | u32 s3; 315 | 316 | inline __device__ u32x (const u32 a, const u32 b, const u32 c, const u32 d) : s0(a), s1(b), s2(c), s3(d) { } 317 | inline __device__ u32x (const u32 a) : s0(a), s1(a), s2(a), s3(a) { } 318 | 319 | inline __device__ u32x (void) : s0(0), s1(0), s2(0), s3(0) { } 320 | inline __device__ ~u32x (void) { } 321 | }; 322 | 323 | struct __device_builtin__ __builtin_align__(32) u64x 324 | { 325 | u64 s0; 326 | u64 s1; 327 | u64 s2; 328 | u64 s3; 329 | 330 | inline __device__ u64x (const u64 a, const u64 b, const u64 c, const u64 d) : s0(a), s1(b), s2(c), s3(d) { } 331 | inline __device__ u64x (const u64 a) : s0(a), s1(a), s2(a), s3(a) { } 332 | 333 | inline __device__ u64x (void) : s0(0), s1(0), s2(0), s3(0) { } 334 | inline __device__ ~u64x (void) { } 335 | }; 336 | 337 | inline __device__ bool operator != (const u32x a, const u32 b) { return ((a.s0 != b) && (a.s1 != b) && (a.s2 != b) && (a.s3 != b) ); } 338 | inline __device__ bool operator != (const u32x a, const u32x b) { return ((a.s0 != b.s0) && (a.s1 != b.s1) && (a.s2 != b.s2) && (a.s3 != b.s3)); } 339 | 340 | inline __device__ void operator ^= (u32x &a, const u32 b) { a.s0 ^= b; a.s1 ^= b; a.s2 ^= b; a.s3 ^= b; } 341 | inline __device__ void operator ^= (u32x &a, const u32x b) { a.s0 ^= b.s0; a.s1 ^= b.s1; a.s2 ^= b.s2; a.s3 ^= b.s3; } 342 | 343 | inline __device__ void operator |= (u32x &a, const u32 b) { a.s0 |= b; a.s1 |= b; a.s2 |= b; a.s3 |= b; } 344 | inline __device__ void operator |= (u32x &a, const u32x b) { a.s0 |= b.s0; a.s1 |= b.s1; a.s2 |= b.s2; a.s3 |= b.s3; } 345 | 346 | inline __device__ void operator &= (u32x &a, const u32 b) { a.s0 &= b; a.s1 &= b; a.s2 &= b; a.s3 &= b; } 347 | inline __device__ void operator &= (u32x &a, const u32x b) { a.s0 &= b.s0; a.s1 &= b.s1; a.s2 &= b.s2; a.s3 &= b.s3; } 348 | 349 | inline __device__ void operator += (u32x &a, const u32 b) { a.s0 += b; a.s1 += b; a.s2 += b; a.s3 += b; } 350 | inline __device__ void operator += (u32x &a, const u32x b) { a.s0 += b.s0; a.s1 += b.s1; a.s2 += b.s2; a.s3 += b.s3; } 351 | 352 | inline __device__ void operator -= (u32x &a, const u32 b) { a.s0 -= b; a.s1 -= b; a.s2 -= b; a.s3 -= b; } 353 | inline __device__ void operator -= (u32x &a, const u32x b) { a.s0 -= b.s0; a.s1 -= b.s1; a.s2 -= b.s2; a.s3 -= b.s3; } 354 | 355 | inline __device__ void operator *= (u32x &a, const u32 b) { a.s0 *= b; a.s1 *= b; a.s2 *= b; a.s3 *= b; } 356 | inline __device__ void operator *= (u32x &a, const u32x b) { a.s0 *= b.s0; a.s1 *= b.s1; a.s2 *= b.s2; a.s3 *= b.s3; } 357 | 358 | inline __device__ void operator >>= (u32x &a, const u32 b) { a.s0 >>= b; a.s1 >>= b; a.s2 >>= b; a.s3 >>= b; } 359 | inline __device__ void operator >>= (u32x &a, const u32x b) { a.s0 >>= b.s0; a.s1 >>= b.s1; a.s2 >>= b.s2; a.s3 >>= b.s3; } 360 | 361 | inline __device__ void operator <<= (u32x &a, const u32 b) { a.s0 <<= b; a.s1 <<= b; a.s2 <<= b; a.s3 <<= b; } 362 | inline __device__ void operator <<= (u32x &a, const u32x b) { a.s0 <<= b.s0; a.s1 <<= b.s1; a.s2 <<= b.s2; a.s3 <<= b.s3; } 363 | 364 | inline __device__ u32x operator << (const u32x a, const u32 b) { return u32x ((a.s0 << b), (a.s1 << b) , (a.s2 << b), (a.s3 << b) ); } 365 | inline __device__ u32x operator << (const u32x a, const u32x b) { return u32x ((a.s0 << b.s0), (a.s1 << b.s1), (a.s2 << b.s2), (a.s3 << b.s3)); } 366 | 367 | inline __device__ u32x operator >> (const u32x a, const u32 b) { return u32x ((a.s0 >> b), (a.s1 >> b) , (a.s2 >> b), (a.s3 >> b) ); } 368 | inline __device__ u32x operator >> (const u32x a, const u32x b) { return u32x ((a.s0 >> b.s0), (a.s1 >> b.s1), (a.s2 >> b.s2), (a.s3 >> b.s3)); } 369 | 370 | inline __device__ u32x operator ^ (const u32x a, const u32 b) { return u32x ((a.s0 ^ b), (a.s1 ^ b) , (a.s2 ^ b), (a.s3 ^ b) ); } 371 | inline __device__ u32x operator ^ (const u32x a, const u32x b) { return u32x ((a.s0 ^ b.s0), (a.s1 ^ b.s1), (a.s2 ^ b.s2), (a.s3 ^ b.s3)); } 372 | 373 | inline __device__ u32x operator | (const u32x a, const u32 b) { return u32x ((a.s0 | b), (a.s1 | b) , (a.s2 | b), (a.s3 | b) ); } 374 | inline __device__ u32x operator | (const u32x a, const u32x b) { return u32x ((a.s0 | b.s0), (a.s1 | b.s1), (a.s2 | b.s2), (a.s3 | b.s3)); } 375 | 376 | inline __device__ u32x operator & (const u32x a, const u32 b) { return u32x ((a.s0 & b), (a.s1 & b) , (a.s2 & b), (a.s3 & b) ); } 377 | inline __device__ u32x operator & (const u32x a, const u32x b) { return u32x ((a.s0 & b.s0), (a.s1 & b.s1), (a.s2 & b.s2), (a.s3 & b.s3)); } 378 | 379 | inline __device__ u32x operator + (const u32x a, const u32 b) { return u32x ((a.s0 + b), (a.s1 + b) , (a.s2 + b), (a.s3 + b) ); } 380 | inline __device__ u32x operator + (const u32x a, const u32x b) { return u32x ((a.s0 + b.s0), (a.s1 + b.s1), (a.s2 + b.s2), (a.s3 + b.s3)); } 381 | 382 | inline __device__ u32x operator - (const u32x a, const u32 b) { return u32x ((a.s0 - b), (a.s1 - b) , (a.s2 - b), (a.s3 - b) ); } 383 | inline __device__ u32x operator - (const u32x a, const u32x b) { return u32x ((a.s0 - b.s0), (a.s1 - b.s1), (a.s2 - b.s2), (a.s3 - b.s3)); } 384 | 385 | inline __device__ u32x operator * (const u32x a, const u32 b) { return u32x ((a.s0 * b), (a.s1 * b) , (a.s2 * b), (a.s3 * b) ); } 386 | inline __device__ u32x operator * (const u32x a, const u32x b) { return u32x ((a.s0 * b.s0), (a.s1 * b.s1), (a.s2 * b.s2), (a.s3 * b.s3)); } 387 | 388 | inline __device__ u32x operator % (const u32x a, const u32 b) { return u32x ((a.s0 % b), (a.s1 % b) , (a.s2 % b), (a.s3 % b) ); } 389 | inline __device__ u32x operator % (const u32x a, const u32x b) { return u32x ((a.s0 % b.s0), (a.s1 % b.s1), (a.s2 % b.s2), (a.s3 % b.s3)); } 390 | 391 | inline __device__ u32x operator ~ (const u32x a) { return u32x (~a.s0, ~a.s1, ~a.s2, ~a.s3); } 392 | 393 | inline __device__ bool operator != (const u64x a, const u64 b) { return ((a.s0 != b) && (a.s1 != b) && (a.s2 != b) && (a.s3 != b) ); } 394 | inline __device__ bool operator != (const u64x a, const u64x b) { return ((a.s0 != b.s0) && (a.s1 != b.s1) && (a.s2 != b.s2) && (a.s3 != b.s3)); } 395 | 396 | inline __device__ void operator ^= (u64x &a, const u64 b) { a.s0 ^= b; a.s1 ^= b; a.s2 ^= b; a.s3 ^= b; } 397 | inline __device__ void operator ^= (u64x &a, const u64x b) { a.s0 ^= b.s0; a.s1 ^= b.s1; a.s2 ^= b.s2; a.s3 ^= b.s3; } 398 | 399 | inline __device__ void operator |= (u64x &a, const u64 b) { a.s0 |= b; a.s1 |= b; a.s2 |= b; a.s3 |= b; } 400 | inline __device__ void operator |= (u64x &a, const u64x b) { a.s0 |= b.s0; a.s1 |= b.s1; a.s2 |= b.s2; a.s3 |= b.s3; } 401 | 402 | inline __device__ void operator &= (u64x &a, const u64 b) { a.s0 &= b; a.s1 &= b; a.s2 &= b; a.s3 &= b; } 403 | inline __device__ void operator &= (u64x &a, const u64x b) { a.s0 &= b.s0; a.s1 &= b.s1; a.s2 &= b.s2; a.s3 &= b.s3; } 404 | 405 | inline __device__ void operator += (u64x &a, const u64 b) { a.s0 += b; a.s1 += b; a.s2 += b; a.s3 += b; } 406 | inline __device__ void operator += (u64x &a, const u64x b) { a.s0 += b.s0; a.s1 += b.s1; a.s2 += b.s2; a.s3 += b.s3; } 407 | 408 | inline __device__ void operator -= (u64x &a, const u64 b) { a.s0 -= b; a.s1 -= b; a.s2 -= b; a.s3 -= b; } 409 | inline __device__ void operator -= (u64x &a, const u64x b) { a.s0 -= b.s0; a.s1 -= b.s1; a.s2 -= b.s2; a.s3 -= b.s3; } 410 | 411 | inline __device__ void operator *= (u64x &a, const u64 b) { a.s0 *= b; a.s1 *= b; a.s2 *= b; a.s3 *= b; } 412 | inline __device__ void operator *= (u64x &a, const u64x b) { a.s0 *= b.s0; a.s1 *= b.s1; a.s2 *= b.s2; a.s3 *= b.s3; } 413 | 414 | inline __device__ void operator >>= (u64x &a, const u64 b) { a.s0 >>= b; a.s1 >>= b; a.s2 >>= b; a.s3 >>= b; } 415 | inline __device__ void operator >>= (u64x &a, const u64x b) { a.s0 >>= b.s0; a.s1 >>= b.s1; a.s2 >>= b.s2; a.s3 >>= b.s3; } 416 | 417 | inline __device__ void operator <<= (u64x &a, const u64 b) { a.s0 <<= b; a.s1 <<= b; a.s2 <<= b; a.s3 <<= b; } 418 | inline __device__ void operator <<= (u64x &a, const u64x b) { a.s0 <<= b.s0; a.s1 <<= b.s1; a.s2 <<= b.s2; a.s3 <<= b.s3; } 419 | 420 | inline __device__ u64x operator << (const u64x a, const u64 b) { return u64x ((a.s0 << b), (a.s1 << b) , (a.s2 << b), (a.s3 << b) ); } 421 | inline __device__ u64x operator << (const u64x a, const u64x b) { return u64x ((a.s0 << b.s0), (a.s1 << b.s1), (a.s2 << b.s2), (a.s3 << b.s3)); } 422 | 423 | inline __device__ u64x operator >> (const u64x a, const u64 b) { return u64x ((a.s0 >> b), (a.s1 >> b) , (a.s2 >> b), (a.s3 >> b) ); } 424 | inline __device__ u64x operator >> (const u64x a, const u64x b) { return u64x ((a.s0 >> b.s0), (a.s1 >> b.s1), (a.s2 >> b.s2), (a.s3 >> b.s3)); } 425 | 426 | inline __device__ u64x operator ^ (const u64x a, const u64 b) { return u64x ((a.s0 ^ b), (a.s1 ^ b) , (a.s2 ^ b), (a.s3 ^ b) ); } 427 | inline __device__ u64x operator ^ (const u64x a, const u64x b) { return u64x ((a.s0 ^ b.s0), (a.s1 ^ b.s1), (a.s2 ^ b.s2), (a.s3 ^ b.s3)); } 428 | 429 | inline __device__ u64x operator | (const u64x a, const u64 b) { return u64x ((a.s0 | b), (a.s1 | b) , (a.s2 | b), (a.s3 | b) ); } 430 | inline __device__ u64x operator | (const u64x a, const u64x b) { return u64x ((a.s0 | b.s0), (a.s1 | b.s1), (a.s2 | b.s2), (a.s3 | b.s3)); } 431 | 432 | inline __device__ u64x operator & (const u64x a, const u64 b) { return u64x ((a.s0 & b), (a.s1 & b) , (a.s2 & b), (a.s3 & b) ); } 433 | inline __device__ u64x operator & (const u64x a, const u64x b) { return u64x ((a.s0 & b.s0), (a.s1 & b.s1), (a.s2 & b.s2), (a.s3 & b.s3)); } 434 | 435 | inline __device__ u64x operator + (const u64x a, const u64 b) { return u64x ((a.s0 + b), (a.s1 + b) , (a.s2 + b), (a.s3 + b) ); } 436 | inline __device__ u64x operator + (const u64x a, const u64x b) { return u64x ((a.s0 + b.s0), (a.s1 + b.s1), (a.s2 + b.s2), (a.s3 + b.s3)); } 437 | 438 | inline __device__ u64x operator - (const u64x a, const u64 b) { return u64x ((a.s0 - b), (a.s1 - b) , (a.s2 - b), (a.s3 - b) ); } 439 | inline __device__ u64x operator - (const u64x a, const u64x b) { return u64x ((a.s0 - b.s0), (a.s1 - b.s1), (a.s2 - b.s2), (a.s3 - b.s3)); } 440 | 441 | inline __device__ u64x operator * (const u64x a, const u64 b) { return u64x ((a.s0 * b), (a.s1 * b) , (a.s2 * b), (a.s3 * b) ); } 442 | inline __device__ u64x operator * (const u64x a, const u64x b) { return u64x ((a.s0 * b.s0), (a.s1 * b.s1), (a.s2 * b.s2), (a.s3 * b.s3)); } 443 | 444 | inline __device__ u64x operator % (const u64x a, const u32 b) { return u64x ((a.s0 % b), (a.s1 % b) , (a.s2 % b), (a.s3 % b) ); } 445 | inline __device__ u64x operator % (const u64x a, const u64x b) { return u64x ((a.s0 % b.s0), (a.s1 % b.s1), (a.s2 % b.s2), (a.s3 % b.s3)); } 446 | 447 | inline __device__ u64x operator ~ (const u64x a) { return u64x (~a.s0, ~a.s1, ~a.s2, ~a.s3); } 448 | 449 | #endif 450 | 451 | #if VECT_SIZE == 8 452 | 453 | struct __device_builtin__ __builtin_align__(8) u8x 454 | { 455 | u8 s0; 456 | u8 s1; 457 | u8 s2; 458 | u8 s3; 459 | u8 s4; 460 | u8 s5; 461 | u8 s6; 462 | u8 s7; 463 | 464 | inline __device__ u8x (const u8 a, const u8 b, const u8 c, const u8 d, const u8 e, const u8 f, const u8 g, const u8 h) : s0(a), s1(b), s2(c), s3(d), s4(e), s5(f), s6(g), s7(h) { } 465 | inline __device__ u8x (const u8 a) : s0(a), s1(a), s2(a), s3(a), s4(a), s5(a), s6(a), s7(a) { } 466 | 467 | inline __device__ u8x (void) : s0(0), s1(0), s2(0), s3(0), s4(0), s5(0), s6(0), s7(0) { } 468 | inline __device__ ~u8x (void) { } 469 | }; 470 | 471 | struct __device_builtin__ __builtin_align__(16) u16x 472 | { 473 | u16 s0; 474 | u16 s1; 475 | u16 s2; 476 | u16 s3; 477 | u16 s4; 478 | u16 s5; 479 | u16 s6; 480 | u16 s7; 481 | 482 | inline __device__ u16x (const u16 a, const u16 b, const u16 c, const u16 d, const u16 e, const u16 f, const u16 g, const u16 h) : s0(a), s1(b), s2(c), s3(d), s4(e), s5(f), s6(g), s7(h) { } 483 | inline __device__ u16x (const u16 a) : s0(a), s1(a), s2(a), s3(a), s4(a), s5(a), s6(a), s7(a) { } 484 | 485 | inline __device__ u16x (void) : s0(0), s1(0), s2(0), s3(0), s4(0), s5(0), s6(0), s7(0) { } 486 | inline __device__ ~u16x (void) { } 487 | }; 488 | 489 | struct __device_builtin__ __builtin_align__(32) u32x 490 | { 491 | u32 s0; 492 | u32 s1; 493 | u32 s2; 494 | u32 s3; 495 | u32 s4; 496 | u32 s5; 497 | u32 s6; 498 | u32 s7; 499 | 500 | inline __device__ u32x (const u32 a, const u32 b, const u32 c, const u32 d, const u32 e, const u32 f, const u32 g, const u32 h) : s0(a), s1(b), s2(c), s3(d), s4(e), s5(f), s6(g), s7(h) { } 501 | inline __device__ u32x (const u32 a) : s0(a), s1(a), s2(a), s3(a), s4(a), s5(a), s6(a), s7(a) { } 502 | 503 | inline __device__ u32x (void) : s0(0), s1(0), s2(0), s3(0), s4(0), s5(0), s6(0), s7(0) { } 504 | inline __device__ ~u32x (void) { } 505 | }; 506 | 507 | struct __device_builtin__ __builtin_align__(64) u64x 508 | { 509 | u64 s0; 510 | u64 s1; 511 | u64 s2; 512 | u64 s3; 513 | u64 s4; 514 | u64 s5; 515 | u64 s6; 516 | u64 s7; 517 | 518 | inline __device__ u64x (const u64 a, const u64 b, const u64 c, const u64 d, const u64 e, const u64 f, const u64 g, const u64 h) : s0(a), s1(b), s2(c), s3(d), s4(e), s5(f), s6(g), s7(h) { } 519 | inline __device__ u64x (const u64 a) : s0(a), s1(a), s2(a), s3(a), s4(a), s5(a), s6(a), s7(a) { } 520 | 521 | inline __device__ u64x (void) : s0(0), s1(0), s2(0), s3(0), s4(0), s5(0), s6(0), s7(0) { } 522 | inline __device__ ~u64x (void) { } 523 | }; 524 | 525 | inline __device__ bool operator != (const u32x a, const u32 b) { return ((a.s0 != b) && (a.s1 != b) && (a.s2 != b) && (a.s3 != b) && (a.s4 != b) && (a.s5 != b) && (a.s6 != b) && (a.s7 != b) ); } 526 | inline __device__ bool operator != (const u32x a, const u32x b) { return ((a.s0 != b.s0) && (a.s1 != b.s1) && (a.s2 != b.s2) && (a.s3 != b.s3) && (a.s4 != b.s4) && (a.s5 != b.s5) && (a.s6 != b.s6) && (a.s7 != b.s7)); } 527 | 528 | inline __device__ void operator ^= (u32x &a, const u32 b) { a.s0 ^= b; a.s1 ^= b; a.s2 ^= b; a.s3 ^= b; a.s4 ^= b; a.s5 ^= b; a.s6 ^= b; a.s7 ^= b; } 529 | inline __device__ void operator ^= (u32x &a, const u32x b) { a.s0 ^= b.s0; a.s1 ^= b.s1; a.s2 ^= b.s2; a.s3 ^= b.s3; a.s4 ^= b.s4; a.s5 ^= b.s5; a.s6 ^= b.s6; a.s7 ^= b.s7; } 530 | 531 | inline __device__ void operator |= (u32x &a, const u32 b) { a.s0 |= b; a.s1 |= b; a.s2 |= b; a.s3 |= b; a.s4 |= b; a.s5 |= b; a.s6 |= b; a.s7 |= b; } 532 | inline __device__ void operator |= (u32x &a, const u32x b) { a.s0 |= b.s0; a.s1 |= b.s1; a.s2 |= b.s2; a.s3 |= b.s3; a.s4 |= b.s4; a.s5 |= b.s5; a.s6 |= b.s6; a.s7 |= b.s7; } 533 | 534 | inline __device__ void operator &= (u32x &a, const u32 b) { a.s0 &= b; a.s1 &= b; a.s2 &= b; a.s3 &= b; a.s4 &= b; a.s5 &= b; a.s6 &= b; a.s7 &= b; } 535 | inline __device__ void operator &= (u32x &a, const u32x b) { a.s0 &= b.s0; a.s1 &= b.s1; a.s2 &= b.s2; a.s3 &= b.s3; a.s4 &= b.s4; a.s5 &= b.s5; a.s6 &= b.s6; a.s7 &= b.s7; } 536 | 537 | inline __device__ void operator += (u32x &a, const u32 b) { a.s0 += b; a.s1 += b; a.s2 += b; a.s3 += b; a.s4 += b; a.s5 += b; a.s6 += b; a.s7 += b; } 538 | inline __device__ void operator += (u32x &a, const u32x b) { a.s0 += b.s0; a.s1 += b.s1; a.s2 += b.s2; a.s3 += b.s3; a.s4 += b.s4; a.s5 += b.s5; a.s6 += b.s6; a.s7 += b.s7; } 539 | 540 | inline __device__ void operator -= (u32x &a, const u32 b) { a.s0 -= b; a.s1 -= b; a.s2 -= b; a.s3 -= b; a.s4 -= b; a.s5 -= b; a.s6 -= b; a.s7 -= b; } 541 | inline __device__ void operator -= (u32x &a, const u32x b) { a.s0 -= b.s0; a.s1 -= b.s1; a.s2 -= b.s2; a.s3 -= b.s3; a.s4 -= b.s4; a.s5 -= b.s5; a.s6 -= b.s6; a.s7 -= b.s7; } 542 | 543 | inline __device__ void operator *= (u32x &a, const u32 b) { a.s0 *= b; a.s1 *= b; a.s2 *= b; a.s3 *= b; a.s4 *= b; a.s5 *= b; a.s6 *= b; a.s7 *= b; } 544 | inline __device__ void operator *= (u32x &a, const u32x b) { a.s0 *= b.s0; a.s1 *= b.s1; a.s2 *= b.s2; a.s3 *= b.s3; a.s4 *= b.s4; a.s5 *= b.s5; a.s6 *= b.s6; a.s7 *= b.s7; } 545 | 546 | inline __device__ void operator >>= (u32x &a, const u32 b) { a.s0 >>= b; a.s1 >>= b; a.s2 >>= b; a.s3 >>= b; a.s4 >>= b; a.s5 >>= b; a.s6 >>= b; a.s7 >>= b; } 547 | inline __device__ void operator >>= (u32x &a, const u32x b) { a.s0 >>= b.s0; a.s1 >>= b.s1; a.s2 >>= b.s2; a.s3 >>= b.s3; a.s4 >>= b.s4; a.s5 >>= b.s5; a.s6 >>= b.s6; a.s7 >>= b.s7; } 548 | 549 | inline __device__ void operator <<= (u32x &a, const u32 b) { a.s0 <<= b; a.s1 <<= b; a.s2 <<= b; a.s3 <<= b; a.s4 <<= b; a.s5 <<= b; a.s6 <<= b; a.s7 <<= b; } 550 | inline __device__ void operator <<= (u32x &a, const u32x b) { a.s0 <<= b.s0; a.s1 <<= b.s1; a.s2 <<= b.s2; a.s3 <<= b.s3; a.s4 <<= b.s4; a.s5 <<= b.s5; a.s6 <<= b.s6; a.s7 <<= b.s7; } 551 | 552 | inline __device__ u32x operator << (const u32x a, const u32 b) { return u32x ((a.s0 << b), (a.s1 << b) , (a.s2 << b), (a.s3 << b) , (a.s4 << b), (a.s5 << b) , (a.s6 << b), (a.s7 << b) ); } 553 | inline __device__ u32x operator << (const u32x a, const u32x b) { return u32x ((a.s0 << b.s0), (a.s1 << b.s1), (a.s2 << b.s2), (a.s3 << b.s3), (a.s4 << b.s4), (a.s5 << b.s5), (a.s6 << b.s6), (a.s7 << b.s7)); } 554 | 555 | inline __device__ u32x operator >> (const u32x a, const u32 b) { return u32x ((a.s0 >> b), (a.s1 >> b) , (a.s2 >> b), (a.s3 >> b) , (a.s4 >> b), (a.s5 >> b) , (a.s6 >> b), (a.s7 >> b) ); } 556 | inline __device__ u32x operator >> (const u32x a, const u32x b) { return u32x ((a.s0 >> b.s0), (a.s1 >> b.s1), (a.s2 >> b.s2), (a.s3 >> b.s3), (a.s4 >> b.s4), (a.s5 >> b.s5), (a.s6 >> b.s6), (a.s7 >> b.s7)); } 557 | 558 | inline __device__ u32x operator ^ (const u32x a, const u32 b) { return u32x ((a.s0 ^ b), (a.s1 ^ b) , (a.s2 ^ b), (a.s3 ^ b) , (a.s4 ^ b), (a.s5 ^ b) , (a.s6 ^ b), (a.s7 ^ b) ); } 559 | inline __device__ u32x operator ^ (const u32x a, const u32x b) { return u32x ((a.s0 ^ b.s0), (a.s1 ^ b.s1), (a.s2 ^ b.s2), (a.s3 ^ b.s3), (a.s4 ^ b.s4), (a.s5 ^ b.s5), (a.s6 ^ b.s6), (a.s7 ^ b.s7)); } 560 | 561 | inline __device__ u32x operator | (const u32x a, const u32 b) { return u32x ((a.s0 | b), (a.s1 | b) , (a.s2 | b), (a.s3 | b) , (a.s4 | b), (a.s5 | b) , (a.s6 | b), (a.s7 | b) ); } 562 | inline __device__ u32x operator | (const u32x a, const u32x b) { return u32x ((a.s0 | b.s0), (a.s1 | b.s1), (a.s2 | b.s2), (a.s3 | b.s3), (a.s4 | b.s4), (a.s5 | b.s5), (a.s6 | b.s6), (a.s7 | b.s7)); } 563 | 564 | inline __device__ u32x operator & (const u32x a, const u32 b) { return u32x ((a.s0 & b), (a.s1 & b) , (a.s2 & b), (a.s3 & b) , (a.s4 & b), (a.s5 & b) , (a.s6 & b), (a.s7 & b) ); } 565 | inline __device__ u32x operator & (const u32x a, const u32x b) { return u32x ((a.s0 & b.s0), (a.s1 & b.s1), (a.s2 & b.s2), (a.s3 & b.s3), (a.s4 & b.s4), (a.s5 & b.s5), (a.s6 & b.s6), (a.s7 & b.s7)); } 566 | 567 | inline __device__ u32x operator + (const u32x a, const u32 b) { return u32x ((a.s0 + b), (a.s1 + b) , (a.s2 + b), (a.s3 + b) , (a.s4 + b), (a.s5 + b) , (a.s6 + b), (a.s7 + b) ); } 568 | inline __device__ u32x operator + (const u32x a, const u32x b) { return u32x ((a.s0 + b.s0), (a.s1 + b.s1), (a.s2 + b.s2), (a.s3 + b.s3), (a.s4 + b.s4), (a.s5 + b.s5), (a.s6 + b.s6), (a.s7 + b.s7)); } 569 | 570 | inline __device__ u32x operator - (const u32x a, const u32 b) { return u32x ((a.s0 - b), (a.s1 - b) , (a.s2 - b), (a.s3 - b) , (a.s4 - b), (a.s5 - b) , (a.s6 - b), (a.s7 - b) ); } 571 | inline __device__ u32x operator - (const u32x a, const u32x b) { return u32x ((a.s0 - b.s0), (a.s1 - b.s1), (a.s2 - b.s2), (a.s3 - b.s3), (a.s4 - b.s4), (a.s5 - b.s5), (a.s6 - b.s6), (a.s7 - b.s7)); } 572 | 573 | inline __device__ u32x operator * (const u32x a, const u32 b) { return u32x ((a.s0 * b), (a.s1 * b) , (a.s2 * b), (a.s3 * b) , (a.s4 * b), (a.s5 * b) , (a.s6 * b), (a.s7 * b) ); } 574 | inline __device__ u32x operator * (const u32x a, const u32x b) { return u32x ((a.s0 * b.s0), (a.s1 * b.s1), (a.s2 * b.s2), (a.s3 * b.s3), (a.s4 * b.s4), (a.s5 * b.s5), (a.s6 * b.s6), (a.s7 * b.s7)); } 575 | 576 | inline __device__ u32x operator % (const u32x a, const u32 b) { return u32x ((a.s0 % b), (a.s1 % b) , (a.s2 % b), (a.s3 % b) , (a.s4 % b), (a.s5 % b) , (a.s6 % b), (a.s7 % b) ); } 577 | inline __device__ u32x operator % (const u32x a, const u32x b) { return u32x ((a.s0 % b.s0), (a.s1 % b.s1), (a.s2 % b.s2), (a.s3 % b.s3), (a.s4 % b.s4), (a.s5 % b.s5), (a.s6 % b.s6), (a.s7 % b.s7)); } 578 | 579 | inline __device__ u32x operator ~ (const u32x a) { return u32x (~a.s0, ~a.s1, ~a.s2, ~a.s3, ~a.s4, ~a.s5, ~a.s6, ~a.s7); } 580 | 581 | inline __device__ bool operator != (const u64x a, const u64 b) { return ((a.s0 != b) && (a.s1 != b) && (a.s2 != b) && (a.s3 != b) && (a.s4 != b) && (a.s5 != b) && (a.s6 != b) && (a.s7 != b) ); } 582 | inline __device__ bool operator != (const u64x a, const u64x b) { return ((a.s0 != b.s0) && (a.s1 != b.s1) && (a.s2 != b.s2) && (a.s3 != b.s3) && (a.s4 != b.s4) && (a.s5 != b.s5) && (a.s6 != b.s6) && (a.s7 != b.s7)); } 583 | 584 | inline __device__ void operator ^= (u64x &a, const u64 b) { a.s0 ^= b; a.s1 ^= b; a.s2 ^= b; a.s3 ^= b; a.s4 ^= b; a.s5 ^= b; a.s6 ^= b; a.s7 ^= b; } 585 | inline __device__ void operator ^= (u64x &a, const u64x b) { a.s0 ^= b.s0; a.s1 ^= b.s1; a.s2 ^= b.s2; a.s3 ^= b.s3; a.s4 ^= b.s4; a.s5 ^= b.s5; a.s6 ^= b.s6; a.s7 ^= b.s7; } 586 | 587 | inline __device__ void operator |= (u64x &a, const u64 b) { a.s0 |= b; a.s1 |= b; a.s2 |= b; a.s3 |= b; a.s4 |= b; a.s5 |= b; a.s6 |= b; a.s7 |= b; } 588 | inline __device__ void operator |= (u64x &a, const u64x b) { a.s0 |= b.s0; a.s1 |= b.s1; a.s2 |= b.s2; a.s3 |= b.s3; a.s4 |= b.s4; a.s5 |= b.s5; a.s6 |= b.s6; a.s7 |= b.s7; } 589 | 590 | inline __device__ void operator &= (u64x &a, const u64 b) { a.s0 &= b; a.s1 &= b; a.s2 &= b; a.s3 &= b; a.s4 &= b; a.s5 &= b; a.s6 &= b; a.s7 &= b; } 591 | inline __device__ void operator &= (u64x &a, const u64x b) { a.s0 &= b.s0; a.s1 &= b.s1; a.s2 &= b.s2; a.s3 &= b.s3; a.s4 &= b.s4; a.s5 &= b.s5; a.s6 &= b.s6; a.s7 &= b.s7; } 592 | 593 | inline __device__ void operator += (u64x &a, const u64 b) { a.s0 += b; a.s1 += b; a.s2 += b; a.s3 += b; a.s4 += b; a.s5 += b; a.s6 += b; a.s7 += b; } 594 | inline __device__ void operator += (u64x &a, const u64x b) { a.s0 += b.s0; a.s1 += b.s1; a.s2 += b.s2; a.s3 += b.s3; a.s4 += b.s4; a.s5 += b.s5; a.s6 += b.s6; a.s7 += b.s7; } 595 | 596 | inline __device__ void operator -= (u64x &a, const u64 b) { a.s0 -= b; a.s1 -= b; a.s2 -= b; a.s3 -= b; a.s4 -= b; a.s5 -= b; a.s6 -= b; a.s7 -= b; } 597 | inline __device__ void operator -= (u64x &a, const u64x b) { a.s0 -= b.s0; a.s1 -= b.s1; a.s2 -= b.s2; a.s3 -= b.s3; a.s4 -= b.s4; a.s5 -= b.s5; a.s6 -= b.s6; a.s7 -= b.s7; } 598 | 599 | inline __device__ void operator *= (u64x &a, const u64 b) { a.s0 *= b; a.s1 *= b; a.s2 *= b; a.s3 *= b; a.s4 *= b; a.s5 *= b; a.s6 *= b; a.s7 *= b; } 600 | inline __device__ void operator *= (u64x &a, const u64x b) { a.s0 *= b.s0; a.s1 *= b.s1; a.s2 *= b.s2; a.s3 *= b.s3; a.s4 *= b.s4; a.s5 *= b.s5; a.s6 *= b.s6; a.s7 *= b.s7; } 601 | 602 | inline __device__ void operator >>= (u64x &a, const u64 b) { a.s0 >>= b; a.s1 >>= b; a.s2 >>= b; a.s3 >>= b; a.s4 >>= b; a.s5 >>= b; a.s6 >>= b; a.s7 >>= b; } 603 | inline __device__ void operator >>= (u64x &a, const u64x b) { a.s0 >>= b.s0; a.s1 >>= b.s1; a.s2 >>= b.s2; a.s3 >>= b.s3; a.s4 >>= b.s4; a.s5 >>= b.s5; a.s6 >>= b.s6; a.s7 >>= b.s7; } 604 | 605 | inline __device__ void operator <<= (u64x &a, const u64 b) { a.s0 <<= b; a.s1 <<= b; a.s2 <<= b; a.s3 <<= b; a.s4 <<= b; a.s5 <<= b; a.s6 <<= b; a.s7 <<= b; } 606 | inline __device__ void operator <<= (u64x &a, const u64x b) { a.s0 <<= b.s0; a.s1 <<= b.s1; a.s2 <<= b.s2; a.s3 <<= b.s3; a.s4 <<= b.s4; a.s5 <<= b.s5; a.s6 <<= b.s6; a.s7 <<= b.s7; } 607 | 608 | inline __device__ u64x operator << (const u64x a, const u64 b) { return u64x ((a.s0 << b), (a.s1 << b) , (a.s2 << b), (a.s3 << b) , (a.s4 << b), (a.s5 << b) , (a.s6 << b), (a.s7 << b) ); } 609 | inline __device__ u64x operator << (const u64x a, const u64x b) { return u64x ((a.s0 << b.s0), (a.s1 << b.s1), (a.s2 << b.s2), (a.s3 << b.s3), (a.s4 << b.s4), (a.s5 << b.s5), (a.s6 << b.s6), (a.s7 << b.s7)); } 610 | 611 | inline __device__ u64x operator >> (const u64x a, const u64 b) { return u64x ((a.s0 >> b), (a.s1 >> b) , (a.s2 >> b), (a.s3 >> b) , (a.s4 >> b), (a.s5 >> b) , (a.s6 >> b), (a.s7 >> b) ); } 612 | inline __device__ u64x operator >> (const u64x a, const u64x b) { return u64x ((a.s0 >> b.s0), (a.s1 >> b.s1), (a.s2 >> b.s2), (a.s3 >> b.s3), (a.s4 >> b.s4), (a.s5 >> b.s5), (a.s6 >> b.s6), (a.s7 >> b.s7)); } 613 | 614 | inline __device__ u64x operator ^ (const u64x a, const u64 b) { return u64x ((a.s0 ^ b), (a.s1 ^ b) , (a.s2 ^ b), (a.s3 ^ b) , (a.s4 ^ b), (a.s5 ^ b) , (a.s6 ^ b), (a.s7 ^ b) ); } 615 | inline __device__ u64x operator ^ (const u64x a, const u64x b) { return u64x ((a.s0 ^ b.s0), (a.s1 ^ b.s1), (a.s2 ^ b.s2), (a.s3 ^ b.s3), (a.s4 ^ b.s4), (a.s5 ^ b.s5), (a.s6 ^ b.s6), (a.s7 ^ b.s7)); } 616 | 617 | inline __device__ u64x operator | (const u64x a, const u64 b) { return u64x ((a.s0 | b), (a.s1 | b) , (a.s2 | b), (a.s3 | b) , (a.s4 | b), (a.s5 | b) , (a.s6 | b), (a.s7 | b) ); } 618 | inline __device__ u64x operator | (const u64x a, const u64x b) { return u64x ((a.s0 | b.s0), (a.s1 | b.s1), (a.s2 | b.s2), (a.s3 | b.s3), (a.s4 | b.s4), (a.s5 | b.s5), (a.s6 | b.s6), (a.s7 | b.s7)); } 619 | 620 | inline __device__ u64x operator & (const u64x a, const u64 b) { return u64x ((a.s0 & b), (a.s1 & b) , (a.s2 & b), (a.s3 & b) , (a.s4 & b), (a.s5 & b) , (a.s6 & b), (a.s7 & b) ); } 621 | inline __device__ u64x operator & (const u64x a, const u64x b) { return u64x ((a.s0 & b.s0), (a.s1 & b.s1), (a.s2 & b.s2), (a.s3 & b.s3), (a.s4 & b.s4), (a.s5 & b.s5), (a.s6 & b.s6), (a.s7 & b.s7)); } 622 | 623 | inline __device__ u64x operator + (const u64x a, const u64 b) { return u64x ((a.s0 + b), (a.s1 + b) , (a.s2 + b), (a.s3 + b) , (a.s4 + b), (a.s5 + b) , (a.s6 + b), (a.s7 + b) ); } 624 | inline __device__ u64x operator + (const u64x a, const u64x b) { return u64x ((a.s0 + b.s0), (a.s1 + b.s1), (a.s2 + b.s2), (a.s3 + b.s3), (a.s4 + b.s4), (a.s5 + b.s5), (a.s6 + b.s6), (a.s7 + b.s7)); } 625 | 626 | inline __device__ u64x operator - (const u64x a, const u64 b) { return u64x ((a.s0 - b), (a.s1 - b) , (a.s2 - b), (a.s3 - b) , (a.s4 - b), (a.s5 - b) , (a.s6 - b), (a.s7 - b) ); } 627 | inline __device__ u64x operator - (const u64x a, const u64x b) { return u64x ((a.s0 - b.s0), (a.s1 - b.s1), (a.s2 - b.s2), (a.s3 - b.s3), (a.s4 - b.s4), (a.s5 - b.s5), (a.s6 - b.s6), (a.s7 - b.s7)); } 628 | 629 | inline __device__ u64x operator * (const u64x a, const u64 b) { return u64x ((a.s0 * b), (a.s1 * b) , (a.s2 * b), (a.s3 * b) , (a.s4 * b), (a.s5 * b) , (a.s6 * b), (a.s7 * b) ); } 630 | inline __device__ u64x operator * (const u64x a, const u64x b) { return u64x ((a.s0 * b.s0), (a.s1 * b.s1), (a.s2 * b.s2), (a.s3 * b.s3), (a.s4 * b.s4), (a.s5 * b.s5), (a.s6 * b.s6), (a.s7 * b.s7)); } 631 | 632 | inline __device__ u64x operator % (const u64x a, const u64 b) { return u64x ((a.s0 % b), (a.s1 % b) , (a.s2 % b), (a.s3 % b) , (a.s4 % b), (a.s5 % b) , (a.s6 % b), (a.s7 % b) ); } 633 | inline __device__ u64x operator % (const u64x a, const u64x b) { return u64x ((a.s0 % b.s0), (a.s1 % b.s1), (a.s2 % b.s2), (a.s3 % b.s3), (a.s4 % b.s4), (a.s5 % b.s5), (a.s6 % b.s6), (a.s7 % b.s7)); } 634 | 635 | inline __device__ u64x operator ~ (const u64x a) { return u64x (~a.s0, ~a.s1, ~a.s2, ~a.s3, ~a.s4, ~a.s5, ~a.s6, ~a.s7); } 636 | 637 | #endif 638 | 639 | #if VECT_SIZE == 16 640 | 641 | struct __device_builtin__ __builtin_align__(16) u8x 642 | { 643 | u8 s0; 644 | u8 s1; 645 | u8 s2; 646 | u8 s3; 647 | u8 s4; 648 | u8 s5; 649 | u8 s6; 650 | u8 s7; 651 | u8 s8; 652 | u8 s9; 653 | u8 sa; 654 | u8 sb; 655 | u8 sc; 656 | u8 sd; 657 | u8 se; 658 | u8 sf; 659 | 660 | inline __device__ u8x (const u8 a, const u8 b, const u8 c, const u8 d, const u8 e, const u8 f, const u8 g, const u8 h, const u8 i, const u8 j, const u8 k, const u8 l, const u8 m, const u8 n, const u8 o, const u8 p) : s0(a), s1(b), s2(c), s3(d), s4(e), s5(f), s6(g), s7(h), s8(i), s9(j), sa(k), sb(l), sc(m), sd(n), se(o), sf(p) { } 661 | inline __device__ u8x (const u8 a) : s0(a), s1(a), s2(a), s3(a), s4(a), s5(a), s6(a), s7(a), s8(a), s9(a), sa(a), sb(a), sc(a), sd(a), se(a), sf(a) { } 662 | 663 | inline __device__ u8x (void) : s0(0), s1(0), s2(0), s3(0), s4(0), s5(0), s6(0), s7(0), s8(0), s9(0), sa(0), sb(0), sc(0), sd(0), se(0), sf(0) { } 664 | inline __device__ ~u8x (void) { } 665 | }; 666 | 667 | struct __device_builtin__ __builtin_align__(32) u16x 668 | { 669 | u16 s0; 670 | u16 s1; 671 | u16 s2; 672 | u16 s3; 673 | u16 s4; 674 | u16 s5; 675 | u16 s6; 676 | u16 s7; 677 | u16 s8; 678 | u16 s9; 679 | u16 sa; 680 | u16 sb; 681 | u16 sc; 682 | u16 sd; 683 | u16 se; 684 | u16 sf; 685 | 686 | inline __device__ u16x (const u16 a, const u16 b, const u16 c, const u16 d, const u16 e, const u16 f, const u16 g, const u16 h, const u16 i, const u16 j, const u16 k, const u16 l, const u16 m, const u16 n, const u16 o, const u16 p) : s0(a), s1(b), s2(c), s3(d), s4(e), s5(f), s6(g), s7(h), s8(i), s9(j), sa(k), sb(l), sc(m), sd(n), se(o), sf(p) { } 687 | inline __device__ u16x (const u16 a) : s0(a), s1(a), s2(a), s3(a), s4(a), s5(a), s6(a), s7(a), s8(a), s9(a), sa(a), sb(a), sc(a), sd(a), se(a), sf(a) { } 688 | 689 | inline __device__ u16x (void) : s0(0), s1(0), s2(0), s3(0), s4(0), s5(0), s6(0), s7(0), s8(0), s9(0), sa(0), sb(0), sc(0), sd(0), se(0), sf(0){ } 690 | inline __device__ ~u16x (void) { } 691 | }; 692 | 693 | struct __device_builtin__ __builtin_align__(64) u32x 694 | { 695 | u32 s0; 696 | u32 s1; 697 | u32 s2; 698 | u32 s3; 699 | u32 s4; 700 | u32 s5; 701 | u32 s6; 702 | u32 s7; 703 | u32 s8; 704 | u32 s9; 705 | u32 sa; 706 | u32 sb; 707 | u32 sc; 708 | u32 sd; 709 | u32 se; 710 | u32 sf; 711 | 712 | inline __device__ u32x (const u32 a, const u32 b, const u32 c, const u32 d, const u32 e, const u32 f, const u32 g, const u32 h, const u32 i, const u32 j, const u32 k, const u32 l, const u32 m, const u32 n, const u32 o, const u32 p) : s0(a), s1(b), s2(c), s3(d), s4(e), s5(f), s6(g), s7(h), s8(i), s9(j), sa(k), sb(l), sc(m), sd(n), se(o), sf(p) { } 713 | inline __device__ u32x (const u32 a) : s0(a), s1(a), s2(a), s3(a), s4(a), s5(a), s6(a), s7(a), s8(a), s9(a), sa(a), sb(a), sc(a), sd(a), se(a), sf(a) { } 714 | 715 | inline __device__ u32x (void) : s0(0), s1(0), s2(0), s3(0), s4(0), s5(0), s6(0), s7(0), s8(0), s9(0), sa(0), sb(0), sc(0), sd(0), se(0), sf(0){ } 716 | inline __device__ ~u32x (void) { } 717 | }; 718 | 719 | struct __device_builtin__ __builtin_align__(128) u64x 720 | { 721 | u64 s0; 722 | u64 s1; 723 | u64 s2; 724 | u64 s3; 725 | u64 s4; 726 | u64 s5; 727 | u64 s6; 728 | u64 s7; 729 | u64 s8; 730 | u64 s9; 731 | u64 sa; 732 | u64 sb; 733 | u64 sc; 734 | u64 sd; 735 | u64 se; 736 | u64 sf; 737 | 738 | inline __device__ u64x (const u64 a, const u64 b, const u64 c, const u64 d, const u64 e, const u64 f, const u64 g, const u64 h, const u64 i, const u64 j, const u64 k, const u64 l, const u64 m, const u64 n, const u64 o, const u64 p) : s0(a), s1(b), s2(c), s3(d), s4(e), s5(f), s6(g), s7(h), s8(i), s9(j), sa(k), sb(l), sc(m), sd(n), se(o), sf(p) { } 739 | inline __device__ u64x (const u64 a) : s0(a), s1(a), s2(a), s3(a), s4(a), s5(a), s6(a), s7(a), s8(a), s9(a), sa(a), sb(a), sc(a), sd(a), se(a), sf(a) { } 740 | 741 | inline __device__ u64x (void) : s0(0), s1(0), s2(0), s3(0), s4(0), s5(0), s6(0), s7(0), s8(0), s9(0), sa(0), sb(0), sc(0), sd(0), se(0), sf(0) { } 742 | inline __device__ ~u64x (void) { } 743 | }; 744 | 745 | inline __device__ bool operator != (const u32x a, const u32 b) { return ((a.s0 != b) && (a.s1 != b) && (a.s2 != b) && (a.s3 != b) && (a.s4 != b) && (a.s5 != b) && (a.s6 != b) && (a.s7 != b) && (a.s8 != b) && (a.s9 != b) && (a.sa != b) && (a.sb != b) && (a.sc != b) && (a.sd != b) && (a.se != b) && (a.sf != b) ); } 746 | inline __device__ bool operator != (const u32x a, const u32x b) { return ((a.s0 != b.s0) && (a.s1 != b.s1) && (a.s2 != b.s2) && (a.s3 != b.s3) && (a.s4 != b.s4) && (a.s5 != b.s5) && (a.s6 != b.s6) && (a.s7 != b.s7) && (a.s8 != b.s8) && (a.s9 != b.s9) && (a.sa != b.sa) && (a.sb != b.sb) && (a.sc != b.sc) && (a.sd != b.sd) && (a.se != b.se) && (a.sf != b.sf)); } 747 | 748 | inline __device__ void operator ^= (u32x &a, const u32 b) { a.s0 ^= b; a.s1 ^= b; a.s2 ^= b; a.s3 ^= b; a.s4 ^= b; a.s5 ^= b; a.s6 ^= b; a.s7 ^= b; a.s8 ^= b; a.s9 ^= b; a.sa ^= b; a.sb ^= b; a.sc ^= b; a.sd ^= b; a.se ^= b; a.sf ^= b; } 749 | inline __device__ void operator ^= (u32x &a, const u32x b) { a.s0 ^= b.s0; a.s1 ^= b.s1; a.s2 ^= b.s2; a.s3 ^= b.s3; a.s4 ^= b.s4; a.s5 ^= b.s5; a.s6 ^= b.s6; a.s7 ^= b.s7; a.s8 ^= b.s8; a.s9 ^= b.s9; a.sa ^= b.sa; a.sb ^= b.sb; a.sc ^= b.sc; a.sd ^= b.sd; a.se ^= b.se; a.sf ^= b.sf; } 750 | 751 | inline __device__ void operator |= (u32x &a, const u32 b) { a.s0 |= b; a.s1 |= b; a.s2 |= b; a.s3 |= b; a.s4 |= b; a.s5 |= b; a.s6 |= b; a.s7 |= b; a.s8 |= b; a.s9 |= b; a.sa |= b; a.sb |= b; a.sc |= b; a.sd |= b; a.se |= b; a.sf |= b; } 752 | inline __device__ void operator |= (u32x &a, const u32x b) { a.s0 |= b.s0; a.s1 |= b.s1; a.s2 |= b.s2; a.s3 |= b.s3; a.s4 |= b.s4; a.s5 |= b.s5; a.s6 |= b.s6; a.s7 |= b.s7; a.s8 |= b.s8; a.s9 |= b.s9; a.sa |= b.sa; a.sb |= b.sb; a.sc |= b.sc; a.sd |= b.sd; a.se |= b.se; a.sf |= b.sf; } 753 | 754 | inline __device__ void operator &= (u32x &a, const u32 b) { a.s0 &= b; a.s1 &= b; a.s2 &= b; a.s3 &= b; a.s4 &= b; a.s5 &= b; a.s6 &= b; a.s7 &= b; a.s8 &= b; a.s9 &= b; a.sa &= b; a.sb &= b; a.sc &= b; a.sd &= b; a.se &= b; a.sf &= b; } 755 | inline __device__ void operator &= (u32x &a, const u32x b) { a.s0 &= b.s0; a.s1 &= b.s1; a.s2 &= b.s2; a.s3 &= b.s3; a.s4 &= b.s4; a.s5 &= b.s5; a.s6 &= b.s6; a.s7 &= b.s7; a.s8 &= b.s8; a.s9 &= b.s9; a.sa &= b.sa; a.sb &= b.sb; a.sc &= b.sc; a.sd &= b.sd; a.se &= b.se; a.sf &= b.sf; } 756 | 757 | inline __device__ void operator += (u32x &a, const u32 b) { a.s0 += b; a.s1 += b; a.s2 += b; a.s3 += b; a.s4 += b; a.s5 += b; a.s6 += b; a.s7 += b; a.s8 += b; a.s9 += b; a.sa += b; a.sb += b; a.sc += b; a.sd += b; a.se += b; a.sf += b; } 758 | inline __device__ void operator += (u32x &a, const u32x b) { a.s0 += b.s0; a.s1 += b.s1; a.s2 += b.s2; a.s3 += b.s3; a.s4 += b.s4; a.s5 += b.s5; a.s6 += b.s6; a.s7 += b.s7; a.s8 += b.s8; a.s9 += b.s9; a.sa += b.sa; a.sb += b.sb; a.sc += b.sc; a.sd += b.sd; a.se += b.se; a.sf += b.sf; } 759 | 760 | inline __device__ void operator -= (u32x &a, const u32 b) { a.s0 -= b; a.s1 -= b; a.s2 -= b; a.s3 -= b; a.s4 -= b; a.s5 -= b; a.s6 -= b; a.s7 -= b; a.s8 -= b; a.s9 -= b; a.sa -= b; a.sb -= b; a.sc -= b; a.sd -= b; a.se -= b; a.sf -= b; } 761 | inline __device__ void operator -= (u32x &a, const u32x b) { a.s0 -= b.s0; a.s1 -= b.s1; a.s2 -= b.s2; a.s3 -= b.s3; a.s4 -= b.s4; a.s5 -= b.s5; a.s6 -= b.s6; a.s7 -= b.s7; a.s8 -= b.s8; a.s9 -= b.s9; a.sa -= b.sa; a.sb -= b.sb; a.sc -= b.sc; a.sd -= b.sd; a.se -= b.se; a.sf -= b.sf; } 762 | 763 | inline __device__ void operator *= (u32x &a, const u32 b) { a.s0 *= b; a.s1 *= b; a.s2 *= b; a.s3 *= b; a.s4 *= b; a.s5 *= b; a.s6 *= b; a.s7 *= b; a.s8 *= b; a.s9 *= b; a.sa *= b; a.sb *= b; a.sc *= b; a.sd *= b; a.se *= b; a.sf *= b; } 764 | inline __device__ void operator *= (u32x &a, const u32x b) { a.s0 *= b.s0; a.s1 *= b.s1; a.s2 *= b.s2; a.s3 *= b.s3; a.s4 *= b.s4; a.s5 *= b.s5; a.s6 *= b.s6; a.s7 *= b.s7; a.s8 *= b.s8; a.s9 *= b.s9; a.sa *= b.sa; a.sb *= b.sb; a.sc *= b.sc; a.sd *= b.sd; a.se *= b.se; a.sf *= b.sf; } 765 | 766 | inline __device__ void operator >>= (u32x &a, const u32 b) { a.s0 >>= b; a.s1 >>= b; a.s2 >>= b; a.s3 >>= b; a.s4 >>= b; a.s5 >>= b; a.s6 >>= b; a.s7 >>= b; a.s8 >>= b; a.s9 >>= b; a.sa >>= b; a.sb >>= b; a.sc >>= b; a.sd >>= b; a.se >>= b; a.sf >>= b; } 767 | inline __device__ void operator >>= (u32x &a, const u32x b) { a.s0 >>= b.s0; a.s1 >>= b.s1; a.s2 >>= b.s2; a.s3 >>= b.s3; a.s4 >>= b.s4; a.s5 >>= b.s5; a.s6 >>= b.s6; a.s7 >>= b.s7; a.s8 >>= b.s8; a.s9 >>= b.s9; a.sa >>= b.sa; a.sb >>= b.sb; a.sc >>= b.sc; a.sd >>= b.sd; a.se >>= b.se; a.sf >>= b.sf; } 768 | 769 | inline __device__ void operator <<= (u32x &a, const u32 b) { a.s0 <<= b; a.s1 <<= b; a.s2 <<= b; a.s3 <<= b; a.s4 <<= b; a.s5 <<= b; a.s6 <<= b; a.s7 <<= b; a.s8 <<= b; a.s9 <<= b; a.sa <<= b; a.sb <<= b; a.sc <<= b; a.sd <<= b; a.se <<= b; a.sf <<= b; } 770 | inline __device__ void operator <<= (u32x &a, const u32x b) { a.s0 <<= b.s0; a.s1 <<= b.s1; a.s2 <<= b.s2; a.s3 <<= b.s3; a.s4 <<= b.s4; a.s5 <<= b.s5; a.s6 <<= b.s6; a.s7 <<= b.s7; a.s8 <<= b.s8; a.s9 <<= b.s9; a.sa <<= b.sa; a.sb <<= b.sb; a.sc <<= b.sc; a.sd <<= b.sd; a.se <<= b.se; a.sf <<= b.sf; } 771 | 772 | inline __device__ u32x operator << (const u32x a, const u32 b) { return u32x ((a.s0 << b), (a.s1 << b) , (a.s2 << b), (a.s3 << b) , (a.s4 << b), (a.s5 << b) , (a.s6 << b), (a.s7 << b), (a.s8 << b), (a.s9 << b) , (a.sa << b), (a.sb << b) , (a.sc << b), (a.sd << b) , (a.se << b), (a.sf << b) ); } 773 | inline __device__ u32x operator << (const u32x a, const u32x b) { return u32x ((a.s0 << b.s0), (a.s1 << b.s1), (a.s2 << b.s2), (a.s3 << b.s3), (a.s4 << b.s4), (a.s5 << b.s5), (a.s6 << b.s6), (a.s7 << b.s7), (a.s8 << b.s8), (a.s9 << b.s9), (a.sa << b.sa), (a.sb << b.sb), (a.sc << b.sc), (a.sd << b.sd), (a.se << b.se), (a.sf << b.sf)); } 774 | 775 | inline __device__ u32x operator >> (const u32x a, const u32 b) { return u32x ((a.s0 >> b), (a.s1 >> b) , (a.s2 >> b), (a.s3 >> b) , (a.s4 >> b), (a.s5 >> b) , (a.s6 >> b), (a.s7 >> b), (a.s8 >> b), (a.s9 >> b) , (a.sa >> b), (a.sb >> b) , (a.sc >> b), (a.sd >> b) , (a.se >> b), (a.sf >> b) ); } 776 | inline __device__ u32x operator >> (const u32x a, const u32x b) { return u32x ((a.s0 >> b.s0), (a.s1 >> b.s1), (a.s2 >> b.s2), (a.s3 >> b.s3), (a.s4 >> b.s4), (a.s5 >> b.s5), (a.s6 >> b.s6), (a.s7 >> b.s7), (a.s8 >> b.s8), (a.s9 >> b.s9), (a.sa >> b.sa), (a.sb >> b.sb), (a.sc >> b.sc), (a.sd >> b.sd), (a.se >> b.se), (a.sf >> b.sf)); } 777 | 778 | inline __device__ u32x operator ^ (const u32x a, const u32 b) { return u32x ((a.s0 ^ b), (a.s1 ^ b) , (a.s2 ^ b), (a.s3 ^ b) , (a.s4 ^ b), (a.s5 ^ b) , (a.s6 ^ b), (a.s7 ^ b), (a.s8 ^ b), (a.s9 ^ b) , (a.sa ^ b), (a.sb ^ b) , (a.sc ^ b), (a.sd ^ b) , (a.se ^ b), (a.sf ^ b) ); } 779 | inline __device__ u32x operator ^ (const u32x a, const u32x b) { return u32x ((a.s0 ^ b.s0), (a.s1 ^ b.s1), (a.s2 ^ b.s2), (a.s3 ^ b.s3), (a.s4 ^ b.s4), (a.s5 ^ b.s5), (a.s6 ^ b.s6), (a.s7 ^ b.s7), (a.s8 ^ b.s8), (a.s9 ^ b.s9), (a.sa ^ b.sa), (a.sb ^ b.sb), (a.sc ^ b.sc), (a.sd ^ b.sd), (a.se ^ b.se), (a.sf ^ b.sf)); } 780 | 781 | inline __device__ u32x operator | (const u32x a, const u32 b) { return u32x ((a.s0 | b), (a.s1 | b) , (a.s2 | b), (a.s3 | b) , (a.s4 | b), (a.s5 | b) , (a.s6 | b), (a.s7 | b), (a.s8 | b), (a.s9 | b) , (a.sa | b), (a.sb | b) , (a.sc | b), (a.sd | b) , (a.se | b), (a.sf | b) ); } 782 | inline __device__ u32x operator | (const u32x a, const u32x b) { return u32x ((a.s0 | b.s0), (a.s1 | b.s1), (a.s2 | b.s2), (a.s3 | b.s3), (a.s4 | b.s4), (a.s5 | b.s5), (a.s6 | b.s6), (a.s7 | b.s7), (a.s8 | b.s8), (a.s9 | b.s9), (a.sa | b.sa), (a.sb | b.sb), (a.sc | b.sc), (a.sd | b.sd), (a.se | b.se), (a.sf | b.sf)); } 783 | 784 | inline __device__ u32x operator & (const u32x a, const u32 b) { return u32x ((a.s0 & b), (a.s1 & b) , (a.s2 & b), (a.s3 & b) , (a.s4 & b), (a.s5 & b) , (a.s6 & b), (a.s7 & b), (a.s8 & b), (a.s9 & b) , (a.sa & b), (a.sb & b) , (a.sc & b), (a.sd & b) , (a.se & b), (a.sf & b) ); } 785 | inline __device__ u32x operator & (const u32x a, const u32x b) { return u32x ((a.s0 & b.s0), (a.s1 & b.s1), (a.s2 & b.s2), (a.s3 & b.s3), (a.s4 & b.s4), (a.s5 & b.s5), (a.s6 & b.s6), (a.s7 & b.s7), (a.s8 & b.s8), (a.s9 & b.s9), (a.sa & b.sa), (a.sb & b.sb), (a.sc & b.sc), (a.sd & b.sd), (a.se & b.se), (a.sf & b.sf)); } 786 | 787 | inline __device__ u32x operator + (const u32x a, const u32 b) { return u32x ((a.s0 + b), (a.s1 + b) , (a.s2 + b), (a.s3 + b) , (a.s4 + b), (a.s5 + b) , (a.s6 + b), (a.s7 + b), (a.s8 + b), (a.s9 + b) , (a.sa + b), (a.sb + b) , (a.sc + b), (a.sd + b) , (a.se + b), (a.sf + b) ); } 788 | inline __device__ u32x operator + (const u32x a, const u32x b) { return u32x ((a.s0 + b.s0), (a.s1 + b.s1), (a.s2 + b.s2), (a.s3 + b.s3), (a.s4 + b.s4), (a.s5 + b.s5), (a.s6 + b.s6), (a.s7 + b.s7), (a.s8 + b.s8), (a.s9 + b.s9), (a.sa + b.sa), (a.sb + b.sb), (a.sc + b.sc), (a.sd + b.sd), (a.se + b.se), (a.sf + b.sf)); } 789 | 790 | inline __device__ u32x operator - (const u32x a, const u32 b) { return u32x ((a.s0 - b), (a.s1 - b) , (a.s2 - b), (a.s3 - b) , (a.s4 - b), (a.s5 - b) , (a.s6 - b), (a.s7 - b), (a.s8 - b), (a.s9 - b) , (a.sa - b), (a.sb - b) , (a.sc - b), (a.sd - b) , (a.se - b), (a.sf - b) ); } 791 | inline __device__ u32x operator - (const u32x a, const u32x b) { return u32x ((a.s0 - b.s0), (a.s1 - b.s1), (a.s2 - b.s2), (a.s3 - b.s3), (a.s4 - b.s4), (a.s5 - b.s5), (a.s6 - b.s6), (a.s7 - b.s7), (a.s8 - b.s8), (a.s9 - b.s9), (a.sa - b.sa), (a.sb - b.sb), (a.sc - b.sc), (a.sd - b.sd), (a.se - b.se), (a.sf - b.sf)); } 792 | 793 | inline __device__ u32x operator * (const u32x a, const u32 b) { return u32x ((a.s0 * b), (a.s1 * b) , (a.s2 * b), (a.s3 * b) , (a.s4 * b), (a.s5 * b) , (a.s6 * b), (a.s7 * b), (a.s8 * b), (a.s9 * b) , (a.sa * b), (a.sb * b) , (a.sc * b), (a.sd * b) , (a.se * b), (a.sf * b) ); } 794 | inline __device__ u32x operator * (const u32x a, const u32x b) { return u32x ((a.s0 * b.s0), (a.s1 * b.s1), (a.s2 * b.s2), (a.s3 * b.s3), (a.s4 * b.s4), (a.s5 * b.s5), (a.s6 * b.s6), (a.s7 * b.s7), (a.s8 * b.s8), (a.s9 * b.s9), (a.sa * b.sa), (a.sb * b.sb), (a.sc * b.sc), (a.sd * b.sd), (a.se * b.se), (a.sf * b.sf)); } 795 | 796 | inline __device__ u32x operator % (const u32x a, const u32 b) { return u32x ((a.s0 % b), (a.s1 % b) , (a.s2 % b), (a.s3 % b) , (a.s4 % b), (a.s5 % b) , (a.s6 % b), (a.s7 % b), (a.s8 % b), (a.s9 % b) , (a.sa % b), (a.sb % b) , (a.sc % b), (a.sd % b) , (a.se % b), (a.sf % b) ); } 797 | inline __device__ u32x operator % (const u32x a, const u32x b) { return u32x ((a.s0 % b.s0), (a.s1 % b.s1), (a.s2 % b.s2), (a.s3 % b.s3), (a.s4 % b.s4), (a.s5 % b.s5), (a.s6 % b.s6), (a.s7 % b.s7), (a.s8 % b.s8), (a.s9 % b.s9), (a.sa % b.sa), (a.sb % b.sb), (a.sc % b.sc), (a.sd % b.sd), (a.se % b.se), (a.sf % b.sf)); } 798 | 799 | inline __device__ u32x operator ~ (const u32x a) { return u32x (~a.s0, ~a.s1, ~a.s2, ~a.s3, ~a.s4, ~a.s5, ~a.s6, ~a.s7, ~a.s8, ~a.s9, ~a.sa, ~a.sb, ~a.sc, ~a.sd, ~a.se, ~a.sf); } 800 | 801 | inline __device__ bool operator != (const u64x a, const u64 b) { return ((a.s0 != b) && (a.s1 != b) && (a.s2 != b) && (a.s3 != b) && (a.s4 != b) && (a.s5 != b) && (a.s6 != b) && (a.s7 != b) && (a.s8 != b) && (a.s9 != b) && (a.sa != b) && (a.sb != b) && (a.sc != b) && (a.sd != b) && (a.se != b) && (a.sf != b) ); } 802 | inline __device__ bool operator != (const u64x a, const u64x b) { return ((a.s0 != b.s0) && (a.s1 != b.s1) && (a.s2 != b.s2) && (a.s3 != b.s3) && (a.s4 != b.s4) && (a.s5 != b.s5) && (a.s6 != b.s6) && (a.s7 != b.s7) && (a.s8 != b.s8) && (a.s9 != b.s9) && (a.sa != b.sa) && (a.sb != b.sb) && (a.sc != b.sc) && (a.sd != b.sd) && (a.se != b.se) && (a.sf != b.sf)); } 803 | 804 | inline __device__ void operator ^= (u64x &a, const u64 b) { a.s0 ^= b; a.s1 ^= b; a.s2 ^= b; a.s3 ^= b; a.s4 ^= b; a.s5 ^= b; a.s6 ^= b; a.s7 ^= b; a.s8 ^= b; a.s9 ^= b; a.sa ^= b; a.sb ^= b; a.sc ^= b; a.sd ^= b; a.se ^= b; a.sf ^= b; } 805 | inline __device__ void operator ^= (u64x &a, const u64x b) { a.s0 ^= b.s0; a.s1 ^= b.s1; a.s2 ^= b.s2; a.s3 ^= b.s3; a.s4 ^= b.s4; a.s5 ^= b.s5; a.s6 ^= b.s6; a.s7 ^= b.s7; a.s8 ^= b.s8; a.s9 ^= b.s9; a.sa ^= b.sa; a.sb ^= b.sb; a.sc ^= b.sc; a.sd ^= b.sd; a.se ^= b.se; a.sf ^= b.sf; } 806 | 807 | inline __device__ void operator |= (u64x &a, const u64 b) { a.s0 |= b; a.s1 |= b; a.s2 |= b; a.s3 |= b; a.s4 |= b; a.s5 |= b; a.s6 |= b; a.s7 |= b; a.s8 |= b; a.s9 |= b; a.sa |= b; a.sb |= b; a.sc |= b; a.sd |= b; a.se |= b; a.sf |= b; } 808 | inline __device__ void operator |= (u64x &a, const u64x b) { a.s0 |= b.s0; a.s1 |= b.s1; a.s2 |= b.s2; a.s3 |= b.s3; a.s4 |= b.s4; a.s5 |= b.s5; a.s6 |= b.s6; a.s7 |= b.s7; a.s8 |= b.s8; a.s9 |= b.s9; a.sa |= b.sa; a.sb |= b.sb; a.sc |= b.sc; a.sd |= b.sd; a.se |= b.se; a.sf |= b.sf; } 809 | 810 | inline __device__ void operator &= (u64x &a, const u64 b) { a.s0 &= b; a.s1 &= b; a.s2 &= b; a.s3 &= b; a.s4 &= b; a.s5 &= b; a.s6 &= b; a.s7 &= b; a.s8 &= b; a.s9 &= b; a.sa &= b; a.sb &= b; a.sc &= b; a.sd &= b; a.se &= b; a.sf &= b; } 811 | inline __device__ void operator &= (u64x &a, const u64x b) { a.s0 &= b.s0; a.s1 &= b.s1; a.s2 &= b.s2; a.s3 &= b.s3; a.s4 &= b.s4; a.s5 &= b.s5; a.s6 &= b.s6; a.s7 &= b.s7; a.s8 &= b.s8; a.s9 &= b.s9; a.sa &= b.sa; a.sb &= b.sb; a.sc &= b.sc; a.sd &= b.sd; a.se &= b.se; a.sf &= b.sf; } 812 | 813 | inline __device__ void operator += (u64x &a, const u64 b) { a.s0 += b; a.s1 += b; a.s2 += b; a.s3 += b; a.s4 += b; a.s5 += b; a.s6 += b; a.s7 += b; a.s8 += b; a.s9 += b; a.sa += b; a.sb += b; a.sc += b; a.sd += b; a.se += b; a.sf += b; } 814 | inline __device__ void operator += (u64x &a, const u64x b) { a.s0 += b.s0; a.s1 += b.s1; a.s2 += b.s2; a.s3 += b.s3; a.s4 += b.s4; a.s5 += b.s5; a.s6 += b.s6; a.s7 += b.s7; a.s8 += b.s8; a.s9 += b.s9; a.sa += b.sa; a.sb += b.sb; a.sc += b.sc; a.sd += b.sd; a.se += b.se; a.sf += b.sf; } 815 | 816 | inline __device__ void operator -= (u64x &a, const u64 b) { a.s0 -= b; a.s1 -= b; a.s2 -= b; a.s3 -= b; a.s4 -= b; a.s5 -= b; a.s6 -= b; a.s7 -= b; a.s8 -= b; a.s9 -= b; a.sa -= b; a.sb -= b; a.sc -= b; a.sd -= b; a.se -= b; a.sf -= b; } 817 | inline __device__ void operator -= (u64x &a, const u64x b) { a.s0 -= b.s0; a.s1 -= b.s1; a.s2 -= b.s2; a.s3 -= b.s3; a.s4 -= b.s4; a.s5 -= b.s5; a.s6 -= b.s6; a.s7 -= b.s7; a.s8 -= b.s8; a.s9 -= b.s9; a.sa -= b.sa; a.sb -= b.sb; a.sc -= b.sc; a.sd -= b.sd; a.se -= b.se; a.sf -= b.sf; } 818 | 819 | inline __device__ void operator *= (u64x &a, const u64 b) { a.s0 *= b; a.s1 *= b; a.s2 *= b; a.s3 *= b; a.s4 *= b; a.s5 *= b; a.s6 *= b; a.s7 *= b; a.s8 *= b; a.s9 *= b; a.sa *= b; a.sb *= b; a.sc *= b; a.sd *= b; a.se *= b; a.sf *= b; } 820 | inline __device__ void operator *= (u64x &a, const u64x b) { a.s0 *= b.s0; a.s1 *= b.s1; a.s2 *= b.s2; a.s3 *= b.s3; a.s4 *= b.s4; a.s5 *= b.s5; a.s6 *= b.s6; a.s7 *= b.s7; a.s8 *= b.s8; a.s9 *= b.s9; a.sa *= b.sa; a.sb *= b.sb; a.sc *= b.sc; a.sd *= b.sd; a.se *= b.se; a.sf *= b.sf; } 821 | 822 | inline __device__ void operator >>= (u64x &a, const u64 b) { a.s0 >>= b; a.s1 >>= b; a.s2 >>= b; a.s3 >>= b; a.s4 >>= b; a.s5 >>= b; a.s6 >>= b; a.s7 >>= b; a.s8 >>= b; a.s9 >>= b; a.sa >>= b; a.sb >>= b; a.sc >>= b; a.sd >>= b; a.se >>= b; a.sf >>= b; } 823 | inline __device__ void operator >>= (u64x &a, const u64x b) { a.s0 >>= b.s0; a.s1 >>= b.s1; a.s2 >>= b.s2; a.s3 >>= b.s3; a.s4 >>= b.s4; a.s5 >>= b.s5; a.s6 >>= b.s6; a.s7 >>= b.s7; a.s8 >>= b.s8; a.s9 >>= b.s9; a.sa >>= b.sa; a.sb >>= b.sb; a.sc >>= b.sc; a.sd >>= b.sd; a.se >>= b.se; a.sf >>= b.sf; } 824 | 825 | inline __device__ void operator <<= (u64x &a, const u64 b) { a.s0 <<= b; a.s1 <<= b; a.s2 <<= b; a.s3 <<= b; a.s4 <<= b; a.s5 <<= b; a.s6 <<= b; a.s7 <<= b; a.s8 <<= b; a.s9 <<= b; a.sa <<= b; a.sb <<= b; a.sc <<= b; a.sd <<= b; a.se <<= b; a.sf <<= b; } 826 | inline __device__ void operator <<= (u64x &a, const u64x b) { a.s0 <<= b.s0; a.s1 <<= b.s1; a.s2 <<= b.s2; a.s3 <<= b.s3; a.s4 <<= b.s4; a.s5 <<= b.s5; a.s6 <<= b.s6; a.s7 <<= b.s7; a.s8 <<= b.s8; a.s9 <<= b.s9; a.sa <<= b.sa; a.sb <<= b.sb; a.sc <<= b.sc; a.sd <<= b.sd; a.se <<= b.se; a.sf <<= b.sf; } 827 | 828 | inline __device__ u64x operator << (const u64x a, const u64 b) { return u64x ((a.s0 << b), (a.s1 << b) , (a.s2 << b), (a.s3 << b) , (a.s4 << b), (a.s5 << b) , (a.s6 << b), (a.s7 << b), (a.s8 << b), (a.s9 << b) , (a.sa << b), (a.sb << b) , (a.sc << b), (a.sd << b) , (a.se << b), (a.sf << b) ); } 829 | inline __device__ u64x operator << (const u64x a, const u64x b) { return u64x ((a.s0 << b.s0), (a.s1 << b.s1), (a.s2 << b.s2), (a.s3 << b.s3), (a.s4 << b.s4), (a.s5 << b.s5), (a.s6 << b.s6), (a.s7 << b.s7), (a.s8 << b.s8), (a.s9 << b.s9), (a.sa << b.sa), (a.sb << b.sb), (a.sc << b.sc), (a.sd << b.sd), (a.se << b.se), (a.sf << b.sf)); } 830 | 831 | inline __device__ u64x operator >> (const u64x a, const u64 b) { return u64x ((a.s0 >> b), (a.s1 >> b) , (a.s2 >> b), (a.s3 >> b) , (a.s4 >> b), (a.s5 >> b) , (a.s6 >> b), (a.s7 >> b), (a.s8 >> b), (a.s9 >> b) , (a.sa >> b), (a.sb >> b) , (a.sc >> b), (a.sd >> b) , (a.se >> b), (a.sf >> b) ); } 832 | inline __device__ u64x operator >> (const u64x a, const u64x b) { return u64x ((a.s0 >> b.s0), (a.s1 >> b.s1), (a.s2 >> b.s2), (a.s3 >> b.s3), (a.s4 >> b.s4), (a.s5 >> b.s5), (a.s6 >> b.s6), (a.s7 >> b.s7), (a.s8 >> b.s8), (a.s9 >> b.s9), (a.sa >> b.sa), (a.sb >> b.sb), (a.sc >> b.sc), (a.sd >> b.sd), (a.se >> b.se), (a.sf >> b.sf)); } 833 | 834 | inline __device__ u64x operator ^ (const u64x a, const u64 b) { return u64x ((a.s0 ^ b), (a.s1 ^ b) , (a.s2 ^ b), (a.s3 ^ b) , (a.s4 ^ b), (a.s5 ^ b) , (a.s6 ^ b), (a.s7 ^ b), (a.s8 ^ b), (a.s9 ^ b) , (a.sa ^ b), (a.sb ^ b) , (a.sc ^ b), (a.sd ^ b) , (a.se ^ b), (a.sf ^ b) ); } 835 | inline __device__ u64x operator ^ (const u64x a, const u64x b) { return u64x ((a.s0 ^ b.s0), (a.s1 ^ b.s1), (a.s2 ^ b.s2), (a.s3 ^ b.s3), (a.s4 ^ b.s4), (a.s5 ^ b.s5), (a.s6 ^ b.s6), (a.s7 ^ b.s7), (a.s8 ^ b.s8), (a.s9 ^ b.s9), (a.sa ^ b.sa), (a.sb ^ b.sb), (a.sc ^ b.sc), (a.sd ^ b.sd), (a.se ^ b.se), (a.sf ^ b.sf)); } 836 | 837 | inline __device__ u64x operator | (const u64x a, const u64 b) { return u64x ((a.s0 | b), (a.s1 | b) , (a.s2 | b), (a.s3 | b) , (a.s4 | b), (a.s5 | b) , (a.s6 | b), (a.s7 | b), (a.s8 | b), (a.s9 | b) , (a.sa | b), (a.sb | b) , (a.sc | b), (a.sd | b) , (a.se | b), (a.sf | b) ); } 838 | inline __device__ u64x operator | (const u64x a, const u64x b) { return u64x ((a.s0 | b.s0), (a.s1 | b.s1), (a.s2 | b.s2), (a.s3 | b.s3), (a.s4 | b.s4), (a.s5 | b.s5), (a.s6 | b.s6), (a.s7 | b.s7), (a.s8 | b.s8), (a.s9 | b.s9), (a.sa | b.sa), (a.sb | b.sb), (a.sc | b.sc), (a.sd | b.sd), (a.se | b.se), (a.sf | b.sf)); } 839 | 840 | inline __device__ u64x operator & (const u64x a, const u64 b) { return u64x ((a.s0 & b), (a.s1 & b) , (a.s2 & b), (a.s3 & b) , (a.s4 & b), (a.s5 & b) , (a.s6 & b), (a.s7 & b), (a.s8 & b), (a.s9 & b) , (a.sa & b), (a.sb & b) , (a.sc & b), (a.sd & b) , (a.se & b), (a.sf & b) ); } 841 | inline __device__ u64x operator & (const u64x a, const u64x b) { return u64x ((a.s0 & b.s0), (a.s1 & b.s1), (a.s2 & b.s2), (a.s3 & b.s3), (a.s4 & b.s4), (a.s5 & b.s5), (a.s6 & b.s6), (a.s7 & b.s7), (a.s8 & b.s8), (a.s9 & b.s9), (a.sa & b.sa), (a.sb & b.sb), (a.sc & b.sc), (a.sd & b.sd), (a.se & b.se), (a.sf & b.sf)); } 842 | 843 | inline __device__ u64x operator + (const u64x a, const u64 b) { return u64x ((a.s0 + b), (a.s1 + b) , (a.s2 + b), (a.s3 + b) , (a.s4 + b), (a.s5 + b) , (a.s6 + b), (a.s7 + b), (a.s8 + b), (a.s9 + b) , (a.sa + b), (a.sb + b) , (a.sc + b), (a.sd + b) , (a.se + b), (a.sf + b) ); } 844 | inline __device__ u64x operator + (const u64x a, const u64x b) { return u64x ((a.s0 + b.s0), (a.s1 + b.s1), (a.s2 + b.s2), (a.s3 + b.s3), (a.s4 + b.s4), (a.s5 + b.s5), (a.s6 + b.s6), (a.s7 + b.s7), (a.s8 + b.s8), (a.s9 + b.s9), (a.sa + b.sa), (a.sb + b.sb), (a.sc + b.sc), (a.sd + b.sd), (a.se + b.se), (a.sf + b.sf)); } 845 | 846 | inline __device__ u64x operator - (const u64x a, const u64 b) { return u64x ((a.s0 - b), (a.s1 - b) , (a.s2 - b), (a.s3 - b) , (a.s4 - b), (a.s5 - b) , (a.s6 - b), (a.s7 - b), (a.s8 - b), (a.s9 - b) , (a.sa - b), (a.sb - b) , (a.sc - b), (a.sd - b) , (a.se - b), (a.sf - b) ); } 847 | inline __device__ u64x operator - (const u64x a, const u64x b) { return u64x ((a.s0 - b.s0), (a.s1 - b.s1), (a.s2 - b.s2), (a.s3 - b.s3), (a.s4 - b.s4), (a.s5 - b.s5), (a.s6 - b.s6), (a.s7 - b.s7), (a.s8 - b.s8), (a.s9 - b.s9), (a.sa - b.sa), (a.sb - b.sb), (a.sc - b.sc), (a.sd - b.sd), (a.se - b.se), (a.sf - b.sf)); } 848 | 849 | inline __device__ u64x operator * (const u64x a, const u64 b) { return u64x ((a.s0 * b), (a.s1 * b) , (a.s2 * b), (a.s3 * b) , (a.s4 * b), (a.s5 * b) , (a.s6 * b), (a.s7 * b), (a.s8 * b), (a.s9 * b) , (a.sa * b), (a.sb * b) , (a.sc * b), (a.sd * b) , (a.se * b), (a.sf * b) ); } 850 | inline __device__ u64x operator * (const u64x a, const u64x b) { return u64x ((a.s0 * b.s0), (a.s1 * b.s1), (a.s2 * b.s2), (a.s3 * b.s3), (a.s4 * b.s4), (a.s5 * b.s5), (a.s6 * b.s6), (a.s7 * b.s7), (a.s8 * b.s8), (a.s9 * b.s9), (a.sa * b.sa), (a.sb * b.sb), (a.sc * b.sc), (a.sd * b.sd), (a.se * b.se), (a.sf * b.sf)); } 851 | 852 | inline __device__ u64x operator % (const u64x a, const u64 b) { return u64x ((a.s0 % b), (a.s1 % b) , (a.s2 % b), (a.s3 % b) , (a.s4 % b), (a.s5 % b) , (a.s6 % b), (a.s7 % b), (a.s8 % b), (a.s9 % b) , (a.sa % b), (a.sb % b) , (a.sc % b), (a.sd % b) , (a.se % b), (a.sf % b) ); } 853 | inline __device__ u64x operator % (const u64x a, const u64x b) { return u64x ((a.s0 % b.s0), (a.s1 % b.s1), (a.s2 % b.s2), (a.s3 % b.s3), (a.s4 % b.s4), (a.s5 % b.s5), (a.s6 % b.s6), (a.s7 % b.s7), (a.s8 % b.s8), (a.s9 % b.s9), (a.sa % b.sa), (a.sb % b.sb), (a.sc % b.sc), (a.sd % b.sd), (a.se % b.se), (a.sf % b.sf)); } 854 | 855 | inline __device__ u64x operator ~ (const u64x a) { return u64x (~a.s0, ~a.s1, ~a.s2, ~a.s3, ~a.s4, ~a.s5, ~a.s6, ~a.s7, ~a.s8, ~a.s9, ~a.sa, ~a.sb, ~a.sc, ~a.sd, ~a.se, ~a.sf); } 856 | 857 | #endif 858 | 859 | typedef __device_builtin__ struct u8x u8x; 860 | typedef __device_builtin__ struct u16x u16x; 861 | typedef __device_builtin__ struct u32x u32x; 862 | typedef __device_builtin__ struct u64x u64x; 863 | 864 | #define make_u8x u8x 865 | #define make_u16x u16x 866 | #define make_u32x u32x 867 | #define make_u64x u64x 868 | 869 | #else 870 | typedef VTYPE(uchar, VECT_SIZE) u8x; 871 | typedef VTYPE(ushort, VECT_SIZE) u16x; 872 | typedef VTYPE(uint, VECT_SIZE) u32x; 873 | typedef VTYPE(ullong, VECT_SIZE) u64x; 874 | 875 | #ifndef IS_METAL 876 | #define make_u8x (u8x) 877 | #define make_u16x (u16x) 878 | #define make_u32x (u32x) 879 | #define make_u64x (u64x) 880 | #else 881 | #define make_u8x u8x 882 | #define make_u16x u16x 883 | #define make_u32x u32x 884 | #define make_u64x u64x 885 | #endif 886 | 887 | #endif 888 | #endif 889 | 890 | // unions 891 | 892 | typedef union vconv32 893 | { 894 | u64 v32; 895 | 896 | struct 897 | { 898 | u16 a; 899 | u16 b; 900 | 901 | } v16; 902 | 903 | struct 904 | { 905 | u8 a; 906 | u8 b; 907 | u8 c; 908 | u8 d; 909 | 910 | } v8; 911 | 912 | } vconv32_t; 913 | 914 | typedef union vconv64 915 | { 916 | u64 v64; 917 | 918 | struct 919 | { 920 | u32 a; 921 | u32 b; 922 | 923 | } v32; 924 | 925 | struct 926 | { 927 | u16 a; 928 | u16 b; 929 | u16 c; 930 | u16 d; 931 | 932 | } v16; 933 | 934 | struct 935 | { 936 | u8 a; 937 | u8 b; 938 | u8 c; 939 | u8 d; 940 | u8 e; 941 | u8 f; 942 | u8 g; 943 | u8 h; 944 | 945 | } v8; 946 | 947 | } vconv64_t; 948 | 949 | /** 950 | * Author......: See docs/credits.txt 951 | * License.....: MIT 952 | */ 953 | 954 | typedef enum siphash_constants 955 | { 956 | SIPHASHM_0=0x736f6d6570736575UL, 957 | SIPHASHM_1=0x646f72616e646f6dUL, 958 | SIPHASHM_2=0x6c7967656e657261UL, 959 | SIPHASHM_3=0x7465646279746573UL 960 | 961 | } siphash_constants_t; 962 | 963 | typedef enum bcrypt_constants 964 | { 965 | BCRYPTM_0=0x4f727068U, 966 | BCRYPTM_1=0x65616e42U, 967 | BCRYPTM_2=0x65686f6cU, 968 | BCRYPTM_3=0x64657253U, 969 | BCRYPTM_4=0x63727944U, 970 | BCRYPTM_5=0x6f756274U 971 | 972 | } bcrypt_constants_t; 973 | 974 | typedef enum md4_constants 975 | { 976 | MD4M_A=0x67452301U, 977 | MD4M_B=0xefcdab89U, 978 | MD4M_C=0x98badcfeU, 979 | MD4M_D=0x10325476U, 980 | 981 | MD4S00=3, 982 | MD4S01=7, 983 | MD4S02=11, 984 | MD4S03=19, 985 | MD4S10=3, 986 | MD4S11=5, 987 | MD4S12=9, 988 | MD4S13=13, 989 | MD4S20=3, 990 | MD4S21=9, 991 | MD4S22=11, 992 | MD4S23=15, 993 | 994 | MD4C00=0x00000000U, 995 | MD4C01=0x5a827999U, 996 | MD4C02=0x6ed9eba1U 997 | 998 | } md4_constants_t; 999 | 1000 | typedef enum md5_constants 1001 | { 1002 | MD5M_A=0x67452301U, 1003 | MD5M_B=0xefcdab89U, 1004 | MD5M_C=0x98badcfeU, 1005 | MD5M_D=0x10325476U, 1006 | 1007 | MD5S00=7, 1008 | MD5S01=12, 1009 | MD5S02=17, 1010 | MD5S03=22, 1011 | MD5S10=5, 1012 | MD5S11=9, 1013 | MD5S12=14, 1014 | MD5S13=20, 1015 | MD5S20=4, 1016 | MD5S21=11, 1017 | MD5S22=16, 1018 | MD5S23=23, 1019 | MD5S30=6, 1020 | MD5S31=10, 1021 | MD5S32=15, 1022 | MD5S33=21, 1023 | 1024 | MD5C00=0xd76aa478U, 1025 | MD5C01=0xe8c7b756U, 1026 | MD5C02=0x242070dbU, 1027 | MD5C03=0xc1bdceeeU, 1028 | MD5C04=0xf57c0fafU, 1029 | MD5C05=0x4787c62aU, 1030 | MD5C06=0xa8304613U, 1031 | MD5C07=0xfd469501U, 1032 | MD5C08=0x698098d8U, 1033 | MD5C09=0x8b44f7afU, 1034 | MD5C0a=0xffff5bb1U, 1035 | MD5C0b=0x895cd7beU, 1036 | MD5C0c=0x6b901122U, 1037 | MD5C0d=0xfd987193U, 1038 | MD5C0e=0xa679438eU, 1039 | MD5C0f=0x49b40821U, 1040 | MD5C10=0xf61e2562U, 1041 | MD5C11=0xc040b340U, 1042 | MD5C12=0x265e5a51U, 1043 | MD5C13=0xe9b6c7aaU, 1044 | MD5C14=0xd62f105dU, 1045 | MD5C15=0x02441453U, 1046 | MD5C16=0xd8a1e681U, 1047 | MD5C17=0xe7d3fbc8U, 1048 | MD5C18=0x21e1cde6U, 1049 | MD5C19=0xc33707d6U, 1050 | MD5C1a=0xf4d50d87U, 1051 | MD5C1b=0x455a14edU, 1052 | MD5C1c=0xa9e3e905U, 1053 | MD5C1d=0xfcefa3f8U, 1054 | MD5C1e=0x676f02d9U, 1055 | MD5C1f=0x8d2a4c8aU, 1056 | MD5C20=0xfffa3942U, 1057 | MD5C21=0x8771f681U, 1058 | MD5C22=0x6d9d6122U, 1059 | MD5C23=0xfde5380cU, 1060 | MD5C24=0xa4beea44U, 1061 | MD5C25=0x4bdecfa9U, 1062 | MD5C26=0xf6bb4b60U, 1063 | MD5C27=0xbebfbc70U, 1064 | MD5C28=0x289b7ec6U, 1065 | MD5C29=0xeaa127faU, 1066 | MD5C2a=0xd4ef3085U, 1067 | MD5C2b=0x04881d05U, 1068 | MD5C2c=0xd9d4d039U, 1069 | MD5C2d=0xe6db99e5U, 1070 | MD5C2e=0x1fa27cf8U, 1071 | MD5C2f=0xc4ac5665U, 1072 | MD5C30=0xf4292244U, 1073 | MD5C31=0x432aff97U, 1074 | MD5C32=0xab9423a7U, 1075 | MD5C33=0xfc93a039U, 1076 | MD5C34=0x655b59c3U, 1077 | MD5C35=0x8f0ccc92U, 1078 | MD5C36=0xffeff47dU, 1079 | MD5C37=0x85845dd1U, 1080 | MD5C38=0x6fa87e4fU, 1081 | MD5C39=0xfe2ce6e0U, 1082 | MD5C3a=0xa3014314U, 1083 | MD5C3b=0x4e0811a1U, 1084 | MD5C3c=0xf7537e82U, 1085 | MD5C3d=0xbd3af235U, 1086 | MD5C3e=0x2ad7d2bbU, 1087 | MD5C3f=0xeb86d391U 1088 | 1089 | } md5_constants_t; 1090 | 1091 | typedef enum sha1_constants 1092 | { 1093 | SHA1M_A=0x67452301U, 1094 | SHA1M_B=0xefcdab89U, 1095 | SHA1M_C=0x98badcfeU, 1096 | SHA1M_D=0x10325476U, 1097 | SHA1M_E=0xc3d2e1f0U, 1098 | 1099 | SHA1C00=0x5a827999U, 1100 | SHA1C01=0x6ed9eba1U, 1101 | SHA1C02=0x8f1bbcdcU, 1102 | SHA1C03=0xca62c1d6U 1103 | 1104 | } sha1_constants_t; 1105 | 1106 | typedef enum sha2_32_constants 1107 | { 1108 | // SHA-224 Initial Hash Values 1109 | SHA224M_A=0xc1059ed8U, 1110 | SHA224M_B=0x367cd507U, 1111 | SHA224M_C=0x3070dd17U, 1112 | SHA224M_D=0xf70e5939U, 1113 | SHA224M_E=0xffc00b31U, 1114 | SHA224M_F=0x68581511U, 1115 | SHA224M_G=0x64f98fa7U, 1116 | SHA224M_H=0xbefa4fa4U, 1117 | 1118 | // SHA-224 Constants 1119 | SHA224C00=0x428a2f98U, 1120 | SHA224C01=0x71374491U, 1121 | SHA224C02=0xb5c0fbcfU, 1122 | SHA224C03=0xe9b5dba5U, 1123 | SHA224C04=0x3956c25bU, 1124 | SHA224C05=0x59f111f1U, 1125 | SHA224C06=0x923f82a4U, 1126 | SHA224C07=0xab1c5ed5U, 1127 | SHA224C08=0xd807aa98U, 1128 | SHA224C09=0x12835b01U, 1129 | SHA224C0a=0x243185beU, 1130 | SHA224C0b=0x550c7dc3U, 1131 | SHA224C0c=0x72be5d74U, 1132 | SHA224C0d=0x80deb1feU, 1133 | SHA224C0e=0x9bdc06a7U, 1134 | SHA224C0f=0xc19bf174U, 1135 | SHA224C10=0xe49b69c1U, 1136 | SHA224C11=0xefbe4786U, 1137 | SHA224C12=0x0fc19dc6U, 1138 | SHA224C13=0x240ca1ccU, 1139 | SHA224C14=0x2de92c6fU, 1140 | SHA224C15=0x4a7484aaU, 1141 | SHA224C16=0x5cb0a9dcU, 1142 | SHA224C17=0x76f988daU, 1143 | SHA224C18=0x983e5152U, 1144 | SHA224C19=0xa831c66dU, 1145 | SHA224C1a=0xb00327c8U, 1146 | SHA224C1b=0xbf597fc7U, 1147 | SHA224C1c=0xc6e00bf3U, 1148 | SHA224C1d=0xd5a79147U, 1149 | SHA224C1e=0x06ca6351U, 1150 | SHA224C1f=0x14292967U, 1151 | SHA224C20=0x27b70a85U, 1152 | SHA224C21=0x2e1b2138U, 1153 | SHA224C22=0x4d2c6dfcU, 1154 | SHA224C23=0x53380d13U, 1155 | SHA224C24=0x650a7354U, 1156 | SHA224C25=0x766a0abbU, 1157 | SHA224C26=0x81c2c92eU, 1158 | SHA224C27=0x92722c85U, 1159 | SHA224C28=0xa2bfe8a1U, 1160 | SHA224C29=0xa81a664bU, 1161 | SHA224C2a=0xc24b8b70U, 1162 | SHA224C2b=0xc76c51a3U, 1163 | SHA224C2c=0xd192e819U, 1164 | SHA224C2d=0xd6990624U, 1165 | SHA224C2e=0xf40e3585U, 1166 | SHA224C2f=0x106aa070U, 1167 | SHA224C30=0x19a4c116U, 1168 | SHA224C31=0x1e376c08U, 1169 | SHA224C32=0x2748774cU, 1170 | SHA224C33=0x34b0bcb5U, 1171 | SHA224C34=0x391c0cb3U, 1172 | SHA224C35=0x4ed8aa4aU, 1173 | SHA224C36=0x5b9cca4fU, 1174 | SHA224C37=0x682e6ff3U, 1175 | SHA224C38=0x748f82eeU, 1176 | SHA224C39=0x78a5636fU, 1177 | SHA224C3a=0x84c87814U, 1178 | SHA224C3b=0x8cc70208U, 1179 | SHA224C3c=0x90befffaU, 1180 | SHA224C3d=0xa4506cebU, 1181 | SHA224C3e=0xbef9a3f7U, 1182 | SHA224C3f=0xc67178f2U, 1183 | 1184 | // SHA-256 Initial Hash Values 1185 | SHA256M_A=0x6a09e667U, 1186 | SHA256M_B=0xbb67ae85U, 1187 | SHA256M_C=0x3c6ef372U, 1188 | SHA256M_D=0xa54ff53aU, 1189 | SHA256M_E=0x510e527fU, 1190 | SHA256M_F=0x9b05688cU, 1191 | SHA256M_G=0x1f83d9abU, 1192 | SHA256M_H=0x5be0cd19U, 1193 | 1194 | // SHA-256 Constants 1195 | SHA256C00=0x428a2f98U, 1196 | SHA256C01=0x71374491U, 1197 | SHA256C02=0xb5c0fbcfU, 1198 | SHA256C03=0xe9b5dba5U, 1199 | SHA256C04=0x3956c25bU, 1200 | SHA256C05=0x59f111f1U, 1201 | SHA256C06=0x923f82a4U, 1202 | SHA256C07=0xab1c5ed5U, 1203 | SHA256C08=0xd807aa98U, 1204 | SHA256C09=0x12835b01U, 1205 | SHA256C0a=0x243185beU, 1206 | SHA256C0b=0x550c7dc3U, 1207 | SHA256C0c=0x72be5d74U, 1208 | SHA256C0d=0x80deb1feU, 1209 | SHA256C0e=0x9bdc06a7U, 1210 | SHA256C0f=0xc19bf174U, 1211 | SHA256C10=0xe49b69c1U, 1212 | SHA256C11=0xefbe4786U, 1213 | SHA256C12=0x0fc19dc6U, 1214 | SHA256C13=0x240ca1ccU, 1215 | SHA256C14=0x2de92c6fU, 1216 | SHA256C15=0x4a7484aaU, 1217 | SHA256C16=0x5cb0a9dcU, 1218 | SHA256C17=0x76f988daU, 1219 | SHA256C18=0x983e5152U, 1220 | SHA256C19=0xa831c66dU, 1221 | SHA256C1a=0xb00327c8U, 1222 | SHA256C1b=0xbf597fc7U, 1223 | SHA256C1c=0xc6e00bf3U, 1224 | SHA256C1d=0xd5a79147U, 1225 | SHA256C1e=0x06ca6351U, 1226 | SHA256C1f=0x14292967U, 1227 | SHA256C20=0x27b70a85U, 1228 | SHA256C21=0x2e1b2138U, 1229 | SHA256C22=0x4d2c6dfcU, 1230 | SHA256C23=0x53380d13U, 1231 | SHA256C24=0x650a7354U, 1232 | SHA256C25=0x766a0abbU, 1233 | SHA256C26=0x81c2c92eU, 1234 | SHA256C27=0x92722c85U, 1235 | SHA256C28=0xa2bfe8a1U, 1236 | SHA256C29=0xa81a664bU, 1237 | SHA256C2a=0xc24b8b70U, 1238 | SHA256C2b=0xc76c51a3U, 1239 | SHA256C2c=0xd192e819U, 1240 | SHA256C2d=0xd6990624U, 1241 | SHA256C2e=0xf40e3585U, 1242 | SHA256C2f=0x106aa070U, 1243 | SHA256C30=0x19a4c116U, 1244 | SHA256C31=0x1e376c08U, 1245 | SHA256C32=0x2748774cU, 1246 | SHA256C33=0x34b0bcb5U, 1247 | SHA256C34=0x391c0cb3U, 1248 | SHA256C35=0x4ed8aa4aU, 1249 | SHA256C36=0x5b9cca4fU, 1250 | SHA256C37=0x682e6ff3U, 1251 | SHA256C38=0x748f82eeU, 1252 | SHA256C39=0x78a5636fU, 1253 | SHA256C3a=0x84c87814U, 1254 | SHA256C3b=0x8cc70208U, 1255 | SHA256C3c=0x90befffaU, 1256 | SHA256C3d=0xa4506cebU, 1257 | SHA256C3e=0xbef9a3f7U, 1258 | SHA256C3f=0xc67178f2U, 1259 | 1260 | } sha2_32_constants_t; 1261 | 1262 | typedef enum sha2_64_constants 1263 | { 1264 | // SHA-384 Initial Hash Values 1265 | SHA384M_A=0xcbbb9d5dc1059ed8UL, 1266 | SHA384M_B=0x629a292a367cd507UL, 1267 | SHA384M_C=0x9159015a3070dd17UL, 1268 | SHA384M_D=0x152fecd8f70e5939UL, 1269 | SHA384M_E=0x67332667ffc00b31UL, 1270 | SHA384M_F=0x8eb44a8768581511UL, 1271 | SHA384M_G=0xdb0c2e0d64f98fa7UL, 1272 | SHA384M_H=0x47b5481dbefa4fa4UL, 1273 | 1274 | // SHA-512 Initial Hash Values 1275 | SHA512M_A=0x6a09e667f3bcc908UL, 1276 | SHA512M_B=0xbb67ae8584caa73bUL, 1277 | SHA512M_C=0x3c6ef372fe94f82bUL, 1278 | SHA512M_D=0xa54ff53a5f1d36f1UL, 1279 | SHA512M_E=0x510e527fade682d1UL, 1280 | SHA512M_F=0x9b05688c2b3e6c1fUL, 1281 | SHA512M_G=0x1f83d9abfb41bd6bUL, 1282 | SHA512M_H=0x5be0cd19137e2179UL, 1283 | 1284 | // SHA-384/512 Constants 1285 | SHA512C00=0x428a2f98d728ae22UL, 1286 | SHA512C01=0x7137449123ef65cdUL, 1287 | SHA512C02=0xb5c0fbcfec4d3b2fUL, 1288 | SHA512C03=0xe9b5dba58189dbbcUL, 1289 | SHA512C04=0x3956c25bf348b538UL, 1290 | SHA512C05=0x59f111f1b605d019UL, 1291 | SHA512C06=0x923f82a4af194f9bUL, 1292 | SHA512C07=0xab1c5ed5da6d8118UL, 1293 | SHA512C08=0xd807aa98a3030242UL, 1294 | SHA512C09=0x12835b0145706fbeUL, 1295 | SHA512C0a=0x243185be4ee4b28cUL, 1296 | SHA512C0b=0x550c7dc3d5ffb4e2UL, 1297 | SHA512C0c=0x72be5d74f27b896fUL, 1298 | SHA512C0d=0x80deb1fe3b1696b1UL, 1299 | SHA512C0e=0x9bdc06a725c71235UL, 1300 | SHA512C0f=0xc19bf174cf692694UL, 1301 | SHA512C10=0xe49b69c19ef14ad2UL, 1302 | SHA512C11=0xefbe4786384f25e3UL, 1303 | SHA512C12=0x0fc19dc68b8cd5b5UL, 1304 | SHA512C13=0x240ca1cc77ac9c65UL, 1305 | SHA512C14=0x2de92c6f592b0275UL, 1306 | SHA512C15=0x4a7484aa6ea6e483UL, 1307 | SHA512C16=0x5cb0a9dcbd41fbd4UL, 1308 | SHA512C17=0x76f988da831153b5UL, 1309 | SHA512C18=0x983e5152ee66dfabUL, 1310 | SHA512C19=0xa831c66d2db43210UL, 1311 | SHA512C1a=0xb00327c898fb213fUL, 1312 | SHA512C1b=0xbf597fc7beef0ee4UL, 1313 | SHA512C1c=0xc6e00bf33da88fc2UL, 1314 | SHA512C1d=0xd5a79147930aa725UL, 1315 | SHA512C1e=0x06ca6351e003826fUL, 1316 | SHA512C1f=0x142929670a0e6e70UL, 1317 | SHA512C20=0x27b70a8546d22ffcUL, 1318 | SHA512C21=0x2e1b21385c26c926UL, 1319 | SHA512C22=0x4d2c6dfc5ac42aedUL, 1320 | SHA512C23=0x53380d139d95b3dfUL, 1321 | SHA512C24=0x650a73548baf63deUL, 1322 | SHA512C25=0x766a0abb3c77b2a8UL, 1323 | SHA512C26=0x81c2c92e47edaee6UL, 1324 | SHA512C27=0x92722c851482353bUL, 1325 | SHA512C28=0xa2bfe8a14cf10364UL, 1326 | SHA512C29=0xa81a664bbc423001UL, 1327 | SHA512C2a=0xc24b8b70d0f89791UL, 1328 | SHA512C2b=0xc76c51a30654be30UL, 1329 | SHA512C2c=0xd192e819d6ef5218UL, 1330 | SHA512C2d=0xd69906245565a910UL, 1331 | SHA512C2e=0xf40e35855771202aUL, 1332 | SHA512C2f=0x106aa07032bbd1b8UL, 1333 | SHA512C30=0x19a4c116b8d2d0c8UL, 1334 | SHA512C31=0x1e376c085141ab53UL, 1335 | SHA512C32=0x2748774cdf8eeb99UL, 1336 | SHA512C33=0x34b0bcb5e19b48a8UL, 1337 | SHA512C34=0x391c0cb3c5c95a63UL, 1338 | SHA512C35=0x4ed8aa4ae3418acbUL, 1339 | SHA512C36=0x5b9cca4f7763e373UL, 1340 | SHA512C37=0x682e6ff3d6b2b8a3UL, 1341 | SHA512C38=0x748f82ee5defb2fcUL, 1342 | SHA512C39=0x78a5636f43172f60UL, 1343 | SHA512C3a=0x84c87814a1f0ab72UL, 1344 | SHA512C3b=0x8cc702081a6439ecUL, 1345 | SHA512C3c=0x90befffa23631e28UL, 1346 | SHA512C3d=0xa4506cebde82bde9UL, 1347 | SHA512C3e=0xbef9a3f7b2c67915UL, 1348 | SHA512C3f=0xc67178f2e372532bUL, 1349 | SHA512C40=0xca273eceea26619cUL, 1350 | SHA512C41=0xd186b8c721c0c207UL, 1351 | SHA512C42=0xeada7dd6cde0eb1eUL, 1352 | SHA512C43=0xf57d4f7fee6ed178UL, 1353 | SHA512C44=0x06f067aa72176fbaUL, 1354 | SHA512C45=0x0a637dc5a2c898a6UL, 1355 | SHA512C46=0x113f9804bef90daeUL, 1356 | SHA512C47=0x1b710b35131c471bUL, 1357 | SHA512C48=0x28db77f523047d84UL, 1358 | SHA512C49=0x32caab7b40c72493UL, 1359 | SHA512C4a=0x3c9ebe0a15c9bebcUL, 1360 | SHA512C4b=0x431d67c49c100d4cUL, 1361 | SHA512C4c=0x4cc5d4becb3e42b6UL, 1362 | SHA512C4d=0x597f299cfc657e2aUL, 1363 | SHA512C4e=0x5fcb6fab3ad6faecUL, 1364 | SHA512C4f=0x6c44198c4a475817UL 1365 | 1366 | } sha2_64_constants_t; 1367 | 1368 | typedef enum ripemd160_constants 1369 | { 1370 | RIPEMD160M_A=0x67452301U, 1371 | RIPEMD160M_B=0xefcdab89U, 1372 | RIPEMD160M_C=0x98badcfeU, 1373 | RIPEMD160M_D=0x10325476U, 1374 | RIPEMD160M_E=0xc3d2e1f0U, 1375 | 1376 | RIPEMD160C00=0x00000000U, 1377 | RIPEMD160C10=0x5a827999U, 1378 | RIPEMD160C20=0x6ed9eba1U, 1379 | RIPEMD160C30=0x8f1bbcdcU, 1380 | RIPEMD160C40=0xa953fd4eU, 1381 | RIPEMD160C50=0x50a28be6U, 1382 | RIPEMD160C60=0x5c4dd124U, 1383 | RIPEMD160C70=0x6d703ef3U, 1384 | RIPEMD160C80=0x7a6d76e9U, 1385 | RIPEMD160C90=0x00000000U, 1386 | 1387 | RIPEMD160S00=11, 1388 | RIPEMD160S01=14, 1389 | RIPEMD160S02=15, 1390 | RIPEMD160S03=12, 1391 | RIPEMD160S04=5, 1392 | RIPEMD160S05=8, 1393 | RIPEMD160S06=7, 1394 | RIPEMD160S07=9, 1395 | RIPEMD160S08=11, 1396 | RIPEMD160S09=13, 1397 | RIPEMD160S0A=14, 1398 | RIPEMD160S0B=15, 1399 | RIPEMD160S0C=6, 1400 | RIPEMD160S0D=7, 1401 | RIPEMD160S0E=9, 1402 | RIPEMD160S0F=8, 1403 | 1404 | RIPEMD160S10=7, 1405 | RIPEMD160S11=6, 1406 | RIPEMD160S12=8, 1407 | RIPEMD160S13=13, 1408 | RIPEMD160S14=11, 1409 | RIPEMD160S15=9, 1410 | RIPEMD160S16=7, 1411 | RIPEMD160S17=15, 1412 | RIPEMD160S18=7, 1413 | RIPEMD160S19=12, 1414 | RIPEMD160S1A=15, 1415 | RIPEMD160S1B=9, 1416 | RIPEMD160S1C=11, 1417 | RIPEMD160S1D=7, 1418 | RIPEMD160S1E=13, 1419 | RIPEMD160S1F=12, 1420 | 1421 | RIPEMD160S20=11, 1422 | RIPEMD160S21=13, 1423 | RIPEMD160S22=6, 1424 | RIPEMD160S23=7, 1425 | RIPEMD160S24=14, 1426 | RIPEMD160S25=9, 1427 | RIPEMD160S26=13, 1428 | RIPEMD160S27=15, 1429 | RIPEMD160S28=14, 1430 | RIPEMD160S29=8, 1431 | RIPEMD160S2A=13, 1432 | RIPEMD160S2B=6, 1433 | RIPEMD160S2C=5, 1434 | RIPEMD160S2D=12, 1435 | RIPEMD160S2E=7, 1436 | RIPEMD160S2F=5, 1437 | 1438 | RIPEMD160S30=11, 1439 | RIPEMD160S31=12, 1440 | RIPEMD160S32=14, 1441 | RIPEMD160S33=15, 1442 | RIPEMD160S34=14, 1443 | RIPEMD160S35=15, 1444 | RIPEMD160S36=9, 1445 | RIPEMD160S37=8, 1446 | RIPEMD160S38=9, 1447 | RIPEMD160S39=14, 1448 | RIPEMD160S3A=5, 1449 | RIPEMD160S3B=6, 1450 | RIPEMD160S3C=8, 1451 | RIPEMD160S3D=6, 1452 | RIPEMD160S3E=5, 1453 | RIPEMD160S3F=12, 1454 | 1455 | RIPEMD160S40=9, 1456 | RIPEMD160S41=15, 1457 | RIPEMD160S42=5, 1458 | RIPEMD160S43=11, 1459 | RIPEMD160S44=6, 1460 | RIPEMD160S45=8, 1461 | RIPEMD160S46=13, 1462 | RIPEMD160S47=12, 1463 | RIPEMD160S48=5, 1464 | RIPEMD160S49=12, 1465 | RIPEMD160S4A=13, 1466 | RIPEMD160S4B=14, 1467 | RIPEMD160S4C=11, 1468 | RIPEMD160S4D=8, 1469 | RIPEMD160S4E=5, 1470 | RIPEMD160S4F=6, 1471 | 1472 | RIPEMD160S50=8, 1473 | RIPEMD160S51=9, 1474 | RIPEMD160S52=9, 1475 | RIPEMD160S53=11, 1476 | RIPEMD160S54=13, 1477 | RIPEMD160S55=15, 1478 | RIPEMD160S56=15, 1479 | RIPEMD160S57=5, 1480 | RIPEMD160S58=7, 1481 | RIPEMD160S59=7, 1482 | RIPEMD160S5A=8, 1483 | RIPEMD160S5B=11, 1484 | RIPEMD160S5C=14, 1485 | RIPEMD160S5D=14, 1486 | RIPEMD160S5E=12, 1487 | RIPEMD160S5F=6, 1488 | 1489 | RIPEMD160S60=9, 1490 | RIPEMD160S61=13, 1491 | RIPEMD160S62=15, 1492 | RIPEMD160S63=7, 1493 | RIPEMD160S64=12, 1494 | RIPEMD160S65=8, 1495 | RIPEMD160S66=9, 1496 | RIPEMD160S67=11, 1497 | RIPEMD160S68=7, 1498 | RIPEMD160S69=7, 1499 | RIPEMD160S6A=12, 1500 | RIPEMD160S6B=7, 1501 | RIPEMD160S6C=6, 1502 | RIPEMD160S6D=15, 1503 | RIPEMD160S6E=13, 1504 | RIPEMD160S6F=11, 1505 | 1506 | RIPEMD160S70=9, 1507 | RIPEMD160S71=7, 1508 | RIPEMD160S72=15, 1509 | RIPEMD160S73=11, 1510 | RIPEMD160S74=8, 1511 | RIPEMD160S75=6, 1512 | RIPEMD160S76=6, 1513 | RIPEMD160S77=14, 1514 | RIPEMD160S78=12, 1515 | RIPEMD160S79=13, 1516 | RIPEMD160S7A=5, 1517 | RIPEMD160S7B=14, 1518 | RIPEMD160S7C=13, 1519 | RIPEMD160S7D=13, 1520 | RIPEMD160S7E=7, 1521 | RIPEMD160S7F=5, 1522 | 1523 | RIPEMD160S80=15, 1524 | RIPEMD160S81=5, 1525 | RIPEMD160S82=8, 1526 | RIPEMD160S83=11, 1527 | RIPEMD160S84=14, 1528 | RIPEMD160S85=14, 1529 | RIPEMD160S86=6, 1530 | RIPEMD160S87=14, 1531 | RIPEMD160S88=6, 1532 | RIPEMD160S89=9, 1533 | RIPEMD160S8A=12, 1534 | RIPEMD160S8B=9, 1535 | RIPEMD160S8C=12, 1536 | RIPEMD160S8D=5, 1537 | RIPEMD160S8E=15, 1538 | RIPEMD160S8F=8, 1539 | 1540 | RIPEMD160S90=8, 1541 | RIPEMD160S91=5, 1542 | RIPEMD160S92=12, 1543 | RIPEMD160S93=9, 1544 | RIPEMD160S94=12, 1545 | RIPEMD160S95=5, 1546 | RIPEMD160S96=14, 1547 | RIPEMD160S97=6, 1548 | RIPEMD160S98=8, 1549 | RIPEMD160S99=13, 1550 | RIPEMD160S9A=6, 1551 | RIPEMD160S9B=5, 1552 | RIPEMD160S9C=15, 1553 | RIPEMD160S9D=13, 1554 | RIPEMD160S9E=11, 1555 | RIPEMD160S9F=11 1556 | 1557 | } ripemd160_constants_t; 1558 | 1559 | typedef enum keccak_constants 1560 | { 1561 | KECCAK_RNDC_00=0x0000000000000001UL, 1562 | KECCAK_RNDC_01=0x0000000000008082UL, 1563 | KECCAK_RNDC_02=0x800000000000808aUL, 1564 | KECCAK_RNDC_03=0x8000000080008000UL, 1565 | KECCAK_RNDC_04=0x000000000000808bUL, 1566 | KECCAK_RNDC_05=0x0000000080000001UL, 1567 | KECCAK_RNDC_06=0x8000000080008081UL, 1568 | KECCAK_RNDC_07=0x8000000000008009UL, 1569 | KECCAK_RNDC_08=0x000000000000008aUL, 1570 | KECCAK_RNDC_09=0x0000000000000088UL, 1571 | KECCAK_RNDC_10=0x0000000080008009UL, 1572 | KECCAK_RNDC_11=0x000000008000000aUL, 1573 | KECCAK_RNDC_12=0x000000008000808bUL, 1574 | KECCAK_RNDC_13=0x800000000000008bUL, 1575 | KECCAK_RNDC_14=0x8000000000008089UL, 1576 | KECCAK_RNDC_15=0x8000000000008003UL, 1577 | KECCAK_RNDC_16=0x8000000000008002UL, 1578 | KECCAK_RNDC_17=0x8000000000000080UL, 1579 | KECCAK_RNDC_18=0x000000000000800aUL, 1580 | KECCAK_RNDC_19=0x800000008000000aUL, 1581 | KECCAK_RNDC_20=0x8000000080008081UL, 1582 | KECCAK_RNDC_21=0x8000000000008080UL, 1583 | KECCAK_RNDC_22=0x0000000080000001UL, 1584 | KECCAK_RNDC_23=0x8000000080008008UL, 1585 | 1586 | KECCAK_PILN_00=10, 1587 | KECCAK_PILN_01=7, 1588 | KECCAK_PILN_02=11, 1589 | KECCAK_PILN_03=17, 1590 | KECCAK_PILN_04=18, 1591 | KECCAK_PILN_05=3, 1592 | KECCAK_PILN_06=5, 1593 | KECCAK_PILN_07=16, 1594 | KECCAK_PILN_08=8, 1595 | KECCAK_PILN_09=21, 1596 | KECCAK_PILN_10=24, 1597 | KECCAK_PILN_11=4, 1598 | KECCAK_PILN_12=15, 1599 | KECCAK_PILN_13=23, 1600 | KECCAK_PILN_14=19, 1601 | KECCAK_PILN_15=13, 1602 | KECCAK_PILN_16=12, 1603 | KECCAK_PILN_17=2, 1604 | KECCAK_PILN_18=20, 1605 | KECCAK_PILN_19=14, 1606 | KECCAK_PILN_20=22, 1607 | KECCAK_PILN_21=9, 1608 | KECCAK_PILN_22=6, 1609 | KECCAK_PILN_23=1, 1610 | 1611 | KECCAK_ROTC_00=1, 1612 | KECCAK_ROTC_01=3, 1613 | KECCAK_ROTC_02=6, 1614 | KECCAK_ROTC_03=10, 1615 | KECCAK_ROTC_04=15, 1616 | KECCAK_ROTC_05=21, 1617 | KECCAK_ROTC_06=28, 1618 | KECCAK_ROTC_07=36, 1619 | KECCAK_ROTC_08=45, 1620 | KECCAK_ROTC_09=55, 1621 | KECCAK_ROTC_10=2, 1622 | KECCAK_ROTC_11=14, 1623 | KECCAK_ROTC_12=27, 1624 | KECCAK_ROTC_13=41, 1625 | KECCAK_ROTC_14=56, 1626 | KECCAK_ROTC_15=8, 1627 | KECCAK_ROTC_16=25, 1628 | KECCAK_ROTC_17=43, 1629 | KECCAK_ROTC_18=62, 1630 | KECCAK_ROTC_19=18, 1631 | KECCAK_ROTC_20=39, 1632 | KECCAK_ROTC_21=61, 1633 | KECCAK_ROTC_22=20, 1634 | KECCAK_ROTC_23=44, 1635 | 1636 | } keccak_constants_t; 1637 | 1638 | typedef enum mysql323_constants 1639 | { 1640 | MYSQL323_A=0x50305735U, 1641 | MYSQL323_B=0x12345671U 1642 | 1643 | } mysql323_constants_t; 1644 | 1645 | typedef enum fortigate_constants 1646 | { 1647 | FORTIGATE_A=0x2eba88a3U, 1648 | FORTIGATE_B=0x4ab04c42U, 1649 | FORTIGATE_C=0xc1307953U, 1650 | FORTIGATE_D=0x3fcc0731U, 1651 | FORTIGATE_E=0x299032a1U, 1652 | FORTIGATE_F=0x705b81a9U 1653 | 1654 | } fortigate_constants_t; 1655 | 1656 | typedef enum blake2b_constants 1657 | { 1658 | BLAKE2B_IV_00=0x6a09e667f3bcc908UL, 1659 | BLAKE2B_IV_01=0xbb67ae8584caa73bUL, 1660 | BLAKE2B_IV_02=0x3c6ef372fe94f82bUL, 1661 | BLAKE2B_IV_03=0xa54ff53a5f1d36f1UL, 1662 | BLAKE2B_IV_04=0x510e527fade682d1UL, 1663 | BLAKE2B_IV_05=0x9b05688c2b3e6c1fUL, 1664 | BLAKE2B_IV_06=0x1f83d9abfb41bd6bUL, 1665 | BLAKE2B_IV_07=0x5be0cd19137e2179UL 1666 | 1667 | } blake2b_constants_t; 1668 | 1669 | typedef enum blake2s_constants 1670 | { 1671 | BLAKE2S_IV_00=0x6a09e667, 1672 | BLAKE2S_IV_01=0xbb67ae85, 1673 | BLAKE2S_IV_02=0x3c6ef372, 1674 | BLAKE2S_IV_03=0xa54ff53a, 1675 | BLAKE2S_IV_04=0x510e527f, 1676 | BLAKE2S_IV_05=0x9b05688c, 1677 | BLAKE2S_IV_06=0x1f83d9ab, 1678 | BLAKE2S_IV_07=0x5be0cd19 1679 | 1680 | } blake2s_constants_t; 1681 | 1682 | typedef enum sm3_constants 1683 | { 1684 | // SM3 Initial Hash Values 1685 | SM3_IV_A=0x7380166fUL, 1686 | SM3_IV_B=0x4914b2b9UL, 1687 | SM3_IV_C=0x172442d7UL, 1688 | SM3_IV_D=0xda8a0600UL, 1689 | SM3_IV_E=0xa96f30bcUL, 1690 | SM3_IV_F=0x163138aaUL, 1691 | SM3_IV_G=0xe38dee4dUL, 1692 | SM3_IV_H=0xb0fb0e4eUL, 1693 | 1694 | // SM3 Tj round constants 1695 | SM3_T00=0x79CC4519UL, 1696 | SM3_T01=0xF3988A32UL, 1697 | SM3_T02=0xE7311465UL, 1698 | SM3_T03=0xCE6228CBUL, 1699 | SM3_T04=0x9CC45197UL, 1700 | SM3_T05=0x3988A32FUL, 1701 | SM3_T06=0x7311465EUL, 1702 | SM3_T07=0xE6228CBCUL, 1703 | SM3_T08=0xCC451979UL, 1704 | SM3_T09=0x988A32F3UL, 1705 | SM3_T10=0x311465E7UL, 1706 | SM3_T11=0x6228CBCEUL, 1707 | SM3_T12=0xC451979CUL, 1708 | SM3_T13=0x88A32F39UL, 1709 | SM3_T14=0x11465E73UL, 1710 | SM3_T15=0x228CBCE6UL, 1711 | SM3_T16=0x9D8A7A87UL, 1712 | SM3_T17=0x3B14F50FUL, 1713 | SM3_T18=0x7629EA1EUL, 1714 | SM3_T19=0xEC53D43CUL, 1715 | SM3_T20=0xD8A7A879UL, 1716 | SM3_T21=0xB14F50F3UL, 1717 | SM3_T22=0x629EA1E7UL, 1718 | SM3_T23=0xC53D43CEUL, 1719 | SM3_T24=0x8A7A879DUL, 1720 | SM3_T25=0x14F50F3BUL, 1721 | SM3_T26=0x29EA1E76UL, 1722 | SM3_T27=0x53D43CECUL, 1723 | SM3_T28=0xA7A879D8UL, 1724 | SM3_T29=0x4F50F3B1UL, 1725 | SM3_T30=0x9EA1E762UL, 1726 | SM3_T31=0x3D43CEC5UL, 1727 | SM3_T32=0x7A879D8AUL, 1728 | SM3_T33=0xF50F3B14UL, 1729 | SM3_T34=0xEA1E7629UL, 1730 | SM3_T35=0xD43CEC53UL, 1731 | SM3_T36=0xA879D8A7UL, 1732 | SM3_T37=0x50F3B14FUL, 1733 | SM3_T38=0xA1E7629EUL, 1734 | SM3_T39=0x43CEC53DUL, 1735 | SM3_T40=0x879D8A7AUL, 1736 | SM3_T41=0x0F3B14F5UL, 1737 | SM3_T42=0x1E7629EAUL, 1738 | SM3_T43=0x3CEC53D4UL, 1739 | SM3_T44=0x79D8A7A8UL, 1740 | SM3_T45=0xF3B14F50UL, 1741 | SM3_T46=0xE7629EA1UL, 1742 | SM3_T47=0xCEC53D43UL, 1743 | SM3_T48=0x9D8A7A87UL, 1744 | SM3_T49=0x3B14F50FUL, 1745 | SM3_T50=0x7629EA1EUL, 1746 | SM3_T51=0xEC53D43CUL, 1747 | SM3_T52=0xD8A7A879UL, 1748 | SM3_T53=0xB14F50F3UL, 1749 | SM3_T54=0x629EA1E7UL, 1750 | SM3_T55=0xC53D43CEUL, 1751 | SM3_T56=0x8A7A879DUL, 1752 | SM3_T57=0x14F50F3BUL, 1753 | SM3_T58=0x29EA1E76UL, 1754 | SM3_T59=0x53D43CECUL, 1755 | SM3_T60=0xA7A879D8UL, 1756 | SM3_T61=0x4F50F3B1UL, 1757 | SM3_T62=0x9EA1E762UL, 1758 | SM3_T63=0x3D43CEC5UL 1759 | 1760 | } sm3_constants_t; 1761 | 1762 | typedef enum combinator_mode 1763 | { 1764 | COMBINATOR_MODE_BASE_LEFT = 10001, 1765 | COMBINATOR_MODE_BASE_RIGHT = 10002 1766 | 1767 | } combinator_mode_t; 1768 | 1769 | #ifdef KERNEL_STATIC 1770 | typedef struct digest 1771 | { 1772 | u32 digest_buf[DGST_ELEM]; 1773 | 1774 | } digest_t; 1775 | #endif 1776 | 1777 | typedef struct kernel_param 1778 | { 1779 | // We can only move attributes into this struct which do not use special declarations like __global 1780 | 1781 | u32 bitmap_mask; // 24 1782 | u32 bitmap_shift1; // 25 1783 | u32 bitmap_shift2; // 26 1784 | u32 salt_pos_host; // 27 1785 | u32 loop_pos; // 28 1786 | u32 loop_cnt; // 29 1787 | u32 il_cnt; // 30 1788 | u32 digests_cnt; // 31 1789 | u32 digests_offset_host; // 32 1790 | u32 combs_mode; // 33 1791 | u32 salt_repeat; // 34 1792 | u64 pws_pos; // 35 1793 | u64 gid_max; // 36 1794 | 1795 | } kernel_param_t; 1796 | 1797 | typedef struct salt 1798 | { 1799 | u32 salt_buf[64]; 1800 | u32 salt_buf_pc[64]; 1801 | 1802 | u32 salt_len; 1803 | u32 salt_len_pc; 1804 | u32 salt_iter; 1805 | u32 salt_iter2; 1806 | u32 salt_sign[2]; 1807 | u32 salt_repeats; 1808 | 1809 | u32 orig_pos; 1810 | 1811 | u32 digests_cnt; 1812 | u32 digests_done; 1813 | 1814 | u32 digests_offset; 1815 | 1816 | u32 scrypt_N; 1817 | u32 scrypt_r; 1818 | u32 scrypt_p; 1819 | 1820 | } salt_t; 1821 | 1822 | typedef struct 1823 | { 1824 | u32 key; 1825 | u64 val; 1826 | 1827 | } hcstat_table_t; 1828 | 1829 | typedef struct 1830 | { 1831 | u32 cs_buf[0x100]; 1832 | u32 cs_len; 1833 | 1834 | } cs_t; 1835 | 1836 | typedef struct 1837 | { 1838 | u32 cmds[32]; 1839 | 1840 | } kernel_rule_t; 1841 | 1842 | typedef struct pw 1843 | { 1844 | u32 i[64]; 1845 | 1846 | u32 pw_len; 1847 | 1848 | } pw_t; 1849 | 1850 | typedef struct pw_idx 1851 | { 1852 | u32 off; 1853 | u32 cnt; 1854 | u32 len; 1855 | 1856 | } pw_idx_t; 1857 | 1858 | typedef struct bf 1859 | { 1860 | u32 i; 1861 | 1862 | } bf_t; 1863 | 1864 | typedef struct bs_word 1865 | { 1866 | u32 b[32]; 1867 | 1868 | } bs_word_t; 1869 | 1870 | typedef struct plain 1871 | { 1872 | u64 gidvid; 1873 | u32 il_pos; 1874 | u32 salt_pos; 1875 | u32 digest_pos; 1876 | u32 hash_pos; 1877 | u32 extra1; 1878 | u32 extra2; 1879 | 1880 | } plain_t; 1881 | 1882 | typedef struct keyboard_layout_mapping 1883 | { 1884 | u32 src_char; 1885 | int src_len; 1886 | u32 dst_char; 1887 | int dst_len; 1888 | 1889 | } keyboard_layout_mapping_t; 1890 | 1891 | typedef struct hc_enc 1892 | { 1893 | int pos; // source offset 1894 | 1895 | u32 cbuf; // carry buffer 1896 | int clen; // carry length 1897 | 1898 | } hc_enc_t; 1899 | 1900 | #endif 1901 | --------------------------------------------------------------------------------