├── keccak
    ├── keccak256.h
    └── keccak256.cl
├── settings.0.ini
├── requirements.txt
├── pathutils
    └── __init__.py
├── README.md
├── LICENSE
├── .vscode
    └── launch.json
├── kernels
    ├── gen_pub_key.cl
    └── gen_eth_addr.cl
├── .gitignore
├── secp256k1
    ├── inc_vendor.h
    ├── inc_ecc_secp256k1.h
    ├── inc_ecc_secp256k1.cl
    └── inc_types.h
├── pygeneth.py
└── pyvanityeth.py


/keccak/keccak256.h:
--------------------------------------------------------------------------------
1 | #define KECCAK256_HASH_LEN		32
2 | #define KECCAK256_BLOCKSIZE		(200-KECCAK256_HASH_LEN*2)
3 | #define KECCAK256_STATE_LEN     25


--------------------------------------------------------------------------------
/settings.0.ini:
--------------------------------------------------------------------------------
1 | [settings]
2 | CL_PATH=C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.44.35207\bin\Hostx64\x64
3 | CUDA_DLL_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0\bin\x64


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | ecdsa==0.19.1
 2 | Mako==1.3.10
 3 | MarkupSafe==3.0.2
 4 | numpy==2.3.3
 5 | platformdirs==4.4.0
 6 | pycryptodome==3.23.0
 7 | pycuda==2025.1.2
 8 | python-decouple==3.8
 9 | pytools==2025.2.4
10 | siphash24==1.8
11 | six==1.17.0
12 | typing_extensions==4.15.0
13 | 


--------------------------------------------------------------------------------
/pathutils/__init__.py:
--------------------------------------------------------------------------------
 1 | from decouple import config
 2 | import os
 3 | 
 4 | CL_PATH = config('CL_PATH', default='')
 5 | if len(CL_PATH) > 0:
 6 |     os.environ['PATH'] += ';'+CL_PATH
 7 | CUDA_DLL_PATH = config('CUDA_DLL_PATH', default='')
 8 | if len(CUDA_DLL_PATH) > 0:
 9 |     os.add_dll_directory(CUDA_DLL_PATH)
10 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # gen_eth
 2 | 
 3 | ## Usage
 4 | 
 5 | Generate vanity Ethereum address with prefix 0x1a2b
 6 | 
 7 | ```bash
 8 | python .\pyvanityeth.py --prefix 0x1a2b
 9 | ```
10 | 
11 | To display a help message listing the available command line arguments, run:
12 | 
13 | ```bash
14 | python .\pyvanityeth.py -h
15 | ```
16 | 
17 | ## Installation
18 | 
19 | ```bash
20 | python -m pip install pycuda
21 | python -m pip install numpy
22 | python -m pip install python-decouple
23 | python -m pip install pycryptodome
24 | python -m pip install ecdsa
25 | ```
26 | 
27 | ## Configuration
28 | 
29 | Configuration is done through a file called `settings.ini`. Example:
30 | 
31 | ```ini
32 | [settings]
33 | CL_PATH=C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.44.35207\bin\Hostx64\x64
34 | CUDA_DLL_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0\bin\x64
35 | ```
36 | 
37 | - CL_PATH - C++ compiler path
38 | - CUDA_DLL_PATH - CUDA Toolkit DLLs path


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Vitaly
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     // Use IntelliSense to learn about possible attributes.
 3 |     // Hover to view descriptions of existing attributes.
 4 |     // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
 5 |     "version": "0.2.0",
 6 |     "configurations": [
 7 |         {
 8 |             "name": "pygeneth",
 9 |             "type": "debugpy",
10 |             "request": "launch",
11 |             "program": "pygeneth.py",
12 |             "console": "integratedTerminal",
13 |             "justMyCode": true
14 |         },
15 |         {
16 |             "name": "pyvanityeth",
17 |             "type": "debugpy",
18 |             "request": "launch",
19 |             "program": "pyvanityeth.py",
20 |             "console": "integratedTerminal",
21 |             "justMyCode": true,
22 |             "args": [
23 |                 "--verbose", "--verify",
24 |                 "--output", "./output/l_1a2b.txt",
25 |                 "--prefix", "1a2b",
26 |                 "--blocks", "5000",
27 |                 "--blockSize", "256",
28 |                 "--blockIterations", "20"
29 |             ]
30 |         }
31 |     ]
32 | }


--------------------------------------------------------------------------------
/keccak/keccak256.cl:
--------------------------------------------------------------------------------
 1 | #include "keccak256.h"
 2 | 
 3 | #define NUM_ROUNDS				24
 4 | 
 5 | DECLSPEC u64 rotl64(u64 x, int i) {
 6 |     return ((0U + x) << i) | (x >> ((64 - i) & 63));
 7 | }
 8 | 
 9 | DECLSPEC void keccak256_absorb(PRIVATE_AS u64* state, PRIVATE_AS const int* rotation) { // u64 state[5 * 5]
10 |     u8 r = 1;  // LFSR
11 |     for (int i = 0; i < NUM_ROUNDS; i++) {
12 |         // Theta step
13 |         u64 c[5] = {};
14 |         for (int x = 0; x < 5; x++) {
15 |             for (int y = 0; y < 5; y++)
16 |                 c[x] ^= state[x + y + (y << 2)]; // x * 5 + y
17 |         }
18 |         for (int x = 0; x < 5; x++) {
19 |             u64 d = c[(x + 4) % 5] ^ rotl64(c[(x + 1) % 5], 1);
20 |             for (int y = 0; y < 5; y++)
21 |                 state[x + y + (y << 2)] ^= d;
22 |         }
23 |         // Rho and pi steps
24 |         u64 b[5][5];
25 |         for (int x = 0; x < 5; x++) {
26 |             for (int y = 0; y < 5; y++)
27 |                 b[y][(x * 2 + y * 3) % 5] = rotl64(state[x + y + (y << 2)], rotation[(x << 2) + x + y]);
28 |         }
29 |         // Chi step
30 |         for (int x = 0; x < 5; x++) {
31 |             for (int y = 0; y < 5; y++)
32 |                 state[x + y + (y << 2)] = b[x][y] ^ (~b[(x + 1) % 5][y] & b[(x + 2) % 5][y]);
33 |         }
34 |         // Iota step
35 |         for (int j = 0; j < 7; j++) {
36 |             state[0] ^= (u64)(r & 1) << ((1 << j) - 1);
37 |             r = (u8)((r << 1) ^ ((r >> 7) * 0x171));
38 |         }
39 |     }
40 | }
41 | 
42 | DECLSPEC void keccak256_update_state(PRIVATE_AS u64* state, PRIVATE_AS const u8* msg, PRIVATE_AS const u32 len) // u64 state[5 * 5]
43 | {
44 |     const int rotation[25] = {
45 |         0, 36,  3, 41, 18,
46 |         1, 44, 10, 45,  2,
47 |         62,  6, 43, 15, 61,
48 |         28, 55, 25, 21, 56,
49 |         27, 20, 39,  8, 14
50 |     };
51 |     u32 blockOff = 0;
52 |     for (u32 i = 0; i < len; i++) {
53 |         u32 j = blockOff >> 3;
54 |         u32 xj = j % 5;
55 |         u32 yj = j / 5;
56 |         state[xj + yj + (yj << 2)] ^= (u64)(msg[i]) << ((blockOff & 7) << 3);
57 |         blockOff++;
58 |         if (blockOff == KECCAK256_BLOCKSIZE) {
59 |             keccak256_absorb(state, rotation);
60 |             blockOff = 0;
61 |         }
62 |     }
63 |     // Final block and padding
64 |     {
65 |         int i = blockOff >> 3;
66 |         u32 xi = i % 5;
67 |         u32 yi = i / 5;
68 |         state[xi + yi + (yi << 2)] ^= UINT64_C(0x01) << ((blockOff & 7) << 3);
69 |         blockOff = KECCAK256_BLOCKSIZE - 1;
70 |         int j = blockOff >> 3;
71 |         u32 xj = j % 5;
72 |         u32 yj = j / 5;
73 |         state[xj + yj + (yj << 2)] ^= UINT64_C(0x80) << ((blockOff & 7) << 3);
74 |         keccak256_absorb(state, rotation);
75 |     }
76 | }
77 | 
78 | DECLSPEC void keccak256_get_hash(GLOBAL_AS u8* r, GLOBAL_AS const u8* msg, GLOBAL_AS const u32 len)
79 | {
80 |     u64 state[25] = {};
81 |     keccak256_update_state(state, (u8*)msg, len);
82 |     // Uint64 array to bytes in little endian
83 |     for (int i = 0; i < KECCAK256_HASH_LEN; i++) {
84 |         int j = i >> 3;
85 |         u32 xj = j % 5;
86 |         u32 yj = j / 5;
87 |         r[i] = (u8)(state[xj + yj + (yj << 2)] >> ((i & 7) << 3));
88 |     }
89 | }
90 | 


--------------------------------------------------------------------------------
/kernels/gen_pub_key.cl:
--------------------------------------------------------------------------------
  1 | // @author: Vitaly <vitaly@optinsoft.net> | github.com/optinsoft
  2 | 
  3 | // little endian to big endian
  4 | DECLSPEC u32 l2be(u32 x) {
  5 |     return (x & 0xff) << 24 | (x & 0xff00) << 8 | (x & 0xff0000) >> 8 | (x & 0xff000000) >> 24;
  6 | }
  7 | 
  8 | __global__ void genPubKey(
  9 |     GLOBAL_AS u32 *r0, GLOBAL_AS u32 *r1, GLOBAL_AS u32 *r2, GLOBAL_AS u32 *r3, 
 10 |     GLOBAL_AS u32 *r4, GLOBAL_AS u32 *r5, GLOBAL_AS u32 *r6, GLOBAL_AS u32 *r7,
 11 |     GLOBAL_AS u32 *r8, GLOBAL_AS u32 *r9, GLOBAL_AS u32 *r10, GLOBAL_AS u32 *r11, 
 12 |     GLOBAL_AS u32 *r12, GLOBAL_AS u32 *r13, GLOBAL_AS u32 *r14, GLOBAL_AS u32 *r15,
 13 |     GLOBAL_AS u32* h0, GLOBAL_AS u32* h1, GLOBAL_AS u32* h2, GLOBAL_AS u32* h3,
 14 |     GLOBAL_AS u32* h4, GLOBAL_AS u32* h5, GLOBAL_AS u32* h6, GLOBAL_AS u32* h7,
 15 |     GLOBAL_AS const u32 *k0, GLOBAL_AS const u32 *k1, GLOBAL_AS const u32 *k2, GLOBAL_AS const u32 *k3,
 16 |     GLOBAL_AS const u32 *k4, GLOBAL_AS const u32 *k5, GLOBAL_AS const u32 *k6, GLOBAL_AS const u32 *k7)
 17 | {
 18 |     u32 g_local[PUBLIC_KEY_LENGTH_WITH_PARITY];
 19 |     u32 k_local[PRIVATE_KEY_LENGTH];
 20 |     secp256k1_t g_xy_local;
 21 |     u32 return_value;
 22 | 
 23 |     int i = threadIdx.x;
 24 | 
 25 |     g_local[0] = SECP256K1_G_STRING0;
 26 |     g_local[1] = SECP256K1_G_STRING1;
 27 |     g_local[2] = SECP256K1_G_STRING2;
 28 |     g_local[3] = SECP256K1_G_STRING3;
 29 |     g_local[4] = SECP256K1_G_STRING4;
 30 |     g_local[5] = SECP256K1_G_STRING5;
 31 |     g_local[6] = SECP256K1_G_STRING6;
 32 |     g_local[7] = SECP256K1_G_STRING7;
 33 |     g_local[8] = SECP256K1_G_STRING8;
 34 | 
 35 |     // global to local
 36 |     k_local[7] = k0[i];
 37 |     k_local[6] = k1[i];
 38 |     k_local[5] = k2[i];
 39 |     k_local[4] = k3[i];
 40 |     k_local[3] = k4[i];
 41 |     k_local[2] = k5[i];
 42 |     k_local[1] = k6[i];
 43 |     k_local[0] = k7[i];
 44 | 
 45 |     return_value = parse_public(&g_xy_local, g_local);
 46 |     if (return_value != 0) {
 47 |         return;
 48 |     }
 49 | 
 50 |     u32 x[8];
 51 |     u32 y[8];
 52 |     point_mul_xy (x, y, k_local,  &g_xy_local);
 53 | 
 54 |     // local to global
 55 |     r7[i] = x[0]; 
 56 |     r6[i] = x[1];
 57 |     r5[i] = x[2];
 58 |     r4[i] = x[3];
 59 |     r3[i] = x[4];
 60 |     r2[i] = x[5];
 61 |     r1[i] = x[6];
 62 |     r0[i] = x[7];
 63 |     r15[i] = y[0];
 64 |     r14[i] = y[1];
 65 |     r13[i] = y[2];
 66 |     r12[i] = y[3];
 67 |     r11[i] = y[4];
 68 |     r10[i] = y[5];
 69 |     r9[i] = y[6];
 70 |     r8[i] = y[7];
 71 | 
 72 |     // keccak256
 73 |     u64 keccak_state[KECCAK256_STATE_LEN] = {};
 74 |     u32 w[16];
 75 | 
 76 |     w[7]  = l2be(x[0]);
 77 |     w[6]  = l2be(x[1]);
 78 |     w[5]  = l2be(x[2]);
 79 |     w[4] = l2be(x[3]);
 80 |     w[3]  = l2be(x[4]);
 81 |     w[2]  = l2be(x[5]);
 82 |     w[1]  = l2be(x[6]);
 83 |     w[0]  = l2be(x[7]);
 84 |     w[15] = l2be(y[0]);
 85 |     w[14] = l2be(y[1]);
 86 |     w[13] = l2be(y[2]);
 87 |     w[12] = l2be(y[3]);
 88 |     w[11] = l2be(y[4]);
 89 |     w[10] = l2be(y[5]);
 90 |     w[9]  = l2be(y[6]);
 91 |     w[8]  = l2be(y[7]);
 92 | 
 93 |     keccak256_update_state(keccak_state, (u8*)w, 64);
 94 | 
 95 |     h0[i] = l2be((u32)keccak_state[0]);
 96 |     h1[i] = l2be((u32)(keccak_state[0] >> 32));
 97 |     h2[i] = l2be((u32)keccak_state[1]);
 98 |     h3[i] = l2be((u32)(keccak_state[1] >> 32));
 99 |     h4[i] = l2be((u32)keccak_state[2]);
100 |     h5[i] = l2be((u32)(keccak_state[2] >> 32));
101 |     h6[i] = l2be((u32)keccak_state[3]);
102 |     h7[i] = l2be((u32)(keccak_state[3] >> 32));
103 | }


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | output/
  2 | temp/
  3 | settings.ini
  4 | 
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | share/python-wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .nox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | *.py,cover
 54 | .hypothesis/
 55 | .pytest_cache/
 56 | cover/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | .pybuilder/
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # IPython
 86 | profile_default/
 87 | ipython_config.py
 88 | 
 89 | # pyenv
 90 | #   For a library or package, you might want to ignore these files since the code is
 91 | #   intended to run in multiple environments; otherwise, check them in:
 92 | # .python-version
 93 | 
 94 | # pipenv
 95 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 96 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 97 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 98 | #   install all needed dependencies.
 99 | #Pipfile.lock
100 | 
101 | # poetry
102 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
103 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
104 | #   commonly ignored for libraries.
105 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
106 | #poetry.lock
107 | 
108 | # pdm
109 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
110 | #pdm.lock
111 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
112 | #   in version control.
113 | #   https://pdm.fming.dev/#use-with-ide
114 | .pdm.toml
115 | 
116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117 | __pypackages__/
118 | 
119 | # Celery stuff
120 | celerybeat-schedule
121 | celerybeat.pid
122 | 
123 | # SageMath parsed files
124 | *.sage.py
125 | 
126 | # Environments
127 | .env
128 | .venv
129 | env/
130 | venv/
131 | ENV/
132 | env.bak/
133 | venv.bak/
134 | 
135 | # Spyder project settings
136 | .spyderproject
137 | .spyproject
138 | 
139 | # Rope project settings
140 | .ropeproject
141 | 
142 | # mkdocs documentation
143 | /site
144 | 
145 | # mypy
146 | .mypy_cache/
147 | .dmypy.json
148 | dmypy.json
149 | 
150 | # Pyre type checker
151 | .pyre/
152 | 
153 | # pytype static type analyzer
154 | .pytype/
155 | 
156 | # Cython debug symbols
157 | cython_debug/
158 | 
159 | # PyCharm
160 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
163 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
164 | #.idea/
165 | 
166 | kernel.cl
167 | 


--------------------------------------------------------------------------------
/secp256k1/inc_vendor.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Author......: See docs/credits.txt
  3 |  * License.....: MIT
  4 |  */
  5 | 
  6 | #ifndef INC_VENDOR_H
  7 | #define INC_VENDOR_H
  8 | 
  9 | #if defined HC_CPU_OPENCL_EMU_H
 10 | #define IS_NATIVE
 11 | #elif defined __CUDACC__
 12 | #define IS_CUDA
 13 | #elif defined __HIPCC__
 14 | #define IS_HIP
 15 | #elif defined __METAL__ || defined __METAL_MACOS__
 16 | #define IS_METAL
 17 | #else
 18 | #define IS_OPENCL
 19 | #endif
 20 | 
 21 | #if defined IS_METAL
 22 | #include <metal_stdlib>
 23 | 
 24 | using namespace metal;
 25 | #endif
 26 | 
 27 | #if defined IS_NATIVE
 28 | #define CONSTANT_VK
 29 | #define CONSTANT_AS
 30 | #define GLOBAL_AS
 31 | #define LOCAL_VK
 32 | #define LOCAL_AS
 33 | #define PRIVATE_AS
 34 | #define KERNEL_FQ
 35 | #elif defined IS_CUDA
 36 | #define CONSTANT_VK __constant__
 37 | #define CONSTANT_AS
 38 | #define GLOBAL_AS
 39 | #define LOCAL_VK    __shared__
 40 | #define LOCAL_AS
 41 | #define PRIVATE_AS
 42 | #define KERNEL_FQ   extern "C" __global__
 43 | #elif defined IS_HIP
 44 | #define CONSTANT_VK __constant__
 45 | #define CONSTANT_AS
 46 | #define GLOBAL_AS
 47 | #define LOCAL_VK    __shared__
 48 | #define LOCAL_AS
 49 | #define PRIVATE_AS
 50 | #define KERNEL_FQ   extern "C" __global__
 51 | #elif defined IS_METAL
 52 | #define CONSTANT_VK constant
 53 | #define CONSTANT_AS constant
 54 | #define GLOBAL_AS   device
 55 | #define LOCAL_VK    threadgroup
 56 | #define LOCAL_AS    threadgroup
 57 | #define PRIVATE_AS  thread
 58 | #define KERNEL_FQ   kernel
 59 | #elif defined IS_OPENCL
 60 | #define CONSTANT_VK __constant
 61 | #define CONSTANT_AS __constant
 62 | #define GLOBAL_AS   __global
 63 | #define LOCAL_VK    __local
 64 | #define LOCAL_AS    __local
 65 | #define PRIVATE_AS
 66 | #define KERNEL_FQ   __kernel
 67 | #endif
 68 | 
 69 | #ifndef MAYBE_UNUSED
 70 | #define MAYBE_UNUSED
 71 | #endif
 72 | 
 73 | /**
 74 |  * device type
 75 |  */
 76 | 
 77 | #define DEVICE_TYPE_CPU   2
 78 | #define DEVICE_TYPE_GPU   4
 79 | #define DEVICE_TYPE_ACCEL 8
 80 | 
 81 | #if   DEVICE_TYPE == DEVICE_TYPE_CPU
 82 | #define IS_CPU
 83 | #elif DEVICE_TYPE == DEVICE_TYPE_GPU
 84 | #define IS_GPU
 85 | #elif DEVICE_TYPE == DEVICE_TYPE_ACCEL
 86 | #define IS_ACCEL
 87 | #endif
 88 | 
 89 | /**
 90 |  * vendor specific
 91 |  */
 92 | 
 93 | #if   VENDOR_ID == (1 << 0)
 94 | #define IS_AMD
 95 | #elif VENDOR_ID == (1 << 1)
 96 | #define IS_APPLE
 97 | #define IS_GENERIC
 98 | #elif VENDOR_ID == (1 << 2)
 99 | #define IS_INTEL_BEIGNET
100 | #define IS_GENERIC
101 | #elif VENDOR_ID == (1 << 3)
102 | #define IS_INTEL_SDK
103 | #define IS_GENERIC
104 | #elif VENDOR_ID == (1 << 4)
105 | #define IS_MESA
106 | #define IS_GENERIC
107 | #elif VENDOR_ID == (1 << 5)
108 | #define IS_NV
109 | #elif VENDOR_ID == (1 << 6)
110 | #define IS_POCL
111 | #define IS_GENERIC
112 | #elif VENDOR_ID == (1 << 8)
113 | #define IS_AMD_USE_HIP
114 | #else
115 | #define IS_GENERIC
116 | #endif
117 | 
118 | #if defined IS_AMD && HAS_VPERM == 1
119 | #define IS_ROCM
120 | #endif
121 | 
122 | #define LOCAL_MEM_TYPE_LOCAL  1
123 | #define LOCAL_MEM_TYPE_GLOBAL 2
124 | 
125 | #if LOCAL_MEM_TYPE == LOCAL_MEM_TYPE_LOCAL
126 | #define REAL_SHM
127 | #endif
128 | 
129 | // So far, only used by -m 22100 and only affects NVIDIA on OpenCL. CUDA seems to work fine.
130 | #ifdef FORCE_DISABLE_SHM
131 | #undef REAL_SHM
132 | #endif
133 | 
134 | #ifdef REAL_SHM
135 | #define SHM_TYPE LOCAL_AS
136 | #else
137 | #define SHM_TYPE CONSTANT_AS
138 | #endif
139 | 
140 | /**
141 |  * function declarations can have a large influence depending on the opencl runtime
142 |  * fast but pure kernels on rocm is a good example
143 |  */
144 | 
145 | #ifdef NO_INLINE
146 | #define HC_INLINE
147 | #else
148 | #define HC_INLINE inline static
149 | #endif
150 | 
151 | #if defined IS_AMD && defined IS_GPU
152 | #define DECLSPEC HC_INLINE
153 | #elif defined IS_HIP
154 | #define DECLSPEC __device__ HC_INLINE
155 | #else
156 | #define DECLSPEC __device__
157 | #endif
158 | 
159 | /**
160 |  * AMD specific
161 |  */
162 | 
163 | #ifdef IS_AMD
164 | #if defined(cl_amd_media_ops)
165 | #pragma OPENCL EXTENSION cl_amd_media_ops  : enable
166 | #endif
167 | #if defined(cl_amd_media_ops2)
168 | #pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
169 | #endif
170 | #endif
171 | 
172 | // Whitelist some OpenCL specific functions
173 | // This could create more stable kernels on systems with bad OpenCL drivers
174 | 
175 | #ifdef IS_CUDA
176 | #define USE_BITSELECT
177 | #define USE_ROTATE
178 | #endif
179 | 
180 | #ifdef IS_HIP
181 | #define USE_BITSELECT
182 | #define USE_ROTATE
183 | #endif
184 | 
185 | #ifdef IS_ROCM
186 | #define USE_BITSELECT
187 | #define USE_ROTATE
188 | #endif
189 | 
190 | #ifdef IS_INTEL_SDK
191 | #ifdef IS_CPU
192 | //#define USE_BITSELECT
193 | //#define USE_ROTATE
194 | #endif
195 | #endif
196 | 
197 | #ifdef IS_OPENCL
198 | //#define USE_BITSELECT
199 | //#define USE_ROTATE
200 | //#define USE_SWIZZLE
201 | #endif
202 | 
203 | #ifdef IS_METAL
204 | #define USE_ROTATE
205 | 
206 | // Metal support max VECT_SIZE = 4
207 | #define s0 x
208 | #define s1 y
209 | #define s2 z
210 | #define s3 w
211 | #endif
212 | 
213 | #endif // INC_VENDOR_H
214 | 


--------------------------------------------------------------------------------
/kernels/gen_eth_addr.cl:
--------------------------------------------------------------------------------
  1 | // @author: Vitaly <vitaly@optinsoft.net> | github.com/optinsoft
  2 | 
  3 | // little endian to big endian
  4 | DECLSPEC u32 l2be(u32 x) {
  5 |     return (x & 0xff) << 24 | (x & 0xff00) << 8 | (x & 0xff0000) >> 8 | (x & 0xff000000) >> 24;
  6 | }
  7 | 
  8 | __global__ void genEthAddress(
  9 |     GLOBAL_AS u32 *r0, GLOBAL_AS u32 *r1, GLOBAL_AS u32 *r2, GLOBAL_AS u32 *r3, GLOBAL_AS u32 *r4, 
 10 |     GLOBAL_AS const u32 *k0, GLOBAL_AS const u32 *k1, GLOBAL_AS const u32 *k2, GLOBAL_AS const u32 *k3,
 11 |     GLOBAL_AS const u32 *k4, GLOBAL_AS const u32 *k5, GLOBAL_AS const u32 *k6, GLOBAL_AS const u32 *k7)
 12 | {
 13 |     u32 g_local[PUBLIC_KEY_LENGTH_WITH_PARITY];
 14 |     u32 k_local[PRIVATE_KEY_LENGTH];
 15 |     secp256k1_t g_xy_local;
 16 |     u32 return_value;
 17 | 
 18 |     int i = threadIdx.x;
 19 | 
 20 |     g_local[0] = SECP256K1_G_STRING0;
 21 |     g_local[1] = SECP256K1_G_STRING1;
 22 |     g_local[2] = SECP256K1_G_STRING2;
 23 |     g_local[3] = SECP256K1_G_STRING3;
 24 |     g_local[4] = SECP256K1_G_STRING4;
 25 |     g_local[5] = SECP256K1_G_STRING5;
 26 |     g_local[6] = SECP256K1_G_STRING6;
 27 |     g_local[7] = SECP256K1_G_STRING7;
 28 |     g_local[8] = SECP256K1_G_STRING8;
 29 | 
 30 |     // global to local
 31 |     k_local[7] = k0[i];
 32 |     k_local[6] = k1[i];
 33 |     k_local[5] = k2[i];
 34 |     k_local[4] = k3[i];
 35 |     k_local[3] = k4[i];
 36 |     k_local[2] = k5[i];
 37 |     k_local[1] = k6[i];
 38 |     k_local[0] = k7[i];
 39 | 
 40 |     return_value = parse_public(&g_xy_local, g_local);
 41 |     if (return_value != 0) {
 42 |         return;
 43 |     }
 44 | 
 45 |     u32 x[8];
 46 |     u32 y[8];
 47 |     point_mul_xy (x, y, k_local,  &g_xy_local);
 48 | 
 49 |     // keccak256
 50 |     u64 keccak_state[KECCAK256_STATE_LEN] = {};
 51 |     u32 w[16];
 52 | 
 53 |     w[7]  = l2be(x[0]);
 54 |     w[6]  = l2be(x[1]);
 55 |     w[5]  = l2be(x[2]);
 56 |     w[4]  = l2be(x[3]);
 57 |     w[3]  = l2be(x[4]);
 58 |     w[2]  = l2be(x[5]);
 59 |     w[1]  = l2be(x[6]);
 60 |     w[0]  = l2be(x[7]);
 61 |     w[15] = l2be(y[0]);
 62 |     w[14] = l2be(y[1]);
 63 |     w[13] = l2be(y[2]);
 64 |     w[12] = l2be(y[3]);
 65 |     w[11] = l2be(y[4]);
 66 |     w[10] = l2be(y[5]);
 67 |     w[9]  = l2be(y[6]);
 68 |     w[8]  = l2be(y[7]);
 69 | 
 70 |     keccak256_update_state(keccak_state, (u8*)w, 64);
 71 | 
 72 |     r0[i] = l2be((u32)(keccak_state[1] >> 32));
 73 |     r1[i] = l2be((u32)keccak_state[2]);
 74 |     r2[i] = l2be((u32)(keccak_state[2] >> 32));
 75 |     r3[i] = l2be((u32)keccak_state[3]);
 76 |     r4[i] = l2be((u32)(keccak_state[3] >> 32));
 77 | }
 78 | 
 79 | __global__ void genEthAddressWithPrefix(
 80 |     GLOBAL_AS u32 *r0, GLOBAL_AS u32 *r1, GLOBAL_AS u32 *r2, GLOBAL_AS u32 *r3, GLOBAL_AS u32 *r4, GLOBAL_AS u32 *rp,
 81 |     GLOBAL_AS u32 *k0, GLOBAL_AS u32 *k1, GLOBAL_AS u32 *k2, GLOBAL_AS u32 *k3,
 82 |     GLOBAL_AS u32 *k4, GLOBAL_AS u32 *k5, GLOBAL_AS u32 *k6, GLOBAL_AS u32 *k7,
 83 |     GLOBAL_AS const u32 p[5], GLOBAL_AS const u32 plen, GLOBAL_AS const u32 n)
 84 | {
 85 |     u32 g_local[PUBLIC_KEY_LENGTH_WITH_PARITY];
 86 |     u32 k_local[PRIVATE_KEY_LENGTH];
 87 |     secp256k1_t g_xy_local;
 88 |     u32 return_value;
 89 |     u32 p_local[5];
 90 |     u32 m_local[5];
 91 |     u32 r_local[5];
 92 |     u32 rp_local = 0;
 93 | 
 94 |     int i = threadIdx.x;
 95 | 
 96 |     // global to local
 97 |     k_local[7] = k0[i];
 98 |     k_local[6] = k1[i];
 99 |     k_local[5] = k2[i];
100 |     k_local[4] = k3[i];
101 |     k_local[3] = k4[i];
102 |     k_local[2] = k5[i];
103 |     k_local[1] = k6[i];
104 |     k_local[0] = k7[i];
105 | 
106 |     u32 l = plen;
107 |     m_local[0] = (l >= 4) ? 0xffffffff : 0xffffffff << ((4-l) << 3);
108 |     l = (l >= 4) ? l-4 : 0; 
109 |     p_local[0] = p[0] & m_local[0];
110 |     m_local[1] = (l >= 4) ? 0xffffffff : 0xffffffff << ((4-l) << 3);
111 |     l = (l >= 4) ? l-4 : 0; 
112 |     p_local[1] = p[1] & m_local[1];
113 |     m_local[2] = (l >= 4) ? 0xffffffff : 0xffffffff << ((4-l) << 3);
114 |     l = (l >= 4) ? l-4 : 0; 
115 |     p_local[2] = p[2] & m_local[2];
116 |     m_local[3] = (l >= 4) ? 0xffffffff : 0xffffffff << ((4-l) << 3);
117 |     l = (l >= 4) ? l-4 : 0; 
118 |     p_local[3] = p[3] & m_local[3];
119 |     m_local[4] = (l >= 4) ? 0xffffffff : 0xffffffff << ((4-l) << 3);
120 |     l = (l >= 4) ? l-4 : 0; 
121 |     p_local[4] = p[4] & m_local[4];
122 | 
123 |     u32 n_local = n > 0 ? n : 1;
124 | 
125 |     u32 x[8];
126 |     u32 y[8];
127 |     u32 w[16];
128 |     u32 ni = 0;
129 | 
130 |     while (1) {
131 | 
132 |         g_local[0] = SECP256K1_G_STRING0;
133 |         g_local[1] = SECP256K1_G_STRING1;
134 |         g_local[2] = SECP256K1_G_STRING2;
135 |         g_local[3] = SECP256K1_G_STRING3;
136 |         g_local[4] = SECP256K1_G_STRING4;
137 |         g_local[5] = SECP256K1_G_STRING5;
138 |         g_local[6] = SECP256K1_G_STRING6;
139 |         g_local[7] = SECP256K1_G_STRING7;
140 |         g_local[8] = SECP256K1_G_STRING8;
141 | 
142 |         return_value = parse_public(&g_xy_local, g_local);
143 |         if (return_value != 0) {
144 |             return;
145 |         }
146 | 
147 |         point_mul_xy (x, y, k_local,  &g_xy_local);
148 | 
149 |         // keccak256
150 |         u64 keccak_state[KECCAK256_STATE_LEN] = {0};
151 | 
152 |         w[7]  = l2be(x[0]);
153 |         w[6]  = l2be(x[1]);
154 |         w[5]  = l2be(x[2]);
155 |         w[4]  = l2be(x[3]);
156 |         w[3]  = l2be(x[4]);
157 |         w[2]  = l2be(x[5]);
158 |         w[1]  = l2be(x[6]);
159 |         w[0]  = l2be(x[7]);
160 |         w[15] = l2be(y[0]);
161 |         w[14] = l2be(y[1]);
162 |         w[13] = l2be(y[2]);
163 |         w[12] = l2be(y[3]);
164 |         w[11] = l2be(y[4]);
165 |         w[10] = l2be(y[5]);
166 |         w[9]  = l2be(y[6]);
167 |         w[8]  = l2be(y[7]);
168 | 
169 |         keccak256_update_state(keccak_state, (u8*)w, 64);
170 | 
171 |         ni++;
172 | 
173 |         r_local[0] = l2be((u32)(keccak_state[1] >> 32));
174 |         r_local[1] = l2be((u32)keccak_state[2]);
175 |         r_local[2] = l2be((u32)(keccak_state[2] >> 32));
176 |         r_local[3] = l2be((u32)keccak_state[3]);
177 |         r_local[4] = l2be((u32)(keccak_state[3] >> 32));
178 |         rp_local = (((r_local[0] & m_local[0]) == p_local[0]) &&  
179 |                     ((r_local[1] & m_local[1]) == p_local[1]) &&
180 |                     ((r_local[2] & m_local[2]) == p_local[2]) &&
181 |                     ((r_local[3] & m_local[3]) == p_local[3]) &&
182 |                     ((r_local[4] & m_local[4]) == p_local[4])) 
183 |                     ? ni : 0;
184 | 
185 |         if (ni >= n_local || rp_local) break;
186 | 
187 |         k_local[(ni & 7)] += 479001599;
188 |     }
189 | 
190 |     //save results
191 |     r0[i] = r_local[0];
192 |     r1[i] = r_local[1];
193 |     r2[i] = r_local[2];
194 |     r3[i] = r_local[3];
195 |     r4[i] = r_local[4];
196 |     rp[i] = rp_local;
197 | 
198 |     k0[i] = k_local[7];
199 |     k1[i] = k_local[6];
200 |     k2[i] = k_local[5];
201 |     k3[i] = k_local[4];
202 |     k4[i] = k_local[3];
203 |     k5[i] = k_local[2];
204 |     k6[i] = k_local[1];
205 |     k7[i] = k_local[0];
206 | }


--------------------------------------------------------------------------------
/pygeneth.py:
--------------------------------------------------------------------------------
  1 | """
  2 | @author: Vitaly <vitaly@optinsoft.net> | github.com/optinsoft
  3 | """
  4 | import pathutils
  5 | import pycuda.driver as cuda
  6 | from pycuda.compiler import SourceModule
  7 | import pycuda.gpuarray as gpuarray
  8 | import pycuda.autoinit
  9 | import numpy as np
 10 | from decouple import config
 11 | import os
 12 | from functools import reduce
 13 | import ecdsa
 14 | from Crypto.Hash import keccak
 15 | 
 16 | def randomUInt32() -> int:
 17 |     return int.from_bytes(np.random.bytes(4), byteorder='little', signed=False)
 18 | 
 19 | '''
 20 | test private key: 0x68e23530deb6d5011ab56d8ad9f7b4a3b424f1112f08606357497495929f72dc
 21 | test public key:  0x5d99d81d9e731e0d7eebd1c858b1155da7981b1f0a16d322a361f8b589ad2e3bde53dc614e3a84164dab3f5899abde3b09553dca10c9716fa623a5942b9ea420
 22 | test keccak256:   0x4c84817f57c18372837905af33f4b63eb1c5a9966a31cebc302f563685695506
 23 | test eth address: 0x33f4b63eb1c5a9966a31cebc302f563685695506
 24 | '''
 25 | 
 26 | def testUInt32(idx: int) -> int:
 27 |     r = [0x68e23530, 0xdeb6d501, 0x1ab56d8a, 0xd9f7b4a3, 0xb424f111, 0x2f086063, 0x57497495, 0x929f72dc][idx]
 28 |     return r
 29 | 
 30 | def randomUInt32Array(count: int) -> list[int]:
 31 |     return [randomUInt32() for i in range(count)]
 32 | 
 33 | def randomWithTestUInt32Array(count: int, idx: int) -> list[int]:
 34 |     return [testUInt32(idx) if i == 0 else randomUInt32() for i in range(count)]
 35 | 
 36 | def constUInt32Array(count: int, v: int) -> list[int]:
 37 |     return [v for i in range(count)]
 38 | 
 39 | def public_key_to_address(public_key, i, print_keccak):
 40 |     keccak_hash = keccak.new(digest_bits=256)
 41 |     keccak_hash.update(public_key)
 42 |     keccak_digest = keccak_hash.digest()
 43 |     if print_keccak:
 44 |         print(f'keccak[{i}] (verification):     0x{keccak_digest.hex()}')
 45 |     address = '0x' + keccak_digest[-20:].hex()
 46 |     return address
 47 | 
 48 | def key_to_hex(k: list[int]) -> str:
 49 |     return reduce(lambda s, t: str(s) + t.to_bytes(4, byteorder='big').hex(), k[1:], k[0].to_bytes(4, byteorder='big').hex())
 50 | 
 51 | def main_genPubKey(keyCount: int, verify: bool):
 52 |     kernel_code = '''
 53 | 
 54 |     '''
 55 |     def load_code(path: str) -> str:
 56 |         with open(path, 'r') as text_file:
 57 |             code_text = text_file.read()
 58 |         lines = code_text.splitlines()
 59 |         result = reduce(lambda t, l: 
 60 |                         t + "\n" + l if len(l) > 0 and not l.startswith('#include ') else t, 
 61 |                         lines, '')
 62 |         return result
 63 |     dirSecp256k1 = './secp256k1/'    
 64 |     kernel_code += load_code(dirSecp256k1 + 'inc_vendor.h')
 65 |     kernel_code += load_code(dirSecp256k1 + 'inc_types.h')
 66 |     kernel_code += load_code(dirSecp256k1 + 'inc_ecc_secp256k1.h')
 67 |     kernel_code += load_code(dirSecp256k1 + 'inc_ecc_secp256k1.cl')
 68 |     dirKeccak = './keccak/'
 69 |     kernel_code += load_code(dirKeccak + 'keccak256.h')
 70 |     kernel_code += load_code(dirKeccak + 'keccak256.cl')
 71 |     dirKernels = './kernels/'
 72 |     kernel_code += load_code(dirKernels + 'gen_pub_key.cl')
 73 | 
 74 |     # with open('./kernel.cl', 'w') as f:
 75 |     #     f.write(kernel_code)
 76 | 
 77 |     k = [np.array(randomUInt32Array(keyCount), dtype=np.uint32) for i in range(8)]
 78 |     xy = [np.array(constUInt32Array(keyCount, 0), dtype=np.uint32) for i in range(16)]
 79 |     h = [np.array(constUInt32Array(keyCount, 0), dtype=np.uint32) for i in range(8)]
 80 | 
 81 |     k_gpu = [gpuarray.to_gpu(k[i]) for i in range(8)]
 82 |     xy_gpu = [gpuarray.to_gpu(xy[i]) for i in range(16)]
 83 |     h_gpu = [gpuarray.to_gpu(h[i]) for i in range(8)]
 84 | 
 85 |     mod = SourceModule(kernel_code)
 86 |     genPubKey = mod.get_function('genPubKey')
 87 | 
 88 |     genPubKey(xy_gpu[0], xy_gpu[1], xy_gpu[2], xy_gpu[3], xy_gpu[4], xy_gpu[5], xy_gpu[6], xy_gpu[7],
 89 |         xy_gpu[8], xy_gpu[9], xy_gpu[10], xy_gpu[11], xy_gpu[12], xy_gpu[13], xy_gpu[14], xy_gpu[15],
 90 |         h_gpu[0], h_gpu[1], h_gpu[2], h_gpu[3], h_gpu[4], h_gpu[5], h_gpu[6], h_gpu[7],
 91 |         k_gpu[0], k_gpu[1], k_gpu[2], k_gpu[3], k_gpu[4], k_gpu[5], k_gpu[6], k_gpu[7],
 92 |         block=(keyCount, 1, 1))
 93 | 
 94 |     for i in range(keyCount):
 95 |         # print(f'--- [{i}] ---')
 96 |         _k = [k_gpu[j][i].get().item() for j in range(8)]
 97 |         priv = key_to_hex(_k)
 98 |         print(f"priv[{i}]:                      0x{priv}")
 99 |         xy = [xy_gpu[j][i].get().item() for j in range(16)]
100 |         pub = key_to_hex(xy)
101 |         print(f"pub[{i}]:                       0x{pub}")
102 |         _h = [h_gpu[j][i].get().item() for j in range(8)]
103 |         keccak = key_to_hex(_h)
104 |         print(f"keccak[{i}]:                    0x{keccak}")      
105 |         if verify:  
106 |             pk_bytes = bytes.fromhex(priv) 
107 |             public_key = ecdsa.SigningKey.from_string(pk_bytes, curve=ecdsa.SECP256k1).verifying_key.to_string()    
108 |             print(f"public Key[{i}] (verification): 0x{public_key.hex()}")
109 |             address = public_key_to_address(public_key, i, True)
110 |             # print(f"Address[{i}]:    {address}")
111 | 
112 | def main_genEthAddress(keyCount: int, verify: bool):
113 |     kernel_code = '''
114 | 
115 |     '''
116 |     def load_code(path: str) -> str:
117 |         with open(path, 'r') as text_file:
118 |             code_text = text_file.read()
119 |         lines = code_text.splitlines()
120 |         result = reduce(lambda t, l: 
121 |                         t + "\n" + l if len(l) > 0 and not l.startswith('#include ') else t, 
122 |                         lines, '')
123 |         return result
124 |     dirSecp256k1 = './secp256k1/'    
125 |     kernel_code += load_code(dirSecp256k1 + 'inc_vendor.h')
126 |     kernel_code += load_code(dirSecp256k1 + 'inc_types.h')
127 |     kernel_code += load_code(dirSecp256k1 + 'inc_ecc_secp256k1.h')
128 |     kernel_code += load_code(dirSecp256k1 + 'inc_ecc_secp256k1.cl')
129 |     dirKeccak = './keccak/'
130 |     kernel_code += load_code(dirKeccak + 'keccak256.h')
131 |     kernel_code += load_code(dirKeccak + 'keccak256.cl')
132 |     dirKernels = './kernels/'
133 |     kernel_code += load_code(dirKernels + 'gen_eth_addr.cl')
134 | 
135 |     # with open('./kernel.cl', 'w') as f:
136 |     #     f.write(kernel_code)
137 | 
138 |     k = [np.array(randomUInt32Array(keyCount), dtype=np.uint32) for i in range(8)]
139 |     a = [np.array(constUInt32Array(keyCount, 0), dtype=np.uint32) for i in range(5)]
140 | 
141 |     k_gpu = [gpuarray.to_gpu(k[i]) for i in range(8)]
142 |     a_gpu = [gpuarray.to_gpu(a[i]) for i in range(5)]
143 | 
144 |     mod = SourceModule(kernel_code)
145 |     genEthAddress = mod.get_function('genEthAddress')
146 | 
147 |     genEthAddress(
148 |         a_gpu[0], a_gpu[1], a_gpu[2], a_gpu[3], a_gpu[4],
149 |         k_gpu[0], k_gpu[1], k_gpu[2], k_gpu[3], k_gpu[4], k_gpu[5], k_gpu[6], k_gpu[7],
150 |         block=(keyCount, 1, 1))
151 | 
152 |     for i in range(keyCount):
153 |         # print(f'--- [{i}] ---')
154 |         _k = [k_gpu[j][i].get().item() for j in range(8)]
155 |         priv = key_to_hex(_k)
156 |         if verify:
157 |             print(f"priv[{i}]:                       0x{priv}")
158 |         _a = [a_gpu[j][i].get().item() for j in range(5)]
159 |         eth_address = key_to_hex(_a)
160 |         if verify:
161 |             print(f"eth address[{i}]:                0x{eth_address}")
162 |             pk_bytes = bytes.fromhex(priv) 
163 |             public_key = ecdsa.SigningKey.from_string(pk_bytes, curve=ecdsa.SECP256k1).verifying_key.to_string()    
164 |             address = public_key_to_address(public_key, i, False)
165 |             print(f"eth address[{i}] (verification): {address}")
166 |         else:
167 |             print(f"0x{priv},0x{eth_address}")
168 | 
169 | if __name__ == "__main__":
170 |     # main_genPubKey(keyCount=32, verify=True)
171 |     main_genEthAddress(keyCount=32, verify=True)


--------------------------------------------------------------------------------
/pyvanityeth.py:
--------------------------------------------------------------------------------
  1 | """
  2 | @author: Vitaly <vitaly@optinsoft.net> | github.com/optinsoft
  3 | """
  4 | import pathutils
  5 | import pycuda.driver as cuda
  6 | from pycuda.compiler import SourceModule
  7 | import pycuda.gpuarray as gpuarray
  8 | import pycuda.autoinit
  9 | import numpy as np
 10 | from decouple import config
 11 | import os
 12 | from functools import reduce
 13 | import ecdsa
 14 | from Crypto.Hash import keccak
 15 | import argparse
 16 | import time
 17 | 
 18 | def randomUInt32() -> int:
 19 |     return int.from_bytes(os.urandom(4), byteorder='little', signed=False)
 20 | 
 21 | def randomUInt32Array(count: int) -> list[int]:
 22 |     return [randomUInt32() for i in range(count)]
 23 | 
 24 | def constUInt32Array(count: int, v: int) -> list[int]:
 25 |     return [v for i in range(count)]
 26 | 
 27 | def prefixUInt32(prefixBytes: bytes) -> int:
 28 |     pl = len(prefixBytes)
 29 |     p = [prefixBytes[i] if i < pl else 0 for i in range(4)]
 30 |     return int.from_bytes(p, byteorder='big', signed=False)
 31 | 
 32 | def prefixUInt32Array(prefixBytes: bytes) -> list[int]:
 33 |     return [prefixUInt32(prefixBytes[i*4:i*4+4]) for i in range(5)]
 34 | 
 35 | def public_key_to_address(public_key, i, print_keccak):
 36 |     keccak_hash = keccak.new(digest_bits=256)
 37 |     keccak_hash.update(public_key)
 38 |     keccak_digest = keccak_hash.digest()
 39 |     if print_keccak:
 40 |         print(f'Keccak[{i}] (verification):     0x{keccak_digest.hex()}')
 41 |     address = '0x' + keccak_digest[-20:].hex()
 42 |     return address
 43 | 
 44 | def key_to_hex(k: list[int]) -> str:
 45 |     return reduce(lambda s, t: str(s) + t.to_bytes(4, byteorder='big').hex(), k[1:], k[0].to_bytes(4, byteorder='big').hex())
 46 | 
 47 | def main_vanityEthAddress(prefixBytes: bytes, keyBlockCount: int, maxBlocks: int, blockIterations: int, verify: bool, verbose: bool, outputFile: str) -> int:
 48 |     kernel_code = '''
 49 | 
 50 |     '''
 51 |     def load_code(path: str) -> str:
 52 |         with open(path, 'r') as text_file:
 53 |             code_text = text_file.read()
 54 |         lines = code_text.splitlines()
 55 |         result = reduce(lambda t, l: 
 56 |                         t + "\n" + l if len(l) > 0 and not l.startswith('#include ') else t, 
 57 |                         lines, '')
 58 |         return result
 59 |     dirSecp256k1 = './secp256k1/'    
 60 |     kernel_code += load_code(dirSecp256k1 + 'inc_vendor.h')
 61 |     kernel_code += load_code(dirSecp256k1 + 'inc_types.h')
 62 |     kernel_code += load_code(dirSecp256k1 + 'inc_ecc_secp256k1.h')
 63 |     kernel_code += load_code(dirSecp256k1 + 'inc_ecc_secp256k1.cl')
 64 |     dirKeccak = './keccak/'
 65 |     kernel_code += load_code(dirKeccak + 'keccak256.h')
 66 |     kernel_code += load_code(dirKeccak + 'keccak256.cl')
 67 |     dirKernels = './kernels/'
 68 |     kernel_code += load_code(dirKernels + 'gen_eth_addr.cl')
 69 | 
 70 |     # with open('./kernel.cl', 'w') as f:
 71 |     #     f.write(kernel_code)
 72 | 
 73 |     if verbose:
 74 |         print("Building kernel...")
 75 | 
 76 |     mod = SourceModule(kernel_code)
 77 |     genEthAddressWithPrefix = mod.get_function('genEthAddressWithPrefix')
 78 | 
 79 |     prefix = prefixBytes.hex()
 80 | 
 81 |     if verbose:
 82 |         print(f'Searching vanity address with prefix "{prefix}"...')
 83 | 
 84 |     start_time = time.time()
 85 | 
 86 |     a = [np.array(constUInt32Array(keyBlockCount, 0), dtype=np.uint32) for i in range(5)]
 87 |     a_gpu = [gpuarray.to_gpu(a[i]) for i in range(5)]
 88 |     ap_gpu = gpuarray.to_gpu(np.array(constUInt32Array(keyBlockCount, 0), dtype=np.uint32))
 89 | 
 90 |     p =  np.array(prefixUInt32Array(prefixBytes), dtype=np.uint32)
 91 |     p_gpu = gpuarray.to_gpu(p)
 92 |     p_len = np.int32(len(prefixBytes))
 93 |     n_iterations = np.int32(blockIterations)
 94 | 
 95 |     for n in range(maxBlocks):
 96 |         k = [np.array(randomUInt32Array(keyBlockCount), dtype=np.uint32) for i in range(8)]
 97 |         k_gpu = [gpuarray.to_gpu(k[i]) for i in range(8)]
 98 | 
 99 |         genEthAddressWithPrefix(
100 |             a_gpu[0], a_gpu[1], a_gpu[2], a_gpu[3], a_gpu[4], ap_gpu,
101 |             k_gpu[0], k_gpu[1], k_gpu[2], k_gpu[3], k_gpu[4], k_gpu[5], k_gpu[6], k_gpu[7],
102 |             p_gpu, p_len, n_iterations,
103 |             block=(keyBlockCount, 1, 1))
104 |         
105 |         for i in range(keyBlockCount):
106 |             # print(f'--- [{i}] ---')
107 |             _ap = ap_gpu[i].get().item()
108 |             if _ap != 0:
109 |                 _a = [a_gpu[j][i].get().item() for j in range(5)]
110 |                 eth_address = '0x'+key_to_hex(_a)
111 |                 if eth_address.startswith('0x'+prefix):
112 |                     if verbose:
113 |                         end_time = time.time()  # end time
114 |                         elapsed_time = end_time - start_time
115 |                         print(f"Vanity address found in block # {n+1} iteration # {_ap}, {elapsed_time:.2f} seconds")
116 |                         count = (n + 1) * keyBlockCount * (blockIterations if blockIterations > 0 else 1)
117 |                         print(f"Generated {count} ethereum addresses, {count/elapsed_time:.2f} addresses/second")
118 |                     _k = [k_gpu[j][i].get().item() for j in range(8)]
119 |                     priv = key_to_hex(_k)
120 |                     if verify and verbose:
121 |                         print(f"private key[{i}]:                0x{priv}")
122 |                         print(f"eth address[{i}]:                {eth_address}")
123 |                     if verify:
124 |                         pk_bytes = bytes.fromhex(priv) 
125 |                         public_key = ecdsa.SigningKey.from_string(pk_bytes, curve=ecdsa.SECP256k1).verifying_key.to_string()    
126 |                         address = public_key_to_address(public_key, i, False)
127 |                         if verbose:
128 |                             print(f"eth address[{i}] (verification): {address}")
129 |                         if address != eth_address:
130 |                             print(f"Verification failed: _as[{i}]={_ap}, eth_address[{i}]={eth_address}, verification={address}")
131 |                         else:
132 |                             print(f"0x{priv},{eth_address}")
133 |                             if outputFile:
134 |                                 with open(outputFile, "a") as of:
135 |                                     of.write(f"0x{priv},{eth_address}\n")
136 |                     else:
137 |                         print(f"0x{priv},{eth_address}")
138 |                         if outputFile:
139 |                             with open(outputFile, "a") as of:
140 |                                 of.write(f"0x{priv},{eth_address}\n")
141 |                     return 1
142 |                 else:
143 |                     print(f"Unexpected result: _ap[{i}]={_ap}, eth_address[{i}]={eth_address}")
144 |     if verbose:
145 |         end_time = time.time()  # end time
146 |         elapsed_time = end_time - start_time
147 |         print(f"Not found, {elapsed_time:.2f} seconds")
148 |         count = maxBlocks * keyBlockCount * (blockIterations if blockIterations > 0 else 1)
149 |         print(f"Generated {count} ethereum addresses, {count/elapsed_time:.2f} addresses/second")
150 |     return 0
151 | 
152 | def hexPrefix(s: str) -> bytes:
153 |     if s.startswith('0x'):
154 |         return bytes.fromhex(s[2:])
155 |     return bytes.fromhex(s)
156 | 
157 | if __name__ == "__main__":
158 |     parser = argparse.ArgumentParser(description="pyvanityeth.py")
159 |     parser.add_argument('-v', '--verbose', action='store_true', help='verbose')
160 |     parser.add_argument('--verify', action='store_true', help='verify found ethereum address')
161 |     parser.add_argument("--prefix", required=True, type=hexPrefix, help="vanity ethereum address PREFIX (without leading 0x)")
162 |     parser.add_argument("--blocks", required=False, type=int, default=1000, help="try find vanity ethereum address within BLOCKS blocks (default: 1000)")
163 |     parser.add_argument("--blockSize", required=False, type=int, default=128, help="generate block of BLOCKSIZE ethereum addresses by using GPU (default: 128)")
164 |     parser.add_argument("--blockIterations", required=False, type=int, default=1, help="attempts to find vanity ethereum address within each block")
165 |     parser.add_argument("--output", required=False, type=str, default="", help="output found ethereum address to file")
166 |     args = parser.parse_args()
167 |     main_vanityEthAddress(args.prefix, args.blockSize, args.blocks, args.blockIterations, args.verify, args.verbose, args.output)
168 | 


--------------------------------------------------------------------------------
/secp256k1/inc_ecc_secp256k1.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Author......: See docs/credits.txt
  3 |  * License.....: MIT
  4 |  */
  5 | 
  6 | #ifndef INC_ECC_SECP256K1_H
  7 | #define INC_ECC_SECP256K1_H
  8 | 
  9 | // y^2 = x^3 + ax + b with a = 0 and b = 7 => y^2 = x^3 + 7:
 10 | 
 11 | #define SECP256K1_B 7
 12 | 
 13 | // finite field Fp
 14 | // p = FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2F
 15 | #define SECP256K1_P0 0xfffffc2f
 16 | #define SECP256K1_P1 0xfffffffe
 17 | #define SECP256K1_P2 0xffffffff
 18 | #define SECP256K1_P3 0xffffffff
 19 | #define SECP256K1_P4 0xffffffff
 20 | #define SECP256K1_P5 0xffffffff
 21 | #define SECP256K1_P6 0xffffffff
 22 | #define SECP256K1_P7 0xffffffff
 23 | 
 24 | // prime order N
 25 | // n = FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE BAAEDCE6 AF48A03B BFD25E8C D0364141
 26 | #define SECP256K1_N0 0xd0364141
 27 | #define SECP256K1_N1 0xbfd25e8c
 28 | #define SECP256K1_N2 0xaf48a03b
 29 | #define SECP256K1_N3 0xbaaedce6
 30 | #define SECP256K1_N4 0xfffffffe
 31 | #define SECP256K1_N5 0xffffffff
 32 | #define SECP256K1_N6 0xffffffff
 33 | #define SECP256K1_N7 0xffffffff
 34 | 
 35 | // the base point G in compressed form for transform_public
 36 | // G = 02 79BE667E F9DCBBAC 55A06295 CE870B07 029BFCDB 2DCE28D9 59F2815B 16F81798
 37 | #define SECP256K1_G_PARITY 0x00000002
 38 | #define SECP256K1_G0 0x16f81798
 39 | #define SECP256K1_G1 0x59f2815b
 40 | #define SECP256K1_G2 0x2dce28d9
 41 | #define SECP256K1_G3 0x029bfcdb
 42 | #define SECP256K1_G4 0xce870b07
 43 | #define SECP256K1_G5 0x55a06295
 44 | #define SECP256K1_G6 0xf9dcbbac
 45 | #define SECP256K1_G7 0x79be667e
 46 | 
 47 | // the base point G in compressed form for parse_public
 48 | // parity and reversed byte/char (8 bit) byte order
 49 | // G = 02 79BE667E F9DCBBAC 55A06295 CE870B07 029BFCDB 2DCE28D9 59F2815B 16F81798
 50 | #define SECP256K1_G_STRING0 0x66be7902
 51 | #define SECP256K1_G_STRING1 0xbbdcf97e
 52 | #define SECP256K1_G_STRING2 0x62a055ac
 53 | #define SECP256K1_G_STRING3 0x0b87ce95
 54 | #define SECP256K1_G_STRING4 0xfc9b0207
 55 | #define SECP256K1_G_STRING5 0x28ce2ddb
 56 | #define SECP256K1_G_STRING6 0x81f259d9
 57 | #define SECP256K1_G_STRING7 0x17f8165b
 58 | #define SECP256K1_G_STRING8 0x00000098
 59 | 
 60 | // pre computed values, can be verified using private keys for
 61 | // x1 is the same as the basepoint g
 62 | // x1 WIF: KwDiBf89QgGbjEhKnhXJuH7LrciVrZi3qYjgd9M7rFU73sVHnoWn
 63 | // x3 WIF: KwDiBf89QgGbjEhKnhXJuH7LrciVrZi3qYjgd9M7rFU74sHUHy8S
 64 | // x5 WIF: KwDiBf89QgGbjEhKnhXJuH7LrciVrZi3qYjgd9M7rFU75s2EPgZf
 65 | // x7 WIF: KwDiBf89QgGbjEhKnhXJuH7LrciVrZi3qYjgd9M7rFU76rnZwVdz
 66 | 
 67 | // x1: 79BE667E F9DCBBAC 55A06295 CE870B07 029BFCDB 2DCE28D9 59F2815B 16F81798
 68 | // x1: 79BE667EF9DCBBAC55A06295CE870B07029BFCDB2DCE28D959F2815B16F81798
 69 | #define SECP256K1_G_PRE_COMPUTED_00 0x16f81798
 70 | #define SECP256K1_G_PRE_COMPUTED_01 0x59f2815b
 71 | #define SECP256K1_G_PRE_COMPUTED_02 0x2dce28d9
 72 | #define SECP256K1_G_PRE_COMPUTED_03 0x029bfcdb
 73 | #define SECP256K1_G_PRE_COMPUTED_04 0xce870b07
 74 | #define SECP256K1_G_PRE_COMPUTED_05 0x55a06295
 75 | #define SECP256K1_G_PRE_COMPUTED_06 0xf9dcbbac
 76 | #define SECP256K1_G_PRE_COMPUTED_07 0x79be667e
 77 | 
 78 | // y1: 483ADA77 26A3C465 5DA4FBFC 0E1108A8 FD17B448 A6855419 9C47D08F FB10D4B8
 79 | // y1: 483ADA7726A3C4655DA4FBFC0E1108A8FD17B448A68554199C47D08FFB10D4B8
 80 | #define SECP256K1_G_PRE_COMPUTED_08 0xfb10d4b8
 81 | #define SECP256K1_G_PRE_COMPUTED_09 0x9c47d08f
 82 | #define SECP256K1_G_PRE_COMPUTED_10 0xa6855419
 83 | #define SECP256K1_G_PRE_COMPUTED_11 0xfd17b448
 84 | #define SECP256K1_G_PRE_COMPUTED_12 0x0e1108a8
 85 | #define SECP256K1_G_PRE_COMPUTED_13 0x5da4fbfc
 86 | #define SECP256K1_G_PRE_COMPUTED_14 0x26a3c465
 87 | #define SECP256K1_G_PRE_COMPUTED_15 0x483ada77
 88 | 
 89 | // -y1: B7C52588 D95C3B9A A25B0403 F1EEF757 02E84BB7 597AABE6 63B82F6F 04EF2777
 90 | // -y1: B7C52588D95C3B9AA25B0403F1EEF75702E84BB7597AABE663B82F6F04EF2777
 91 | #define SECP256K1_G_PRE_COMPUTED_16 0x04ef2777
 92 | #define SECP256K1_G_PRE_COMPUTED_17 0x63b82f6f
 93 | #define SECP256K1_G_PRE_COMPUTED_18 0x597aabe6
 94 | #define SECP256K1_G_PRE_COMPUTED_19 0x02e84bb7
 95 | #define SECP256K1_G_PRE_COMPUTED_20 0xf1eef757
 96 | #define SECP256K1_G_PRE_COMPUTED_21 0xa25b0403
 97 | #define SECP256K1_G_PRE_COMPUTED_22 0xd95c3b9a
 98 | #define SECP256K1_G_PRE_COMPUTED_23 0xb7c52588
 99 | 
100 | // x3: F9308A01 9258C310 49344F85 F89D5229 B531C845 836F99B0 8601F113 BCE036F9
101 | // x3: F9308A019258C31049344F85F89D5229B531C845836F99B08601F113BCE036F9
102 | #define SECP256K1_G_PRE_COMPUTED_24 0xbce036f9
103 | #define SECP256K1_G_PRE_COMPUTED_25 0x8601f113
104 | #define SECP256K1_G_PRE_COMPUTED_26 0x836f99b0
105 | #define SECP256K1_G_PRE_COMPUTED_27 0xb531c845
106 | #define SECP256K1_G_PRE_COMPUTED_28 0xf89d5229
107 | #define SECP256K1_G_PRE_COMPUTED_29 0x49344f85
108 | #define SECP256K1_G_PRE_COMPUTED_30 0x9258c310
109 | #define SECP256K1_G_PRE_COMPUTED_31 0xf9308a01
110 | 
111 | // y3: 388F7B0F 632DE814 0FE337E6 2A37F356 6500A999 34C2231B 6CB9FD75 84B8E672
112 | // y3: 388F7B0F632DE8140FE337E62A37F3566500A99934C2231B6CB9FD7584B8E672
113 | #define SECP256K1_G_PRE_COMPUTED_32 0x84b8e672
114 | #define SECP256K1_G_PRE_COMPUTED_33 0x6cb9fd75
115 | #define SECP256K1_G_PRE_COMPUTED_34 0x34c2231b
116 | #define SECP256K1_G_PRE_COMPUTED_35 0x6500a999
117 | #define SECP256K1_G_PRE_COMPUTED_36 0x2a37f356
118 | #define SECP256K1_G_PRE_COMPUTED_37 0x0fe337e6
119 | #define SECP256K1_G_PRE_COMPUTED_38 0x632de814
120 | #define SECP256K1_G_PRE_COMPUTED_39 0x388f7b0f
121 | 
122 | // -y3: C77084F0 9CD217EB F01CC819 D5C80CA9 9AFF5666 CB3DDCE4 93460289 7B4715BD
123 | // -y3: C77084F09CD217EBF01CC819D5C80CA99AFF5666CB3DDCE4934602897B4715BD
124 | #define SECP256K1_G_PRE_COMPUTED_40 0x7b4715bd
125 | #define SECP256K1_G_PRE_COMPUTED_41 0x93460289
126 | #define SECP256K1_G_PRE_COMPUTED_42 0xcb3ddce4
127 | #define SECP256K1_G_PRE_COMPUTED_43 0x9aff5666
128 | #define SECP256K1_G_PRE_COMPUTED_44 0xd5c80ca9
129 | #define SECP256K1_G_PRE_COMPUTED_45 0xf01cc819
130 | #define SECP256K1_G_PRE_COMPUTED_46 0x9cd217eb
131 | #define SECP256K1_G_PRE_COMPUTED_47 0xc77084f0
132 | 
133 | // x5: 2F8BDE4D 1A072093 55B4A725 0A5C5128 E88B84BD DC619AB7 CBA8D569 B240EFE4
134 | // x5: 2F8BDE4D1A07209355B4A7250A5C5128E88B84BDDC619AB7CBA8D569B240EFE4
135 | #define SECP256K1_G_PRE_COMPUTED_48 0xb240efe4
136 | #define SECP256K1_G_PRE_COMPUTED_49 0xcba8d569
137 | #define SECP256K1_G_PRE_COMPUTED_50 0xdc619ab7
138 | #define SECP256K1_G_PRE_COMPUTED_51 0xe88b84bd
139 | #define SECP256K1_G_PRE_COMPUTED_52 0x0a5c5128
140 | #define SECP256K1_G_PRE_COMPUTED_53 0x55b4a725
141 | #define SECP256K1_G_PRE_COMPUTED_54 0x1a072093
142 | #define SECP256K1_G_PRE_COMPUTED_55 0x2f8bde4d
143 | 
144 | // y5: D8AC2226 36E5E3D6 D4DBA9DD A6C9C426 F788271B AB0D6840 DCA87D3A A6AC62D6
145 | // y5: D8AC222636E5E3D6D4DBA9DDA6C9C426F788271BAB0D6840DCA87D3AA6AC62D6
146 | #define SECP256K1_G_PRE_COMPUTED_56 0xa6ac62d6
147 | #define SECP256K1_G_PRE_COMPUTED_57 0xdca87d3a
148 | #define SECP256K1_G_PRE_COMPUTED_58 0xab0d6840
149 | #define SECP256K1_G_PRE_COMPUTED_59 0xf788271b
150 | #define SECP256K1_G_PRE_COMPUTED_60 0xa6c9c426
151 | #define SECP256K1_G_PRE_COMPUTED_61 0xd4dba9dd
152 | #define SECP256K1_G_PRE_COMPUTED_62 0x36e5e3d6
153 | #define SECP256K1_G_PRE_COMPUTED_63 0xd8ac2226
154 | 
155 | // -y5: 2753DDD9 C91A1C29 2B245622 59363BD9 0877D8E4 54F297BF 235782C4 59539959
156 | // -y5: 2753DDD9C91A1C292B24562259363BD90877D8E454F297BF235782C459539959
157 | #define SECP256K1_G_PRE_COMPUTED_64 0x59539959
158 | #define SECP256K1_G_PRE_COMPUTED_65 0x235782c4
159 | #define SECP256K1_G_PRE_COMPUTED_66 0x54f297bf
160 | #define SECP256K1_G_PRE_COMPUTED_67 0x0877d8e4
161 | #define SECP256K1_G_PRE_COMPUTED_68 0x59363bd9
162 | #define SECP256K1_G_PRE_COMPUTED_69 0x2b245622
163 | #define SECP256K1_G_PRE_COMPUTED_70 0xc91a1c29
164 | #define SECP256K1_G_PRE_COMPUTED_71 0x2753ddd9
165 | 
166 | // x7: 5CBDF064 6E5DB4EA A398F365 F2EA7A0E 3D419B7E 0330E39C E92BDDED CAC4F9BC
167 | // x7: 5CBDF0646E5DB4EAA398F365F2EA7A0E3D419B7E0330E39CE92BDDEDCAC4F9BC
168 | #define SECP256K1_G_PRE_COMPUTED_72 0xcac4f9bc
169 | #define SECP256K1_G_PRE_COMPUTED_73 0xe92bdded
170 | #define SECP256K1_G_PRE_COMPUTED_74 0x0330e39c
171 | #define SECP256K1_G_PRE_COMPUTED_75 0x3d419b7e
172 | #define SECP256K1_G_PRE_COMPUTED_76 0xf2ea7a0e
173 | #define SECP256K1_G_PRE_COMPUTED_77 0xa398f365
174 | #define SECP256K1_G_PRE_COMPUTED_78 0x6e5db4ea
175 | #define SECP256K1_G_PRE_COMPUTED_79 0x5cbdf064
176 | 
177 | // y7: 6AEBCA40 BA255960 A3178D6D 861A54DB A813D0B8 13FDE7B5 A5082628 087264DA
178 | // y7: 6AEBCA40BA255960A3178D6D861A54DBA813D0B813FDE7B5A5082628087264DA
179 | #define SECP256K1_G_PRE_COMPUTED_80 0x087264da
180 | #define SECP256K1_G_PRE_COMPUTED_81 0xa5082628
181 | #define SECP256K1_G_PRE_COMPUTED_82 0x13fde7b5
182 | #define SECP256K1_G_PRE_COMPUTED_83 0xa813d0b8
183 | #define SECP256K1_G_PRE_COMPUTED_84 0x861a54db
184 | #define SECP256K1_G_PRE_COMPUTED_85 0xa3178d6d
185 | #define SECP256K1_G_PRE_COMPUTED_86 0xba255960
186 | #define SECP256K1_G_PRE_COMPUTED_87 0x6aebca40
187 | 
188 | // -y7: 951435BF 45DAA69F 5CE87292 79E5AB24 57EC2F47 EC02184A 5AF7D9D6 F78D9755
189 | // -y7: 951435BF45DAA69F5CE8729279E5AB2457EC2F47EC02184A5AF7D9D6F78D9755
190 | #define SECP256K1_G_PRE_COMPUTED_88 0xf78d9755
191 | #define SECP256K1_G_PRE_COMPUTED_89 0x5af7d9d6
192 | #define SECP256K1_G_PRE_COMPUTED_90 0xec02184a
193 | #define SECP256K1_G_PRE_COMPUTED_91 0x57ec2f47
194 | #define SECP256K1_G_PRE_COMPUTED_92 0x79e5ab24
195 | #define SECP256K1_G_PRE_COMPUTED_93 0x5ce87292
196 | #define SECP256K1_G_PRE_COMPUTED_94 0x45daa69f
197 | #define SECP256K1_G_PRE_COMPUTED_95 0x951435bf
198 | 
199 | #define SECP256K1_PRE_COMPUTED_XY_SIZE 96
200 | #define SECP256K1_NAF_SIZE 33 // 32+1, we need one extra slot
201 | 
202 | #define PUBLIC_KEY_LENGTH_WITHOUT_PARITY 8
203 | #define PUBLIC_KEY_LENGTH_X_Y_WITHOUT_PARITY 16
204 | // 8+1 to make room for the parity
205 | #define PUBLIC_KEY_LENGTH_WITH_PARITY 9
206 | 
207 | // (32*8 == 256)
208 | #define PRIVATE_KEY_LENGTH 8
209 | 
210 | // change the type of input/tmps in your kernel (e.g. PRIVATE_AS / CONSTANT_AS):
211 | #ifndef SECP256K1_TMPS_TYPE
212 | #define SECP256K1_TMPS_TYPE GLOBAL_AS
213 | #endif
214 | 
215 | typedef struct secp256k1
216 | {
217 |   u32 xy[SECP256K1_PRE_COMPUTED_XY_SIZE]; // pre-computed points: (x1,y1,-y1),(x3,y3,-y3),(x5,y5,-y5),(x7,y7,-y7)
218 | 
219 | } secp256k1_t;
220 | 
221 | 
222 | DECLSPEC u32  transform_public (PRIVATE_AS secp256k1_t *r, PRIVATE_AS const u32 *x, const u32 first_byte);
223 | DECLSPEC u32  parse_public (PRIVATE_AS secp256k1_t *r, PRIVATE_AS const u32 *k);
224 | 
225 | DECLSPEC void point_mul_xy (PRIVATE_AS u32 *x1, PRIVATE_AS u32 *y1, PRIVATE_AS const u32 *k, SECP256K1_TMPS_TYPE const secp256k1_t *tmps);
226 | DECLSPEC void point_mul (PRIVATE_AS u32 *r, PRIVATE_AS const u32 *k, SECP256K1_TMPS_TYPE const secp256k1_t *tmps);
227 | 
228 | DECLSPEC void set_precomputed_basepoint_g (PRIVATE_AS secp256k1_t *r);
229 | 
230 | #endif // INC_ECC_SECP256K1_H
231 | 


--------------------------------------------------------------------------------
/secp256k1/inc_ecc_secp256k1.cl:
--------------------------------------------------------------------------------
   1 | /**
   2 |  * Author......: See docs/credits.txt
   3 |  * License.....: MIT
   4 |  *
   5 |  * Furthermore, since elliptic curve operations are highly researched and optimized,
   6 |  * we've consulted a lot of online resources to implement this, including several papers and
   7 |  * example code.
   8 |  *
   9 |  * Credits where credits are due: there are a lot of nice projects that explain and/or optimize
  10 |  * elliptic curve operations (especially elliptic curve multiplications by a scalar).
  11 |  *
  12 |  * We want to shout out following projects, which were quite helpful when implementing this:
  13 |  * - secp256k1 by Pieter Wuille (https://github.com/bitcoin-core/secp256k1/, MIT)
  14 |  * - secp256k1-cl by hhanh00 (https://github.com/hhanh00/secp256k1-cl/, MIT)
  15 |  * - ec_pure_c by masterzorag (https://github.com/masterzorag/ec_pure_c/)
  16 |  * - ecc-gmp by leivaburto (https://github.com/leivaburto/ecc-gmp)
  17 |  * - micro-ecc by Ken MacKay (https://github.com/kmackay/micro-ecc/, BSD)
  18 |  * - curve_example by willem (https://gist.github.com/nlitsme/c9031c7b9bf6bb009e5a)
  19 |  * - py_ecc by Vitalik Buterin (https://github.com/ethereum/py_ecc/, MIT)
  20 |  *
  21 |  *
  22 |  * Some BigNum operations are implemented similar to micro-ecc which is licensed under these terms:
  23 |  *  Copyright 2014 Ken MacKay, 2-Clause BSD License
  24 |  *
  25 |  *  Redistribution and use in source and binary forms, with or without modification, are permitted
  26 |  *  provided that the following conditions are met:
  27 |  *
  28 |  *  1. Redistributions of source code must retain the above copyright notice, this list of
  29 |  *     conditions and the following disclaimer.
  30 |  *
  31 |  *  2. Redistributions in binary form must reproduce the above copyright notice, this list of
  32 |  *     conditions and the following disclaimer in the documentation and/or other materials
  33 |  *     provided with the distribution.
  34 |  *
  35 |  *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  36 |  *  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
  37 |  *  AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
  38 |  *  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  39 |  *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  40 |  *  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  41 |  *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  42 |  *  OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  43 |  *  POSSIBILITY OF SUCH DAMAGE.
  44 |  */
  45 | 
  46 | /*
  47 |  * ATTENTION: this code is NOT meant to be used in security critical environments that are at risk
  48 |  * of side-channel or timing attacks etc, it's only purpose is to make it work fast for GPGPU
  49 |  * (OpenCL/CUDA). Some attack vectors like side-channel and timing-attacks might be possible,
  50 |  * because of some optimizations used within this code (non-constant time etc).
  51 |  */
  52 | 
  53 | /*
  54 |  * Implementation considerations:
  55 |  * point double and point add are implemented similar to algorithms mentioned in this 2011 paper:
  56 |  * http://eprint.iacr.org/2011/338.pdf
  57 |  * (Fast and Regular Algorithms for Scalar Multiplication over Elliptic Curves by Matthieu Rivain)
  58 |  *
  59 |  * In theory we could use the Jacobian Co-Z enhancement to get rid of the larger buffer caused by
  60 |  * the z coordinates (and in this way reduce register pressure etc).
  61 |  * For the Co-Z improvement there are a lot of fast algorithms, but we might still be faster
  62 |  * with this implementation (b/c we allow non-constant time) without the Brier/Joye Montgomery-like
  63 |  * ladder. Of course, this claim would need to be verified and tested to see which one is faster
  64 |  * for our specific scenario at the end.
  65 |  *
  66 |  * We accomplish a "little" speedup by using scalars converted to w-NAF (non-adjacent form):
  67 |  * The general idea of w-NAF is to pre-compute some zi coefficients like below to reduce the
  68 |  * costly point additions by using a non-binary ("signed") number system (values other than just
  69 |  * 0 and 1, but ranging from -2^(w-1)-1 to 2^(w-1)-1). This works best with the left-to-right
  70 |  * binary algorithm such that we just add zi * P when adding point P (we pre-compute all the
  71 |  * possible zi * P values because the x/y coordinates are known before the kernel starts):
  72 |  *
  73 |  *  // Example with window size w = 2 (i.e. mod 4 => & 3):
  74 |  *  // 173 => 1 0 -1 0 -1 0 -1 0 1 = 2^8 - 2^6 - 2^4 - 2^2 + 1
  75 |  *  int e = 0b10101101;   // 173
  76 |  *  int z[8 + 1] = { 0 }; // our zi/di, we need one extra slot to make the subtraction work
  77 |  *
  78 |  *  int i = 0;
  79 |  *
  80 |  *  while (e)
  81 |  *  {
  82 |  *    if (e & 1)
  83 |  *    {
  84 |  *      // for window size w = 3 it would be:
  85 |  *      // => 2^(w-0) = 2^3 = 8
  86 |  *      // => 2^(w-1) = 2^2 = 4
  87 |  *
  88 |  *      int bit; // = 2 - (e & 3) for w = 2
  89 |  *
  90 |  *      if ((e & 3) >= 2) // e % 4 == e & 3, use (e & 7) >= 4 for w = 3
  91 |  *        bit = (e & 3) - 4; // (e & 7) - 8 for w = 3
  92 |  *      else
  93 |  *        bit = e & 3; // e & 7 for w = 3
  94 |  *
  95 |  *      z[i] = bit;
  96 |  *      e   -= bit;
  97 |  *    }
  98 |  *
  99 |  *    e >>= 1; // e / 2
 100 |  *    i++;
 101 |  *  }
 102 | */
 103 | 
 104 | #include "inc_ecc_secp256k1.h"
 105 | 
 106 | DECLSPEC u32 sub (PRIVATE_AS u32 *r, PRIVATE_AS const u32 *a, PRIVATE_AS const u32 *b)
 107 | {
 108 |   u32 c = 0; // carry/borrow
 109 | 
 110 |   #if defined IS_NV && HAS_SUB == 1 && HAS_SUBC == 1
 111 |   asm volatile
 112 |   (
 113 |     "sub.cc.u32   %0,  %9, %17;"
 114 |     "subc.cc.u32  %1, %10, %18;"
 115 |     "subc.cc.u32  %2, %11, %19;"
 116 |     "subc.cc.u32  %3, %12, %20;"
 117 |     "subc.cc.u32  %4, %13, %21;"
 118 |     "subc.cc.u32  %5, %14, %22;"
 119 |     "subc.cc.u32  %6, %15, %23;"
 120 |     "subc.cc.u32  %7, %16, %24;"
 121 |     "subc.u32     %8,   0,   0;"
 122 |     : "=r"(r[0]), "=r"(r[1]), "=r"(r[2]), "=r"(r[3]), "=r"(r[4]), "=r"(r[5]), "=r"(r[6]), "=r"(r[7]),
 123 |       "=r"(c)
 124 |     :  "r"(a[0]),  "r"(a[1]),  "r"(a[2]),  "r"(a[3]),  "r"(a[4]),  "r"(a[5]),  "r"(a[6]),  "r"(a[7]),
 125 |        "r"(b[0]),  "r"(b[1]),  "r"(b[2]),  "r"(b[3]),  "r"(b[4]),  "r"(b[5]),  "r"(b[6]),  "r"(b[7])
 126 |   );
 127 |   // HIP doesnt support these so we stick to OpenCL (aka IS_AMD) - is also faster without asm
 128 |   //#elif (defined IS_AMD || defined IS_HIP) && HAS_VSUB == 1 && HAS_VSUBB == 1
 129 |   #elif 0
 130 |   __asm__ __volatile__
 131 |   (
 132 |     "V_SUB_U32   %0,  %9, %17;"
 133 |     "V_SUBB_U32  %1, %10, %18;"
 134 |     "V_SUBB_U32  %2, %11, %19;"
 135 |     "V_SUBB_U32  %3, %12, %20;"
 136 |     "V_SUBB_U32  %4, %13, %21;"
 137 |     "V_SUBB_U32  %5, %14, %22;"
 138 |     "V_SUBB_U32  %6, %15, %23;"
 139 |     "V_SUBB_U32  %7, %16, %24;"
 140 |     "V_SUBB_U32  %8,   0,   0;"
 141 |     : "=v"(r[0]), "=v"(r[1]), "=v"(r[2]), "=v"(r[3]), "=v"(r[4]), "=v"(r[5]), "=v"(r[6]), "=v"(r[7]),
 142 |       "=v"(c)
 143 |     :  "v"(a[0]),  "v"(a[1]),  "v"(a[2]),  "v"(a[3]),  "v"(a[4]),  "v"(a[5]),  "v"(a[6]),  "v"(a[7]),
 144 |        "v"(b[0]),  "v"(b[1]),  "v"(b[2]),  "v"(b[3]),  "v"(b[4]),  "v"(b[5]),  "v"(b[6]),  "v"(b[7])
 145 |   );
 146 |   #else
 147 |   for (u32 i = 0; i < 8; i++)
 148 |   {
 149 |     const u32 diff = a[i] - b[i] - c;
 150 | 
 151 |     if (diff != a[i]) c = (diff > a[i]);
 152 | 
 153 |     r[i] = diff;
 154 |   }
 155 |   #endif
 156 | 
 157 |   return c;
 158 | }
 159 | 
 160 | DECLSPEC u32 add (PRIVATE_AS u32 *r, PRIVATE_AS const u32 *a, PRIVATE_AS const u32 *b)
 161 | {
 162 |   u32 c = 0; // carry/borrow
 163 | 
 164 |   #if defined IS_NV && HAS_ADD == 1 && HAS_ADDC == 1
 165 |   asm volatile
 166 |   (
 167 |     "add.cc.u32   %0,  %9, %17;"
 168 |     "addc.cc.u32  %1, %10, %18;"
 169 |     "addc.cc.u32  %2, %11, %19;"
 170 |     "addc.cc.u32  %3, %12, %20;"
 171 |     "addc.cc.u32  %4, %13, %21;"
 172 |     "addc.cc.u32  %5, %14, %22;"
 173 |     "addc.cc.u32  %6, %15, %23;"
 174 |     "addc.cc.u32  %7, %16, %24;"
 175 |     "addc.u32     %8,   0,   0;"
 176 |     : "=r"(r[0]), "=r"(r[1]), "=r"(r[2]), "=r"(r[3]), "=r"(r[4]), "=r"(r[5]), "=r"(r[6]), "=r"(r[7]),
 177 |       "=r"(c)
 178 |     :  "r"(a[0]),  "r"(a[1]),  "r"(a[2]),  "r"(a[3]),  "r"(a[4]),  "r"(a[5]),  "r"(a[6]),  "r"(a[7]),
 179 |        "r"(b[0]),  "r"(b[1]),  "r"(b[2]),  "r"(b[3]),  "r"(b[4]),  "r"(b[5]),  "r"(b[6]),  "r"(b[7])
 180 |   );
 181 |   // HIP doesnt support these so we stick to OpenCL (aka IS_AMD) - is also faster without asm
 182 |   //#elif (defined IS_AMD || defined IS_HIP) && HAS_VSUB == 1 && HAS_VSUBB == 1
 183 |   #elif 0
 184 |   __asm__ __volatile__
 185 |   (
 186 |     "V_ADD_U32   %0,  %9, %17;"
 187 |     "V_ADDC_U32  %1, %10, %18;"
 188 |     "V_ADDC_U32  %2, %11, %19;"
 189 |     "V_ADDC_U32  %3, %12, %20;"
 190 |     "V_ADDC_U32  %4, %13, %21;"
 191 |     "V_ADDC_U32  %5, %14, %22;"
 192 |     "V_ADDC_U32  %6, %15, %23;"
 193 |     "V_ADDC_U32  %7, %16, %24;"
 194 |     "V_ADDC_U32  %8,   0,   0;"
 195 |     : "=v"(r[0]), "=v"(r[1]), "=v"(r[2]), "=v"(r[3]), "=v"(r[4]), "=v"(r[5]), "=v"(r[6]), "=v"(r[7]),
 196 |       "=v"(c)
 197 |     :  "v"(a[0]),  "v"(a[1]),  "v"(a[2]),  "v"(a[3]),  "v"(a[4]),  "v"(a[5]),  "v"(a[6]),  "v"(a[7]),
 198 |        "v"(b[0]),  "v"(b[1]),  "v"(b[2]),  "v"(b[3]),  "v"(b[4]),  "v"(b[5]),  "v"(b[6]),  "v"(b[7])
 199 |   );
 200 |   #else
 201 |   for (u32 i = 0; i < 8; i++)
 202 |   {
 203 |     const u32 t = a[i] + b[i] + c;
 204 | 
 205 |     if (t != a[i]) c = (t < a[i]);
 206 | 
 207 |     r[i] = t;
 208 |   }
 209 |   #endif
 210 | 
 211 |   return c;
 212 | }
 213 | 
 214 | DECLSPEC void sub_mod (PRIVATE_AS u32 *r, PRIVATE_AS const u32 *a, PRIVATE_AS const u32 *b)
 215 | {
 216 |   const u32 c = sub (r, a, b); // carry
 217 | 
 218 |   if (c)
 219 |   {
 220 |     u32 t[8];
 221 | 
 222 |     t[0] = SECP256K1_P0;
 223 |     t[1] = SECP256K1_P1;
 224 |     t[2] = SECP256K1_P2;
 225 |     t[3] = SECP256K1_P3;
 226 |     t[4] = SECP256K1_P4;
 227 |     t[5] = SECP256K1_P5;
 228 |     t[6] = SECP256K1_P6;
 229 |     t[7] = SECP256K1_P7;
 230 | 
 231 |     add (r, r, t);
 232 |   }
 233 | }
 234 | 
 235 | DECLSPEC void add_mod (PRIVATE_AS u32 *r, PRIVATE_AS const u32 *a, PRIVATE_AS const u32 *b)
 236 | {
 237 |   const u32 c = add (r, a, b); // carry
 238 | 
 239 |   /*
 240 |    * Modulo operation:
 241 |    */
 242 | 
 243 |   // note: we could have an early exit in case of c == 1 => sub ()
 244 | 
 245 |   u32 t[8];
 246 | 
 247 |   t[0] = SECP256K1_P0;
 248 |   t[1] = SECP256K1_P1;
 249 |   t[2] = SECP256K1_P2;
 250 |   t[3] = SECP256K1_P3;
 251 |   t[4] = SECP256K1_P4;
 252 |   t[5] = SECP256K1_P5;
 253 |   t[6] = SECP256K1_P6;
 254 |   t[7] = SECP256K1_P7;
 255 | 
 256 |   // check if modulo operation is needed
 257 | 
 258 |   u32 mod = 1;
 259 | 
 260 |   if (c == 0)
 261 |   {
 262 |     for (int i = 7; i >= 0; i--)
 263 |     {
 264 |       if (r[i] < t[i])
 265 |       {
 266 |         mod = 0;
 267 | 
 268 |         break; // or return ! (check if faster)
 269 |       }
 270 | 
 271 |       if (r[i] > t[i]) break;
 272 |     }
 273 |   }
 274 | 
 275 |   if (mod == 1)
 276 |   {
 277 |     sub (r, r, t);
 278 |   }
 279 | }
 280 | 
 281 | DECLSPEC void mod_512 (PRIVATE_AS u32 *n)
 282 | {
 283 |   // we need to perform a modulo operation with 512-bit % 256-bit (bignum modulo):
 284 |   // the modulus is the secp256k1 group order
 285 | 
 286 |   // ATTENTION: for this function the byte-order is reversed (most significant bytes
 287 |   // at the left)
 288 | 
 289 |   /*
 290 |     the general modulo by shift and substract code (a = a % b):
 291 | 
 292 |     x = b;
 293 | 
 294 |     t = a >> 1;
 295 | 
 296 |     while (x <= t) x <<= 1;
 297 | 
 298 |     while (a >= b)
 299 |     {
 300 |       if (a >= x) a -= x;
 301 | 
 302 |       x >>= 1;
 303 |     }
 304 | 
 305 |     return a; // remainder
 306 |   */
 307 | 
 308 |   u32 a[16];
 309 | 
 310 |   a[ 0] = n[ 0];
 311 |   a[ 1] = n[ 1];
 312 |   a[ 2] = n[ 2];
 313 |   a[ 3] = n[ 3];
 314 |   a[ 4] = n[ 4];
 315 |   a[ 5] = n[ 5];
 316 |   a[ 6] = n[ 6];
 317 |   a[ 7] = n[ 7];
 318 |   a[ 8] = n[ 8];
 319 |   a[ 9] = n[ 9];
 320 |   a[10] = n[10];
 321 |   a[11] = n[11];
 322 |   a[12] = n[12];
 323 |   a[13] = n[13];
 324 |   a[14] = n[14];
 325 |   a[15] = n[15];
 326 | 
 327 |   u32 b[16];
 328 | 
 329 |   b[ 0] = 0x00000000;
 330 |   b[ 1] = 0x00000000;
 331 |   b[ 2] = 0x00000000;
 332 |   b[ 3] = 0x00000000;
 333 |   b[ 4] = 0x00000000;
 334 |   b[ 5] = 0x00000000;
 335 |   b[ 6] = 0x00000000;
 336 |   b[ 7] = 0x00000000;
 337 |   b[ 8] = SECP256K1_N7;
 338 |   b[ 9] = SECP256K1_N6;
 339 |   b[10] = SECP256K1_N5;
 340 |   b[11] = SECP256K1_N4;
 341 |   b[12] = SECP256K1_N3;
 342 |   b[13] = SECP256K1_N2;
 343 |   b[14] = SECP256K1_N1;
 344 |   b[15] = SECP256K1_N0;
 345 | 
 346 |   /*
 347 |    * Start:
 348 |    */
 349 | 
 350 |   // x = b (but with a fast "shift" trick to avoid the while loop)
 351 | 
 352 |   u32 x[16];
 353 | 
 354 |   x[ 0] = b[ 8]; // this is a trick: we just put the group order's most significant bit all the
 355 |   x[ 1] = b[ 9]; // way to the top to avoid doing the initial: while (x <= t) x <<= 1
 356 |   x[ 2] = b[10];
 357 |   x[ 3] = b[11];
 358 |   x[ 4] = b[12];
 359 |   x[ 5] = b[13];
 360 |   x[ 6] = b[14];
 361 |   x[ 7] = b[15];
 362 |   x[ 8] = 0x00000000;
 363 |   x[ 9] = 0x00000000;
 364 |   x[10] = 0x00000000;
 365 |   x[11] = 0x00000000;
 366 |   x[12] = 0x00000000;
 367 |   x[13] = 0x00000000;
 368 |   x[14] = 0x00000000;
 369 |   x[15] = 0x00000000;
 370 | 
 371 |   // a >= b
 372 | 
 373 |   while (a[0] >= b[0])
 374 |   {
 375 |     u32 l00 = a[ 0] < b[ 0];
 376 |     u32 l01 = a[ 1] < b[ 1];
 377 |     u32 l02 = a[ 2] < b[ 2];
 378 |     u32 l03 = a[ 3] < b[ 3];
 379 |     u32 l04 = a[ 4] < b[ 4];
 380 |     u32 l05 = a[ 5] < b[ 5];
 381 |     u32 l06 = a[ 6] < b[ 6];
 382 |     u32 l07 = a[ 7] < b[ 7];
 383 |     u32 l08 = a[ 8] < b[ 8];
 384 |     u32 l09 = a[ 9] < b[ 9];
 385 |     u32 l10 = a[10] < b[10];
 386 |     u32 l11 = a[11] < b[11];
 387 |     u32 l12 = a[12] < b[12];
 388 |     u32 l13 = a[13] < b[13];
 389 |     u32 l14 = a[14] < b[14];
 390 |     u32 l15 = a[15] < b[15];
 391 | 
 392 |     u32 e00 = a[ 0] == b[ 0];
 393 |     u32 e01 = a[ 1] == b[ 1];
 394 |     u32 e02 = a[ 2] == b[ 2];
 395 |     u32 e03 = a[ 3] == b[ 3];
 396 |     u32 e04 = a[ 4] == b[ 4];
 397 |     u32 e05 = a[ 5] == b[ 5];
 398 |     u32 e06 = a[ 6] == b[ 6];
 399 |     u32 e07 = a[ 7] == b[ 7];
 400 |     u32 e08 = a[ 8] == b[ 8];
 401 |     u32 e09 = a[ 9] == b[ 9];
 402 |     u32 e10 = a[10] == b[10];
 403 |     u32 e11 = a[11] == b[11];
 404 |     u32 e12 = a[12] == b[12];
 405 |     u32 e13 = a[13] == b[13];
 406 |     u32 e14 = a[14] == b[14];
 407 | 
 408 |     if (l00) break;
 409 |     if (l01 && e00) break;
 410 |     if (l02 && e00 && e01) break;
 411 |     if (l03 && e00 && e01 && e02) break;
 412 |     if (l04 && e00 && e01 && e02 && e03) break;
 413 |     if (l05 && e00 && e01 && e02 && e03 && e04) break;
 414 |     if (l06 && e00 && e01 && e02 && e03 && e04 && e05) break;
 415 |     if (l07 && e00 && e01 && e02 && e03 && e04 && e05 && e06) break;
 416 |     if (l08 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07) break;
 417 |     if (l09 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08) break;
 418 |     if (l10 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09) break;
 419 |     if (l11 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09 && e10) break;
 420 |     if (l12 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09 && e10 && e11) break;
 421 |     if (l13 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09 && e10 && e11 && e12) break;
 422 |     if (l14 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09 && e10 && e11 && e12 && e13) break;
 423 |     if (l15 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09 && e10 && e11 && e12 && e13 && e14) break;
 424 | 
 425 |     // r = x (copy it to have the original values for the subtraction)
 426 | 
 427 |     u32 r[16];
 428 | 
 429 |     r[ 0] = x[ 0];
 430 |     r[ 1] = x[ 1];
 431 |     r[ 2] = x[ 2];
 432 |     r[ 3] = x[ 3];
 433 |     r[ 4] = x[ 4];
 434 |     r[ 5] = x[ 5];
 435 |     r[ 6] = x[ 6];
 436 |     r[ 7] = x[ 7];
 437 |     r[ 8] = x[ 8];
 438 |     r[ 9] = x[ 9];
 439 |     r[10] = x[10];
 440 |     r[11] = x[11];
 441 |     r[12] = x[12];
 442 |     r[13] = x[13];
 443 |     r[14] = x[14];
 444 |     r[15] = x[15];
 445 | 
 446 |     // x <<= 1
 447 | 
 448 |     x[15] = x[15] >> 1 | x[14] << 31;
 449 |     x[14] = x[14] >> 1 | x[13] << 31;
 450 |     x[13] = x[13] >> 1 | x[12] << 31;
 451 |     x[12] = x[12] >> 1 | x[11] << 31;
 452 |     x[11] = x[11] >> 1 | x[10] << 31;
 453 |     x[10] = x[10] >> 1 | x[ 9] << 31;
 454 |     x[ 9] = x[ 9] >> 1 | x[ 8] << 31;
 455 |     x[ 8] = x[ 8] >> 1 | x[ 7] << 31;
 456 |     x[ 7] = x[ 7] >> 1 | x[ 6] << 31;
 457 |     x[ 6] = x[ 6] >> 1 | x[ 5] << 31;
 458 |     x[ 5] = x[ 5] >> 1 | x[ 4] << 31;
 459 |     x[ 4] = x[ 4] >> 1 | x[ 3] << 31;
 460 |     x[ 3] = x[ 3] >> 1 | x[ 2] << 31;
 461 |     x[ 2] = x[ 2] >> 1 | x[ 1] << 31;
 462 |     x[ 1] = x[ 1] >> 1 | x[ 0] << 31;
 463 |     x[ 0] = x[ 0] >> 1;
 464 | 
 465 |     // if (a >= r) a -= r;
 466 | 
 467 |     l00 = a[ 0] < r[ 0];
 468 |     l01 = a[ 1] < r[ 1];
 469 |     l02 = a[ 2] < r[ 2];
 470 |     l03 = a[ 3] < r[ 3];
 471 |     l04 = a[ 4] < r[ 4];
 472 |     l05 = a[ 5] < r[ 5];
 473 |     l06 = a[ 6] < r[ 6];
 474 |     l07 = a[ 7] < r[ 7];
 475 |     l08 = a[ 8] < r[ 8];
 476 |     l09 = a[ 9] < r[ 9];
 477 |     l10 = a[10] < r[10];
 478 |     l11 = a[11] < r[11];
 479 |     l12 = a[12] < r[12];
 480 |     l13 = a[13] < r[13];
 481 |     l14 = a[14] < r[14];
 482 |     l15 = a[15] < r[15];
 483 | 
 484 |     e00 = a[ 0] == r[ 0];
 485 |     e01 = a[ 1] == r[ 1];
 486 |     e02 = a[ 2] == r[ 2];
 487 |     e03 = a[ 3] == r[ 3];
 488 |     e04 = a[ 4] == r[ 4];
 489 |     e05 = a[ 5] == r[ 5];
 490 |     e06 = a[ 6] == r[ 6];
 491 |     e07 = a[ 7] == r[ 7];
 492 |     e08 = a[ 8] == r[ 8];
 493 |     e09 = a[ 9] == r[ 9];
 494 |     e10 = a[10] == r[10];
 495 |     e11 = a[11] == r[11];
 496 |     e12 = a[12] == r[12];
 497 |     e13 = a[13] == r[13];
 498 |     e14 = a[14] == r[14];
 499 | 
 500 |     if (l00) continue;
 501 |     if (l01 && e00) continue;
 502 |     if (l02 && e00 && e01) continue;
 503 |     if (l03 && e00 && e01 && e02) continue;
 504 |     if (l04 && e00 && e01 && e02 && e03) continue;
 505 |     if (l05 && e00 && e01 && e02 && e03 && e04) continue;
 506 |     if (l06 && e00 && e01 && e02 && e03 && e04 && e05) continue;
 507 |     if (l07 && e00 && e01 && e02 && e03 && e04 && e05 && e06) continue;
 508 |     if (l08 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07) continue;
 509 |     if (l09 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08) continue;
 510 |     if (l10 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09) continue;
 511 |     if (l11 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09 && e10) continue;
 512 |     if (l12 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09 && e10 && e11) continue;
 513 |     if (l13 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09 && e10 && e11 && e12) continue;
 514 |     if (l14 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09 && e10 && e11 && e12 && e13) continue;
 515 |     if (l15 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09 && e10 && e11 && e12 && e13 && e14) continue;
 516 | 
 517 |     // substract (a -= r):
 518 | 
 519 |     if ((r[ 0] | r[ 1] | r[ 2] | r[ 3] | r[ 4] | r[ 5] | r[ 6] | r[ 7] |
 520 |          r[ 8] | r[ 9] | r[10] | r[11] | r[12] | r[13] | r[14] | r[15]) == 0) break;
 521 | 
 522 |     r[ 0] = a[ 0] - r[ 0];
 523 |     r[ 1] = a[ 1] - r[ 1];
 524 |     r[ 2] = a[ 2] - r[ 2];
 525 |     r[ 3] = a[ 3] - r[ 3];
 526 |     r[ 4] = a[ 4] - r[ 4];
 527 |     r[ 5] = a[ 5] - r[ 5];
 528 |     r[ 6] = a[ 6] - r[ 6];
 529 |     r[ 7] = a[ 7] - r[ 7];
 530 |     r[ 8] = a[ 8] - r[ 8];
 531 |     r[ 9] = a[ 9] - r[ 9];
 532 |     r[10] = a[10] - r[10];
 533 |     r[11] = a[11] - r[11];
 534 |     r[12] = a[12] - r[12];
 535 |     r[13] = a[13] - r[13];
 536 |     r[14] = a[14] - r[14];
 537 |     r[15] = a[15] - r[15];
 538 | 
 539 |     // take care of the "borrow" (we can't do it the other way around 15...1 because r[x] is changed!)
 540 | 
 541 |     if (r[ 1] > a[ 1]) r[ 0]--;
 542 |     if (r[ 2] > a[ 2]) r[ 1]--;
 543 |     if (r[ 3] > a[ 3]) r[ 2]--;
 544 |     if (r[ 4] > a[ 4]) r[ 3]--;
 545 |     if (r[ 5] > a[ 5]) r[ 4]--;
 546 |     if (r[ 6] > a[ 6]) r[ 5]--;
 547 |     if (r[ 7] > a[ 7]) r[ 6]--;
 548 |     if (r[ 8] > a[ 8]) r[ 7]--;
 549 |     if (r[ 9] > a[ 9]) r[ 8]--;
 550 |     if (r[10] > a[10]) r[ 9]--;
 551 |     if (r[11] > a[11]) r[10]--;
 552 |     if (r[12] > a[12]) r[11]--;
 553 |     if (r[13] > a[13]) r[12]--;
 554 |     if (r[14] > a[14]) r[13]--;
 555 |     if (r[15] > a[15]) r[14]--;
 556 | 
 557 |     a[ 0] = r[ 0];
 558 |     a[ 1] = r[ 1];
 559 |     a[ 2] = r[ 2];
 560 |     a[ 3] = r[ 3];
 561 |     a[ 4] = r[ 4];
 562 |     a[ 5] = r[ 5];
 563 |     a[ 6] = r[ 6];
 564 |     a[ 7] = r[ 7];
 565 |     a[ 8] = r[ 8];
 566 |     a[ 9] = r[ 9];
 567 |     a[10] = r[10];
 568 |     a[11] = r[11];
 569 |     a[12] = r[12];
 570 |     a[13] = r[13];
 571 |     a[14] = r[14];
 572 |     a[15] = r[15];
 573 |   }
 574 | 
 575 |   n[ 0] = a[ 0];
 576 |   n[ 1] = a[ 1];
 577 |   n[ 2] = a[ 2];
 578 |   n[ 3] = a[ 3];
 579 |   n[ 4] = a[ 4];
 580 |   n[ 5] = a[ 5];
 581 |   n[ 6] = a[ 6];
 582 |   n[ 7] = a[ 7];
 583 |   n[ 8] = a[ 8];
 584 |   n[ 9] = a[ 9];
 585 |   n[10] = a[10];
 586 |   n[11] = a[11];
 587 |   n[12] = a[12];
 588 |   n[13] = a[13];
 589 |   n[14] = a[14];
 590 |   n[15] = a[15];
 591 | }
 592 | 
 593 | DECLSPEC void mul_mod (PRIVATE_AS u32 *r, PRIVATE_AS const u32 *a, PRIVATE_AS const u32 *b) // TODO get rid of u64 ?
 594 | {
 595 |   u32 t[16] = { 0 }; // we need up to double the space (2 * 8)
 596 | 
 597 |   /*
 598 |    * First start with the basic a * b multiplication:
 599 |    */
 600 | 
 601 |   u32 t0 = 0;
 602 |   u32 t1 = 0;
 603 |   u32 c  = 0;
 604 | 
 605 |   for (u32 i = 0; i < 8; i++)
 606 |   {
 607 |     for (u32 j = 0; j <= i; j++)
 608 |     {
 609 |       u64 p = ((u64) a[j]) * b[i - j];
 610 | 
 611 |       u64 d = ((u64) t1) << 32 | t0;
 612 | 
 613 |       d += p;
 614 | 
 615 |       t0 = (u32) d;
 616 |       t1 = d >> 32;
 617 | 
 618 |       c += d < p; // carry
 619 |     }
 620 | 
 621 |     t[i] = t0;
 622 | 
 623 |     t0 = t1;
 624 |     t1 = c;
 625 | 
 626 |     c = 0;
 627 |   }
 628 | 
 629 |   for (u32 i = 8; i < 15; i++)
 630 |   {
 631 |     for (u32 j = i - 7; j < 8; j++)
 632 |     {
 633 |       u64 p = ((u64) a[j]) * b[i - j];
 634 | 
 635 |       u64 d = ((u64) t1) << 32 | t0;
 636 | 
 637 |       d += p;
 638 | 
 639 |       t0 = (u32) d;
 640 |       t1 = d >> 32;
 641 | 
 642 |       c += d < p;
 643 |     }
 644 | 
 645 |     t[i] = t0;
 646 | 
 647 |     t0 = t1;
 648 |     t1 = c;
 649 | 
 650 |     c = 0;
 651 |   }
 652 | 
 653 |   t[15] = t0;
 654 | 
 655 | 
 656 | 
 657 |   /*
 658 |    * Now do the modulo operation:
 659 |    * (r = t % p)
 660 |    *
 661 |    * http://www.isys.uni-klu.ac.at/PDF/2001-0126-MT.pdf (p.354 or p.9 in that document)
 662 |    */
 663 | 
 664 |   u32 tmp[16] = { 0 };
 665 | 
 666 |   // c = 0;
 667 | 
 668 |   // Note: SECP256K1_P = 2^256 - 2^32 - 977 (0x03d1 = 977)
 669 |   // multiply t[8]...t[15] by omega:
 670 | 
 671 |   for (u32 i = 0, j = 8; i < 8; i++, j++)
 672 |   {
 673 |     u64 p = ((u64) 0x03d1) * t[j] + c;
 674 | 
 675 |     tmp[i] = (u32) p;
 676 | 
 677 |     c = p >> 32;
 678 |   }
 679 | 
 680 |   tmp[8] = c;
 681 | 
 682 |   c = add (tmp + 1, tmp + 1, t + 8); // modifies tmp[1]...tmp[8]
 683 | 
 684 |   tmp[9] = c;
 685 | 
 686 | 
 687 |   // r = t + tmp
 688 | 
 689 |   c = add (r, t, tmp);
 690 | 
 691 |   // multiply t[0]...t[7] by omega:
 692 | 
 693 |   u32 c2 = 0;
 694 | 
 695 |   // memset (t, 0, sizeof (t));
 696 | 
 697 |   for (u32 i = 0, j = 8; i < 8; i++, j++)
 698 |   {
 699 |     u64 p = ((u64) 0x3d1) * tmp[j] + c2;
 700 | 
 701 |     t[i] = (u32) p;
 702 | 
 703 |     c2 = p >> 32;
 704 |   }
 705 | 
 706 |   t[8] = c2;
 707 | 
 708 |   c2 = add (t + 1, t + 1, tmp + 8); // modifies t[1]...t[8]
 709 | 
 710 |   t[9] = c2;
 711 | 
 712 | 
 713 |   // r = r + t
 714 | 
 715 |   c2 = add (r, r, t);
 716 | 
 717 |   c += c2;
 718 | 
 719 |   t[0] = SECP256K1_P0;
 720 |   t[1] = SECP256K1_P1;
 721 |   t[2] = SECP256K1_P2;
 722 |   t[3] = SECP256K1_P3;
 723 |   t[4] = SECP256K1_P4;
 724 |   t[5] = SECP256K1_P5;
 725 |   t[6] = SECP256K1_P6;
 726 |   t[7] = SECP256K1_P7;
 727 | 
 728 |   for (u32 i = c; i > 0; i--)
 729 |   {
 730 |     sub (r, r, t);
 731 |   }
 732 | 
 733 |   for (int i = 7; i >= 0; i--)
 734 |   {
 735 |     if (r[i] < t[i]) break;
 736 | 
 737 |     if (r[i] > t[i])
 738 |     {
 739 |       sub (r, r, t);
 740 | 
 741 |       break;
 742 |     }
 743 |   }
 744 | }
 745 | 
 746 | DECLSPEC void sqrt_mod (PRIVATE_AS u32 *r)
 747 | {
 748 |   // Fermat's Little Theorem
 749 |   // secp256k1: y^2 = x^3 + 7 % p
 750 |   // y ^ (p - 1) = 1
 751 |   // y ^ (p - 1) = (y^2) ^ ((p - 1) / 2) = 1 => y^2 = (y^2) ^ (((p - 1) / 2) + 1)
 752 |   // => y = (y^2) ^ ((((p - 1) / 2) + 1) / 2)
 753 |   // y = (y^2) ^ (((p - 1 + 2) / 2) / 2) = (y^2) ^ ((p + 1) / 4)
 754 | 
 755 |   // y1 = (x^3 + 7) ^ ((p + 1) / 4)
 756 |   // y2 = p - y1 (or y2 = y1 * -1 % p)
 757 | 
 758 |   u32 s[8];
 759 | 
 760 |   s[0] = SECP256K1_P0 + 1; //  because of (p + 1) / 4 or use add (s, s, 1)
 761 |   s[1] = SECP256K1_P1;
 762 |   s[2] = SECP256K1_P2;
 763 |   s[3] = SECP256K1_P3;
 764 |   s[4] = SECP256K1_P4;
 765 |   s[5] = SECP256K1_P5;
 766 |   s[6] = SECP256K1_P6;
 767 |   s[7] = SECP256K1_P7;
 768 | 
 769 |   u32 t[8] = { 0 };
 770 | 
 771 |   t[0] = 1;
 772 | 
 773 |   for (u32 i = 255; i > 1; i--) // we just skip the last 2 multiplications (=> exp / 4)
 774 |   {
 775 |     mul_mod (t, t, t); // r * r
 776 | 
 777 |     u32 idx  = i >> 5;
 778 |     u32 mask = 1 << (i & 0x1f);
 779 | 
 780 |     if (s[idx] & mask)
 781 |     {
 782 |       mul_mod (t, t, r); // t * r
 783 |     }
 784 |   }
 785 | 
 786 |   r[0] = t[0];
 787 |   r[1] = t[1];
 788 |   r[2] = t[2];
 789 |   r[3] = t[3];
 790 |   r[4] = t[4];
 791 |   r[5] = t[5];
 792 |   r[6] = t[6];
 793 |   r[7] = t[7];
 794 | }
 795 | 
 796 | // (inverse (a, p) * a) % p == 1 (or think of a * a^-1 = a / a = 1)
 797 | 
 798 | DECLSPEC void inv_mod (PRIVATE_AS u32 *a)
 799 | {
 800 |   // How often does this really happen? it should "almost" never happen (but would be safer)
 801 |   // if ((a[0] | a[1] | a[2] | a[3] | a[4] | a[5] | a[6] | a[7]) == 0) return;
 802 | 
 803 |   u32 t0[8];
 804 | 
 805 |   t0[0] = a[0];
 806 |   t0[1] = a[1];
 807 |   t0[2] = a[2];
 808 |   t0[3] = a[3];
 809 |   t0[4] = a[4];
 810 |   t0[5] = a[5];
 811 |   t0[6] = a[6];
 812 |   t0[7] = a[7];
 813 | 
 814 |   u32 p[8];
 815 | 
 816 |   p[0] = SECP256K1_P0;
 817 |   p[1] = SECP256K1_P1;
 818 |   p[2] = SECP256K1_P2;
 819 |   p[3] = SECP256K1_P3;
 820 |   p[4] = SECP256K1_P4;
 821 |   p[5] = SECP256K1_P5;
 822 |   p[6] = SECP256K1_P6;
 823 |   p[7] = SECP256K1_P7;
 824 | 
 825 |   u32 t1[8];
 826 | 
 827 |   t1[0] = SECP256K1_P0;
 828 |   t1[1] = SECP256K1_P1;
 829 |   t1[2] = SECP256K1_P2;
 830 |   t1[3] = SECP256K1_P3;
 831 |   t1[4] = SECP256K1_P4;
 832 |   t1[5] = SECP256K1_P5;
 833 |   t1[6] = SECP256K1_P6;
 834 |   t1[7] = SECP256K1_P7;
 835 | 
 836 |   u32 t2[8] = { 0 };
 837 | 
 838 |   t2[0] = 0x00000001;
 839 | 
 840 |   u32 t3[8] = { 0 };
 841 | 
 842 |   u32 b = (t0[0] != t1[0])
 843 |         | (t0[1] != t1[1])
 844 |         | (t0[2] != t1[2])
 845 |         | (t0[3] != t1[3])
 846 |         | (t0[4] != t1[4])
 847 |         | (t0[5] != t1[5])
 848 |         | (t0[6] != t1[6])
 849 |         | (t0[7] != t1[7]);
 850 | 
 851 |   while (b)
 852 |   {
 853 |     if ((t0[0] & 1) == 0) // even
 854 |     {
 855 |       t0[0] = t0[0] >> 1 | t0[1] << 31;
 856 |       t0[1] = t0[1] >> 1 | t0[2] << 31;
 857 |       t0[2] = t0[2] >> 1 | t0[3] << 31;
 858 |       t0[3] = t0[3] >> 1 | t0[4] << 31;
 859 |       t0[4] = t0[4] >> 1 | t0[5] << 31;
 860 |       t0[5] = t0[5] >> 1 | t0[6] << 31;
 861 |       t0[6] = t0[6] >> 1 | t0[7] << 31;
 862 |       t0[7] = t0[7] >> 1;
 863 | 
 864 |       u32 c = 0;
 865 | 
 866 |       if (t2[0] & 1) c = add (t2, t2, p);
 867 | 
 868 |       t2[0] = t2[0] >> 1 | t2[1] << 31;
 869 |       t2[1] = t2[1] >> 1 | t2[2] << 31;
 870 |       t2[2] = t2[2] >> 1 | t2[3] << 31;
 871 |       t2[3] = t2[3] >> 1 | t2[4] << 31;
 872 |       t2[4] = t2[4] >> 1 | t2[5] << 31;
 873 |       t2[5] = t2[5] >> 1 | t2[6] << 31;
 874 |       t2[6] = t2[6] >> 1 | t2[7] << 31;
 875 |       t2[7] = t2[7] >> 1 | c     << 31;
 876 |     }
 877 |     else if ((t1[0] & 1) == 0)
 878 |     {
 879 |       t1[0] = t1[0] >> 1 | t1[1] << 31;
 880 |       t1[1] = t1[1] >> 1 | t1[2] << 31;
 881 |       t1[2] = t1[2] >> 1 | t1[3] << 31;
 882 |       t1[3] = t1[3] >> 1 | t1[4] << 31;
 883 |       t1[4] = t1[4] >> 1 | t1[5] << 31;
 884 |       t1[5] = t1[5] >> 1 | t1[6] << 31;
 885 |       t1[6] = t1[6] >> 1 | t1[7] << 31;
 886 |       t1[7] = t1[7] >> 1;
 887 | 
 888 |       u32 c = 0;
 889 | 
 890 |       if (t3[0] & 1) c = add (t3, t3, p);
 891 | 
 892 |       t3[0] = t3[0] >> 1 | t3[1] << 31;
 893 |       t3[1] = t3[1] >> 1 | t3[2] << 31;
 894 |       t3[2] = t3[2] >> 1 | t3[3] << 31;
 895 |       t3[3] = t3[3] >> 1 | t3[4] << 31;
 896 |       t3[4] = t3[4] >> 1 | t3[5] << 31;
 897 |       t3[5] = t3[5] >> 1 | t3[6] << 31;
 898 |       t3[6] = t3[6] >> 1 | t3[7] << 31;
 899 |       t3[7] = t3[7] >> 1 | c     << 31;
 900 |     }
 901 |     else
 902 |     {
 903 |       u32 gt = 0;
 904 | 
 905 |       for (int i = 7; i >= 0; i--)
 906 |       {
 907 |         if (t0[i] > t1[i])
 908 |         {
 909 |           gt = 1;
 910 | 
 911 |           break;
 912 |         }
 913 | 
 914 |         if (t0[i] < t1[i]) break;
 915 |       }
 916 | 
 917 |       if (gt)
 918 |       {
 919 |         sub (t0, t0, t1);
 920 | 
 921 |         t0[0] = t0[0] >> 1 | t0[1] << 31;
 922 |         t0[1] = t0[1] >> 1 | t0[2] << 31;
 923 |         t0[2] = t0[2] >> 1 | t0[3] << 31;
 924 |         t0[3] = t0[3] >> 1 | t0[4] << 31;
 925 |         t0[4] = t0[4] >> 1 | t0[5] << 31;
 926 |         t0[5] = t0[5] >> 1 | t0[6] << 31;
 927 |         t0[6] = t0[6] >> 1 | t0[7] << 31;
 928 |         t0[7] = t0[7] >> 1;
 929 | 
 930 |         u32 lt = 0;
 931 | 
 932 |         for (int i = 7; i >= 0; i--)
 933 |         {
 934 |           if (t2[i] < t3[i])
 935 |           {
 936 |             lt = 1;
 937 | 
 938 |             break;
 939 |           }
 940 | 
 941 |           if (t2[i] > t3[i]) break;
 942 |         }
 943 | 
 944 |         if (lt) add (t2, t2, p);
 945 | 
 946 |         sub (t2, t2, t3);
 947 | 
 948 |         u32 c = 0;
 949 | 
 950 |         if (t2[0] & 1) c = add (t2, t2, p);
 951 | 
 952 |         t2[0] = t2[0] >> 1 | t2[1] << 31;
 953 |         t2[1] = t2[1] >> 1 | t2[2] << 31;
 954 |         t2[2] = t2[2] >> 1 | t2[3] << 31;
 955 |         t2[3] = t2[3] >> 1 | t2[4] << 31;
 956 |         t2[4] = t2[4] >> 1 | t2[5] << 31;
 957 |         t2[5] = t2[5] >> 1 | t2[6] << 31;
 958 |         t2[6] = t2[6] >> 1 | t2[7] << 31;
 959 |         t2[7] = t2[7] >> 1 | c     << 31;
 960 |       }
 961 |       else
 962 |       {
 963 |         sub (t1, t1, t0);
 964 | 
 965 |         t1[0] = t1[0] >> 1 | t1[1] << 31;
 966 |         t1[1] = t1[1] >> 1 | t1[2] << 31;
 967 |         t1[2] = t1[2] >> 1 | t1[3] << 31;
 968 |         t1[3] = t1[3] >> 1 | t1[4] << 31;
 969 |         t1[4] = t1[4] >> 1 | t1[5] << 31;
 970 |         t1[5] = t1[5] >> 1 | t1[6] << 31;
 971 |         t1[6] = t1[6] >> 1 | t1[7] << 31;
 972 |         t1[7] = t1[7] >> 1;
 973 | 
 974 |         u32 lt = 0;
 975 | 
 976 |         for (int i = 7; i >= 0; i--)
 977 |         {
 978 |           if (t3[i] < t2[i])
 979 |           {
 980 |             lt = 1;
 981 | 
 982 |             break;
 983 |           }
 984 | 
 985 |           if (t3[i] > t2[i]) break;
 986 |         }
 987 | 
 988 |         if (lt) add (t3, t3, p);
 989 | 
 990 |         sub (t3, t3, t2);
 991 | 
 992 |         u32 c = 0;
 993 | 
 994 |         if (t3[0] & 1) c = add (t3, t3, p);
 995 | 
 996 |         t3[0] = t3[0] >> 1 | t3[1] << 31;
 997 |         t3[1] = t3[1] >> 1 | t3[2] << 31;
 998 |         t3[2] = t3[2] >> 1 | t3[3] << 31;
 999 |         t3[3] = t3[3] >> 1 | t3[4] << 31;
1000 |         t3[4] = t3[4] >> 1 | t3[5] << 31;
1001 |         t3[5] = t3[5] >> 1 | t3[6] << 31;
1002 |         t3[6] = t3[6] >> 1 | t3[7] << 31;
1003 |         t3[7] = t3[7] >> 1 | c     << 31;
1004 |       }
1005 |     }
1006 | 
1007 |     // update b:
1008 | 
1009 |     b = (t0[0] != t1[0])
1010 |       | (t0[1] != t1[1])
1011 |       | (t0[2] != t1[2])
1012 |       | (t0[3] != t1[3])
1013 |       | (t0[4] != t1[4])
1014 |       | (t0[5] != t1[5])
1015 |       | (t0[6] != t1[6])
1016 |       | (t0[7] != t1[7]);
1017 |   }
1018 | 
1019 |   // set result:
1020 | 
1021 |   a[0] = t2[0];
1022 |   a[1] = t2[1];
1023 |   a[2] = t2[2];
1024 |   a[3] = t2[3];
1025 |   a[4] = t2[4];
1026 |   a[5] = t2[5];
1027 |   a[6] = t2[6];
1028 |   a[7] = t2[7];
1029 | }
1030 | 
1031 | /*
1032 |   // everything from the formulas below of course MOD the prime:
1033 | 
1034 |   // we use this formula:
1035 | 
1036 |   X = (3/2 * x^2)^2 - 2 * x * y^2
1037 |   Y = (3/2 * x^2) * (x * y^2 - X) - y^4
1038 |   Z = y * z
1039 | 
1040 |   this is identical to the more frequently used form:
1041 | 
1042 |   X = (3 * x^2)^2 - 8 * x * y^2
1043 |   Y =  3 * x^2 * (4 * x * y^2 - X) - 8 * y^4
1044 |   Z =  2 * y * z
1045 | */
1046 | 
1047 | DECLSPEC void point_double (PRIVATE_AS u32 *x, PRIVATE_AS u32 *y, PRIVATE_AS u32 *z)
1048 | {
1049 |   // How often does this really happen? it should "almost" never happen (but would be safer)
1050 | 
1051 |   /*
1052 |   if ((y[0] | y[1] | y[2] | y[3] | y[4] | y[5] | y[6] | y[7]) == 0)
1053 |   {
1054 |     x[0] = 0;
1055 |     x[1] = 0;
1056 |     x[2] = 0;
1057 |     x[3] = 0;
1058 |     x[4] = 0;
1059 |     x[5] = 0;
1060 |     x[6] = 0;
1061 |     x[7] = 0;
1062 | 
1063 |     y[0] = 0;
1064 |     y[1] = 0;
1065 |     y[2] = 0;
1066 |     y[3] = 0;
1067 |     y[4] = 0;
1068 |     y[5] = 0;
1069 |     y[6] = 0;
1070 |     y[7] = 0;
1071 | 
1072 |     z[0] = 0;
1073 |     z[1] = 0;
1074 |     z[2] = 0;
1075 |     z[3] = 0;
1076 |     z[4] = 0;
1077 |     z[5] = 0;
1078 |     z[6] = 0;
1079 |     z[7] = 0;
1080 | 
1081 |     return;
1082 |   }
1083 |   */
1084 | 
1085 |   u32 t1[8];
1086 | 
1087 |   t1[0] = x[0];
1088 |   t1[1] = x[1];
1089 |   t1[2] = x[2];
1090 |   t1[3] = x[3];
1091 |   t1[4] = x[4];
1092 |   t1[5] = x[5];
1093 |   t1[6] = x[6];
1094 |   t1[7] = x[7];
1095 | 
1096 |   u32 t2[8];
1097 | 
1098 |   t2[0] = y[0];
1099 |   t2[1] = y[1];
1100 |   t2[2] = y[2];
1101 |   t2[3] = y[3];
1102 |   t2[4] = y[4];
1103 |   t2[5] = y[5];
1104 |   t2[6] = y[6];
1105 |   t2[7] = y[7];
1106 | 
1107 |   u32 t3[8];
1108 | 
1109 |   t3[0] = z[0];
1110 |   t3[1] = z[1];
1111 |   t3[2] = z[2];
1112 |   t3[3] = z[3];
1113 |   t3[4] = z[4];
1114 |   t3[5] = z[5];
1115 |   t3[6] = z[6];
1116 |   t3[7] = z[7];
1117 | 
1118 |   u32 t4[8];
1119 |   u32 t5[8];
1120 |   u32 t6[8];
1121 | 
1122 |   mul_mod (t4, t1, t1); // t4 = x^2
1123 | 
1124 |   mul_mod (t5, t2, t2); // t5 = y^2
1125 | 
1126 |   mul_mod (t1, t1, t5); // t1 = x*y^2
1127 | 
1128 |   mul_mod (t5, t5, t5); // t5 = t5^2 = y^4
1129 | 
1130 |   // here the z^2 and z^4 is not needed for a = 0
1131 | 
1132 |   mul_mod (t3, t2, t3); // t3 = x * z
1133 | 
1134 |   add_mod (t2, t4, t4); // t2 = 2 * t4 = 2 * x^2
1135 |   add_mod (t4, t4, t2); // t4 = 3 * t4 = 3 * x^2
1136 | 
1137 |   // a * z^4 = 0 * 1^4 = 0
1138 | 
1139 |   // don't discard the least significant bit it's important too!
1140 | 
1141 |   u32 c = 0;
1142 | 
1143 |   if (t4[0] & 1)
1144 |   {
1145 |     u32 t[8];
1146 | 
1147 |     t[0] = SECP256K1_P0;
1148 |     t[1] = SECP256K1_P1;
1149 |     t[2] = SECP256K1_P2;
1150 |     t[3] = SECP256K1_P3;
1151 |     t[4] = SECP256K1_P4;
1152 |     t[5] = SECP256K1_P5;
1153 |     t[6] = SECP256K1_P6;
1154 |     t[7] = SECP256K1_P7;
1155 | 
1156 |     c = add (t4, t4, t); // t4 + SECP256K1_P
1157 |   }
1158 | 
1159 |   // right shift (t4 / 2):
1160 | 
1161 |   t4[0] = t4[0] >> 1 | t4[1] << 31;
1162 |   t4[1] = t4[1] >> 1 | t4[2] << 31;
1163 |   t4[2] = t4[2] >> 1 | t4[3] << 31;
1164 |   t4[3] = t4[3] >> 1 | t4[4] << 31;
1165 |   t4[4] = t4[4] >> 1 | t4[5] << 31;
1166 |   t4[5] = t4[5] >> 1 | t4[6] << 31;
1167 |   t4[6] = t4[6] >> 1 | t4[7] << 31;
1168 |   t4[7] = t4[7] >> 1 | c     << 31;
1169 | 
1170 |   mul_mod (t6, t4, t4); // t6 = t4^2 = (3/2 * x^2)^2
1171 | 
1172 |   add_mod (t2, t1, t1); // t2 = 2 * t1
1173 | 
1174 |   sub_mod (t6, t6, t2); // t6 = t6 - t2
1175 |   sub_mod (t1, t1, t6); // t1 = t1 - t6
1176 | 
1177 |   mul_mod (t4, t4, t1); // t4 = t4 * t1
1178 | 
1179 |   sub_mod (t1, t4, t5); // t1 = t4 - t5
1180 | 
1181 |   // => x = t6, y = t1, z = t3:
1182 | 
1183 |   x[0] = t6[0];
1184 |   x[1] = t6[1];
1185 |   x[2] = t6[2];
1186 |   x[3] = t6[3];
1187 |   x[4] = t6[4];
1188 |   x[5] = t6[5];
1189 |   x[6] = t6[6];
1190 |   x[7] = t6[7];
1191 | 
1192 |   y[0] = t1[0];
1193 |   y[1] = t1[1];
1194 |   y[2] = t1[2];
1195 |   y[3] = t1[3];
1196 |   y[4] = t1[4];
1197 |   y[5] = t1[5];
1198 |   y[6] = t1[6];
1199 |   y[7] = t1[7];
1200 | 
1201 |   z[0] = t3[0];
1202 |   z[1] = t3[1];
1203 |   z[2] = t3[2];
1204 |   z[3] = t3[3];
1205 |   z[4] = t3[4];
1206 |   z[5] = t3[5];
1207 |   z[6] = t3[6];
1208 |   z[7] = t3[7];
1209 | }
1210 | 
1211 | /*
1212 |  * madd-2004-hmv:
1213 |  * (from https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html)
1214 |  * t1 = z1^2
1215 |  * t2 = t1*z1
1216 |  * t1 = t1*x2
1217 |  * t2 = t2*y2
1218 |  * t1 = t1-x1
1219 |  * t2 = t2-y1
1220 |  * z3 = z1*t1
1221 |  * t3 = t1^2
1222 |  * t4 = t3*t1
1223 |  * t3 = t3*x1
1224 |  * t1 = 2*t3
1225 |  * x3 = t2^2
1226 |  * x3 = x3-t1
1227 |  * x3 = x3-t4
1228 |  * t3 = t3-x3
1229 |  * t3 = t3*t2
1230 |  * t4 = t4*y1
1231 |  * y3 = t3-t4
1232 |  */
1233 | 
1234 | DECLSPEC void point_add (PRIVATE_AS u32 *x1, PRIVATE_AS u32 *y1, PRIVATE_AS u32 *z1, PRIVATE_AS u32 *x2, PRIVATE_AS u32 *y2) // z2 = 1
1235 | {
1236 |   // How often does this really happen? it should "almost" never happen (but would be safer)
1237 | 
1238 |   /*
1239 |   if ((y2[0] | y2[1] | y2[2] | y2[3] | y2[4] | y2[5] | y2[6] | y2[7]) == 0) return;
1240 | 
1241 |   if ((y1[0] | y1[1] | y1[2] | y1[3] | y1[4] | y1[5] | y1[6] | y1[7]) == 0)
1242 |   {
1243 |     x1[0] = x2[0];
1244 |     x1[1] = x2[1];
1245 |     x1[2] = x2[2];
1246 |     x1[3] = x2[3];
1247 |     x1[4] = x2[4];
1248 |     x1[5] = x2[5];
1249 |     x1[6] = x2[6];
1250 |     x1[7] = x2[7];
1251 | 
1252 |     y1[0] = y2[0];
1253 |     y1[1] = y2[1];
1254 |     y1[2] = y2[2];
1255 |     y1[3] = y2[3];
1256 |     y1[4] = y2[4];
1257 |     y1[5] = y2[5];
1258 |     y1[6] = y2[6];
1259 |     y1[7] = y2[7];
1260 | 
1261 |     z1[0] = z2[0];
1262 |     z1[1] = z2[1];
1263 |     z1[2] = z2[2];
1264 |     z1[3] = z2[3];
1265 |     z1[4] = z2[4];
1266 |     z1[5] = z2[5];
1267 |     z1[6] = z2[6];
1268 |     z1[7] = z2[7];
1269 | 
1270 |     return;
1271 |   }
1272 |   */
1273 | 
1274 |   // if x1 == x2 and y2 == y2 and z2 == z2 we need to double instead?
1275 | 
1276 |   // x1/y1/z1:
1277 | 
1278 |   u32 t1[8];
1279 | 
1280 |   t1[0] = x1[0];
1281 |   t1[1] = x1[1];
1282 |   t1[2] = x1[2];
1283 |   t1[3] = x1[3];
1284 |   t1[4] = x1[4];
1285 |   t1[5] = x1[5];
1286 |   t1[6] = x1[6];
1287 |   t1[7] = x1[7];
1288 | 
1289 |   u32 t2[8];
1290 | 
1291 |   t2[0] = y1[0];
1292 |   t2[1] = y1[1];
1293 |   t2[2] = y1[2];
1294 |   t2[3] = y1[3];
1295 |   t2[4] = y1[4];
1296 |   t2[5] = y1[5];
1297 |   t2[6] = y1[6];
1298 |   t2[7] = y1[7];
1299 | 
1300 |   u32 t3[8];
1301 | 
1302 |   t3[0] = z1[0];
1303 |   t3[1] = z1[1];
1304 |   t3[2] = z1[2];
1305 |   t3[3] = z1[3];
1306 |   t3[4] = z1[4];
1307 |   t3[5] = z1[5];
1308 |   t3[6] = z1[6];
1309 |   t3[7] = z1[7];
1310 | 
1311 |   // x2/y2:
1312 | 
1313 |   u32 t4[8];
1314 | 
1315 |   t4[0] = x2[0];
1316 |   t4[1] = x2[1];
1317 |   t4[2] = x2[2];
1318 |   t4[3] = x2[3];
1319 |   t4[4] = x2[4];
1320 |   t4[5] = x2[5];
1321 |   t4[6] = x2[6];
1322 |   t4[7] = x2[7];
1323 | 
1324 |   u32 t5[8];
1325 | 
1326 |   t5[0] = y2[0];
1327 |   t5[1] = y2[1];
1328 |   t5[2] = y2[2];
1329 |   t5[3] = y2[3];
1330 |   t5[4] = y2[4];
1331 |   t5[5] = y2[5];
1332 |   t5[6] = y2[6];
1333 |   t5[7] = y2[7];
1334 | 
1335 |   u32 t6[8];
1336 |   u32 t7[8];
1337 |   u32 t8[8];
1338 |   u32 t9[8];
1339 | 
1340 |   mul_mod (t6, t3, t3); // t6 = t3^2
1341 | 
1342 |   mul_mod (t7, t6, t3); // t7 = t6*t3
1343 |   mul_mod (t6, t6, t4); // t6 = t6*t4
1344 |   mul_mod (t7, t7, t5); // t7 = t7*t5
1345 | 
1346 |   sub_mod (t6, t6, t1); // t6 = t6-t1
1347 |   sub_mod (t7, t7, t2); // t7 = t7-t2
1348 | 
1349 |   mul_mod (t8, t3, t6); // t8 = t3*t6
1350 |   mul_mod (t4, t6, t6); // t4 = t6^2
1351 |   mul_mod (t9, t4, t6); // t9 = t4*t6
1352 |   mul_mod (t4, t4, t1); // t4 = t4*t1
1353 | 
1354 |   // left shift (t4 * 2):
1355 | 
1356 |   t6[7] = t4[7] << 1 | t4[6] >> 31;
1357 |   t6[6] = t4[6] << 1 | t4[5] >> 31;
1358 |   t6[5] = t4[5] << 1 | t4[4] >> 31;
1359 |   t6[4] = t4[4] << 1 | t4[3] >> 31;
1360 |   t6[3] = t4[3] << 1 | t4[2] >> 31;
1361 |   t6[2] = t4[2] << 1 | t4[1] >> 31;
1362 |   t6[1] = t4[1] << 1 | t4[0] >> 31;
1363 |   t6[0] = t4[0] << 1;
1364 | 
1365 |   // don't discard the most significant bit, it's important too!
1366 | 
1367 |   if (t4[7] & 0x80000000)
1368 |   {
1369 |     // use most significant bit and perform mod P, since we have: t4 * 2 % P
1370 | 
1371 |     u32 a[8] = { 0 };
1372 | 
1373 |     a[1] = 1;
1374 |     a[0] = 0x000003d1; // omega (see: mul_mod ())
1375 | 
1376 |     add (t6, t6, a);
1377 |   }
1378 | 
1379 |   mul_mod (t5, t7, t7); // t5 = t7*t7
1380 | 
1381 |   sub_mod (t5, t5, t6); // t5 = t5-t6
1382 |   sub_mod (t5, t5, t9); // t5 = t5-t9
1383 |   sub_mod (t4, t4, t5); // t4 = t4-t5
1384 | 
1385 |   mul_mod (t4, t4, t7); // t4 = t4*t7
1386 |   mul_mod (t9, t9, t2); // t9 = t9*t2
1387 | 
1388 |   sub_mod (t9, t4, t9); // t9 = t4-t9
1389 | 
1390 |   x1[0] = t5[0];
1391 |   x1[1] = t5[1];
1392 |   x1[2] = t5[2];
1393 |   x1[3] = t5[3];
1394 |   x1[4] = t5[4];
1395 |   x1[5] = t5[5];
1396 |   x1[6] = t5[6];
1397 |   x1[7] = t5[7];
1398 | 
1399 |   y1[0] = t9[0];
1400 |   y1[1] = t9[1];
1401 |   y1[2] = t9[2];
1402 |   y1[3] = t9[3];
1403 |   y1[4] = t9[4];
1404 |   y1[5] = t9[5];
1405 |   y1[6] = t9[6];
1406 |   y1[7] = t9[7];
1407 | 
1408 |   z1[0] = t8[0];
1409 |   z1[1] = t8[1];
1410 |   z1[2] = t8[2];
1411 |   z1[3] = t8[3];
1412 |   z1[4] = t8[4];
1413 |   z1[5] = t8[5];
1414 |   z1[6] = t8[6];
1415 |   z1[7] = t8[7];
1416 | }
1417 | 
1418 | DECLSPEC void point_get_coords (PRIVATE_AS secp256k1_t *r, PRIVATE_AS const u32 *x, PRIVATE_AS const u32 *y)
1419 | {
1420 |   /*
1421 |     pre-compute 1/-1, 3/-3, 5/-5, 7/-7 times P (x, y)
1422 |     for wNAF with window size 4 (max/min: +/- 2^3-1): -7, -5, -3, -1, 1, 3, 5, 7
1423 | 
1424 |     +x1 ( 0)
1425 |     +y1 ( 8)
1426 |     -y1 (16)
1427 | 
1428 |     +x3 (24)
1429 |     +y3 (32)
1430 |     -y3 (40)
1431 | 
1432 |     +x5 (48)
1433 |     +y5 (56)
1434 |     -y5 (64)
1435 | 
1436 |     +x7 (72)
1437 |     +y7 (80)
1438 |     -y7 (88)
1439 |    */
1440 | 
1441 |   // note: we use jacobian forms with (x, y, z) for computation, but affine
1442 |   // (or just converted to z = 1) for storage
1443 | 
1444 |   // 1:
1445 | 
1446 |   r->xy[ 0] = x[0];
1447 |   r->xy[ 1] = x[1];
1448 |   r->xy[ 2] = x[2];
1449 |   r->xy[ 3] = x[3];
1450 |   r->xy[ 4] = x[4];
1451 |   r->xy[ 5] = x[5];
1452 |   r->xy[ 6] = x[6];
1453 |   r->xy[ 7] = x[7];
1454 | 
1455 |   r->xy[ 8] = y[0];
1456 |   r->xy[ 9] = y[1];
1457 |   r->xy[10] = y[2];
1458 |   r->xy[11] = y[3];
1459 |   r->xy[12] = y[4];
1460 |   r->xy[13] = y[5];
1461 |   r->xy[14] = y[6];
1462 |   r->xy[15] = y[7];
1463 | 
1464 |   // -1:
1465 | 
1466 |   u32 p[8];
1467 | 
1468 |   p[0] = SECP256K1_P0;
1469 |   p[1] = SECP256K1_P1;
1470 |   p[2] = SECP256K1_P2;
1471 |   p[3] = SECP256K1_P3;
1472 |   p[4] = SECP256K1_P4;
1473 |   p[5] = SECP256K1_P5;
1474 |   p[6] = SECP256K1_P6;
1475 |   p[7] = SECP256K1_P7;
1476 | 
1477 |   u32 neg[8];
1478 | 
1479 |   neg[0] = y[0];
1480 |   neg[1] = y[1];
1481 |   neg[2] = y[2];
1482 |   neg[3] = y[3];
1483 |   neg[4] = y[4];
1484 |   neg[5] = y[5];
1485 |   neg[6] = y[6];
1486 |   neg[7] = y[7];
1487 | 
1488 |   sub_mod (neg, p, neg); // -y = p - y
1489 | 
1490 |   r->xy[16] = neg[0];
1491 |   r->xy[17] = neg[1];
1492 |   r->xy[18] = neg[2];
1493 |   r->xy[19] = neg[3];
1494 |   r->xy[20] = neg[4];
1495 |   r->xy[21] = neg[5];
1496 |   r->xy[22] = neg[6];
1497 |   r->xy[23] = neg[7];
1498 | 
1499 | 
1500 |   // copy of 1:
1501 | 
1502 |   u32 tx[8];
1503 | 
1504 |   tx[0] = x[0];
1505 |   tx[1] = x[1];
1506 |   tx[2] = x[2];
1507 |   tx[3] = x[3];
1508 |   tx[4] = x[4];
1509 |   tx[5] = x[5];
1510 |   tx[6] = x[6];
1511 |   tx[7] = x[7];
1512 | 
1513 |   u32 ty[8];
1514 | 
1515 |   ty[0] = y[0];
1516 |   ty[1] = y[1];
1517 |   ty[2] = y[2];
1518 |   ty[3] = y[3];
1519 |   ty[4] = y[4];
1520 |   ty[5] = y[5];
1521 |   ty[6] = y[6];
1522 |   ty[7] = y[7];
1523 | 
1524 |   u32 rx[8];
1525 | 
1526 |   rx[0] = x[0];
1527 |   rx[1] = x[1];
1528 |   rx[2] = x[2];
1529 |   rx[3] = x[3];
1530 |   rx[4] = x[4];
1531 |   rx[5] = x[5];
1532 |   rx[6] = x[6];
1533 |   rx[7] = x[7];
1534 | 
1535 |   u32 ry[8];
1536 | 
1537 |   ry[0] = y[0];
1538 |   ry[1] = y[1];
1539 |   ry[2] = y[2];
1540 |   ry[3] = y[3];
1541 |   ry[4] = y[4];
1542 |   ry[5] = y[5];
1543 |   ry[6] = y[6];
1544 |   ry[7] = y[7];
1545 | 
1546 |   u32 rz[8] = { 0 };
1547 | 
1548 |   rz[0] = 1;
1549 | 
1550 | 
1551 |   // 3:
1552 | 
1553 |   point_double (rx, ry, rz);          // 2
1554 |   point_add    (rx, ry, rz, tx, ty);  // 3
1555 | 
1556 |   // to affine:
1557 | 
1558 |   inv_mod (rz);
1559 | 
1560 |   mul_mod (neg, rz, rz); // neg is temporary variable (z^2)
1561 |   mul_mod (rx,  rx, neg);
1562 | 
1563 |   mul_mod (rz, neg, rz);
1564 |   mul_mod (ry, ry, rz);
1565 | 
1566 |   r->xy[24] = rx[0];
1567 |   r->xy[25] = rx[1];
1568 |   r->xy[26] = rx[2];
1569 |   r->xy[27] = rx[3];
1570 |   r->xy[28] = rx[4];
1571 |   r->xy[29] = rx[5];
1572 |   r->xy[30] = rx[6];
1573 |   r->xy[31] = rx[7];
1574 | 
1575 |   r->xy[32] = ry[0];
1576 |   r->xy[33] = ry[1];
1577 |   r->xy[34] = ry[2];
1578 |   r->xy[35] = ry[3];
1579 |   r->xy[36] = ry[4];
1580 |   r->xy[37] = ry[5];
1581 |   r->xy[38] = ry[6];
1582 |   r->xy[39] = ry[7];
1583 | 
1584 |   // -3:
1585 | 
1586 |   neg[0] = ry[0];
1587 |   neg[1] = ry[1];
1588 |   neg[2] = ry[2];
1589 |   neg[3] = ry[3];
1590 |   neg[4] = ry[4];
1591 |   neg[5] = ry[5];
1592 |   neg[6] = ry[6];
1593 |   neg[7] = ry[7];
1594 | 
1595 |   sub_mod (neg, p, neg);
1596 | 
1597 |   r->xy[40] = neg[0];
1598 |   r->xy[41] = neg[1];
1599 |   r->xy[42] = neg[2];
1600 |   r->xy[43] = neg[3];
1601 |   r->xy[44] = neg[4];
1602 |   r->xy[45] = neg[5];
1603 |   r->xy[46] = neg[6];
1604 |   r->xy[47] = neg[7];
1605 | 
1606 | 
1607 |   // 5:
1608 | 
1609 |   rz[0] = 1; // actually we could take advantage of rz being 1 too (alternative point_add ()),
1610 |   rz[1] = 0; // but it is not important because this is performed only once per "hash"
1611 |   rz[2] = 0;
1612 |   rz[3] = 0;
1613 |   rz[4] = 0;
1614 |   rz[5] = 0;
1615 |   rz[6] = 0;
1616 |   rz[7] = 0;
1617 | 
1618 |   point_add (rx, ry, rz, tx, ty); // 4
1619 |   point_add (rx, ry, rz, tx, ty); // 5
1620 | 
1621 |   // to affine:
1622 | 
1623 |   inv_mod (rz);
1624 | 
1625 |   mul_mod (neg, rz, rz);
1626 |   mul_mod (rx,  rx, neg);
1627 | 
1628 |   mul_mod (rz, neg, rz);
1629 |   mul_mod (ry, ry, rz);
1630 | 
1631 |   r->xy[48] = rx[0];
1632 |   r->xy[49] = rx[1];
1633 |   r->xy[50] = rx[2];
1634 |   r->xy[51] = rx[3];
1635 |   r->xy[52] = rx[4];
1636 |   r->xy[53] = rx[5];
1637 |   r->xy[54] = rx[6];
1638 |   r->xy[55] = rx[7];
1639 | 
1640 |   r->xy[56] = ry[0];
1641 |   r->xy[57] = ry[1];
1642 |   r->xy[58] = ry[2];
1643 |   r->xy[59] = ry[3];
1644 |   r->xy[60] = ry[4];
1645 |   r->xy[61] = ry[5];
1646 |   r->xy[62] = ry[6];
1647 |   r->xy[63] = ry[7];
1648 | 
1649 |   // -5:
1650 | 
1651 |   neg[0] = ry[0];
1652 |   neg[1] = ry[1];
1653 |   neg[2] = ry[2];
1654 |   neg[3] = ry[3];
1655 |   neg[4] = ry[4];
1656 |   neg[5] = ry[5];
1657 |   neg[6] = ry[6];
1658 |   neg[7] = ry[7];
1659 | 
1660 |   sub_mod (neg, p, neg);
1661 | 
1662 |   r->xy[64] = neg[0];
1663 |   r->xy[65] = neg[1];
1664 |   r->xy[66] = neg[2];
1665 |   r->xy[67] = neg[3];
1666 |   r->xy[68] = neg[4];
1667 |   r->xy[69] = neg[5];
1668 |   r->xy[70] = neg[6];
1669 |   r->xy[71] = neg[7];
1670 | 
1671 | 
1672 |   // 7:
1673 | 
1674 |   rz[0] = 1;
1675 |   rz[1] = 0;
1676 |   rz[2] = 0;
1677 |   rz[3] = 0;
1678 |   rz[4] = 0;
1679 |   rz[5] = 0;
1680 |   rz[6] = 0;
1681 |   rz[7] = 0;
1682 | 
1683 |   point_add (rx, ry, rz, tx, ty); // 6
1684 |   point_add (rx, ry, rz, tx, ty); // 7
1685 | 
1686 |   // to affine:
1687 | 
1688 |   inv_mod (rz);
1689 | 
1690 |   mul_mod (neg, rz, rz);
1691 |   mul_mod (rx,  rx, neg);
1692 | 
1693 |   mul_mod (rz, neg, rz);
1694 |   mul_mod (ry, ry, rz);
1695 | 
1696 |   r->xy[72] = rx[0];
1697 |   r->xy[73] = rx[1];
1698 |   r->xy[74] = rx[2];
1699 |   r->xy[75] = rx[3];
1700 |   r->xy[76] = rx[4];
1701 |   r->xy[77] = rx[5];
1702 |   r->xy[78] = rx[6];
1703 |   r->xy[79] = rx[7];
1704 | 
1705 |   r->xy[80] = ry[0];
1706 |   r->xy[81] = ry[1];
1707 |   r->xy[82] = ry[2];
1708 |   r->xy[83] = ry[3];
1709 |   r->xy[84] = ry[4];
1710 |   r->xy[85] = ry[5];
1711 |   r->xy[86] = ry[6];
1712 |   r->xy[87] = ry[7];
1713 | 
1714 |   // -7:
1715 | 
1716 |   neg[0] = ry[0];
1717 |   neg[1] = ry[1];
1718 |   neg[2] = ry[2];
1719 |   neg[3] = ry[3];
1720 |   neg[4] = ry[4];
1721 |   neg[5] = ry[5];
1722 |   neg[6] = ry[6];
1723 |   neg[7] = ry[7];
1724 | 
1725 |   sub_mod (neg, p, neg);
1726 | 
1727 |   r->xy[88] = neg[0];
1728 |   r->xy[89] = neg[1];
1729 |   r->xy[90] = neg[2];
1730 |   r->xy[91] = neg[3];
1731 |   r->xy[92] = neg[4];
1732 |   r->xy[93] = neg[5];
1733 |   r->xy[94] = neg[6];
1734 |   r->xy[95] = neg[7];
1735 | }
1736 | 
1737 | /*
1738 |  * Convert the tweak/scalar k to w-NAF (window size is 4).
1739 |  * @param naf out: w-NAF form of the tweak/scalar, a pointer to an u32 array with a size of 33.
1740 |  * @param k in: tweak/scalar which should be converted, a pointer to an u32 array with a size of 8.
1741 |  * @return Returns the loop start index.
1742 |  */
1743 | DECLSPEC int convert_to_window_naf (PRIVATE_AS u32 *naf, PRIVATE_AS const u32 *k)
1744 | {
1745 |   int loop_start = 0;
1746 | 
1747 |   u32 n[9];
1748 | 
1749 |   n[0] =    0; // we need this extra slot sometimes for the subtraction to work
1750 |   n[1] = k[7];
1751 |   n[2] = k[6];
1752 |   n[3] = k[5];
1753 |   n[4] = k[4];
1754 |   n[5] = k[3];
1755 |   n[6] = k[2];
1756 |   n[7] = k[1];
1757 |   n[8] = k[0];
1758 | 
1759 |   for (int i = 0; i <= 256; i++)
1760 |   {
1761 |     if (n[8] & 1)
1762 |     {
1763 |       // for window size w = 4:
1764 |       // => 2^(w-0) = 2^4 = 16 (0x10)
1765 |       // => 2^(w-1) = 2^3 =  8 (0x08)
1766 | 
1767 |       int diff = n[8] & 0x0f; // n % 2^w == n & (2^w - 1)
1768 | 
1769 |       // convert diff to val according to this table:
1770 |       //  1 -> +1 -> 1
1771 |       //  3 -> +3 -> 3
1772 |       //  5 -> +5 -> 5
1773 |       //  7 -> +7 -> 7
1774 |       //  9 -> -7 -> 8
1775 |       // 11 -> -5 -> 6
1776 |       // 13 -> -3 -> 4
1777 |       // 15 -> -1 -> 2
1778 | 
1779 |       int val = diff;
1780 | 
1781 |       if (diff >= 0x08)
1782 |       {
1783 |         diff -= 0x10;
1784 | 
1785 |         val = 0x11 - val;
1786 |       }
1787 | 
1788 |       naf[i >> 3] |= val << ((i & 7) << 2);
1789 | 
1790 |       u32 t = n[8]; // t is the (temporary) old/unmodified value
1791 | 
1792 |       n[8] -= diff;
1793 | 
1794 |       // we need to take care of the carry/borrow:
1795 | 
1796 |       u32 k = 8;
1797 | 
1798 |       if (diff > 0)
1799 |       {
1800 |         while (n[k] > t) // overflow propagation
1801 |         {
1802 |           if (k == 0) break; // needed ?
1803 | 
1804 |           k--;
1805 | 
1806 |           t = n[k];
1807 | 
1808 |           n[k]--;
1809 |         }
1810 |       }
1811 |       else // if (diff < 0)
1812 |       {
1813 |         while (t > n[k]) // overflow propagation
1814 |         {
1815 |           if (k == 0) break;
1816 | 
1817 |           k--;
1818 | 
1819 |           t = n[k];
1820 | 
1821 |           n[k]++;
1822 |         }
1823 |       }
1824 | 
1825 |       // update start:
1826 | 
1827 |       loop_start = i;
1828 |     }
1829 | 
1830 |     // n = n / 2:
1831 | 
1832 |     n[8] = n[8] >> 1 | n[7] << 31;
1833 |     n[7] = n[7] >> 1 | n[6] << 31;
1834 |     n[6] = n[6] >> 1 | n[5] << 31;
1835 |     n[5] = n[5] >> 1 | n[4] << 31;
1836 |     n[4] = n[4] >> 1 | n[3] << 31;
1837 |     n[3] = n[3] >> 1 | n[2] << 31;
1838 |     n[2] = n[2] >> 1 | n[1] << 31;
1839 |     n[1] = n[1] >> 1 | n[0] << 31;
1840 |     n[0] = n[0] >> 1;
1841 |   }
1842 | 
1843 |   return loop_start;
1844 | }
1845 | 
1846 | /*
1847 |  * @param x1 out: x coordinate, a pointer to an u32 array with a size of 8.
1848 |  * @param y1 out: y coordinate, a pointer to an u32 array with a size of 8.
1849 |  * @param k in: tweak/scalar which should be converted, a pointer to an u32 array with a size of 8.
1850 |  * @param tmps in: a basepoint for the multiplication.
1851 |  * @return Returns the x coordinate with a leading parity/sign (for odd/even y), it is named a compressed coordinate.
1852 |  */
1853 | DECLSPEC void point_mul_xy (PRIVATE_AS u32 *x1, PRIVATE_AS u32 *y1, PRIVATE_AS const u32 *k, SECP256K1_TMPS_TYPE const secp256k1_t *tmps)
1854 | {
1855 |   u32 naf[SECP256K1_NAF_SIZE] = { 0 };
1856 | 
1857 |   int loop_start = convert_to_window_naf (naf, k);
1858 | 
1859 |   // first set:
1860 | 
1861 |   const u32 multiplier = (naf[loop_start >> 3] >> ((loop_start & 7) << 2)) & 0x0f; // or use u8 ?
1862 | 
1863 |   const u32 odd = multiplier & 1;
1864 | 
1865 |   const u32 x_pos = ((multiplier - 1 + odd) >> 1) * 24;
1866 |   const u32 y_pos = odd ? (x_pos + 8) : (x_pos + 16);
1867 | 
1868 | 
1869 |   x1[0] = tmps->xy[x_pos + 0];
1870 |   x1[1] = tmps->xy[x_pos + 1];
1871 |   x1[2] = tmps->xy[x_pos + 2];
1872 |   x1[3] = tmps->xy[x_pos + 3];
1873 |   x1[4] = tmps->xy[x_pos + 4];
1874 |   x1[5] = tmps->xy[x_pos + 5];
1875 |   x1[6] = tmps->xy[x_pos + 6];
1876 |   x1[7] = tmps->xy[x_pos + 7];
1877 | 
1878 |   y1[0] = tmps->xy[y_pos + 0];
1879 |   y1[1] = tmps->xy[y_pos + 1];
1880 |   y1[2] = tmps->xy[y_pos + 2];
1881 |   y1[3] = tmps->xy[y_pos + 3];
1882 |   y1[4] = tmps->xy[y_pos + 4];
1883 |   y1[5] = tmps->xy[y_pos + 5];
1884 |   y1[6] = tmps->xy[y_pos + 6];
1885 |   y1[7] = tmps->xy[y_pos + 7];
1886 | 
1887 |   u32 z1[8] = { 0 };
1888 | 
1889 |   z1[0] = 1;
1890 | 
1891 |   /*
1892 |    * Start:
1893 |    */
1894 | 
1895 |   // main loop (left-to-right binary algorithm):
1896 | 
1897 |   for (int pos = loop_start - 1; pos >= 0; pos--) // -1 because we've set/add the point already
1898 |   {
1899 |     // always double:
1900 | 
1901 |     point_double (x1, y1, z1);
1902 | 
1903 |     // add only if needed:
1904 | 
1905 |     const u32 multiplier = (naf[pos >> 3] >> ((pos & 7) << 2)) & 0x0f;
1906 | 
1907 |     if (multiplier)
1908 |     {
1909 |       /*
1910 |         m ->  y | y = ((m - (m & 1)) / 2) * 24
1911 |         ----------------------------------
1912 |         1 ->  0 | 1/2 * 24 = 0
1913 |         2 -> 16
1914 |         3 -> 24 | 3/2 * 24 = 24
1915 |         4 -> 40
1916 |         5 -> 48 | 5/2 * 24 = 2*24
1917 |         6 -> 64
1918 |         7 -> 72 | 7/2 * 24 = 3*24
1919 |         8 -> 88
1920 |        */
1921 | 
1922 |       const u32 odd = multiplier & 1;
1923 | 
1924 |       const u32 x_pos = ((multiplier - 1 + odd) >> 1) * 24;
1925 |       const u32 y_pos = odd ? (x_pos + 8) : (x_pos + 16);
1926 | 
1927 |       u32 x2[8];
1928 | 
1929 |       x2[0] = tmps->xy[x_pos + 0];
1930 |       x2[1] = tmps->xy[x_pos + 1];
1931 |       x2[2] = tmps->xy[x_pos + 2];
1932 |       x2[3] = tmps->xy[x_pos + 3];
1933 |       x2[4] = tmps->xy[x_pos + 4];
1934 |       x2[5] = tmps->xy[x_pos + 5];
1935 |       x2[6] = tmps->xy[x_pos + 6];
1936 |       x2[7] = tmps->xy[x_pos + 7];
1937 | 
1938 |       u32 y2[8];
1939 | 
1940 |       y2[0] = tmps->xy[y_pos + 0];
1941 |       y2[1] = tmps->xy[y_pos + 1];
1942 |       y2[2] = tmps->xy[y_pos + 2];
1943 |       y2[3] = tmps->xy[y_pos + 3];
1944 |       y2[4] = tmps->xy[y_pos + 4];
1945 |       y2[5] = tmps->xy[y_pos + 5];
1946 |       y2[6] = tmps->xy[y_pos + 6];
1947 |       y2[7] = tmps->xy[y_pos + 7];
1948 | 
1949 |       // (x1, y1, z1) + multiplier * (x, y, z) = (x1, y1, z1) + (x2, y2, z2)
1950 | 
1951 |       point_add (x1, y1, z1, x2, y2);
1952 | 
1953 |       // optimization (there can't be any adds after an add for w-1 times):
1954 |       // (but it seems to be faster without this manipulation of "pos")
1955 | 
1956 |       //for (u32 i = 0; i < 3; i++)
1957 |       //{
1958 |       //  if (pos == 0) break;
1959 |       //  point_double (x1, y1, z1);
1960 |       //  pos--;
1961 |       //}
1962 |     }
1963 |   }
1964 | 
1965 | 
1966 |   /*
1967 |    * Get the corresponding affine coordinates x/y:
1968 |    *
1969 |    * Note:
1970 |    * x1_affine = x1_jacobian / z1^2 = x1_jacobian * z1_inv^2
1971 |    * y1_affine = y1_jacobian / z1^2 = y1_jacobian * z1_inv^2
1972 |    *
1973 |    */
1974 | 
1975 |   inv_mod (z1);
1976 | 
1977 |   u32 z2[8];
1978 | 
1979 |   mul_mod (z2, z1, z1); // z1^2
1980 |   mul_mod (x1, x1, z2); // x1_affine
1981 | 
1982 |   mul_mod (z1, z2, z1); // z1^3
1983 |   mul_mod (y1, y1, z1); // y1_affine
1984 | 
1985 |   // return values are already in x1 and y1
1986 | }
1987 | 
1988 | /*
1989 |  * @param r out: x coordinate with leading parity/sign (for odd/even y), a pointer to an u32 array with a size of 9.
1990 |  * @param k in: tweak/scalar which should be converted, a pointer to an u32 array with a size of 8.
1991 |  * @param tmps in: a basepoint for the multiplication.
1992 |  * @return Returns the x coordinate with a leading parity/sign (for odd/even y), it is named a compressed coordinate.
1993 |  */
1994 | DECLSPEC void point_mul (PRIVATE_AS u32 *r, PRIVATE_AS const u32 *k, SECP256K1_TMPS_TYPE const secp256k1_t *tmps)
1995 | {
1996 |   u32 x[8];
1997 |   u32 y[8];
1998 | 
1999 |   point_mul_xy (x, y, k, tmps);
2000 | 
2001 |   /*
2002 |    * output:
2003 |    */
2004 | 
2005 |   // shift by 1 byte (8 bits) to make room and add the parity/sign (for odd/even y):
2006 | 
2007 |   r[8] =               (x[0] << 24);
2008 |   r[7] = (x[0] >> 8) | (x[1] << 24);
2009 |   r[6] = (x[1] >> 8) | (x[2] << 24);
2010 |   r[5] = (x[2] >> 8) | (x[3] << 24);
2011 |   r[4] = (x[3] >> 8) | (x[4] << 24);
2012 |   r[3] = (x[4] >> 8) | (x[5] << 24);
2013 |   r[2] = (x[5] >> 8) | (x[6] << 24);
2014 |   r[1] = (x[6] >> 8) | (x[7] << 24);
2015 |   r[0] = (x[7] >> 8);
2016 | 
2017 |   const u32 type = 0x02 | (y[0] & 1); // (note: 0b10 | 0b01 = 0x03)
2018 | 
2019 |   r[0] = r[0] | type << 24; // 0x02 or 0x03
2020 | }
2021 | 
2022 | /*
2023 |  * Transform a x coordinate and separate parity to secp256k1_t.
2024 |  * @param r out: x and y coordinates.
2025 |  * @param x in: x coordinate which should be converted, a pointer to an u32 array with a size of 8.
2026 |  * @param first_byte in: The parity of the y coordinate, a u32.
2027 |  * @return Returns 0 if successfull, returns 1 if x is greater than the basepoint.
2028 |  */
2029 | DECLSPEC u32 transform_public (PRIVATE_AS secp256k1_t *r, PRIVATE_AS const u32 *x, const u32 first_byte)
2030 | {
2031 |   u32 p[8];
2032 | 
2033 |   p[0] = SECP256K1_P0;
2034 |   p[1] = SECP256K1_P1;
2035 |   p[2] = SECP256K1_P2;
2036 |   p[3] = SECP256K1_P3;
2037 |   p[4] = SECP256K1_P4;
2038 |   p[5] = SECP256K1_P5;
2039 |   p[6] = SECP256K1_P6;
2040 |   p[7] = SECP256K1_P7;
2041 | 
2042 |   // x must be smaller than p (because of y ^ 2 = x ^ 3 % p)
2043 | 
2044 |   for (int i = 7; i >= 0; i--)
2045 |   {
2046 |     if (x[i] < p[i]) break;
2047 |     if (x[i] > p[i]) return 1;
2048 |   }
2049 | 
2050 | 
2051 |   // get y^2 = x^3 + 7:
2052 | 
2053 |   u32 b[8] = { 0 };
2054 | 
2055 |   b[0] = SECP256K1_B;
2056 | 
2057 |   u32 y[8];
2058 | 
2059 |   mul_mod (y, x, x);
2060 |   mul_mod (y, y, x);
2061 |   add_mod (y, y, b);
2062 | 
2063 |   // get y = sqrt (y^2):
2064 | 
2065 |   sqrt_mod (y);
2066 | 
2067 |   // check if it's of the correct parity that we want (odd/even):
2068 | 
2069 |   if ((first_byte & 1) != (y[0] & 1))
2070 |   {
2071 |     // y2 = p - y1 (or y2 = y1 * -1)
2072 | 
2073 |     sub_mod (y, p, y);
2074 |   }
2075 | 
2076 |   // get xy:
2077 | 
2078 |   point_get_coords (r, x, y);
2079 | 
2080 |   return 0;
2081 | }
2082 | 
2083 | /*
2084 |  * Parse a x coordinate with leading parity to secp256k1_t.
2085 |  * @param r out: x and y coordinates.
2086 |  * @param k in: x coordinate which should be converted with leading parity, a pointer to an u32 array with a size of 9.
2087 |  * @return Returns 0 if successfull, returns 1 if x is greater than the basepoint or the parity has an unexpected value.
2088 |  */
2089 | DECLSPEC u32 parse_public (PRIVATE_AS secp256k1_t *r, PRIVATE_AS const u32 *k)
2090 | {
2091 |   // verify:
2092 | 
2093 |   const u32 first_byte = k[0] & 0xff;
2094 | 
2095 |   if ((first_byte != '\x02') && (first_byte != '\x03'))
2096 |   {
2097 |     return 1;
2098 |   }
2099 | 
2100 |   // load k into x without the first byte:
2101 | 
2102 |   u32 x[8];
2103 | 
2104 |   x[0] = (k[7] & 0xff00) << 16 | (k[7] & 0xff0000) | (k[7] & 0xff000000) >> 16 | (k[8] & 0xff);
2105 |   x[1] = (k[6] & 0xff00) << 16 | (k[6] & 0xff0000) | (k[6] & 0xff000000) >> 16 | (k[7] & 0xff);
2106 |   x[2] = (k[5] & 0xff00) << 16 | (k[5] & 0xff0000) | (k[5] & 0xff000000) >> 16 | (k[6] & 0xff);
2107 |   x[3] = (k[4] & 0xff00) << 16 | (k[4] & 0xff0000) | (k[4] & 0xff000000) >> 16 | (k[5] & 0xff);
2108 |   x[4] = (k[3] & 0xff00) << 16 | (k[3] & 0xff0000) | (k[3] & 0xff000000) >> 16 | (k[4] & 0xff);
2109 |   x[5] = (k[2] & 0xff00) << 16 | (k[2] & 0xff0000) | (k[2] & 0xff000000) >> 16 | (k[3] & 0xff);
2110 |   x[6] = (k[1] & 0xff00) << 16 | (k[1] & 0xff0000) | (k[1] & 0xff000000) >> 16 | (k[2] & 0xff);
2111 |   x[7] = (k[0] & 0xff00) << 16 | (k[0] & 0xff0000) | (k[0] & 0xff000000) >> 16 | (k[1] & 0xff);
2112 | 
2113 |   return transform_public (r, x, first_byte);
2114 | }
2115 | 
2116 | 
2117 | /*
2118 |  * Set precomputed values of the basepoint g to a secp256k1 structure.
2119 |  * @param r out: x and y coordinates. pre-computed points: (x1,y1,-y1),(x3,y3,-y3),(x5,y5,-y5),(x7,y7,-y7)
2120 |  */
2121 | DECLSPEC void set_precomputed_basepoint_g (PRIVATE_AS secp256k1_t *r)
2122 | {
2123 |   // x1
2124 |   r->xy[ 0] = SECP256K1_G_PRE_COMPUTED_00;
2125 |   r->xy[ 1] = SECP256K1_G_PRE_COMPUTED_01;
2126 |   r->xy[ 2] = SECP256K1_G_PRE_COMPUTED_02;
2127 |   r->xy[ 3] = SECP256K1_G_PRE_COMPUTED_03;
2128 |   r->xy[ 4] = SECP256K1_G_PRE_COMPUTED_04;
2129 |   r->xy[ 5] = SECP256K1_G_PRE_COMPUTED_05;
2130 |   r->xy[ 6] = SECP256K1_G_PRE_COMPUTED_06;
2131 |   r->xy[ 7] = SECP256K1_G_PRE_COMPUTED_07;
2132 | 
2133 |   // y1
2134 |   r->xy[ 8] = SECP256K1_G_PRE_COMPUTED_08;
2135 |   r->xy[ 9] = SECP256K1_G_PRE_COMPUTED_09;
2136 |   r->xy[10] = SECP256K1_G_PRE_COMPUTED_10;
2137 |   r->xy[11] = SECP256K1_G_PRE_COMPUTED_11;
2138 |   r->xy[12] = SECP256K1_G_PRE_COMPUTED_12;
2139 |   r->xy[13] = SECP256K1_G_PRE_COMPUTED_13;
2140 |   r->xy[14] = SECP256K1_G_PRE_COMPUTED_14;
2141 |   r->xy[15] = SECP256K1_G_PRE_COMPUTED_15;
2142 | 
2143 |   // -y1
2144 |   r->xy[16] = SECP256K1_G_PRE_COMPUTED_16;
2145 |   r->xy[17] = SECP256K1_G_PRE_COMPUTED_17;
2146 |   r->xy[18] = SECP256K1_G_PRE_COMPUTED_18;
2147 |   r->xy[19] = SECP256K1_G_PRE_COMPUTED_19;
2148 |   r->xy[20] = SECP256K1_G_PRE_COMPUTED_20;
2149 |   r->xy[21] = SECP256K1_G_PRE_COMPUTED_21;
2150 |   r->xy[22] = SECP256K1_G_PRE_COMPUTED_22;
2151 |   r->xy[23] = SECP256K1_G_PRE_COMPUTED_23;
2152 | 
2153 |   // x3
2154 |   r->xy[24] = SECP256K1_G_PRE_COMPUTED_24;
2155 |   r->xy[25] = SECP256K1_G_PRE_COMPUTED_25;
2156 |   r->xy[26] = SECP256K1_G_PRE_COMPUTED_26;
2157 |   r->xy[27] = SECP256K1_G_PRE_COMPUTED_27;
2158 |   r->xy[28] = SECP256K1_G_PRE_COMPUTED_28;
2159 |   r->xy[29] = SECP256K1_G_PRE_COMPUTED_29;
2160 |   r->xy[30] = SECP256K1_G_PRE_COMPUTED_30;
2161 |   r->xy[31] = SECP256K1_G_PRE_COMPUTED_31;
2162 | 
2163 |   // y3
2164 |   r->xy[32] = SECP256K1_G_PRE_COMPUTED_32;
2165 |   r->xy[33] = SECP256K1_G_PRE_COMPUTED_33;
2166 |   r->xy[34] = SECP256K1_G_PRE_COMPUTED_34;
2167 |   r->xy[35] = SECP256K1_G_PRE_COMPUTED_35;
2168 |   r->xy[36] = SECP256K1_G_PRE_COMPUTED_36;
2169 |   r->xy[37] = SECP256K1_G_PRE_COMPUTED_37;
2170 |   r->xy[38] = SECP256K1_G_PRE_COMPUTED_38;
2171 |   r->xy[39] = SECP256K1_G_PRE_COMPUTED_39;
2172 | 
2173 |   // -y3
2174 |   r->xy[40] = SECP256K1_G_PRE_COMPUTED_40;
2175 |   r->xy[41] = SECP256K1_G_PRE_COMPUTED_41;
2176 |   r->xy[42] = SECP256K1_G_PRE_COMPUTED_42;
2177 |   r->xy[43] = SECP256K1_G_PRE_COMPUTED_43;
2178 |   r->xy[44] = SECP256K1_G_PRE_COMPUTED_44;
2179 |   r->xy[45] = SECP256K1_G_PRE_COMPUTED_45;
2180 |   r->xy[46] = SECP256K1_G_PRE_COMPUTED_46;
2181 |   r->xy[47] = SECP256K1_G_PRE_COMPUTED_47;
2182 | 
2183 |   // x5
2184 |   r->xy[48] = SECP256K1_G_PRE_COMPUTED_48;
2185 |   r->xy[49] = SECP256K1_G_PRE_COMPUTED_49;
2186 |   r->xy[50] = SECP256K1_G_PRE_COMPUTED_50;
2187 |   r->xy[51] = SECP256K1_G_PRE_COMPUTED_51;
2188 |   r->xy[52] = SECP256K1_G_PRE_COMPUTED_52;
2189 |   r->xy[53] = SECP256K1_G_PRE_COMPUTED_53;
2190 |   r->xy[54] = SECP256K1_G_PRE_COMPUTED_54;
2191 |   r->xy[55] = SECP256K1_G_PRE_COMPUTED_55;
2192 | 
2193 |   // y5
2194 |   r->xy[56] = SECP256K1_G_PRE_COMPUTED_56;
2195 |   r->xy[57] = SECP256K1_G_PRE_COMPUTED_57;
2196 |   r->xy[58] = SECP256K1_G_PRE_COMPUTED_58;
2197 |   r->xy[59] = SECP256K1_G_PRE_COMPUTED_59;
2198 |   r->xy[60] = SECP256K1_G_PRE_COMPUTED_60;
2199 |   r->xy[61] = SECP256K1_G_PRE_COMPUTED_61;
2200 |   r->xy[62] = SECP256K1_G_PRE_COMPUTED_62;
2201 |   r->xy[63] = SECP256K1_G_PRE_COMPUTED_63;
2202 | 
2203 |   // -y5
2204 |   r->xy[64] = SECP256K1_G_PRE_COMPUTED_64;
2205 |   r->xy[65] = SECP256K1_G_PRE_COMPUTED_65;
2206 |   r->xy[66] = SECP256K1_G_PRE_COMPUTED_66;
2207 |   r->xy[67] = SECP256K1_G_PRE_COMPUTED_67;
2208 |   r->xy[68] = SECP256K1_G_PRE_COMPUTED_68;
2209 |   r->xy[69] = SECP256K1_G_PRE_COMPUTED_69;
2210 |   r->xy[70] = SECP256K1_G_PRE_COMPUTED_70;
2211 |   r->xy[71] = SECP256K1_G_PRE_COMPUTED_71;
2212 | 
2213 |   // x7
2214 |   r->xy[72] = SECP256K1_G_PRE_COMPUTED_72;
2215 |   r->xy[73] = SECP256K1_G_PRE_COMPUTED_73;
2216 |   r->xy[74] = SECP256K1_G_PRE_COMPUTED_74;
2217 |   r->xy[75] = SECP256K1_G_PRE_COMPUTED_75;
2218 |   r->xy[76] = SECP256K1_G_PRE_COMPUTED_76;
2219 |   r->xy[77] = SECP256K1_G_PRE_COMPUTED_77;
2220 |   r->xy[78] = SECP256K1_G_PRE_COMPUTED_78;
2221 |   r->xy[79] = SECP256K1_G_PRE_COMPUTED_79;
2222 | 
2223 |   // y7
2224 |   r->xy[80] = SECP256K1_G_PRE_COMPUTED_80;
2225 |   r->xy[81] = SECP256K1_G_PRE_COMPUTED_81;
2226 |   r->xy[82] = SECP256K1_G_PRE_COMPUTED_82;
2227 |   r->xy[83] = SECP256K1_G_PRE_COMPUTED_83;
2228 |   r->xy[84] = SECP256K1_G_PRE_COMPUTED_84;
2229 |   r->xy[85] = SECP256K1_G_PRE_COMPUTED_85;
2230 |   r->xy[86] = SECP256K1_G_PRE_COMPUTED_86;
2231 |   r->xy[87] = SECP256K1_G_PRE_COMPUTED_87;
2232 | 
2233 |   // -y7
2234 |   r->xy[88] = SECP256K1_G_PRE_COMPUTED_88;
2235 |   r->xy[89] = SECP256K1_G_PRE_COMPUTED_89;
2236 |   r->xy[90] = SECP256K1_G_PRE_COMPUTED_90;
2237 |   r->xy[91] = SECP256K1_G_PRE_COMPUTED_91;
2238 |   r->xy[92] = SECP256K1_G_PRE_COMPUTED_92;
2239 |   r->xy[93] = SECP256K1_G_PRE_COMPUTED_93;
2240 |   r->xy[94] = SECP256K1_G_PRE_COMPUTED_94;
2241 |   r->xy[95] = SECP256K1_G_PRE_COMPUTED_95;
2242 | }
2243 | 


--------------------------------------------------------------------------------
/secp256k1/inc_types.h:
--------------------------------------------------------------------------------
   1 | /**
   2 |  * Author......: See docs/credits.txt
   3 |  * License.....: MIT
   4 |  */
   5 | 
   6 | #ifndef INC_TYPES_H
   7 | #define INC_TYPES_H
   8 | 
   9 | #if ATTACK_MODE == 9
  10 | #define BITMAP_MASK         kernel_param->bitmap_mask
  11 | #define BITMAP_SHIFT1       kernel_param->bitmap_shift1
  12 | #define BITMAP_SHIFT2       kernel_param->bitmap_shift2
  13 | #define SALT_POS_HOST       (kernel_param->pws_pos + gid)
  14 | #define LOOP_POS            kernel_param->loop_pos
  15 | #define LOOP_CNT            kernel_param->loop_cnt
  16 | #define IL_CNT              kernel_param->il_cnt
  17 | #define DIGESTS_CNT         1
  18 | #define DIGESTS_OFFSET_HOST (kernel_param->pws_pos + gid)
  19 | #define COMBS_MODE          kernel_param->combs_mode
  20 | #define SALT_REPEAT         kernel_param->salt_repeat
  21 | #define PWS_POS             kernel_param->pws_pos
  22 | #define GID_CNT             kernel_param->gid_max
  23 | #else
  24 | #define BITMAP_MASK         kernel_param->bitmap_mask
  25 | #define BITMAP_SHIFT1       kernel_param->bitmap_shift1
  26 | #define BITMAP_SHIFT2       kernel_param->bitmap_shift2
  27 | #define SALT_POS_HOST       kernel_param->salt_pos_host
  28 | #define LOOP_POS            kernel_param->loop_pos
  29 | #define LOOP_CNT            kernel_param->loop_cnt
  30 | #define IL_CNT              kernel_param->il_cnt
  31 | #define DIGESTS_CNT         kernel_param->digests_cnt
  32 | #define DIGESTS_OFFSET_HOST kernel_param->digests_offset_host
  33 | #define COMBS_MODE          kernel_param->combs_mode
  34 | #define SALT_REPEAT         kernel_param->salt_repeat
  35 | #define PWS_POS             kernel_param->pws_pos
  36 | #define GID_CNT             kernel_param->gid_max
  37 | #endif
  38 | 
  39 | #ifdef IS_CUDA
  40 | // https://docs.nvidia.com/cuda/nvrtc/index.html#integer-size
  41 | typedef unsigned char       uchar;
  42 | typedef unsigned short      ushort;
  43 | typedef unsigned int        uint;
  44 | typedef unsigned long       ulong;
  45 | typedef unsigned long long  ullong;
  46 | #endif
  47 | 
  48 | #ifdef IS_METAL
  49 | typedef unsigned char  uchar;
  50 | typedef unsigned short ushort;
  51 | typedef unsigned int   uint;
  52 | typedef unsigned long  ulong;
  53 | #define ullong ulong
  54 | #endif
  55 | 
  56 | #ifdef IS_OPENCL
  57 | typedef ulong   ullong;
  58 | typedef ulong2  ullong2;
  59 | typedef ulong4  ullong4;
  60 | typedef ulong8  ullong8;
  61 | typedef ulong16 ullong16;
  62 | #endif
  63 | 
  64 | #ifdef KERNEL_STATIC
  65 | typedef uchar  u8;
  66 | typedef ushort u16;
  67 | typedef uint   u32;
  68 | #ifdef IS_METAL
  69 | typedef ulong  u64;
  70 | #else
  71 | typedef ullong u64;
  72 | #endif
  73 | #else
  74 | typedef uint8_t  u8;
  75 | typedef uint16_t u16;
  76 | typedef uint32_t u32;
  77 | typedef uint64_t u64;
  78 | #endif
  79 | 
  80 | //testwise disabled
  81 | //typedef u8  u8a  __attribute__ ((aligned (8)));
  82 | //typedef u16 u16a __attribute__ ((aligned (8)));
  83 | //typedef u32 u32a __attribute__ ((aligned (8)));
  84 | //typedef u64 u64a __attribute__ ((aligned (8)));
  85 | 
  86 | typedef u8  u8a;
  87 | typedef u16 u16a;
  88 | typedef u32 u32a;
  89 | typedef u64 u64a;
  90 | 
  91 | #ifndef NEW_SIMD_CODE
  92 | #undef  VECT_SIZE
  93 | #define VECT_SIZE 1
  94 | #endif
  95 | 
  96 | #define CONCAT(a, b)       a##b
  97 | #define VTYPE(type, width) CONCAT(type, width)
  98 | 
  99 | // emulated is always VECT_SIZE = 1
 100 | #if VECT_SIZE == 1
 101 | typedef u8   u8x;
 102 | typedef u16  u16x;
 103 | typedef u32  u32x;
 104 | typedef u64  u64x;
 105 | 
 106 | #define make_u8x  (u8)
 107 | #define make_u16x (u16)
 108 | #define make_u32x (u32)
 109 | #define make_u64x (u64)
 110 | 
 111 | #else
 112 | 
 113 | #if defined IS_CUDA || defined IS_HIP
 114 | 
 115 | #if VECT_SIZE == 2
 116 | 
 117 | struct __device_builtin__ __builtin_align__(2) u8x
 118 | {
 119 |   u8 s0;
 120 |   u8 s1;
 121 | 
 122 |   inline __device__  u8x (const u8 a, const u8 b) : s0(a), s1(b) { }
 123 |   inline __device__  u8x (const u8 a)             : s0(a), s1(a) { }
 124 | 
 125 |   inline __device__  u8x (void) : s0(0), s1(0) { }
 126 |   inline __device__ ~u8x (void) { }
 127 | };
 128 | 
 129 | struct __device_builtin__ __builtin_align__(4) u16x
 130 | {
 131 |   u16 s0;
 132 |   u16 s1;
 133 | 
 134 |   inline __device__  u16x (const u16 a, const u16 b) : s0(a), s1(b) { }
 135 |   inline __device__  u16x (const u16 a)              : s0(a), s1(a) { }
 136 | 
 137 |   inline __device__  u16x (void) : s0(0), s1(0) { }
 138 |   inline __device__ ~u16x (void) { }
 139 | };
 140 | 
 141 | struct __device_builtin__ __builtin_align__(8) u32x
 142 | {
 143 |   u32 s0;
 144 |   u32 s1;
 145 | 
 146 |   inline __device__  u32x (const u32 a, const u32 b) : s0(a), s1(b) { }
 147 |   inline __device__  u32x (const u32 a)              : s0(a), s1(a) { }
 148 | 
 149 |   inline __device__  u32x (void) : s0(0), s1(0) { }
 150 |   inline __device__ ~u32x (void) { }
 151 | };
 152 | 
 153 | struct __device_builtin__ __builtin_align__(16) u64x
 154 | {
 155 |   u64 s0;
 156 |   u64 s1;
 157 | 
 158 |   inline __device__  u64x (const u64 a, const u64 b) : s0(a), s1(b) { }
 159 |   inline __device__  u64x (const u64 a)              : s0(a), s1(a) { }
 160 | 
 161 |   inline __device__  u64x (void) : s0(0), s1(0) { }
 162 |   inline __device__ ~u64x (void) { }
 163 | };
 164 | 
 165 | inline __device__ bool operator != (const u32x a, const u32  b) { return ((a.s0 != b)    && (a.s1 != b));    }
 166 | inline __device__ bool operator != (const u32x a, const u32x b) { return ((a.s0 != b.s0) && (a.s1 != b.s1)); }
 167 | 
 168 | inline __device__ void operator ^= (u32x &a, const u32  b) { a.s0 ^= b;    a.s1 ^= b;     }
 169 | inline __device__ void operator ^= (u32x &a, const u32x b) { a.s0 ^= b.s0; a.s1 ^= b.s1;  }
 170 | 
 171 | inline __device__ void operator |= (u32x &a, const u32  b) { a.s0 |= b;    a.s1 |= b;     }
 172 | inline __device__ void operator |= (u32x &a, const u32x b) { a.s0 |= b.s0; a.s1 |= b.s1;  }
 173 | 
 174 | inline __device__ void operator &= (u32x &a, const u32  b) { a.s0 &= b;    a.s1 &= b;     }
 175 | inline __device__ void operator &= (u32x &a, const u32x b) { a.s0 &= b.s0; a.s1 &= b.s1;  }
 176 | 
 177 | inline __device__ void operator += (u32x &a, const u32  b) { a.s0 += b;    a.s1 += b;     }
 178 | inline __device__ void operator += (u32x &a, const u32x b) { a.s0 += b.s0; a.s1 += b.s1;  }
 179 | 
 180 | inline __device__ void operator -= (u32x &a, const u32  b) { a.s0 -= b;    a.s1 -= b;     }
 181 | inline __device__ void operator -= (u32x &a, const u32x b) { a.s0 -= b.s0; a.s1 -= b.s1;  }
 182 | 
 183 | inline __device__ void operator *= (u32x &a, const u32  b) { a.s0 *= b;    a.s1 *= b;     }
 184 | inline __device__ void operator *= (u32x &a, const u32x b) { a.s0 *= b.s0; a.s1 *= b.s1;  }
 185 | 
 186 | inline __device__ void operator >>= (u32x &a, const u32  b) { a.s0 >>= b;    a.s1 >>= b;     }
 187 | inline __device__ void operator >>= (u32x &a, const u32x b) { a.s0 >>= b.s0; a.s1 >>= b.s1;  }
 188 | 
 189 | inline __device__ void operator <<= (u32x &a, const u32  b) { a.s0 <<= b;    a.s1 <<= b;     }
 190 | inline __device__ void operator <<= (u32x &a, const u32x b) { a.s0 <<= b.s0; a.s1 <<= b.s1;  }
 191 | 
 192 | inline __device__ u32x operator << (const u32x a, const u32  b) { return u32x ((a.s0 << b),    (a.s1 << b)   );  }
 193 | inline __device__ u32x operator << (const u32x a, const u32x b) { return u32x ((a.s0 << b.s0), (a.s1 << b.s1));  }
 194 | 
 195 | inline __device__ u32x operator >> (const u32x a, const u32  b) { return u32x ((a.s0 >> b),    (a.s1 >> b)   );  }
 196 | inline __device__ u32x operator >> (const u32x a, const u32x b) { return u32x ((a.s0 >> b.s0), (a.s1 >> b.s1));  }
 197 | 
 198 | inline __device__ u32x operator ^  (const u32x a, const u32  b) { return u32x ((a.s0 ^  b),    (a.s1 ^  b)   );  }
 199 | inline __device__ u32x operator ^  (const u32x a, const u32x b) { return u32x ((a.s0 ^  b.s0), (a.s1 ^  b.s1));  }
 200 | 
 201 | inline __device__ u32x operator |  (const u32x a, const u32  b) { return u32x ((a.s0 |  b),    (a.s1 |  b)   );  }
 202 | inline __device__ u32x operator |  (const u32x a, const u32x b) { return u32x ((a.s0 |  b.s0), (a.s1 |  b.s1));  }
 203 | 
 204 | inline __device__ u32x operator &  (const u32x a, const u32  b) { return u32x ((a.s0 &  b),    (a.s1 &  b)   );  }
 205 | inline __device__ u32x operator &  (const u32x a, const u32x b) { return u32x ((a.s0 &  b.s0), (a.s1 &  b.s1));  }
 206 | 
 207 | inline __device__ u32x operator +  (const u32x a, const u32  b) { return u32x ((a.s0 +  b),    (a.s1 +  b)   );  }
 208 | inline __device__ u32x operator +  (const u32x a, const u32x b) { return u32x ((a.s0 +  b.s0), (a.s1 +  b.s1));  }
 209 | 
 210 | inline __device__ u32x operator -  (const u32x a, const u32  b) { return u32x ((a.s0 -  b),    (a.s1 -  b)   );  }
 211 | inline __device__ u32x operator -  (const u32x a, const u32x b) { return u32x ((a.s0 -  b.s0), (a.s1 -  b.s1));  }
 212 | 
 213 | inline __device__ u32x operator *  (const u32x a, const u32  b) { return u32x ((a.s0 *  b),    (a.s1 *  b)   );  }
 214 | inline __device__ u32x operator *  (const u32x a, const u32x b) { return u32x ((a.s0 *  b.s0), (a.s1 *  b.s1));  }
 215 | 
 216 | inline __device__ u32x operator %  (const u32x a, const u32  b) { return u32x ((a.s0 %  b),    (a.s1 %  b)   );  }
 217 | inline __device__ u32x operator %  (const u32x a, const u32x b) { return u32x ((a.s0 %  b.s0), (a.s1 %  b.s1));  }
 218 | 
 219 | inline __device__ u32x operator ~  (const u32x a) { return u32x (~a.s0, ~a.s1); }
 220 | 
 221 | inline __device__ bool operator != (const u64x a, const u64  b) { return ((a.s0 != b)    && (a.s1 != b));    }
 222 | inline __device__ bool operator != (const u64x a, const u64x b) { return ((a.s0 != b.s0) && (a.s1 != b.s1)); }
 223 | 
 224 | inline __device__ void operator ^= (u64x &a, const u64  b) { a.s0 ^= b;    a.s1 ^= b;     }
 225 | inline __device__ void operator ^= (u64x &a, const u64x b) { a.s0 ^= b.s0; a.s1 ^= b.s1;  }
 226 | 
 227 | inline __device__ void operator |= (u64x &a, const u64  b) { a.s0 |= b;    a.s1 |= b;     }
 228 | inline __device__ void operator |= (u64x &a, const u64x b) { a.s0 |= b.s0; a.s1 |= b.s1;  }
 229 | 
 230 | inline __device__ void operator &= (u64x &a, const u64  b) { a.s0 &= b;    a.s1 &= b;     }
 231 | inline __device__ void operator &= (u64x &a, const u64x b) { a.s0 &= b.s0; a.s1 &= b.s1;  }
 232 | 
 233 | inline __device__ void operator += (u64x &a, const u64  b) { a.s0 += b;    a.s1 += b;     }
 234 | inline __device__ void operator += (u64x &a, const u64x b) { a.s0 += b.s0; a.s1 += b.s1;  }
 235 | 
 236 | inline __device__ void operator -= (u64x &a, const u64  b) { a.s0 -= b;    a.s1 -= b;     }
 237 | inline __device__ void operator -= (u64x &a, const u64x b) { a.s0 -= b.s0; a.s1 -= b.s1;  }
 238 | 
 239 | inline __device__ void operator *= (u64x &a, const u64  b) { a.s0 *= b;    a.s1 *= b;     }
 240 | inline __device__ void operator *= (u64x &a, const u64x b) { a.s0 *= b.s0; a.s1 *= b.s1;  }
 241 | 
 242 | inline __device__ void operator >>= (u64x &a, const u64  b) { a.s0 >>= b;    a.s1 >>= b;     }
 243 | inline __device__ void operator >>= (u64x &a, const u64x b) { a.s0 >>= b.s0; a.s1 >>= b.s1;  }
 244 | 
 245 | inline __device__ void operator <<= (u64x &a, const u64  b) { a.s0 <<= b;    a.s1 <<= b;     }
 246 | inline __device__ void operator <<= (u64x &a, const u64x b) { a.s0 <<= b.s0; a.s1 <<= b.s1;  }
 247 | 
 248 | inline __device__ u64x operator << (const u64x a, const u64  b) { return u64x ((a.s0 << b),    (a.s1 << b)   );  }
 249 | inline __device__ u64x operator << (const u64x a, const u64x b) { return u64x ((a.s0 << b.s0), (a.s1 << b.s1));  }
 250 | 
 251 | inline __device__ u64x operator >> (const u64x a, const u64  b) { return u64x ((a.s0 >> b),    (a.s1 >> b)   );  }
 252 | inline __device__ u64x operator >> (const u64x a, const u64x b) { return u64x ((a.s0 >> b.s0), (a.s1 >> b.s1));  }
 253 | 
 254 | inline __device__ u64x operator ^  (const u64x a, const u64  b) { return u64x ((a.s0 ^  b),    (a.s1 ^  b)   );  }
 255 | inline __device__ u64x operator ^  (const u64x a, const u64x b) { return u64x ((a.s0 ^  b.s0), (a.s1 ^  b.s1));  }
 256 | 
 257 | inline __device__ u64x operator |  (const u64x a, const u64  b) { return u64x ((a.s0 |  b),    (a.s1 |  b)   );  }
 258 | inline __device__ u64x operator |  (const u64x a, const u64x b) { return u64x ((a.s0 |  b.s0), (a.s1 |  b.s1));  }
 259 | 
 260 | inline __device__ u64x operator &  (const u64x a, const u64  b) { return u64x ((a.s0 &  b),    (a.s1 &  b)   );  }
 261 | inline __device__ u64x operator &  (const u64x a, const u64x b) { return u64x ((a.s0 &  b.s0), (a.s1 &  b.s1));  }
 262 | 
 263 | inline __device__ u64x operator +  (const u64x a, const u64  b) { return u64x ((a.s0 +  b),    (a.s1 +  b)   );  }
 264 | inline __device__ u64x operator +  (const u64x a, const u64x b) { return u64x ((a.s0 +  b.s0), (a.s1 +  b.s1));  }
 265 | 
 266 | inline __device__ u64x operator -  (const u64x a, const u64  b) { return u64x ((a.s0 -  b),    (a.s1 -  b)   );  }
 267 | inline __device__ u64x operator -  (const u64x a, const u64x b) { return u64x ((a.s0 -  b.s0), (a.s1 -  b.s1));  }
 268 | 
 269 | inline __device__ u64x operator *  (const u64x a, const u64  b) { return u64x ((a.s0 *  b),    (a.s1 *  b)   );  }
 270 | inline __device__ u64x operator *  (const u64x a, const u64x b) { return u64x ((a.s0 *  b.s0), (a.s1 *  b.s1));  }
 271 | 
 272 | inline __device__ u64x operator %  (const u64x a, const u64  b) { return u64x ((a.s0 %  b),    (a.s1 %  b)   );  }
 273 | inline __device__ u64x operator %  (const u64x a, const u64x b) { return u64x ((a.s0 %  b.s0), (a.s1 %  b.s1));  }
 274 | 
 275 | inline __device__ u64x operator ~  (const u64x a) { return u64x (~a.s0, ~a.s1); }
 276 | 
 277 | #endif
 278 | 
 279 | #if VECT_SIZE == 4
 280 | 
 281 | struct __device_builtin__ __builtin_align__(4) u8x
 282 | {
 283 |   u8 s0;
 284 |   u8 s1;
 285 |   u8 s2;
 286 |   u8 s3;
 287 | 
 288 |   inline __device__  u8x (const u8 a, const u8 b, const u8 c, const u8 d) : s0(a), s1(b), s2(c), s3(d) { }
 289 |   inline __device__  u8x (const u8 a)                                     : s0(a), s1(a), s2(a), s3(a) { }
 290 | 
 291 |   inline __device__  u8x (void) : s0(0), s1(0), s2(0), s3(0) { }
 292 |   inline __device__ ~u8x (void) { }
 293 | };
 294 | 
 295 | struct __device_builtin__ __builtin_align__(8) u16x
 296 | {
 297 |   u16 s0;
 298 |   u16 s1;
 299 |   u16 s2;
 300 |   u16 s3;
 301 | 
 302 |   inline __device__  u16x (const u16 a, const u16 b, const u16 c, const u16 d) : s0(a), s1(b), s2(c), s3(d) { }
 303 |   inline __device__  u16x (const u16 a)                                        : s0(a), s1(a), s2(a), s3(a) { }
 304 | 
 305 |   inline __device__  u16x (void) : s0(0), s1(0), s2(0), s3(0) { }
 306 |   inline __device__ ~u16x (void) { }
 307 | };
 308 | 
 309 | struct __device_builtin__ __builtin_align__(16) u32x
 310 | {
 311 |   u32 s0;
 312 |   u32 s1;
 313 |   u32 s2;
 314 |   u32 s3;
 315 | 
 316 |   inline __device__  u32x (const u32 a, const u32 b, const u32 c, const u32 d) : s0(a), s1(b), s2(c), s3(d) { }
 317 |   inline __device__  u32x (const u32 a)                                        : s0(a), s1(a), s2(a), s3(a) { }
 318 | 
 319 |   inline __device__  u32x (void) : s0(0), s1(0), s2(0), s3(0) { }
 320 |   inline __device__ ~u32x (void) { }
 321 | };
 322 | 
 323 | struct __device_builtin__ __builtin_align__(32) u64x
 324 | {
 325 |   u64 s0;
 326 |   u64 s1;
 327 |   u64 s2;
 328 |   u64 s3;
 329 | 
 330 |   inline __device__  u64x (const u64 a, const u64 b, const u64 c, const u64 d) : s0(a), s1(b), s2(c), s3(d) { }
 331 |   inline __device__  u64x (const u64 a)                                        : s0(a), s1(a), s2(a), s3(a) { }
 332 | 
 333 |   inline __device__  u64x (void) : s0(0), s1(0), s2(0), s3(0) { }
 334 |   inline __device__ ~u64x (void) { }
 335 | };
 336 | 
 337 | inline __device__ bool operator != (const u32x a, const u32  b) { return ((a.s0 != b)    && (a.s1 != b)    && (a.s2 != b)    && (a.s3 != b)   ); }
 338 | inline __device__ bool operator != (const u32x a, const u32x b) { return ((a.s0 != b.s0) && (a.s1 != b.s1) && (a.s2 != b.s2) && (a.s3 != b.s3)); }
 339 | 
 340 | inline __device__ void operator ^= (u32x &a, const u32  b) { a.s0 ^= b;    a.s1 ^= b;    a.s2 ^= b;    a.s3 ^= b;     }
 341 | inline __device__ void operator ^= (u32x &a, const u32x b) { a.s0 ^= b.s0; a.s1 ^= b.s1; a.s2 ^= b.s2; a.s3 ^= b.s3;  }
 342 | 
 343 | inline __device__ void operator |= (u32x &a, const u32  b) { a.s0 |= b;    a.s1 |= b;    a.s2 |= b;    a.s3 |= b;     }
 344 | inline __device__ void operator |= (u32x &a, const u32x b) { a.s0 |= b.s0; a.s1 |= b.s1; a.s2 |= b.s2; a.s3 |= b.s3;  }
 345 | 
 346 | inline __device__ void operator &= (u32x &a, const u32  b) { a.s0 &= b;    a.s1 &= b;    a.s2 &= b;    a.s3 &= b;     }
 347 | inline __device__ void operator &= (u32x &a, const u32x b) { a.s0 &= b.s0; a.s1 &= b.s1; a.s2 &= b.s2; a.s3 &= b.s3;  }
 348 | 
 349 | inline __device__ void operator += (u32x &a, const u32  b) { a.s0 += b;    a.s1 += b;    a.s2 += b;    a.s3 += b;     }
 350 | inline __device__ void operator += (u32x &a, const u32x b) { a.s0 += b.s0; a.s1 += b.s1; a.s2 += b.s2; a.s3 += b.s3;  }
 351 | 
 352 | inline __device__ void operator -= (u32x &a, const u32  b) { a.s0 -= b;    a.s1 -= b;    a.s2 -= b;    a.s3 -= b;     }
 353 | inline __device__ void operator -= (u32x &a, const u32x b) { a.s0 -= b.s0; a.s1 -= b.s1; a.s2 -= b.s2; a.s3 -= b.s3;  }
 354 | 
 355 | inline __device__ void operator *= (u32x &a, const u32  b) { a.s0 *= b;    a.s1 *= b;    a.s2 *= b;    a.s3 *= b;     }
 356 | inline __device__ void operator *= (u32x &a, const u32x b) { a.s0 *= b.s0; a.s1 *= b.s1; a.s2 *= b.s2; a.s3 *= b.s3;  }
 357 | 
 358 | inline __device__ void operator >>= (u32x &a, const u32  b) { a.s0 >>= b;    a.s1 >>= b;    a.s2 >>= b;    a.s3 >>= b;     }
 359 | inline __device__ void operator >>= (u32x &a, const u32x b) { a.s0 >>= b.s0; a.s1 >>= b.s1; a.s2 >>= b.s2; a.s3 >>= b.s3;  }
 360 | 
 361 | inline __device__ void operator <<= (u32x &a, const u32  b) { a.s0 <<= b;    a.s1 <<= b;    a.s2 <<= b;    a.s3 <<= b;     }
 362 | inline __device__ void operator <<= (u32x &a, const u32x b) { a.s0 <<= b.s0; a.s1 <<= b.s1; a.s2 <<= b.s2; a.s3 <<= b.s3;  }
 363 | 
 364 | inline __device__ u32x operator << (const u32x a, const u32  b) { return u32x ((a.s0 << b),    (a.s1 << b)   , (a.s2 << b),    (a.s3 << b)   );  }
 365 | inline __device__ u32x operator << (const u32x a, const u32x b) { return u32x ((a.s0 << b.s0), (a.s1 << b.s1), (a.s2 << b.s2), (a.s3 << b.s3));  }
 366 | 
 367 | inline __device__ u32x operator >> (const u32x a, const u32  b) { return u32x ((a.s0 >> b),    (a.s1 >> b)   , (a.s2 >> b),    (a.s3 >> b)   );  }
 368 | inline __device__ u32x operator >> (const u32x a, const u32x b) { return u32x ((a.s0 >> b.s0), (a.s1 >> b.s1), (a.s2 >> b.s2), (a.s3 >> b.s3));  }
 369 | 
 370 | inline __device__ u32x operator ^  (const u32x a, const u32  b) { return u32x ((a.s0 ^  b),    (a.s1 ^  b)   , (a.s2 ^  b),    (a.s3 ^  b)   );  }
 371 | inline __device__ u32x operator ^  (const u32x a, const u32x b) { return u32x ((a.s0 ^  b.s0), (a.s1 ^  b.s1), (a.s2 ^  b.s2), (a.s3 ^  b.s3));  }
 372 | 
 373 | inline __device__ u32x operator |  (const u32x a, const u32  b) { return u32x ((a.s0 |  b),    (a.s1 |  b)   , (a.s2 |  b),    (a.s3 |  b)   );  }
 374 | inline __device__ u32x operator |  (const u32x a, const u32x b) { return u32x ((a.s0 |  b.s0), (a.s1 |  b.s1), (a.s2 |  b.s2), (a.s3 |  b.s3));  }
 375 | 
 376 | inline __device__ u32x operator &  (const u32x a, const u32  b) { return u32x ((a.s0 &  b),    (a.s1 &  b)   , (a.s2 &  b),    (a.s3 &  b)   );  }
 377 | inline __device__ u32x operator &  (const u32x a, const u32x b) { return u32x ((a.s0 &  b.s0), (a.s1 &  b.s1), (a.s2 &  b.s2), (a.s3 &  b.s3));  }
 378 | 
 379 | inline __device__ u32x operator +  (const u32x a, const u32  b) { return u32x ((a.s0 +  b),    (a.s1 +  b)   , (a.s2 +  b),    (a.s3 +  b)   );  }
 380 | inline __device__ u32x operator +  (const u32x a, const u32x b) { return u32x ((a.s0 +  b.s0), (a.s1 +  b.s1), (a.s2 +  b.s2), (a.s3 +  b.s3));  }
 381 | 
 382 | inline __device__ u32x operator -  (const u32x a, const u32  b) { return u32x ((a.s0 -  b),    (a.s1 -  b)   , (a.s2 -  b),    (a.s3 -  b)   );  }
 383 | inline __device__ u32x operator -  (const u32x a, const u32x b) { return u32x ((a.s0 -  b.s0), (a.s1 -  b.s1), (a.s2 -  b.s2), (a.s3 -  b.s3));  }
 384 | 
 385 | inline __device__ u32x operator *  (const u32x a, const u32  b) { return u32x ((a.s0 *  b),    (a.s1 *  b)   , (a.s2 *  b),    (a.s3 *  b)   );  }
 386 | inline __device__ u32x operator *  (const u32x a, const u32x b) { return u32x ((a.s0 *  b.s0), (a.s1 *  b.s1), (a.s2 *  b.s2), (a.s3 *  b.s3));  }
 387 | 
 388 | inline __device__ u32x operator %  (const u32x a, const u32  b) { return u32x ((a.s0 %  b),    (a.s1 %  b)   , (a.s2 %  b),    (a.s3 %  b)   );  }
 389 | inline __device__ u32x operator %  (const u32x a, const u32x b) { return u32x ((a.s0 %  b.s0), (a.s1 %  b.s1), (a.s2 %  b.s2), (a.s3 %  b.s3));  }
 390 | 
 391 | inline __device__ u32x operator ~  (const u32x a) { return u32x (~a.s0, ~a.s1, ~a.s2, ~a.s3); }
 392 | 
 393 | inline __device__ bool operator != (const u64x a, const u64  b) { return ((a.s0 != b)    && (a.s1 != b)    && (a.s2 != b)    && (a.s3 != b)   ); }
 394 | inline __device__ bool operator != (const u64x a, const u64x b) { return ((a.s0 != b.s0) && (a.s1 != b.s1) && (a.s2 != b.s2) && (a.s3 != b.s3)); }
 395 | 
 396 | inline __device__ void operator ^= (u64x &a, const u64  b) { a.s0 ^= b;    a.s1 ^= b;    a.s2 ^= b;    a.s3 ^= b;     }
 397 | inline __device__ void operator ^= (u64x &a, const u64x b) { a.s0 ^= b.s0; a.s1 ^= b.s1; a.s2 ^= b.s2; a.s3 ^= b.s3;  }
 398 | 
 399 | inline __device__ void operator |= (u64x &a, const u64  b) { a.s0 |= b;    a.s1 |= b;    a.s2 |= b;    a.s3 |= b;     }
 400 | inline __device__ void operator |= (u64x &a, const u64x b) { a.s0 |= b.s0; a.s1 |= b.s1; a.s2 |= b.s2; a.s3 |= b.s3;  }
 401 | 
 402 | inline __device__ void operator &= (u64x &a, const u64  b) { a.s0 &= b;    a.s1 &= b;    a.s2 &= b;    a.s3 &= b;     }
 403 | inline __device__ void operator &= (u64x &a, const u64x b) { a.s0 &= b.s0; a.s1 &= b.s1; a.s2 &= b.s2; a.s3 &= b.s3;  }
 404 | 
 405 | inline __device__ void operator += (u64x &a, const u64  b) { a.s0 += b;    a.s1 += b;    a.s2 += b;    a.s3 += b;     }
 406 | inline __device__ void operator += (u64x &a, const u64x b) { a.s0 += b.s0; a.s1 += b.s1; a.s2 += b.s2; a.s3 += b.s3;  }
 407 | 
 408 | inline __device__ void operator -= (u64x &a, const u64  b) { a.s0 -= b;    a.s1 -= b;    a.s2 -= b;    a.s3 -= b;     }
 409 | inline __device__ void operator -= (u64x &a, const u64x b) { a.s0 -= b.s0; a.s1 -= b.s1; a.s2 -= b.s2; a.s3 -= b.s3;  }
 410 | 
 411 | inline __device__ void operator *= (u64x &a, const u64  b) { a.s0 *= b;    a.s1 *= b;    a.s2 *= b;    a.s3 *= b;     }
 412 | inline __device__ void operator *= (u64x &a, const u64x b) { a.s0 *= b.s0; a.s1 *= b.s1; a.s2 *= b.s2; a.s3 *= b.s3;  }
 413 | 
 414 | inline __device__ void operator >>= (u64x &a, const u64  b) { a.s0 >>= b;    a.s1 >>= b;    a.s2 >>= b;    a.s3 >>= b;     }
 415 | inline __device__ void operator >>= (u64x &a, const u64x b) { a.s0 >>= b.s0; a.s1 >>= b.s1; a.s2 >>= b.s2; a.s3 >>= b.s3;  }
 416 | 
 417 | inline __device__ void operator <<= (u64x &a, const u64  b) { a.s0 <<= b;    a.s1 <<= b;    a.s2 <<= b;    a.s3 <<= b;     }
 418 | inline __device__ void operator <<= (u64x &a, const u64x b) { a.s0 <<= b.s0; a.s1 <<= b.s1; a.s2 <<= b.s2; a.s3 <<= b.s3;  }
 419 | 
 420 | inline __device__ u64x operator << (const u64x a, const u64  b) { return u64x ((a.s0 << b),    (a.s1 << b)   , (a.s2 << b),    (a.s3 << b)   );  }
 421 | inline __device__ u64x operator << (const u64x a, const u64x b) { return u64x ((a.s0 << b.s0), (a.s1 << b.s1), (a.s2 << b.s2), (a.s3 << b.s3));  }
 422 | 
 423 | inline __device__ u64x operator >> (const u64x a, const u64  b) { return u64x ((a.s0 >> b),    (a.s1 >> b)   , (a.s2 >> b),    (a.s3 >> b)   );  }
 424 | inline __device__ u64x operator >> (const u64x a, const u64x b) { return u64x ((a.s0 >> b.s0), (a.s1 >> b.s1), (a.s2 >> b.s2), (a.s3 >> b.s3));  }
 425 | 
 426 | inline __device__ u64x operator ^  (const u64x a, const u64  b) { return u64x ((a.s0 ^  b),    (a.s1 ^  b)   , (a.s2 ^  b),    (a.s3 ^  b)   );  }
 427 | inline __device__ u64x operator ^  (const u64x a, const u64x b) { return u64x ((a.s0 ^  b.s0), (a.s1 ^  b.s1), (a.s2 ^  b.s2), (a.s3 ^  b.s3));  }
 428 | 
 429 | inline __device__ u64x operator |  (const u64x a, const u64  b) { return u64x ((a.s0 |  b),    (a.s1 |  b)   , (a.s2 |  b),    (a.s3 |  b)   );  }
 430 | inline __device__ u64x operator |  (const u64x a, const u64x b) { return u64x ((a.s0 |  b.s0), (a.s1 |  b.s1), (a.s2 |  b.s2), (a.s3 |  b.s3));  }
 431 | 
 432 | inline __device__ u64x operator &  (const u64x a, const u64  b) { return u64x ((a.s0 &  b),    (a.s1 &  b)   , (a.s2 &  b),    (a.s3 &  b)   );  }
 433 | inline __device__ u64x operator &  (const u64x a, const u64x b) { return u64x ((a.s0 &  b.s0), (a.s1 &  b.s1), (a.s2 &  b.s2), (a.s3 &  b.s3));  }
 434 | 
 435 | inline __device__ u64x operator +  (const u64x a, const u64  b) { return u64x ((a.s0 +  b),    (a.s1 +  b)   , (a.s2 +  b),    (a.s3 +  b)   );  }
 436 | inline __device__ u64x operator +  (const u64x a, const u64x b) { return u64x ((a.s0 +  b.s0), (a.s1 +  b.s1), (a.s2 +  b.s2), (a.s3 +  b.s3));  }
 437 | 
 438 | inline __device__ u64x operator -  (const u64x a, const u64  b) { return u64x ((a.s0 -  b),    (a.s1 -  b)   , (a.s2 -  b),    (a.s3 -  b)   );  }
 439 | inline __device__ u64x operator -  (const u64x a, const u64x b) { return u64x ((a.s0 -  b.s0), (a.s1 -  b.s1), (a.s2 -  b.s2), (a.s3 -  b.s3));  }
 440 | 
 441 | inline __device__ u64x operator *  (const u64x a, const u64  b) { return u64x ((a.s0 *  b),    (a.s1 *  b)   , (a.s2 *  b),    (a.s3 *  b)   );  }
 442 | inline __device__ u64x operator *  (const u64x a, const u64x b) { return u64x ((a.s0 *  b.s0), (a.s1 *  b.s1), (a.s2 *  b.s2), (a.s3 *  b.s3));  }
 443 | 
 444 | inline __device__ u64x operator %  (const u64x a, const u32  b) { return u64x ((a.s0 %  b),    (a.s1 %  b)   , (a.s2 %  b),    (a.s3 %  b)   );  }
 445 | inline __device__ u64x operator %  (const u64x a, const u64x b) { return u64x ((a.s0 %  b.s0), (a.s1 %  b.s1), (a.s2 %  b.s2), (a.s3 %  b.s3));  }
 446 | 
 447 | inline __device__ u64x operator ~  (const u64x a) { return u64x (~a.s0, ~a.s1, ~a.s2, ~a.s3); }
 448 | 
 449 | #endif
 450 | 
 451 | #if VECT_SIZE == 8
 452 | 
 453 | struct __device_builtin__ __builtin_align__(8) u8x
 454 | {
 455 |   u8 s0;
 456 |   u8 s1;
 457 |   u8 s2;
 458 |   u8 s3;
 459 |   u8 s4;
 460 |   u8 s5;
 461 |   u8 s6;
 462 |   u8 s7;
 463 | 
 464 |   inline __device__  u8x (const u8 a, const u8 b, const u8 c, const u8 d, const u8 e, const u8 f, const u8 g, const u8 h) : s0(a), s1(b), s2(c), s3(d), s4(e), s5(f), s6(g), s7(h) { }
 465 |   inline __device__  u8x (const u8 a)                                                                                     : s0(a), s1(a), s2(a), s3(a), s4(a), s5(a), s6(a), s7(a) { }
 466 | 
 467 |   inline __device__  u8x (void) : s0(0), s1(0), s2(0), s3(0), s4(0), s5(0), s6(0), s7(0) { }
 468 |   inline __device__ ~u8x (void) { }
 469 | };
 470 | 
 471 | struct __device_builtin__ __builtin_align__(16) u16x
 472 | {
 473 |   u16 s0;
 474 |   u16 s1;
 475 |   u16 s2;
 476 |   u16 s3;
 477 |   u16 s4;
 478 |   u16 s5;
 479 |   u16 s6;
 480 |   u16 s7;
 481 | 
 482 |   inline __device__  u16x (const u16 a, const u16 b, const u16 c, const u16 d, const u16 e, const u16 f, const u16 g, const u16 h) : s0(a), s1(b), s2(c), s3(d), s4(e), s5(f), s6(g), s7(h) { }
 483 |   inline __device__  u16x (const u16 a)                                                                                            : s0(a), s1(a), s2(a), s3(a), s4(a), s5(a), s6(a), s7(a) { }
 484 | 
 485 |   inline __device__  u16x (void) : s0(0), s1(0), s2(0), s3(0), s4(0), s5(0), s6(0), s7(0) { }
 486 |   inline __device__ ~u16x (void) { }
 487 | };
 488 | 
 489 | struct __device_builtin__ __builtin_align__(32) u32x
 490 | {
 491 |   u32 s0;
 492 |   u32 s1;
 493 |   u32 s2;
 494 |   u32 s3;
 495 |   u32 s4;
 496 |   u32 s5;
 497 |   u32 s6;
 498 |   u32 s7;
 499 | 
 500 |   inline __device__  u32x (const u32 a, const u32 b, const u32 c, const u32 d, const u32 e, const u32 f, const u32 g, const u32 h) : s0(a), s1(b), s2(c), s3(d), s4(e), s5(f), s6(g), s7(h) { }
 501 |   inline __device__  u32x (const u32 a)                                                                                            : s0(a), s1(a), s2(a), s3(a), s4(a), s5(a), s6(a), s7(a) { }
 502 | 
 503 |   inline __device__  u32x (void) : s0(0), s1(0), s2(0), s3(0), s4(0), s5(0), s6(0), s7(0) { }
 504 |   inline __device__ ~u32x (void) { }
 505 | };
 506 | 
 507 | struct __device_builtin__ __builtin_align__(64) u64x
 508 | {
 509 |   u64 s0;
 510 |   u64 s1;
 511 |   u64 s2;
 512 |   u64 s3;
 513 |   u64 s4;
 514 |   u64 s5;
 515 |   u64 s6;
 516 |   u64 s7;
 517 | 
 518 |   inline __device__  u64x (const u64 a, const u64 b, const u64 c, const u64 d, const u64 e, const u64 f, const u64 g, const u64 h) : s0(a), s1(b), s2(c), s3(d), s4(e), s5(f), s6(g), s7(h) { }
 519 |   inline __device__  u64x (const u64 a)                                                                                            : s0(a), s1(a), s2(a), s3(a), s4(a), s5(a), s6(a), s7(a) { }
 520 | 
 521 |   inline __device__  u64x (void) : s0(0), s1(0), s2(0), s3(0), s4(0), s5(0), s6(0), s7(0) { }
 522 |   inline __device__ ~u64x (void) { }
 523 | };
 524 | 
 525 | inline __device__ bool operator != (const u32x a, const u32  b) { return ((a.s0 != b)    && (a.s1 != b)    && (a.s2 != b)    && (a.s3 != b)    && (a.s4 != b)    && (a.s5 != b)    && (a.s6 != b)    && (a.s7 != b)   ); }
 526 | inline __device__ bool operator != (const u32x a, const u32x b) { return ((a.s0 != b.s0) && (a.s1 != b.s1) && (a.s2 != b.s2) && (a.s3 != b.s3) && (a.s4 != b.s4) && (a.s5 != b.s5) && (a.s6 != b.s6) && (a.s7 != b.s7)); }
 527 | 
 528 | inline __device__ void operator ^= (u32x &a, const u32  b) { a.s0 ^= b;    a.s1 ^= b;    a.s2 ^= b;    a.s3 ^= b;    a.s4 ^= b;    a.s5 ^= b;    a.s6 ^= b;    a.s7 ^= b;     }
 529 | inline __device__ void operator ^= (u32x &a, const u32x b) { a.s0 ^= b.s0; a.s1 ^= b.s1; a.s2 ^= b.s2; a.s3 ^= b.s3; a.s4 ^= b.s4; a.s5 ^= b.s5; a.s6 ^= b.s6; a.s7 ^= b.s7;  }
 530 | 
 531 | inline __device__ void operator |= (u32x &a, const u32  b) { a.s0 |= b;    a.s1 |= b;    a.s2 |= b;    a.s3 |= b;    a.s4 |= b;    a.s5 |= b;    a.s6 |= b;    a.s7 |= b;     }
 532 | inline __device__ void operator |= (u32x &a, const u32x b) { a.s0 |= b.s0; a.s1 |= b.s1; a.s2 |= b.s2; a.s3 |= b.s3; a.s4 |= b.s4; a.s5 |= b.s5; a.s6 |= b.s6; a.s7 |= b.s7;  }
 533 | 
 534 | inline __device__ void operator &= (u32x &a, const u32  b) { a.s0 &= b;    a.s1 &= b;    a.s2 &= b;    a.s3 &= b;    a.s4 &= b;    a.s5 &= b;    a.s6 &= b;    a.s7 &= b;     }
 535 | inline __device__ void operator &= (u32x &a, const u32x b) { a.s0 &= b.s0; a.s1 &= b.s1; a.s2 &= b.s2; a.s3 &= b.s3; a.s4 &= b.s4; a.s5 &= b.s5; a.s6 &= b.s6; a.s7 &= b.s7;  }
 536 | 
 537 | inline __device__ void operator += (u32x &a, const u32  b) { a.s0 += b;    a.s1 += b;    a.s2 += b;    a.s3 += b;    a.s4 += b;    a.s5 += b;    a.s6 += b;    a.s7 += b;     }
 538 | inline __device__ void operator += (u32x &a, const u32x b) { a.s0 += b.s0; a.s1 += b.s1; a.s2 += b.s2; a.s3 += b.s3; a.s4 += b.s4; a.s5 += b.s5; a.s6 += b.s6; a.s7 += b.s7;  }
 539 | 
 540 | inline __device__ void operator -= (u32x &a, const u32  b) { a.s0 -= b;    a.s1 -= b;    a.s2 -= b;    a.s3 -= b;    a.s4 -= b;    a.s5 -= b;    a.s6 -= b;    a.s7 -= b;     }
 541 | inline __device__ void operator -= (u32x &a, const u32x b) { a.s0 -= b.s0; a.s1 -= b.s1; a.s2 -= b.s2; a.s3 -= b.s3; a.s4 -= b.s4; a.s5 -= b.s5; a.s6 -= b.s6; a.s7 -= b.s7;  }
 542 | 
 543 | inline __device__ void operator *= (u32x &a, const u32  b) { a.s0 *= b;    a.s1 *= b;    a.s2 *= b;    a.s3 *= b;    a.s4 *= b;    a.s5 *= b;    a.s6 *= b;    a.s7 *= b;     }
 544 | inline __device__ void operator *= (u32x &a, const u32x b) { a.s0 *= b.s0; a.s1 *= b.s1; a.s2 *= b.s2; a.s3 *= b.s3; a.s4 *= b.s4; a.s5 *= b.s5; a.s6 *= b.s6; a.s7 *= b.s7;  }
 545 | 
 546 | inline __device__ void operator >>= (u32x &a, const u32  b) { a.s0 >>= b;    a.s1 >>= b;    a.s2 >>= b;    a.s3 >>= b;    a.s4 >>= b;    a.s5 >>= b;    a.s6 >>= b;    a.s7 >>= b;     }
 547 | inline __device__ void operator >>= (u32x &a, const u32x b) { a.s0 >>= b.s0; a.s1 >>= b.s1; a.s2 >>= b.s2; a.s3 >>= b.s3; a.s4 >>= b.s4; a.s5 >>= b.s5; a.s6 >>= b.s6; a.s7 >>= b.s7;  }
 548 | 
 549 | inline __device__ void operator <<= (u32x &a, const u32  b) { a.s0 <<= b;    a.s1 <<= b;    a.s2 <<= b;    a.s3 <<= b;    a.s4 <<= b;    a.s5 <<= b;    a.s6 <<= b;    a.s7 <<= b;     }
 550 | inline __device__ void operator <<= (u32x &a, const u32x b) { a.s0 <<= b.s0; a.s1 <<= b.s1; a.s2 <<= b.s2; a.s3 <<= b.s3; a.s4 <<= b.s4; a.s5 <<= b.s5; a.s6 <<= b.s6; a.s7 <<= b.s7;  }
 551 | 
 552 | inline __device__ u32x operator << (const u32x a, const u32  b) { return u32x ((a.s0 << b),    (a.s1 << b)   , (a.s2 << b),    (a.s3 << b)   , (a.s4 << b),    (a.s5 << b)   , (a.s6 << b),    (a.s7 << b)   );  }
 553 | inline __device__ u32x operator << (const u32x a, const u32x b) { return u32x ((a.s0 << b.s0), (a.s1 << b.s1), (a.s2 << b.s2), (a.s3 << b.s3), (a.s4 << b.s4), (a.s5 << b.s5), (a.s6 << b.s6), (a.s7 << b.s7));  }
 554 | 
 555 | inline __device__ u32x operator >> (const u32x a, const u32  b) { return u32x ((a.s0 >> b),    (a.s1 >> b)   , (a.s2 >> b),    (a.s3 >> b)   , (a.s4 >> b),    (a.s5 >> b)   , (a.s6 >> b),    (a.s7 >> b)   );  }
 556 | inline __device__ u32x operator >> (const u32x a, const u32x b) { return u32x ((a.s0 >> b.s0), (a.s1 >> b.s1), (a.s2 >> b.s2), (a.s3 >> b.s3), (a.s4 >> b.s4), (a.s5 >> b.s5), (a.s6 >> b.s6), (a.s7 >> b.s7));  }
 557 | 
 558 | inline __device__ u32x operator ^  (const u32x a, const u32  b) { return u32x ((a.s0 ^  b),    (a.s1 ^  b)   , (a.s2 ^  b),    (a.s3 ^  b)   , (a.s4 ^  b),    (a.s5 ^  b)   , (a.s6 ^  b),    (a.s7 ^  b)   );  }
 559 | inline __device__ u32x operator ^  (const u32x a, const u32x b) { return u32x ((a.s0 ^  b.s0), (a.s1 ^  b.s1), (a.s2 ^  b.s2), (a.s3 ^  b.s3), (a.s4 ^  b.s4), (a.s5 ^  b.s5), (a.s6 ^  b.s6), (a.s7 ^  b.s7));  }
 560 | 
 561 | inline __device__ u32x operator |  (const u32x a, const u32  b) { return u32x ((a.s0 |  b),    (a.s1 |  b)   , (a.s2 |  b),    (a.s3 |  b)   , (a.s4 |  b),    (a.s5 |  b)   , (a.s6 |  b),    (a.s7 |  b)   );  }
 562 | inline __device__ u32x operator |  (const u32x a, const u32x b) { return u32x ((a.s0 |  b.s0), (a.s1 |  b.s1), (a.s2 |  b.s2), (a.s3 |  b.s3), (a.s4 |  b.s4), (a.s5 |  b.s5), (a.s6 |  b.s6), (a.s7 |  b.s7));  }
 563 | 
 564 | inline __device__ u32x operator &  (const u32x a, const u32  b) { return u32x ((a.s0 &  b),    (a.s1 &  b)   , (a.s2 &  b),    (a.s3 &  b)   , (a.s4 &  b),    (a.s5 &  b)   , (a.s6 &  b),    (a.s7 &  b)   );  }
 565 | inline __device__ u32x operator &  (const u32x a, const u32x b) { return u32x ((a.s0 &  b.s0), (a.s1 &  b.s1), (a.s2 &  b.s2), (a.s3 &  b.s3), (a.s4 &  b.s4), (a.s5 &  b.s5), (a.s6 &  b.s6), (a.s7 &  b.s7));  }
 566 | 
 567 | inline __device__ u32x operator +  (const u32x a, const u32  b) { return u32x ((a.s0 +  b),    (a.s1 +  b)   , (a.s2 +  b),    (a.s3 +  b)   , (a.s4 +  b),    (a.s5 +  b)   , (a.s6 +  b),    (a.s7 +  b)   );  }
 568 | inline __device__ u32x operator +  (const u32x a, const u32x b) { return u32x ((a.s0 +  b.s0), (a.s1 +  b.s1), (a.s2 +  b.s2), (a.s3 +  b.s3), (a.s4 +  b.s4), (a.s5 +  b.s5), (a.s6 +  b.s6), (a.s7 +  b.s7));  }
 569 | 
 570 | inline __device__ u32x operator -  (const u32x a, const u32  b) { return u32x ((a.s0 -  b),    (a.s1 -  b)   , (a.s2 -  b),    (a.s3 -  b)   , (a.s4 -  b),    (a.s5 -  b)   , (a.s6 -  b),    (a.s7 -  b)   );  }
 571 | inline __device__ u32x operator -  (const u32x a, const u32x b) { return u32x ((a.s0 -  b.s0), (a.s1 -  b.s1), (a.s2 -  b.s2), (a.s3 -  b.s3), (a.s4 -  b.s4), (a.s5 -  b.s5), (a.s6 -  b.s6), (a.s7 -  b.s7));  }
 572 | 
 573 | inline __device__ u32x operator *  (const u32x a, const u32  b) { return u32x ((a.s0 *  b),    (a.s1 *  b)   , (a.s2 *  b),    (a.s3 *  b)   , (a.s4 *  b),    (a.s5 *  b)   , (a.s6 *  b),    (a.s7 *  b)   );  }
 574 | inline __device__ u32x operator *  (const u32x a, const u32x b) { return u32x ((a.s0 *  b.s0), (a.s1 *  b.s1), (a.s2 *  b.s2), (a.s3 *  b.s3), (a.s4 *  b.s4), (a.s5 *  b.s5), (a.s6 *  b.s6), (a.s7 *  b.s7));  }
 575 | 
 576 | inline __device__ u32x operator %  (const u32x a, const u32  b) { return u32x ((a.s0 %  b),    (a.s1 %  b)   , (a.s2 %  b),    (a.s3 %  b)   , (a.s4 %  b),    (a.s5 %  b)   , (a.s6 %  b),    (a.s7 %  b)   );  }
 577 | inline __device__ u32x operator %  (const u32x a, const u32x b) { return u32x ((a.s0 %  b.s0), (a.s1 %  b.s1), (a.s2 %  b.s2), (a.s3 %  b.s3), (a.s4 %  b.s4), (a.s5 %  b.s5), (a.s6 %  b.s6), (a.s7 %  b.s7));  }
 578 | 
 579 | inline __device__ u32x operator ~  (const u32x a) { return u32x (~a.s0, ~a.s1, ~a.s2, ~a.s3, ~a.s4, ~a.s5, ~a.s6, ~a.s7); }
 580 | 
 581 | inline __device__ bool operator != (const u64x a, const u64  b) { return ((a.s0 != b)    && (a.s1 != b)    && (a.s2 != b)    && (a.s3 != b)    && (a.s4 != b)    && (a.s5 != b)    && (a.s6 != b)    && (a.s7 != b)   ); }
 582 | inline __device__ bool operator != (const u64x a, const u64x b) { return ((a.s0 != b.s0) && (a.s1 != b.s1) && (a.s2 != b.s2) && (a.s3 != b.s3) && (a.s4 != b.s4) && (a.s5 != b.s5) && (a.s6 != b.s6) && (a.s7 != b.s7)); }
 583 | 
 584 | inline __device__ void operator ^= (u64x &a, const u64  b) { a.s0 ^= b;    a.s1 ^= b;    a.s2 ^= b;    a.s3 ^= b;    a.s4 ^= b;    a.s5 ^= b;    a.s6 ^= b;    a.s7 ^= b;     }
 585 | inline __device__ void operator ^= (u64x &a, const u64x b) { a.s0 ^= b.s0; a.s1 ^= b.s1; a.s2 ^= b.s2; a.s3 ^= b.s3; a.s4 ^= b.s4; a.s5 ^= b.s5; a.s6 ^= b.s6; a.s7 ^= b.s7;  }
 586 | 
 587 | inline __device__ void operator |= (u64x &a, const u64  b) { a.s0 |= b;    a.s1 |= b;    a.s2 |= b;    a.s3 |= b;    a.s4 |= b;    a.s5 |= b;    a.s6 |= b;    a.s7 |= b;     }
 588 | inline __device__ void operator |= (u64x &a, const u64x b) { a.s0 |= b.s0; a.s1 |= b.s1; a.s2 |= b.s2; a.s3 |= b.s3; a.s4 |= b.s4; a.s5 |= b.s5; a.s6 |= b.s6; a.s7 |= b.s7;  }
 589 | 
 590 | inline __device__ void operator &= (u64x &a, const u64  b) { a.s0 &= b;    a.s1 &= b;    a.s2 &= b;    a.s3 &= b;    a.s4 &= b;    a.s5 &= b;    a.s6 &= b;    a.s7 &= b;     }
 591 | inline __device__ void operator &= (u64x &a, const u64x b) { a.s0 &= b.s0; a.s1 &= b.s1; a.s2 &= b.s2; a.s3 &= b.s3; a.s4 &= b.s4; a.s5 &= b.s5; a.s6 &= b.s6; a.s7 &= b.s7;  }
 592 | 
 593 | inline __device__ void operator += (u64x &a, const u64  b) { a.s0 += b;    a.s1 += b;    a.s2 += b;    a.s3 += b;    a.s4 += b;    a.s5 += b;    a.s6 += b;    a.s7 += b;     }
 594 | inline __device__ void operator += (u64x &a, const u64x b) { a.s0 += b.s0; a.s1 += b.s1; a.s2 += b.s2; a.s3 += b.s3; a.s4 += b.s4; a.s5 += b.s5; a.s6 += b.s6; a.s7 += b.s7;  }
 595 | 
 596 | inline __device__ void operator -= (u64x &a, const u64  b) { a.s0 -= b;    a.s1 -= b;    a.s2 -= b;    a.s3 -= b;    a.s4 -= b;    a.s5 -= b;    a.s6 -= b;    a.s7 -= b;     }
 597 | inline __device__ void operator -= (u64x &a, const u64x b) { a.s0 -= b.s0; a.s1 -= b.s1; a.s2 -= b.s2; a.s3 -= b.s3; a.s4 -= b.s4; a.s5 -= b.s5; a.s6 -= b.s6; a.s7 -= b.s7;  }
 598 | 
 599 | inline __device__ void operator *= (u64x &a, const u64  b) { a.s0 *= b;    a.s1 *= b;    a.s2 *= b;    a.s3 *= b;    a.s4 *= b;    a.s5 *= b;    a.s6 *= b;    a.s7 *= b;     }
 600 | inline __device__ void operator *= (u64x &a, const u64x b) { a.s0 *= b.s0; a.s1 *= b.s1; a.s2 *= b.s2; a.s3 *= b.s3; a.s4 *= b.s4; a.s5 *= b.s5; a.s6 *= b.s6; a.s7 *= b.s7;  }
 601 | 
 602 | inline __device__ void operator >>= (u64x &a, const u64  b) { a.s0 >>= b;    a.s1 >>= b;    a.s2 >>= b;    a.s3 >>= b;    a.s4 >>= b;    a.s5 >>= b;    a.s6 >>= b;    a.s7 >>= b;     }
 603 | inline __device__ void operator >>= (u64x &a, const u64x b) { a.s0 >>= b.s0; a.s1 >>= b.s1; a.s2 >>= b.s2; a.s3 >>= b.s3; a.s4 >>= b.s4; a.s5 >>= b.s5; a.s6 >>= b.s6; a.s7 >>= b.s7;  }
 604 | 
 605 | inline __device__ void operator <<= (u64x &a, const u64  b) { a.s0 <<= b;    a.s1 <<= b;    a.s2 <<= b;    a.s3 <<= b;    a.s4 <<= b;    a.s5 <<= b;    a.s6 <<= b;    a.s7 <<= b;     }
 606 | inline __device__ void operator <<= (u64x &a, const u64x b) { a.s0 <<= b.s0; a.s1 <<= b.s1; a.s2 <<= b.s2; a.s3 <<= b.s3; a.s4 <<= b.s4; a.s5 <<= b.s5; a.s6 <<= b.s6; a.s7 <<= b.s7;  }
 607 | 
 608 | inline __device__ u64x operator << (const u64x a, const u64  b) { return u64x ((a.s0 << b),    (a.s1 << b)   , (a.s2 << b),    (a.s3 << b)   , (a.s4 << b),    (a.s5 << b)   , (a.s6 << b),    (a.s7 << b)   );  }
 609 | inline __device__ u64x operator << (const u64x a, const u64x b) { return u64x ((a.s0 << b.s0), (a.s1 << b.s1), (a.s2 << b.s2), (a.s3 << b.s3), (a.s4 << b.s4), (a.s5 << b.s5), (a.s6 << b.s6), (a.s7 << b.s7));  }
 610 | 
 611 | inline __device__ u64x operator >> (const u64x a, const u64  b) { return u64x ((a.s0 >> b),    (a.s1 >> b)   , (a.s2 >> b),    (a.s3 >> b)   , (a.s4 >> b),    (a.s5 >> b)   , (a.s6 >> b),    (a.s7 >> b)   );  }
 612 | inline __device__ u64x operator >> (const u64x a, const u64x b) { return u64x ((a.s0 >> b.s0), (a.s1 >> b.s1), (a.s2 >> b.s2), (a.s3 >> b.s3), (a.s4 >> b.s4), (a.s5 >> b.s5), (a.s6 >> b.s6), (a.s7 >> b.s7));  }
 613 | 
 614 | inline __device__ u64x operator ^  (const u64x a, const u64  b) { return u64x ((a.s0 ^  b),    (a.s1 ^  b)   , (a.s2 ^  b),    (a.s3 ^  b)   , (a.s4 ^  b),    (a.s5 ^  b)   , (a.s6 ^  b),    (a.s7 ^  b)   );  }
 615 | inline __device__ u64x operator ^  (const u64x a, const u64x b) { return u64x ((a.s0 ^  b.s0), (a.s1 ^  b.s1), (a.s2 ^  b.s2), (a.s3 ^  b.s3), (a.s4 ^  b.s4), (a.s5 ^  b.s5), (a.s6 ^  b.s6), (a.s7 ^  b.s7));  }
 616 | 
 617 | inline __device__ u64x operator |  (const u64x a, const u64  b) { return u64x ((a.s0 |  b),    (a.s1 |  b)   , (a.s2 |  b),    (a.s3 |  b)   , (a.s4 |  b),    (a.s5 |  b)   , (a.s6 |  b),    (a.s7 |  b)   );  }
 618 | inline __device__ u64x operator |  (const u64x a, const u64x b) { return u64x ((a.s0 |  b.s0), (a.s1 |  b.s1), (a.s2 |  b.s2), (a.s3 |  b.s3), (a.s4 |  b.s4), (a.s5 |  b.s5), (a.s6 |  b.s6), (a.s7 |  b.s7));  }
 619 | 
 620 | inline __device__ u64x operator &  (const u64x a, const u64  b) { return u64x ((a.s0 &  b),    (a.s1 &  b)   , (a.s2 &  b),    (a.s3 &  b)   , (a.s4 &  b),    (a.s5 &  b)   , (a.s6 &  b),    (a.s7 &  b)   );  }
 621 | inline __device__ u64x operator &  (const u64x a, const u64x b) { return u64x ((a.s0 &  b.s0), (a.s1 &  b.s1), (a.s2 &  b.s2), (a.s3 &  b.s3), (a.s4 &  b.s4), (a.s5 &  b.s5), (a.s6 &  b.s6), (a.s7 &  b.s7));  }
 622 | 
 623 | inline __device__ u64x operator +  (const u64x a, const u64  b) { return u64x ((a.s0 +  b),    (a.s1 +  b)   , (a.s2 +  b),    (a.s3 +  b)   , (a.s4 +  b),    (a.s5 +  b)   , (a.s6 +  b),    (a.s7 +  b)   );  }
 624 | inline __device__ u64x operator +  (const u64x a, const u64x b) { return u64x ((a.s0 +  b.s0), (a.s1 +  b.s1), (a.s2 +  b.s2), (a.s3 +  b.s3), (a.s4 +  b.s4), (a.s5 +  b.s5), (a.s6 +  b.s6), (a.s7 +  b.s7));  }
 625 | 
 626 | inline __device__ u64x operator -  (const u64x a, const u64  b) { return u64x ((a.s0 -  b),    (a.s1 -  b)   , (a.s2 -  b),    (a.s3 -  b)   , (a.s4 -  b),    (a.s5 -  b)   , (a.s6 -  b),    (a.s7 -  b)   );  }
 627 | inline __device__ u64x operator -  (const u64x a, const u64x b) { return u64x ((a.s0 -  b.s0), (a.s1 -  b.s1), (a.s2 -  b.s2), (a.s3 -  b.s3), (a.s4 -  b.s4), (a.s5 -  b.s5), (a.s6 -  b.s6), (a.s7 -  b.s7));  }
 628 | 
 629 | inline __device__ u64x operator *  (const u64x a, const u64  b) { return u64x ((a.s0 *  b),    (a.s1 *  b)   , (a.s2 *  b),    (a.s3 *  b)   , (a.s4 *  b),    (a.s5 *  b)   , (a.s6 *  b),    (a.s7 *  b)   );  }
 630 | inline __device__ u64x operator *  (const u64x a, const u64x b) { return u64x ((a.s0 *  b.s0), (a.s1 *  b.s1), (a.s2 *  b.s2), (a.s3 *  b.s3), (a.s4 *  b.s4), (a.s5 *  b.s5), (a.s6 *  b.s6), (a.s7 *  b.s7));  }
 631 | 
 632 | inline __device__ u64x operator %  (const u64x a, const u64  b) { return u64x ((a.s0 %  b),    (a.s1 %  b)   , (a.s2 %  b),    (a.s3 %  b)   , (a.s4 %  b),    (a.s5 %  b)   , (a.s6 %  b),    (a.s7 %  b)   );  }
 633 | inline __device__ u64x operator %  (const u64x a, const u64x b) { return u64x ((a.s0 %  b.s0), (a.s1 %  b.s1), (a.s2 %  b.s2), (a.s3 %  b.s3), (a.s4 %  b.s4), (a.s5 %  b.s5), (a.s6 %  b.s6), (a.s7 %  b.s7));  }
 634 | 
 635 | inline __device__ u64x operator ~  (const u64x a) { return u64x (~a.s0, ~a.s1, ~a.s2, ~a.s3, ~a.s4, ~a.s5, ~a.s6, ~a.s7); }
 636 | 
 637 | #endif
 638 | 
 639 | #if VECT_SIZE == 16
 640 | 
 641 | struct __device_builtin__ __builtin_align__(16) u8x
 642 | {
 643 |   u8 s0;
 644 |   u8 s1;
 645 |   u8 s2;
 646 |   u8 s3;
 647 |   u8 s4;
 648 |   u8 s5;
 649 |   u8 s6;
 650 |   u8 s7;
 651 |   u8 s8;
 652 |   u8 s9;
 653 |   u8 sa;
 654 |   u8 sb;
 655 |   u8 sc;
 656 |   u8 sd;
 657 |   u8 se;
 658 |   u8 sf;
 659 | 
 660 |   inline __device__  u8x (const u8 a, const u8 b, const u8 c, const u8 d, const u8 e, const u8 f, const u8 g, const u8 h, const u8 i, const u8 j, const u8 k, const u8 l, const u8 m, const u8 n, const u8 o, const u8 p) : s0(a), s1(b), s2(c), s3(d), s4(e), s5(f), s6(g), s7(h), s8(i), s9(j), sa(k), sb(l), sc(m), sd(n), se(o), sf(p) { }
 661 |   inline __device__  u8x (const u8 a)                                                                                                                                                                                     : s0(a), s1(a), s2(a), s3(a), s4(a), s5(a), s6(a), s7(a), s8(a), s9(a), sa(a), sb(a), sc(a), sd(a), se(a), sf(a) { }
 662 | 
 663 |   inline __device__  u8x (void) : s0(0), s1(0), s2(0), s3(0), s4(0), s5(0), s6(0), s7(0), s8(0), s9(0), sa(0), sb(0), sc(0), sd(0), se(0), sf(0) { }
 664 |   inline __device__ ~u8x (void) { }
 665 | };
 666 | 
 667 | struct __device_builtin__ __builtin_align__(32) u16x
 668 | {
 669 |   u16 s0;
 670 |   u16 s1;
 671 |   u16 s2;
 672 |   u16 s3;
 673 |   u16 s4;
 674 |   u16 s5;
 675 |   u16 s6;
 676 |   u16 s7;
 677 |   u16 s8;
 678 |   u16 s9;
 679 |   u16 sa;
 680 |   u16 sb;
 681 |   u16 sc;
 682 |   u16 sd;
 683 |   u16 se;
 684 |   u16 sf;
 685 | 
 686 |   inline __device__  u16x (const u16 a, const u16 b, const u16 c, const u16 d, const u16 e, const u16 f, const u16 g, const u16 h, const u16 i, const u16 j, const u16 k, const u16 l, const u16 m, const u16 n, const u16 o, const u16 p) : s0(a), s1(b), s2(c), s3(d), s4(e), s5(f), s6(g), s7(h), s8(i), s9(j), sa(k), sb(l), sc(m), sd(n), se(o), sf(p) { }
 687 |   inline __device__  u16x (const u16 a)                                                                                                                                                                                     : s0(a), s1(a), s2(a), s3(a), s4(a), s5(a), s6(a), s7(a), s8(a), s9(a), sa(a), sb(a), sc(a), sd(a), se(a), sf(a) { }
 688 | 
 689 |   inline __device__  u16x (void) : s0(0), s1(0), s2(0), s3(0), s4(0), s5(0), s6(0), s7(0), s8(0), s9(0), sa(0), sb(0), sc(0), sd(0), se(0), sf(0){ }
 690 |   inline __device__ ~u16x (void) { }
 691 | };
 692 | 
 693 | struct __device_builtin__ __builtin_align__(64) u32x
 694 | {
 695 |   u32 s0;
 696 |   u32 s1;
 697 |   u32 s2;
 698 |   u32 s3;
 699 |   u32 s4;
 700 |   u32 s5;
 701 |   u32 s6;
 702 |   u32 s7;
 703 |   u32 s8;
 704 |   u32 s9;
 705 |   u32 sa;
 706 |   u32 sb;
 707 |   u32 sc;
 708 |   u32 sd;
 709 |   u32 se;
 710 |   u32 sf;
 711 | 
 712 |   inline __device__  u32x (const u32 a, const u32 b, const u32 c, const u32 d, const u32 e, const u32 f, const u32 g, const u32 h, const u32 i, const u32 j, const u32 k, const u32 l, const u32 m, const u32 n, const u32 o, const u32 p) : s0(a), s1(b), s2(c), s3(d), s4(e), s5(f), s6(g), s7(h), s8(i), s9(j), sa(k), sb(l), sc(m), sd(n), se(o), sf(p) { }
 713 |   inline __device__  u32x (const u32 a)                                                                                                                                                                                     : s0(a), s1(a), s2(a), s3(a), s4(a), s5(a), s6(a), s7(a), s8(a), s9(a), sa(a), sb(a), sc(a), sd(a), se(a), sf(a) { }
 714 | 
 715 |   inline __device__  u32x (void) : s0(0), s1(0), s2(0), s3(0), s4(0), s5(0), s6(0), s7(0), s8(0), s9(0), sa(0), sb(0), sc(0), sd(0), se(0), sf(0){ }
 716 |   inline __device__ ~u32x (void) { }
 717 | };
 718 | 
 719 | struct __device_builtin__ __builtin_align__(128) u64x
 720 | {
 721 |   u64 s0;
 722 |   u64 s1;
 723 |   u64 s2;
 724 |   u64 s3;
 725 |   u64 s4;
 726 |   u64 s5;
 727 |   u64 s6;
 728 |   u64 s7;
 729 |   u64 s8;
 730 |   u64 s9;
 731 |   u64 sa;
 732 |   u64 sb;
 733 |   u64 sc;
 734 |   u64 sd;
 735 |   u64 se;
 736 |   u64 sf;
 737 | 
 738 |   inline __device__  u64x (const u64 a, const u64 b, const u64 c, const u64 d, const u64 e, const u64 f, const u64 g, const u64 h, const u64 i, const u64 j, const u64 k, const u64 l, const u64 m, const u64 n, const u64 o, const u64 p) : s0(a), s1(b), s2(c), s3(d), s4(e), s5(f), s6(g), s7(h), s8(i), s9(j), sa(k), sb(l), sc(m), sd(n), se(o), sf(p) { }
 739 |   inline __device__  u64x (const u64 a)                                                                                                                                                                                     : s0(a), s1(a), s2(a), s3(a), s4(a), s5(a), s6(a), s7(a), s8(a), s9(a), sa(a), sb(a), sc(a), sd(a), se(a), sf(a) { }
 740 | 
 741 |   inline __device__  u64x (void) : s0(0), s1(0), s2(0), s3(0), s4(0), s5(0), s6(0), s7(0), s8(0), s9(0), sa(0), sb(0), sc(0), sd(0), se(0), sf(0) { }
 742 |   inline __device__ ~u64x (void) { }
 743 | };
 744 | 
 745 | inline __device__ bool operator != (const u32x a, const u32  b) { return ((a.s0 != b)    && (a.s1 != b)    && (a.s2 != b)    && (a.s3 != b)    && (a.s4 != b)    && (a.s5 != b)    && (a.s6 != b)    && (a.s7 != b)    && (a.s8 != b)    && (a.s9 != b)    && (a.sa != b)    && (a.sb != b)    && (a.sc != b)    && (a.sd != b)    && (a.se != b)    && (a.sf != b)   ); }
 746 | inline __device__ bool operator != (const u32x a, const u32x b) { return ((a.s0 != b.s0) && (a.s1 != b.s1) && (a.s2 != b.s2) && (a.s3 != b.s3) && (a.s4 != b.s4) && (a.s5 != b.s5) && (a.s6 != b.s6) && (a.s7 != b.s7) && (a.s8 != b.s8) && (a.s9 != b.s9) && (a.sa != b.sa) && (a.sb != b.sb) && (a.sc != b.sc) && (a.sd != b.sd) && (a.se != b.se) && (a.sf != b.sf)); }
 747 | 
 748 | inline __device__ void operator ^= (u32x &a, const u32  b) { a.s0 ^= b;    a.s1 ^= b;    a.s2 ^= b;    a.s3 ^= b;    a.s4 ^= b;    a.s5 ^= b;    a.s6 ^= b;    a.s7 ^= b;    a.s8 ^= b;    a.s9 ^= b;    a.sa ^= b;    a.sb ^= b;    a.sc ^= b;    a.sd ^= b;    a.se ^= b;    a.sf ^= b;    }
 749 | inline __device__ void operator ^= (u32x &a, const u32x b) { a.s0 ^= b.s0; a.s1 ^= b.s1; a.s2 ^= b.s2; a.s3 ^= b.s3; a.s4 ^= b.s4; a.s5 ^= b.s5; a.s6 ^= b.s6; a.s7 ^= b.s7; a.s8 ^= b.s8; a.s9 ^= b.s9; a.sa ^= b.sa; a.sb ^= b.sb; a.sc ^= b.sc; a.sd ^= b.sd; a.se ^= b.se; a.sf ^= b.sf; }
 750 | 
 751 | inline __device__ void operator |= (u32x &a, const u32  b) { a.s0 |= b;    a.s1 |= b;    a.s2 |= b;    a.s3 |= b;    a.s4 |= b;    a.s5 |= b;    a.s6 |= b;    a.s7 |= b;    a.s8 |= b;    a.s9 |= b;    a.sa |= b;    a.sb |= b;    a.sc |= b;    a.sd |= b;    a.se |= b;    a.sf |= b;    }
 752 | inline __device__ void operator |= (u32x &a, const u32x b) { a.s0 |= b.s0; a.s1 |= b.s1; a.s2 |= b.s2; a.s3 |= b.s3; a.s4 |= b.s4; a.s5 |= b.s5; a.s6 |= b.s6; a.s7 |= b.s7; a.s8 |= b.s8; a.s9 |= b.s9; a.sa |= b.sa; a.sb |= b.sb; a.sc |= b.sc; a.sd |= b.sd; a.se |= b.se; a.sf |= b.sf; }
 753 | 
 754 | inline __device__ void operator &= (u32x &a, const u32  b) { a.s0 &= b;    a.s1 &= b;    a.s2 &= b;    a.s3 &= b;    a.s4 &= b;    a.s5 &= b;    a.s6 &= b;    a.s7 &= b;    a.s8 &= b;    a.s9 &= b;    a.sa &= b;    a.sb &= b;    a.sc &= b;    a.sd &= b;    a.se &= b;    a.sf &= b;    }
 755 | inline __device__ void operator &= (u32x &a, const u32x b) { a.s0 &= b.s0; a.s1 &= b.s1; a.s2 &= b.s2; a.s3 &= b.s3; a.s4 &= b.s4; a.s5 &= b.s5; a.s6 &= b.s6; a.s7 &= b.s7; a.s8 &= b.s8; a.s9 &= b.s9; a.sa &= b.sa; a.sb &= b.sb; a.sc &= b.sc; a.sd &= b.sd; a.se &= b.se; a.sf &= b.sf; }
 756 | 
 757 | inline __device__ void operator += (u32x &a, const u32  b) { a.s0 += b;    a.s1 += b;    a.s2 += b;    a.s3 += b;    a.s4 += b;    a.s5 += b;    a.s6 += b;    a.s7 += b;    a.s8 += b;    a.s9 += b;    a.sa += b;    a.sb += b;    a.sc += b;    a.sd += b;    a.se += b;    a.sf += b;    }
 758 | inline __device__ void operator += (u32x &a, const u32x b) { a.s0 += b.s0; a.s1 += b.s1; a.s2 += b.s2; a.s3 += b.s3; a.s4 += b.s4; a.s5 += b.s5; a.s6 += b.s6; a.s7 += b.s7; a.s8 += b.s8; a.s9 += b.s9; a.sa += b.sa; a.sb += b.sb; a.sc += b.sc; a.sd += b.sd; a.se += b.se; a.sf += b.sf; }
 759 | 
 760 | inline __device__ void operator -= (u32x &a, const u32  b) { a.s0 -= b;    a.s1 -= b;    a.s2 -= b;    a.s3 -= b;    a.s4 -= b;    a.s5 -= b;    a.s6 -= b;    a.s7 -= b;    a.s8 -= b;    a.s9 -= b;    a.sa -= b;    a.sb -= b;    a.sc -= b;    a.sd -= b;    a.se -= b;    a.sf -= b;    }
 761 | inline __device__ void operator -= (u32x &a, const u32x b) { a.s0 -= b.s0; a.s1 -= b.s1; a.s2 -= b.s2; a.s3 -= b.s3; a.s4 -= b.s4; a.s5 -= b.s5; a.s6 -= b.s6; a.s7 -= b.s7; a.s8 -= b.s8; a.s9 -= b.s9; a.sa -= b.sa; a.sb -= b.sb; a.sc -= b.sc; a.sd -= b.sd; a.se -= b.se; a.sf -= b.sf; }
 762 | 
 763 | inline __device__ void operator *= (u32x &a, const u32  b) { a.s0 *= b;    a.s1 *= b;    a.s2 *= b;    a.s3 *= b;    a.s4 *= b;    a.s5 *= b;    a.s6 *= b;    a.s7 *= b;    a.s8 *= b;    a.s9 *= b;    a.sa *= b;    a.sb *= b;    a.sc *= b;    a.sd *= b;    a.se *= b;    a.sf *= b;    }
 764 | inline __device__ void operator *= (u32x &a, const u32x b) { a.s0 *= b.s0; a.s1 *= b.s1; a.s2 *= b.s2; a.s3 *= b.s3; a.s4 *= b.s4; a.s5 *= b.s5; a.s6 *= b.s6; a.s7 *= b.s7; a.s8 *= b.s8; a.s9 *= b.s9; a.sa *= b.sa; a.sb *= b.sb; a.sc *= b.sc; a.sd *= b.sd; a.se *= b.se; a.sf *= b.sf; }
 765 | 
 766 | inline __device__ void operator >>= (u32x &a, const u32  b) { a.s0 >>= b;    a.s1 >>= b;    a.s2 >>= b;    a.s3 >>= b;    a.s4 >>= b;    a.s5 >>= b;    a.s6 >>= b;    a.s7 >>= b;    a.s8 >>= b;    a.s9 >>= b;    a.sa >>= b;    a.sb >>= b;    a.sc >>= b;    a.sd >>= b;    a.se >>= b;    a.sf >>= b;    }
 767 | inline __device__ void operator >>= (u32x &a, const u32x b) { a.s0 >>= b.s0; a.s1 >>= b.s1; a.s2 >>= b.s2; a.s3 >>= b.s3; a.s4 >>= b.s4; a.s5 >>= b.s5; a.s6 >>= b.s6; a.s7 >>= b.s7; a.s8 >>= b.s8; a.s9 >>= b.s9; a.sa >>= b.sa; a.sb >>= b.sb; a.sc >>= b.sc; a.sd >>= b.sd; a.se >>= b.se; a.sf >>= b.sf; }
 768 | 
 769 | inline __device__ void operator <<= (u32x &a, const u32  b) { a.s0 <<= b;    a.s1 <<= b;    a.s2 <<= b;    a.s3 <<= b;    a.s4 <<= b;    a.s5 <<= b;    a.s6 <<= b;    a.s7 <<= b;    a.s8 <<= b;    a.s9 <<= b;    a.sa <<= b;    a.sb <<= b;    a.sc <<= b;    a.sd <<= b;    a.se <<= b;    a.sf <<= b;    }
 770 | inline __device__ void operator <<= (u32x &a, const u32x b) { a.s0 <<= b.s0; a.s1 <<= b.s1; a.s2 <<= b.s2; a.s3 <<= b.s3; a.s4 <<= b.s4; a.s5 <<= b.s5; a.s6 <<= b.s6; a.s7 <<= b.s7; a.s8 <<= b.s8; a.s9 <<= b.s9; a.sa <<= b.sa; a.sb <<= b.sb; a.sc <<= b.sc; a.sd <<= b.sd; a.se <<= b.se; a.sf <<= b.sf; }
 771 | 
 772 | inline __device__ u32x operator << (const u32x a, const u32  b) { return u32x ((a.s0 << b),    (a.s1 << b)   , (a.s2 << b),    (a.s3 << b)   , (a.s4 << b),    (a.s5 << b)   , (a.s6 << b),    (a.s7 << b),    (a.s8 << b),    (a.s9 << b)   , (a.sa << b),    (a.sb << b)   , (a.sc << b),    (a.sd << b)   , (a.se << b),    (a.sf << b)   );  }
 773 | inline __device__ u32x operator << (const u32x a, const u32x b) { return u32x ((a.s0 << b.s0), (a.s1 << b.s1), (a.s2 << b.s2), (a.s3 << b.s3), (a.s4 << b.s4), (a.s5 << b.s5), (a.s6 << b.s6), (a.s7 << b.s7), (a.s8 << b.s8), (a.s9 << b.s9), (a.sa << b.sa), (a.sb << b.sb), (a.sc << b.sc), (a.sd << b.sd), (a.se << b.se), (a.sf << b.sf));  }
 774 | 
 775 | inline __device__ u32x operator >> (const u32x a, const u32  b) { return u32x ((a.s0 >> b),    (a.s1 >> b)   , (a.s2 >> b),    (a.s3 >> b)   , (a.s4 >> b),    (a.s5 >> b)   , (a.s6 >> b),    (a.s7 >> b),    (a.s8 >> b),    (a.s9 >> b)   , (a.sa >> b),    (a.sb >> b)   , (a.sc >> b),    (a.sd >> b)   , (a.se >> b),    (a.sf >> b)   );  }
 776 | inline __device__ u32x operator >> (const u32x a, const u32x b) { return u32x ((a.s0 >> b.s0), (a.s1 >> b.s1), (a.s2 >> b.s2), (a.s3 >> b.s3), (a.s4 >> b.s4), (a.s5 >> b.s5), (a.s6 >> b.s6), (a.s7 >> b.s7), (a.s8 >> b.s8), (a.s9 >> b.s9), (a.sa >> b.sa), (a.sb >> b.sb), (a.sc >> b.sc), (a.sd >> b.sd), (a.se >> b.se), (a.sf >> b.sf));  }
 777 | 
 778 | inline __device__ u32x operator ^  (const u32x a, const u32  b) { return u32x ((a.s0 ^  b),    (a.s1 ^  b)   , (a.s2 ^  b),    (a.s3 ^  b)   , (a.s4 ^  b),    (a.s5 ^  b)   , (a.s6 ^  b),    (a.s7 ^  b),    (a.s8 ^  b),    (a.s9 ^  b)   , (a.sa ^  b),    (a.sb ^  b)   , (a.sc ^  b),    (a.sd ^  b)   , (a.se ^  b),    (a.sf ^  b)   );  }
 779 | inline __device__ u32x operator ^  (const u32x a, const u32x b) { return u32x ((a.s0 ^  b.s0), (a.s1 ^  b.s1), (a.s2 ^  b.s2), (a.s3 ^  b.s3), (a.s4 ^  b.s4), (a.s5 ^  b.s5), (a.s6 ^  b.s6), (a.s7 ^  b.s7), (a.s8 ^  b.s8), (a.s9 ^  b.s9), (a.sa ^  b.sa), (a.sb ^  b.sb), (a.sc ^  b.sc), (a.sd ^  b.sd), (a.se ^  b.se), (a.sf ^  b.sf));  }
 780 | 
 781 | inline __device__ u32x operator |  (const u32x a, const u32  b) { return u32x ((a.s0 |  b),    (a.s1 |  b)   , (a.s2 |  b),    (a.s3 |  b)   , (a.s4 |  b),    (a.s5 |  b)   , (a.s6 |  b),    (a.s7 |  b),    (a.s8 |  b),    (a.s9 |  b)   , (a.sa |  b),    (a.sb |  b)   , (a.sc |  b),    (a.sd |  b)   , (a.se |  b),    (a.sf |  b)   );  }
 782 | inline __device__ u32x operator |  (const u32x a, const u32x b) { return u32x ((a.s0 |  b.s0), (a.s1 |  b.s1), (a.s2 |  b.s2), (a.s3 |  b.s3), (a.s4 |  b.s4), (a.s5 |  b.s5), (a.s6 |  b.s6), (a.s7 |  b.s7), (a.s8 |  b.s8), (a.s9 |  b.s9), (a.sa |  b.sa), (a.sb |  b.sb), (a.sc |  b.sc), (a.sd |  b.sd), (a.se |  b.se), (a.sf |  b.sf));  }
 783 | 
 784 | inline __device__ u32x operator &  (const u32x a, const u32  b) { return u32x ((a.s0 &  b),    (a.s1 &  b)   , (a.s2 &  b),    (a.s3 &  b)   , (a.s4 &  b),    (a.s5 &  b)   , (a.s6 &  b),    (a.s7 &  b),    (a.s8 &  b),    (a.s9 &  b)   , (a.sa &  b),    (a.sb &  b)   , (a.sc &  b),    (a.sd &  b)   , (a.se &  b),    (a.sf &  b)   );  }
 785 | inline __device__ u32x operator &  (const u32x a, const u32x b) { return u32x ((a.s0 &  b.s0), (a.s1 &  b.s1), (a.s2 &  b.s2), (a.s3 &  b.s3), (a.s4 &  b.s4), (a.s5 &  b.s5), (a.s6 &  b.s6), (a.s7 &  b.s7), (a.s8 &  b.s8), (a.s9 &  b.s9), (a.sa &  b.sa), (a.sb &  b.sb), (a.sc &  b.sc), (a.sd &  b.sd), (a.se &  b.se), (a.sf &  b.sf));  }
 786 | 
 787 | inline __device__ u32x operator +  (const u32x a, const u32  b) { return u32x ((a.s0 +  b),    (a.s1 +  b)   , (a.s2 +  b),    (a.s3 +  b)   , (a.s4 +  b),    (a.s5 +  b)   , (a.s6 +  b),    (a.s7 +  b),    (a.s8 +  b),    (a.s9 +  b)   , (a.sa +  b),    (a.sb +  b)   , (a.sc +  b),    (a.sd +  b)   , (a.se +  b),    (a.sf +  b)   );  }
 788 | inline __device__ u32x operator +  (const u32x a, const u32x b) { return u32x ((a.s0 +  b.s0), (a.s1 +  b.s1), (a.s2 +  b.s2), (a.s3 +  b.s3), (a.s4 +  b.s4), (a.s5 +  b.s5), (a.s6 +  b.s6), (a.s7 +  b.s7), (a.s8 +  b.s8), (a.s9 +  b.s9), (a.sa +  b.sa), (a.sb +  b.sb), (a.sc +  b.sc), (a.sd +  b.sd), (a.se +  b.se), (a.sf +  b.sf));  }
 789 | 
 790 | inline __device__ u32x operator -  (const u32x a, const u32  b) { return u32x ((a.s0 -  b),    (a.s1 -  b)   , (a.s2 -  b),    (a.s3 -  b)   , (a.s4 -  b),    (a.s5 -  b)   , (a.s6 -  b),    (a.s7 -  b),    (a.s8 -  b),    (a.s9 -  b)   , (a.sa -  b),    (a.sb -  b)   , (a.sc -  b),    (a.sd -  b)   , (a.se -  b),    (a.sf -  b)   );  }
 791 | inline __device__ u32x operator -  (const u32x a, const u32x b) { return u32x ((a.s0 -  b.s0), (a.s1 -  b.s1), (a.s2 -  b.s2), (a.s3 -  b.s3), (a.s4 -  b.s4), (a.s5 -  b.s5), (a.s6 -  b.s6), (a.s7 -  b.s7), (a.s8 -  b.s8), (a.s9 -  b.s9), (a.sa -  b.sa), (a.sb -  b.sb), (a.sc -  b.sc), (a.sd -  b.sd), (a.se -  b.se), (a.sf -  b.sf));  }
 792 | 
 793 | inline __device__ u32x operator *  (const u32x a, const u32  b) { return u32x ((a.s0 *  b),    (a.s1 *  b)   , (a.s2 *  b),    (a.s3 *  b)   , (a.s4 *  b),    (a.s5 *  b)   , (a.s6 *  b),    (a.s7 *  b),    (a.s8 *  b),    (a.s9 *  b)   , (a.sa *  b),    (a.sb *  b)   , (a.sc *  b),    (a.sd *  b)   , (a.se *  b),    (a.sf *  b)   );  }
 794 | inline __device__ u32x operator *  (const u32x a, const u32x b) { return u32x ((a.s0 *  b.s0), (a.s1 *  b.s1), (a.s2 *  b.s2), (a.s3 *  b.s3), (a.s4 *  b.s4), (a.s5 *  b.s5), (a.s6 *  b.s6), (a.s7 *  b.s7), (a.s8 *  b.s8), (a.s9 *  b.s9), (a.sa *  b.sa), (a.sb *  b.sb), (a.sc *  b.sc), (a.sd *  b.sd), (a.se *  b.se), (a.sf *  b.sf));  }
 795 | 
 796 | inline __device__ u32x operator %  (const u32x a, const u32  b) { return u32x ((a.s0 %  b),    (a.s1 %  b)   , (a.s2 %  b),    (a.s3 %  b)   , (a.s4 %  b),    (a.s5 %  b)   , (a.s6 %  b),    (a.s7 %  b),    (a.s8 %  b),    (a.s9 %  b)   , (a.sa %  b),    (a.sb %  b)   , (a.sc %  b),    (a.sd %  b)   , (a.se %  b),    (a.sf %  b)   );  }
 797 | inline __device__ u32x operator %  (const u32x a, const u32x b) { return u32x ((a.s0 %  b.s0), (a.s1 %  b.s1), (a.s2 %  b.s2), (a.s3 %  b.s3), (a.s4 %  b.s4), (a.s5 %  b.s5), (a.s6 %  b.s6), (a.s7 %  b.s7), (a.s8 %  b.s8), (a.s9 %  b.s9), (a.sa %  b.sa), (a.sb %  b.sb), (a.sc %  b.sc), (a.sd %  b.sd), (a.se %  b.se), (a.sf %  b.sf));  }
 798 | 
 799 | inline __device__ u32x operator ~  (const u32x a) { return u32x (~a.s0, ~a.s1, ~a.s2, ~a.s3, ~a.s4, ~a.s5, ~a.s6, ~a.s7, ~a.s8, ~a.s9, ~a.sa, ~a.sb, ~a.sc, ~a.sd, ~a.se, ~a.sf); }
 800 | 
 801 | inline __device__ bool operator != (const u64x a, const u64  b) { return ((a.s0 != b)    && (a.s1 != b)    && (a.s2 != b)    && (a.s3 != b)    && (a.s4 != b)    && (a.s5 != b)    && (a.s6 != b)    && (a.s7 != b)    && (a.s8 != b)    && (a.s9 != b)    && (a.sa != b)    && (a.sb != b)    && (a.sc != b)    && (a.sd != b)    && (a.se != b)    && (a.sf != b)   ); }
 802 | inline __device__ bool operator != (const u64x a, const u64x b) { return ((a.s0 != b.s0) && (a.s1 != b.s1) && (a.s2 != b.s2) && (a.s3 != b.s3) && (a.s4 != b.s4) && (a.s5 != b.s5) && (a.s6 != b.s6) && (a.s7 != b.s7) && (a.s8 != b.s8) && (a.s9 != b.s9) && (a.sa != b.sa) && (a.sb != b.sb) && (a.sc != b.sc) && (a.sd != b.sd) && (a.se != b.se) && (a.sf != b.sf)); }
 803 | 
 804 | inline __device__ void operator ^= (u64x &a, const u64  b) { a.s0 ^= b;    a.s1 ^= b;    a.s2 ^= b;    a.s3 ^= b;    a.s4 ^= b;    a.s5 ^= b;    a.s6 ^= b;    a.s7 ^= b;    a.s8 ^= b;    a.s9 ^= b;    a.sa ^= b;    a.sb ^= b;    a.sc ^= b;    a.sd ^= b;    a.se ^= b;    a.sf ^= b;    }
 805 | inline __device__ void operator ^= (u64x &a, const u64x b) { a.s0 ^= b.s0; a.s1 ^= b.s1; a.s2 ^= b.s2; a.s3 ^= b.s3; a.s4 ^= b.s4; a.s5 ^= b.s5; a.s6 ^= b.s6; a.s7 ^= b.s7; a.s8 ^= b.s8; a.s9 ^= b.s9; a.sa ^= b.sa; a.sb ^= b.sb; a.sc ^= b.sc; a.sd ^= b.sd; a.se ^= b.se; a.sf ^= b.sf; }
 806 | 
 807 | inline __device__ void operator |= (u64x &a, const u64  b) { a.s0 |= b;    a.s1 |= b;    a.s2 |= b;    a.s3 |= b;    a.s4 |= b;    a.s5 |= b;    a.s6 |= b;    a.s7 |= b;    a.s8 |= b;    a.s9 |= b;    a.sa |= b;    a.sb |= b;    a.sc |= b;    a.sd |= b;    a.se |= b;    a.sf |= b;    }
 808 | inline __device__ void operator |= (u64x &a, const u64x b) { a.s0 |= b.s0; a.s1 |= b.s1; a.s2 |= b.s2; a.s3 |= b.s3; a.s4 |= b.s4; a.s5 |= b.s5; a.s6 |= b.s6; a.s7 |= b.s7; a.s8 |= b.s8; a.s9 |= b.s9; a.sa |= b.sa; a.sb |= b.sb; a.sc |= b.sc; a.sd |= b.sd; a.se |= b.se; a.sf |= b.sf; }
 809 | 
 810 | inline __device__ void operator &= (u64x &a, const u64  b) { a.s0 &= b;    a.s1 &= b;    a.s2 &= b;    a.s3 &= b;    a.s4 &= b;    a.s5 &= b;    a.s6 &= b;    a.s7 &= b;    a.s8 &= b;    a.s9 &= b;    a.sa &= b;    a.sb &= b;    a.sc &= b;    a.sd &= b;    a.se &= b;    a.sf &= b;    }
 811 | inline __device__ void operator &= (u64x &a, const u64x b) { a.s0 &= b.s0; a.s1 &= b.s1; a.s2 &= b.s2; a.s3 &= b.s3; a.s4 &= b.s4; a.s5 &= b.s5; a.s6 &= b.s6; a.s7 &= b.s7; a.s8 &= b.s8; a.s9 &= b.s9; a.sa &= b.sa; a.sb &= b.sb; a.sc &= b.sc; a.sd &= b.sd; a.se &= b.se; a.sf &= b.sf; }
 812 | 
 813 | inline __device__ void operator += (u64x &a, const u64  b) { a.s0 += b;    a.s1 += b;    a.s2 += b;    a.s3 += b;    a.s4 += b;    a.s5 += b;    a.s6 += b;    a.s7 += b;    a.s8 += b;    a.s9 += b;    a.sa += b;    a.sb += b;    a.sc += b;    a.sd += b;    a.se += b;    a.sf += b;    }
 814 | inline __device__ void operator += (u64x &a, const u64x b) { a.s0 += b.s0; a.s1 += b.s1; a.s2 += b.s2; a.s3 += b.s3; a.s4 += b.s4; a.s5 += b.s5; a.s6 += b.s6; a.s7 += b.s7; a.s8 += b.s8; a.s9 += b.s9; a.sa += b.sa; a.sb += b.sb; a.sc += b.sc; a.sd += b.sd; a.se += b.se; a.sf += b.sf; }
 815 | 
 816 | inline __device__ void operator -= (u64x &a, const u64  b) { a.s0 -= b;    a.s1 -= b;    a.s2 -= b;    a.s3 -= b;    a.s4 -= b;    a.s5 -= b;    a.s6 -= b;    a.s7 -= b;    a.s8 -= b;    a.s9 -= b;    a.sa -= b;    a.sb -= b;    a.sc -= b;    a.sd -= b;    a.se -= b;    a.sf -= b;    }
 817 | inline __device__ void operator -= (u64x &a, const u64x b) { a.s0 -= b.s0; a.s1 -= b.s1; a.s2 -= b.s2; a.s3 -= b.s3; a.s4 -= b.s4; a.s5 -= b.s5; a.s6 -= b.s6; a.s7 -= b.s7; a.s8 -= b.s8; a.s9 -= b.s9; a.sa -= b.sa; a.sb -= b.sb; a.sc -= b.sc; a.sd -= b.sd; a.se -= b.se; a.sf -= b.sf; }
 818 | 
 819 | inline __device__ void operator *= (u64x &a, const u64  b) { a.s0 *= b;    a.s1 *= b;    a.s2 *= b;    a.s3 *= b;    a.s4 *= b;    a.s5 *= b;    a.s6 *= b;    a.s7 *= b;    a.s8 *= b;    a.s9 *= b;    a.sa *= b;    a.sb *= b;    a.sc *= b;    a.sd *= b;    a.se *= b;    a.sf *= b;    }
 820 | inline __device__ void operator *= (u64x &a, const u64x b) { a.s0 *= b.s0; a.s1 *= b.s1; a.s2 *= b.s2; a.s3 *= b.s3; a.s4 *= b.s4; a.s5 *= b.s5; a.s6 *= b.s6; a.s7 *= b.s7; a.s8 *= b.s8; a.s9 *= b.s9; a.sa *= b.sa; a.sb *= b.sb; a.sc *= b.sc; a.sd *= b.sd; a.se *= b.se; a.sf *= b.sf; }
 821 | 
 822 | inline __device__ void operator >>= (u64x &a, const u64  b) { a.s0 >>= b;    a.s1 >>= b;    a.s2 >>= b;    a.s3 >>= b;    a.s4 >>= b;    a.s5 >>= b;    a.s6 >>= b;    a.s7 >>= b;    a.s8 >>= b;    a.s9 >>= b;    a.sa >>= b;    a.sb >>= b;    a.sc >>= b;    a.sd >>= b;    a.se >>= b;    a.sf >>= b;    }
 823 | inline __device__ void operator >>= (u64x &a, const u64x b) { a.s0 >>= b.s0; a.s1 >>= b.s1; a.s2 >>= b.s2; a.s3 >>= b.s3; a.s4 >>= b.s4; a.s5 >>= b.s5; a.s6 >>= b.s6; a.s7 >>= b.s7; a.s8 >>= b.s8; a.s9 >>= b.s9; a.sa >>= b.sa; a.sb >>= b.sb; a.sc >>= b.sc; a.sd >>= b.sd; a.se >>= b.se; a.sf >>= b.sf; }
 824 | 
 825 | inline __device__ void operator <<= (u64x &a, const u64  b) { a.s0 <<= b;    a.s1 <<= b;    a.s2 <<= b;    a.s3 <<= b;    a.s4 <<= b;    a.s5 <<= b;    a.s6 <<= b;    a.s7 <<= b;    a.s8 <<= b;    a.s9 <<= b;    a.sa <<= b;    a.sb <<= b;    a.sc <<= b;    a.sd <<= b;    a.se <<= b;    a.sf <<= b;    }
 826 | inline __device__ void operator <<= (u64x &a, const u64x b) { a.s0 <<= b.s0; a.s1 <<= b.s1; a.s2 <<= b.s2; a.s3 <<= b.s3; a.s4 <<= b.s4; a.s5 <<= b.s5; a.s6 <<= b.s6; a.s7 <<= b.s7; a.s8 <<= b.s8; a.s9 <<= b.s9; a.sa <<= b.sa; a.sb <<= b.sb; a.sc <<= b.sc; a.sd <<= b.sd; a.se <<= b.se; a.sf <<= b.sf; }
 827 | 
 828 | inline __device__ u64x operator << (const u64x a, const u64  b) { return u64x ((a.s0 << b),    (a.s1 << b)   , (a.s2 << b),    (a.s3 << b)   , (a.s4 << b),    (a.s5 << b)   , (a.s6 << b),    (a.s7 << b),    (a.s8 << b),    (a.s9 << b)   , (a.sa << b),    (a.sb << b)   , (a.sc << b),    (a.sd << b)   , (a.se << b),    (a.sf << b)   );  }
 829 | inline __device__ u64x operator << (const u64x a, const u64x b) { return u64x ((a.s0 << b.s0), (a.s1 << b.s1), (a.s2 << b.s2), (a.s3 << b.s3), (a.s4 << b.s4), (a.s5 << b.s5), (a.s6 << b.s6), (a.s7 << b.s7), (a.s8 << b.s8), (a.s9 << b.s9), (a.sa << b.sa), (a.sb << b.sb), (a.sc << b.sc), (a.sd << b.sd), (a.se << b.se), (a.sf << b.sf));  }
 830 | 
 831 | inline __device__ u64x operator >> (const u64x a, const u64  b) { return u64x ((a.s0 >> b),    (a.s1 >> b)   , (a.s2 >> b),    (a.s3 >> b)   , (a.s4 >> b),    (a.s5 >> b)   , (a.s6 >> b),    (a.s7 >> b),    (a.s8 >> b),    (a.s9 >> b)   , (a.sa >> b),    (a.sb >> b)   , (a.sc >> b),    (a.sd >> b)   , (a.se >> b),    (a.sf >> b)   );  }
 832 | inline __device__ u64x operator >> (const u64x a, const u64x b) { return u64x ((a.s0 >> b.s0), (a.s1 >> b.s1), (a.s2 >> b.s2), (a.s3 >> b.s3), (a.s4 >> b.s4), (a.s5 >> b.s5), (a.s6 >> b.s6), (a.s7 >> b.s7), (a.s8 >> b.s8), (a.s9 >> b.s9), (a.sa >> b.sa), (a.sb >> b.sb), (a.sc >> b.sc), (a.sd >> b.sd), (a.se >> b.se), (a.sf >> b.sf));  }
 833 | 
 834 | inline __device__ u64x operator ^  (const u64x a, const u64  b) { return u64x ((a.s0 ^  b),    (a.s1 ^  b)   , (a.s2 ^  b),    (a.s3 ^  b)   , (a.s4 ^  b),    (a.s5 ^  b)   , (a.s6 ^  b),    (a.s7 ^  b),    (a.s8 ^  b),    (a.s9 ^  b)   , (a.sa ^  b),    (a.sb ^  b)   , (a.sc ^  b),    (a.sd ^  b)   , (a.se ^  b),    (a.sf ^  b)   );  }
 835 | inline __device__ u64x operator ^  (const u64x a, const u64x b) { return u64x ((a.s0 ^  b.s0), (a.s1 ^  b.s1), (a.s2 ^  b.s2), (a.s3 ^  b.s3), (a.s4 ^  b.s4), (a.s5 ^  b.s5), (a.s6 ^  b.s6), (a.s7 ^  b.s7), (a.s8 ^  b.s8), (a.s9 ^  b.s9), (a.sa ^  b.sa), (a.sb ^  b.sb), (a.sc ^  b.sc), (a.sd ^  b.sd), (a.se ^  b.se), (a.sf ^  b.sf));  }
 836 | 
 837 | inline __device__ u64x operator |  (const u64x a, const u64  b) { return u64x ((a.s0 |  b),    (a.s1 |  b)   , (a.s2 |  b),    (a.s3 |  b)   , (a.s4 |  b),    (a.s5 |  b)   , (a.s6 |  b),    (a.s7 |  b),    (a.s8 |  b),    (a.s9 |  b)   , (a.sa |  b),    (a.sb |  b)   , (a.sc |  b),    (a.sd |  b)   , (a.se |  b),    (a.sf |  b)   );  }
 838 | inline __device__ u64x operator |  (const u64x a, const u64x b) { return u64x ((a.s0 |  b.s0), (a.s1 |  b.s1), (a.s2 |  b.s2), (a.s3 |  b.s3), (a.s4 |  b.s4), (a.s5 |  b.s5), (a.s6 |  b.s6), (a.s7 |  b.s7), (a.s8 |  b.s8), (a.s9 |  b.s9), (a.sa |  b.sa), (a.sb |  b.sb), (a.sc |  b.sc), (a.sd |  b.sd), (a.se |  b.se), (a.sf |  b.sf));  }
 839 | 
 840 | inline __device__ u64x operator &  (const u64x a, const u64  b) { return u64x ((a.s0 &  b),    (a.s1 &  b)   , (a.s2 &  b),    (a.s3 &  b)   , (a.s4 &  b),    (a.s5 &  b)   , (a.s6 &  b),    (a.s7 &  b),    (a.s8 &  b),    (a.s9 &  b)   , (a.sa &  b),    (a.sb &  b)   , (a.sc &  b),    (a.sd &  b)   , (a.se &  b),    (a.sf &  b)   );  }
 841 | inline __device__ u64x operator &  (const u64x a, const u64x b) { return u64x ((a.s0 &  b.s0), (a.s1 &  b.s1), (a.s2 &  b.s2), (a.s3 &  b.s3), (a.s4 &  b.s4), (a.s5 &  b.s5), (a.s6 &  b.s6), (a.s7 &  b.s7), (a.s8 &  b.s8), (a.s9 &  b.s9), (a.sa &  b.sa), (a.sb &  b.sb), (a.sc &  b.sc), (a.sd &  b.sd), (a.se &  b.se), (a.sf &  b.sf));  }
 842 | 
 843 | inline __device__ u64x operator +  (const u64x a, const u64  b) { return u64x ((a.s0 +  b),    (a.s1 +  b)   , (a.s2 +  b),    (a.s3 +  b)   , (a.s4 +  b),    (a.s5 +  b)   , (a.s6 +  b),    (a.s7 +  b),    (a.s8 +  b),    (a.s9 +  b)   , (a.sa +  b),    (a.sb +  b)   , (a.sc +  b),    (a.sd +  b)   , (a.se +  b),    (a.sf +  b)   );  }
 844 | inline __device__ u64x operator +  (const u64x a, const u64x b) { return u64x ((a.s0 +  b.s0), (a.s1 +  b.s1), (a.s2 +  b.s2), (a.s3 +  b.s3), (a.s4 +  b.s4), (a.s5 +  b.s5), (a.s6 +  b.s6), (a.s7 +  b.s7), (a.s8 +  b.s8), (a.s9 +  b.s9), (a.sa +  b.sa), (a.sb +  b.sb), (a.sc +  b.sc), (a.sd +  b.sd), (a.se +  b.se), (a.sf +  b.sf));  }
 845 | 
 846 | inline __device__ u64x operator -  (const u64x a, const u64  b) { return u64x ((a.s0 -  b),    (a.s1 -  b)   , (a.s2 -  b),    (a.s3 -  b)   , (a.s4 -  b),    (a.s5 -  b)   , (a.s6 -  b),    (a.s7 -  b),    (a.s8 -  b),    (a.s9 -  b)   , (a.sa -  b),    (a.sb -  b)   , (a.sc -  b),    (a.sd -  b)   , (a.se -  b),    (a.sf -  b)   );  }
 847 | inline __device__ u64x operator -  (const u64x a, const u64x b) { return u64x ((a.s0 -  b.s0), (a.s1 -  b.s1), (a.s2 -  b.s2), (a.s3 -  b.s3), (a.s4 -  b.s4), (a.s5 -  b.s5), (a.s6 -  b.s6), (a.s7 -  b.s7), (a.s8 -  b.s8), (a.s9 -  b.s9), (a.sa -  b.sa), (a.sb -  b.sb), (a.sc -  b.sc), (a.sd -  b.sd), (a.se -  b.se), (a.sf -  b.sf));  }
 848 | 
 849 | inline __device__ u64x operator *  (const u64x a, const u64  b) { return u64x ((a.s0 *  b),    (a.s1 *  b)   , (a.s2 *  b),    (a.s3 *  b)   , (a.s4 *  b),    (a.s5 *  b)   , (a.s6 *  b),    (a.s7 *  b),    (a.s8 *  b),    (a.s9 *  b)   , (a.sa *  b),    (a.sb *  b)   , (a.sc *  b),    (a.sd *  b)   , (a.se *  b),    (a.sf *  b)   );  }
 850 | inline __device__ u64x operator *  (const u64x a, const u64x b) { return u64x ((a.s0 *  b.s0), (a.s1 *  b.s1), (a.s2 *  b.s2), (a.s3 *  b.s3), (a.s4 *  b.s4), (a.s5 *  b.s5), (a.s6 *  b.s6), (a.s7 *  b.s7), (a.s8 *  b.s8), (a.s9 *  b.s9), (a.sa *  b.sa), (a.sb *  b.sb), (a.sc *  b.sc), (a.sd *  b.sd), (a.se *  b.se), (a.sf *  b.sf));  }
 851 | 
 852 | inline __device__ u64x operator %  (const u64x a, const u64  b) { return u64x ((a.s0 %  b),    (a.s1 %  b)   , (a.s2 %  b),    (a.s3 %  b)   , (a.s4 %  b),    (a.s5 %  b)   , (a.s6 %  b),    (a.s7 %  b),    (a.s8 %  b),    (a.s9 %  b)   , (a.sa %  b),    (a.sb %  b)   , (a.sc %  b),    (a.sd %  b)   , (a.se %  b),    (a.sf %  b)   );  }
 853 | inline __device__ u64x operator %  (const u64x a, const u64x b) { return u64x ((a.s0 %  b.s0), (a.s1 %  b.s1), (a.s2 %  b.s2), (a.s3 %  b.s3), (a.s4 %  b.s4), (a.s5 %  b.s5), (a.s6 %  b.s6), (a.s7 %  b.s7), (a.s8 %  b.s8), (a.s9 %  b.s9), (a.sa %  b.sa), (a.sb %  b.sb), (a.sc %  b.sc), (a.sd %  b.sd), (a.se %  b.se), (a.sf %  b.sf));  }
 854 | 
 855 | inline __device__ u64x operator ~  (const u64x a) { return u64x (~a.s0, ~a.s1, ~a.s2, ~a.s3, ~a.s4, ~a.s5, ~a.s6, ~a.s7, ~a.s8, ~a.s9, ~a.sa, ~a.sb, ~a.sc, ~a.sd, ~a.se, ~a.sf); }
 856 | 
 857 | #endif
 858 | 
 859 | typedef __device_builtin__ struct u8x  u8x;
 860 | typedef __device_builtin__ struct u16x u16x;
 861 | typedef __device_builtin__ struct u32x u32x;
 862 | typedef __device_builtin__ struct u64x u64x;
 863 | 
 864 | #define make_u8x  u8x
 865 | #define make_u16x u16x
 866 | #define make_u32x u32x
 867 | #define make_u64x u64x
 868 | 
 869 | #else
 870 | typedef VTYPE(uchar,  VECT_SIZE) u8x;
 871 | typedef VTYPE(ushort, VECT_SIZE) u16x;
 872 | typedef VTYPE(uint,   VECT_SIZE) u32x;
 873 | typedef VTYPE(ullong, VECT_SIZE) u64x;
 874 | 
 875 | #ifndef IS_METAL
 876 | #define make_u8x  (u8x)
 877 | #define make_u16x (u16x)
 878 | #define make_u32x (u32x)
 879 | #define make_u64x (u64x)
 880 | #else
 881 | #define make_u8x  u8x
 882 | #define make_u16x u16x
 883 | #define make_u32x u32x
 884 | #define make_u64x u64x
 885 | #endif
 886 | 
 887 | #endif
 888 | #endif
 889 | 
 890 | // unions
 891 | 
 892 | typedef union vconv32
 893 | {
 894 |   u64 v32;
 895 | 
 896 |   struct
 897 |   {
 898 |     u16 a;
 899 |     u16 b;
 900 | 
 901 |   } v16;
 902 | 
 903 |   struct
 904 |   {
 905 |     u8 a;
 906 |     u8 b;
 907 |     u8 c;
 908 |     u8 d;
 909 | 
 910 |   } v8;
 911 | 
 912 | } vconv32_t;
 913 | 
 914 | typedef union vconv64
 915 | {
 916 |   u64 v64;
 917 | 
 918 |   struct
 919 |   {
 920 |     u32 a;
 921 |     u32 b;
 922 | 
 923 |   } v32;
 924 | 
 925 |   struct
 926 |   {
 927 |     u16 a;
 928 |     u16 b;
 929 |     u16 c;
 930 |     u16 d;
 931 | 
 932 |   } v16;
 933 | 
 934 |   struct
 935 |   {
 936 |     u8 a;
 937 |     u8 b;
 938 |     u8 c;
 939 |     u8 d;
 940 |     u8 e;
 941 |     u8 f;
 942 |     u8 g;
 943 |     u8 h;
 944 | 
 945 |   } v8;
 946 | 
 947 | } vconv64_t;
 948 | 
 949 | /**
 950 |  * Author......: See docs/credits.txt
 951 |  * License.....: MIT
 952 |  */
 953 | 
 954 | typedef enum siphash_constants
 955 | {
 956 |   SIPHASHM_0=0x736f6d6570736575UL,
 957 |   SIPHASHM_1=0x646f72616e646f6dUL,
 958 |   SIPHASHM_2=0x6c7967656e657261UL,
 959 |   SIPHASHM_3=0x7465646279746573UL
 960 | 
 961 | } siphash_constants_t;
 962 | 
 963 | typedef enum bcrypt_constants
 964 | {
 965 |   BCRYPTM_0=0x4f727068U,
 966 |   BCRYPTM_1=0x65616e42U,
 967 |   BCRYPTM_2=0x65686f6cU,
 968 |   BCRYPTM_3=0x64657253U,
 969 |   BCRYPTM_4=0x63727944U,
 970 |   BCRYPTM_5=0x6f756274U
 971 | 
 972 | } bcrypt_constants_t;
 973 | 
 974 | typedef enum md4_constants
 975 | {
 976 |   MD4M_A=0x67452301U,
 977 |   MD4M_B=0xefcdab89U,
 978 |   MD4M_C=0x98badcfeU,
 979 |   MD4M_D=0x10325476U,
 980 | 
 981 |   MD4S00=3,
 982 |   MD4S01=7,
 983 |   MD4S02=11,
 984 |   MD4S03=19,
 985 |   MD4S10=3,
 986 |   MD4S11=5,
 987 |   MD4S12=9,
 988 |   MD4S13=13,
 989 |   MD4S20=3,
 990 |   MD4S21=9,
 991 |   MD4S22=11,
 992 |   MD4S23=15,
 993 | 
 994 |   MD4C00=0x00000000U,
 995 |   MD4C01=0x5a827999U,
 996 |   MD4C02=0x6ed9eba1U
 997 | 
 998 | } md4_constants_t;
 999 | 
1000 | typedef enum md5_constants
1001 | {
1002 |   MD5M_A=0x67452301U,
1003 |   MD5M_B=0xefcdab89U,
1004 |   MD5M_C=0x98badcfeU,
1005 |   MD5M_D=0x10325476U,
1006 | 
1007 |   MD5S00=7,
1008 |   MD5S01=12,
1009 |   MD5S02=17,
1010 |   MD5S03=22,
1011 |   MD5S10=5,
1012 |   MD5S11=9,
1013 |   MD5S12=14,
1014 |   MD5S13=20,
1015 |   MD5S20=4,
1016 |   MD5S21=11,
1017 |   MD5S22=16,
1018 |   MD5S23=23,
1019 |   MD5S30=6,
1020 |   MD5S31=10,
1021 |   MD5S32=15,
1022 |   MD5S33=21,
1023 | 
1024 |   MD5C00=0xd76aa478U,
1025 |   MD5C01=0xe8c7b756U,
1026 |   MD5C02=0x242070dbU,
1027 |   MD5C03=0xc1bdceeeU,
1028 |   MD5C04=0xf57c0fafU,
1029 |   MD5C05=0x4787c62aU,
1030 |   MD5C06=0xa8304613U,
1031 |   MD5C07=0xfd469501U,
1032 |   MD5C08=0x698098d8U,
1033 |   MD5C09=0x8b44f7afU,
1034 |   MD5C0a=0xffff5bb1U,
1035 |   MD5C0b=0x895cd7beU,
1036 |   MD5C0c=0x6b901122U,
1037 |   MD5C0d=0xfd987193U,
1038 |   MD5C0e=0xa679438eU,
1039 |   MD5C0f=0x49b40821U,
1040 |   MD5C10=0xf61e2562U,
1041 |   MD5C11=0xc040b340U,
1042 |   MD5C12=0x265e5a51U,
1043 |   MD5C13=0xe9b6c7aaU,
1044 |   MD5C14=0xd62f105dU,
1045 |   MD5C15=0x02441453U,
1046 |   MD5C16=0xd8a1e681U,
1047 |   MD5C17=0xe7d3fbc8U,
1048 |   MD5C18=0x21e1cde6U,
1049 |   MD5C19=0xc33707d6U,
1050 |   MD5C1a=0xf4d50d87U,
1051 |   MD5C1b=0x455a14edU,
1052 |   MD5C1c=0xa9e3e905U,
1053 |   MD5C1d=0xfcefa3f8U,
1054 |   MD5C1e=0x676f02d9U,
1055 |   MD5C1f=0x8d2a4c8aU,
1056 |   MD5C20=0xfffa3942U,
1057 |   MD5C21=0x8771f681U,
1058 |   MD5C22=0x6d9d6122U,
1059 |   MD5C23=0xfde5380cU,
1060 |   MD5C24=0xa4beea44U,
1061 |   MD5C25=0x4bdecfa9U,
1062 |   MD5C26=0xf6bb4b60U,
1063 |   MD5C27=0xbebfbc70U,
1064 |   MD5C28=0x289b7ec6U,
1065 |   MD5C29=0xeaa127faU,
1066 |   MD5C2a=0xd4ef3085U,
1067 |   MD5C2b=0x04881d05U,
1068 |   MD5C2c=0xd9d4d039U,
1069 |   MD5C2d=0xe6db99e5U,
1070 |   MD5C2e=0x1fa27cf8U,
1071 |   MD5C2f=0xc4ac5665U,
1072 |   MD5C30=0xf4292244U,
1073 |   MD5C31=0x432aff97U,
1074 |   MD5C32=0xab9423a7U,
1075 |   MD5C33=0xfc93a039U,
1076 |   MD5C34=0x655b59c3U,
1077 |   MD5C35=0x8f0ccc92U,
1078 |   MD5C36=0xffeff47dU,
1079 |   MD5C37=0x85845dd1U,
1080 |   MD5C38=0x6fa87e4fU,
1081 |   MD5C39=0xfe2ce6e0U,
1082 |   MD5C3a=0xa3014314U,
1083 |   MD5C3b=0x4e0811a1U,
1084 |   MD5C3c=0xf7537e82U,
1085 |   MD5C3d=0xbd3af235U,
1086 |   MD5C3e=0x2ad7d2bbU,
1087 |   MD5C3f=0xeb86d391U
1088 | 
1089 | } md5_constants_t;
1090 | 
1091 | typedef enum sha1_constants
1092 | {
1093 |   SHA1M_A=0x67452301U,
1094 |   SHA1M_B=0xefcdab89U,
1095 |   SHA1M_C=0x98badcfeU,
1096 |   SHA1M_D=0x10325476U,
1097 |   SHA1M_E=0xc3d2e1f0U,
1098 | 
1099 |   SHA1C00=0x5a827999U,
1100 |   SHA1C01=0x6ed9eba1U,
1101 |   SHA1C02=0x8f1bbcdcU,
1102 |   SHA1C03=0xca62c1d6U
1103 | 
1104 | } sha1_constants_t;
1105 | 
1106 | typedef enum sha2_32_constants
1107 | {
1108 |   // SHA-224 Initial Hash Values
1109 |   SHA224M_A=0xc1059ed8U,
1110 |   SHA224M_B=0x367cd507U,
1111 |   SHA224M_C=0x3070dd17U,
1112 |   SHA224M_D=0xf70e5939U,
1113 |   SHA224M_E=0xffc00b31U,
1114 |   SHA224M_F=0x68581511U,
1115 |   SHA224M_G=0x64f98fa7U,
1116 |   SHA224M_H=0xbefa4fa4U,
1117 | 
1118 |   // SHA-224 Constants
1119 |   SHA224C00=0x428a2f98U,
1120 |   SHA224C01=0x71374491U,
1121 |   SHA224C02=0xb5c0fbcfU,
1122 |   SHA224C03=0xe9b5dba5U,
1123 |   SHA224C04=0x3956c25bU,
1124 |   SHA224C05=0x59f111f1U,
1125 |   SHA224C06=0x923f82a4U,
1126 |   SHA224C07=0xab1c5ed5U,
1127 |   SHA224C08=0xd807aa98U,
1128 |   SHA224C09=0x12835b01U,
1129 |   SHA224C0a=0x243185beU,
1130 |   SHA224C0b=0x550c7dc3U,
1131 |   SHA224C0c=0x72be5d74U,
1132 |   SHA224C0d=0x80deb1feU,
1133 |   SHA224C0e=0x9bdc06a7U,
1134 |   SHA224C0f=0xc19bf174U,
1135 |   SHA224C10=0xe49b69c1U,
1136 |   SHA224C11=0xefbe4786U,
1137 |   SHA224C12=0x0fc19dc6U,
1138 |   SHA224C13=0x240ca1ccU,
1139 |   SHA224C14=0x2de92c6fU,
1140 |   SHA224C15=0x4a7484aaU,
1141 |   SHA224C16=0x5cb0a9dcU,
1142 |   SHA224C17=0x76f988daU,
1143 |   SHA224C18=0x983e5152U,
1144 |   SHA224C19=0xa831c66dU,
1145 |   SHA224C1a=0xb00327c8U,
1146 |   SHA224C1b=0xbf597fc7U,
1147 |   SHA224C1c=0xc6e00bf3U,
1148 |   SHA224C1d=0xd5a79147U,
1149 |   SHA224C1e=0x06ca6351U,
1150 |   SHA224C1f=0x14292967U,
1151 |   SHA224C20=0x27b70a85U,
1152 |   SHA224C21=0x2e1b2138U,
1153 |   SHA224C22=0x4d2c6dfcU,
1154 |   SHA224C23=0x53380d13U,
1155 |   SHA224C24=0x650a7354U,
1156 |   SHA224C25=0x766a0abbU,
1157 |   SHA224C26=0x81c2c92eU,
1158 |   SHA224C27=0x92722c85U,
1159 |   SHA224C28=0xa2bfe8a1U,
1160 |   SHA224C29=0xa81a664bU,
1161 |   SHA224C2a=0xc24b8b70U,
1162 |   SHA224C2b=0xc76c51a3U,
1163 |   SHA224C2c=0xd192e819U,
1164 |   SHA224C2d=0xd6990624U,
1165 |   SHA224C2e=0xf40e3585U,
1166 |   SHA224C2f=0x106aa070U,
1167 |   SHA224C30=0x19a4c116U,
1168 |   SHA224C31=0x1e376c08U,
1169 |   SHA224C32=0x2748774cU,
1170 |   SHA224C33=0x34b0bcb5U,
1171 |   SHA224C34=0x391c0cb3U,
1172 |   SHA224C35=0x4ed8aa4aU,
1173 |   SHA224C36=0x5b9cca4fU,
1174 |   SHA224C37=0x682e6ff3U,
1175 |   SHA224C38=0x748f82eeU,
1176 |   SHA224C39=0x78a5636fU,
1177 |   SHA224C3a=0x84c87814U,
1178 |   SHA224C3b=0x8cc70208U,
1179 |   SHA224C3c=0x90befffaU,
1180 |   SHA224C3d=0xa4506cebU,
1181 |   SHA224C3e=0xbef9a3f7U,
1182 |   SHA224C3f=0xc67178f2U,
1183 | 
1184 |   // SHA-256 Initial Hash Values
1185 |   SHA256M_A=0x6a09e667U,
1186 |   SHA256M_B=0xbb67ae85U,
1187 |   SHA256M_C=0x3c6ef372U,
1188 |   SHA256M_D=0xa54ff53aU,
1189 |   SHA256M_E=0x510e527fU,
1190 |   SHA256M_F=0x9b05688cU,
1191 |   SHA256M_G=0x1f83d9abU,
1192 |   SHA256M_H=0x5be0cd19U,
1193 | 
1194 |   // SHA-256 Constants
1195 |   SHA256C00=0x428a2f98U,
1196 |   SHA256C01=0x71374491U,
1197 |   SHA256C02=0xb5c0fbcfU,
1198 |   SHA256C03=0xe9b5dba5U,
1199 |   SHA256C04=0x3956c25bU,
1200 |   SHA256C05=0x59f111f1U,
1201 |   SHA256C06=0x923f82a4U,
1202 |   SHA256C07=0xab1c5ed5U,
1203 |   SHA256C08=0xd807aa98U,
1204 |   SHA256C09=0x12835b01U,
1205 |   SHA256C0a=0x243185beU,
1206 |   SHA256C0b=0x550c7dc3U,
1207 |   SHA256C0c=0x72be5d74U,
1208 |   SHA256C0d=0x80deb1feU,
1209 |   SHA256C0e=0x9bdc06a7U,
1210 |   SHA256C0f=0xc19bf174U,
1211 |   SHA256C10=0xe49b69c1U,
1212 |   SHA256C11=0xefbe4786U,
1213 |   SHA256C12=0x0fc19dc6U,
1214 |   SHA256C13=0x240ca1ccU,
1215 |   SHA256C14=0x2de92c6fU,
1216 |   SHA256C15=0x4a7484aaU,
1217 |   SHA256C16=0x5cb0a9dcU,
1218 |   SHA256C17=0x76f988daU,
1219 |   SHA256C18=0x983e5152U,
1220 |   SHA256C19=0xa831c66dU,
1221 |   SHA256C1a=0xb00327c8U,
1222 |   SHA256C1b=0xbf597fc7U,
1223 |   SHA256C1c=0xc6e00bf3U,
1224 |   SHA256C1d=0xd5a79147U,
1225 |   SHA256C1e=0x06ca6351U,
1226 |   SHA256C1f=0x14292967U,
1227 |   SHA256C20=0x27b70a85U,
1228 |   SHA256C21=0x2e1b2138U,
1229 |   SHA256C22=0x4d2c6dfcU,
1230 |   SHA256C23=0x53380d13U,
1231 |   SHA256C24=0x650a7354U,
1232 |   SHA256C25=0x766a0abbU,
1233 |   SHA256C26=0x81c2c92eU,
1234 |   SHA256C27=0x92722c85U,
1235 |   SHA256C28=0xa2bfe8a1U,
1236 |   SHA256C29=0xa81a664bU,
1237 |   SHA256C2a=0xc24b8b70U,
1238 |   SHA256C2b=0xc76c51a3U,
1239 |   SHA256C2c=0xd192e819U,
1240 |   SHA256C2d=0xd6990624U,
1241 |   SHA256C2e=0xf40e3585U,
1242 |   SHA256C2f=0x106aa070U,
1243 |   SHA256C30=0x19a4c116U,
1244 |   SHA256C31=0x1e376c08U,
1245 |   SHA256C32=0x2748774cU,
1246 |   SHA256C33=0x34b0bcb5U,
1247 |   SHA256C34=0x391c0cb3U,
1248 |   SHA256C35=0x4ed8aa4aU,
1249 |   SHA256C36=0x5b9cca4fU,
1250 |   SHA256C37=0x682e6ff3U,
1251 |   SHA256C38=0x748f82eeU,
1252 |   SHA256C39=0x78a5636fU,
1253 |   SHA256C3a=0x84c87814U,
1254 |   SHA256C3b=0x8cc70208U,
1255 |   SHA256C3c=0x90befffaU,
1256 |   SHA256C3d=0xa4506cebU,
1257 |   SHA256C3e=0xbef9a3f7U,
1258 |   SHA256C3f=0xc67178f2U,
1259 | 
1260 | } sha2_32_constants_t;
1261 | 
1262 | typedef enum sha2_64_constants
1263 | {
1264 |   // SHA-384 Initial Hash Values
1265 |   SHA384M_A=0xcbbb9d5dc1059ed8UL,
1266 |   SHA384M_B=0x629a292a367cd507UL,
1267 |   SHA384M_C=0x9159015a3070dd17UL,
1268 |   SHA384M_D=0x152fecd8f70e5939UL,
1269 |   SHA384M_E=0x67332667ffc00b31UL,
1270 |   SHA384M_F=0x8eb44a8768581511UL,
1271 |   SHA384M_G=0xdb0c2e0d64f98fa7UL,
1272 |   SHA384M_H=0x47b5481dbefa4fa4UL,
1273 | 
1274 |   // SHA-512 Initial Hash Values
1275 |   SHA512M_A=0x6a09e667f3bcc908UL,
1276 |   SHA512M_B=0xbb67ae8584caa73bUL,
1277 |   SHA512M_C=0x3c6ef372fe94f82bUL,
1278 |   SHA512M_D=0xa54ff53a5f1d36f1UL,
1279 |   SHA512M_E=0x510e527fade682d1UL,
1280 |   SHA512M_F=0x9b05688c2b3e6c1fUL,
1281 |   SHA512M_G=0x1f83d9abfb41bd6bUL,
1282 |   SHA512M_H=0x5be0cd19137e2179UL,
1283 | 
1284 |   // SHA-384/512 Constants
1285 |   SHA512C00=0x428a2f98d728ae22UL,
1286 |   SHA512C01=0x7137449123ef65cdUL,
1287 |   SHA512C02=0xb5c0fbcfec4d3b2fUL,
1288 |   SHA512C03=0xe9b5dba58189dbbcUL,
1289 |   SHA512C04=0x3956c25bf348b538UL,
1290 |   SHA512C05=0x59f111f1b605d019UL,
1291 |   SHA512C06=0x923f82a4af194f9bUL,
1292 |   SHA512C07=0xab1c5ed5da6d8118UL,
1293 |   SHA512C08=0xd807aa98a3030242UL,
1294 |   SHA512C09=0x12835b0145706fbeUL,
1295 |   SHA512C0a=0x243185be4ee4b28cUL,
1296 |   SHA512C0b=0x550c7dc3d5ffb4e2UL,
1297 |   SHA512C0c=0x72be5d74f27b896fUL,
1298 |   SHA512C0d=0x80deb1fe3b1696b1UL,
1299 |   SHA512C0e=0x9bdc06a725c71235UL,
1300 |   SHA512C0f=0xc19bf174cf692694UL,
1301 |   SHA512C10=0xe49b69c19ef14ad2UL,
1302 |   SHA512C11=0xefbe4786384f25e3UL,
1303 |   SHA512C12=0x0fc19dc68b8cd5b5UL,
1304 |   SHA512C13=0x240ca1cc77ac9c65UL,
1305 |   SHA512C14=0x2de92c6f592b0275UL,
1306 |   SHA512C15=0x4a7484aa6ea6e483UL,
1307 |   SHA512C16=0x5cb0a9dcbd41fbd4UL,
1308 |   SHA512C17=0x76f988da831153b5UL,
1309 |   SHA512C18=0x983e5152ee66dfabUL,
1310 |   SHA512C19=0xa831c66d2db43210UL,
1311 |   SHA512C1a=0xb00327c898fb213fUL,
1312 |   SHA512C1b=0xbf597fc7beef0ee4UL,
1313 |   SHA512C1c=0xc6e00bf33da88fc2UL,
1314 |   SHA512C1d=0xd5a79147930aa725UL,
1315 |   SHA512C1e=0x06ca6351e003826fUL,
1316 |   SHA512C1f=0x142929670a0e6e70UL,
1317 |   SHA512C20=0x27b70a8546d22ffcUL,
1318 |   SHA512C21=0x2e1b21385c26c926UL,
1319 |   SHA512C22=0x4d2c6dfc5ac42aedUL,
1320 |   SHA512C23=0x53380d139d95b3dfUL,
1321 |   SHA512C24=0x650a73548baf63deUL,
1322 |   SHA512C25=0x766a0abb3c77b2a8UL,
1323 |   SHA512C26=0x81c2c92e47edaee6UL,
1324 |   SHA512C27=0x92722c851482353bUL,
1325 |   SHA512C28=0xa2bfe8a14cf10364UL,
1326 |   SHA512C29=0xa81a664bbc423001UL,
1327 |   SHA512C2a=0xc24b8b70d0f89791UL,
1328 |   SHA512C2b=0xc76c51a30654be30UL,
1329 |   SHA512C2c=0xd192e819d6ef5218UL,
1330 |   SHA512C2d=0xd69906245565a910UL,
1331 |   SHA512C2e=0xf40e35855771202aUL,
1332 |   SHA512C2f=0x106aa07032bbd1b8UL,
1333 |   SHA512C30=0x19a4c116b8d2d0c8UL,
1334 |   SHA512C31=0x1e376c085141ab53UL,
1335 |   SHA512C32=0x2748774cdf8eeb99UL,
1336 |   SHA512C33=0x34b0bcb5e19b48a8UL,
1337 |   SHA512C34=0x391c0cb3c5c95a63UL,
1338 |   SHA512C35=0x4ed8aa4ae3418acbUL,
1339 |   SHA512C36=0x5b9cca4f7763e373UL,
1340 |   SHA512C37=0x682e6ff3d6b2b8a3UL,
1341 |   SHA512C38=0x748f82ee5defb2fcUL,
1342 |   SHA512C39=0x78a5636f43172f60UL,
1343 |   SHA512C3a=0x84c87814a1f0ab72UL,
1344 |   SHA512C3b=0x8cc702081a6439ecUL,
1345 |   SHA512C3c=0x90befffa23631e28UL,
1346 |   SHA512C3d=0xa4506cebde82bde9UL,
1347 |   SHA512C3e=0xbef9a3f7b2c67915UL,
1348 |   SHA512C3f=0xc67178f2e372532bUL,
1349 |   SHA512C40=0xca273eceea26619cUL,
1350 |   SHA512C41=0xd186b8c721c0c207UL,
1351 |   SHA512C42=0xeada7dd6cde0eb1eUL,
1352 |   SHA512C43=0xf57d4f7fee6ed178UL,
1353 |   SHA512C44=0x06f067aa72176fbaUL,
1354 |   SHA512C45=0x0a637dc5a2c898a6UL,
1355 |   SHA512C46=0x113f9804bef90daeUL,
1356 |   SHA512C47=0x1b710b35131c471bUL,
1357 |   SHA512C48=0x28db77f523047d84UL,
1358 |   SHA512C49=0x32caab7b40c72493UL,
1359 |   SHA512C4a=0x3c9ebe0a15c9bebcUL,
1360 |   SHA512C4b=0x431d67c49c100d4cUL,
1361 |   SHA512C4c=0x4cc5d4becb3e42b6UL,
1362 |   SHA512C4d=0x597f299cfc657e2aUL,
1363 |   SHA512C4e=0x5fcb6fab3ad6faecUL,
1364 |   SHA512C4f=0x6c44198c4a475817UL
1365 | 
1366 | } sha2_64_constants_t;
1367 | 
1368 | typedef enum ripemd160_constants
1369 | {
1370 |   RIPEMD160M_A=0x67452301U,
1371 |   RIPEMD160M_B=0xefcdab89U,
1372 |   RIPEMD160M_C=0x98badcfeU,
1373 |   RIPEMD160M_D=0x10325476U,
1374 |   RIPEMD160M_E=0xc3d2e1f0U,
1375 | 
1376 |   RIPEMD160C00=0x00000000U,
1377 |   RIPEMD160C10=0x5a827999U,
1378 |   RIPEMD160C20=0x6ed9eba1U,
1379 |   RIPEMD160C30=0x8f1bbcdcU,
1380 |   RIPEMD160C40=0xa953fd4eU,
1381 |   RIPEMD160C50=0x50a28be6U,
1382 |   RIPEMD160C60=0x5c4dd124U,
1383 |   RIPEMD160C70=0x6d703ef3U,
1384 |   RIPEMD160C80=0x7a6d76e9U,
1385 |   RIPEMD160C90=0x00000000U,
1386 | 
1387 |   RIPEMD160S00=11,
1388 |   RIPEMD160S01=14,
1389 |   RIPEMD160S02=15,
1390 |   RIPEMD160S03=12,
1391 |   RIPEMD160S04=5,
1392 |   RIPEMD160S05=8,
1393 |   RIPEMD160S06=7,
1394 |   RIPEMD160S07=9,
1395 |   RIPEMD160S08=11,
1396 |   RIPEMD160S09=13,
1397 |   RIPEMD160S0A=14,
1398 |   RIPEMD160S0B=15,
1399 |   RIPEMD160S0C=6,
1400 |   RIPEMD160S0D=7,
1401 |   RIPEMD160S0E=9,
1402 |   RIPEMD160S0F=8,
1403 | 
1404 |   RIPEMD160S10=7,
1405 |   RIPEMD160S11=6,
1406 |   RIPEMD160S12=8,
1407 |   RIPEMD160S13=13,
1408 |   RIPEMD160S14=11,
1409 |   RIPEMD160S15=9,
1410 |   RIPEMD160S16=7,
1411 |   RIPEMD160S17=15,
1412 |   RIPEMD160S18=7,
1413 |   RIPEMD160S19=12,
1414 |   RIPEMD160S1A=15,
1415 |   RIPEMD160S1B=9,
1416 |   RIPEMD160S1C=11,
1417 |   RIPEMD160S1D=7,
1418 |   RIPEMD160S1E=13,
1419 |   RIPEMD160S1F=12,
1420 | 
1421 |   RIPEMD160S20=11,
1422 |   RIPEMD160S21=13,
1423 |   RIPEMD160S22=6,
1424 |   RIPEMD160S23=7,
1425 |   RIPEMD160S24=14,
1426 |   RIPEMD160S25=9,
1427 |   RIPEMD160S26=13,
1428 |   RIPEMD160S27=15,
1429 |   RIPEMD160S28=14,
1430 |   RIPEMD160S29=8,
1431 |   RIPEMD160S2A=13,
1432 |   RIPEMD160S2B=6,
1433 |   RIPEMD160S2C=5,
1434 |   RIPEMD160S2D=12,
1435 |   RIPEMD160S2E=7,
1436 |   RIPEMD160S2F=5,
1437 | 
1438 |   RIPEMD160S30=11,
1439 |   RIPEMD160S31=12,
1440 |   RIPEMD160S32=14,
1441 |   RIPEMD160S33=15,
1442 |   RIPEMD160S34=14,
1443 |   RIPEMD160S35=15,
1444 |   RIPEMD160S36=9,
1445 |   RIPEMD160S37=8,
1446 |   RIPEMD160S38=9,
1447 |   RIPEMD160S39=14,
1448 |   RIPEMD160S3A=5,
1449 |   RIPEMD160S3B=6,
1450 |   RIPEMD160S3C=8,
1451 |   RIPEMD160S3D=6,
1452 |   RIPEMD160S3E=5,
1453 |   RIPEMD160S3F=12,
1454 | 
1455 |   RIPEMD160S40=9,
1456 |   RIPEMD160S41=15,
1457 |   RIPEMD160S42=5,
1458 |   RIPEMD160S43=11,
1459 |   RIPEMD160S44=6,
1460 |   RIPEMD160S45=8,
1461 |   RIPEMD160S46=13,
1462 |   RIPEMD160S47=12,
1463 |   RIPEMD160S48=5,
1464 |   RIPEMD160S49=12,
1465 |   RIPEMD160S4A=13,
1466 |   RIPEMD160S4B=14,
1467 |   RIPEMD160S4C=11,
1468 |   RIPEMD160S4D=8,
1469 |   RIPEMD160S4E=5,
1470 |   RIPEMD160S4F=6,
1471 | 
1472 |   RIPEMD160S50=8,
1473 |   RIPEMD160S51=9,
1474 |   RIPEMD160S52=9,
1475 |   RIPEMD160S53=11,
1476 |   RIPEMD160S54=13,
1477 |   RIPEMD160S55=15,
1478 |   RIPEMD160S56=15,
1479 |   RIPEMD160S57=5,
1480 |   RIPEMD160S58=7,
1481 |   RIPEMD160S59=7,
1482 |   RIPEMD160S5A=8,
1483 |   RIPEMD160S5B=11,
1484 |   RIPEMD160S5C=14,
1485 |   RIPEMD160S5D=14,
1486 |   RIPEMD160S5E=12,
1487 |   RIPEMD160S5F=6,
1488 | 
1489 |   RIPEMD160S60=9,
1490 |   RIPEMD160S61=13,
1491 |   RIPEMD160S62=15,
1492 |   RIPEMD160S63=7,
1493 |   RIPEMD160S64=12,
1494 |   RIPEMD160S65=8,
1495 |   RIPEMD160S66=9,
1496 |   RIPEMD160S67=11,
1497 |   RIPEMD160S68=7,
1498 |   RIPEMD160S69=7,
1499 |   RIPEMD160S6A=12,
1500 |   RIPEMD160S6B=7,
1501 |   RIPEMD160S6C=6,
1502 |   RIPEMD160S6D=15,
1503 |   RIPEMD160S6E=13,
1504 |   RIPEMD160S6F=11,
1505 | 
1506 |   RIPEMD160S70=9,
1507 |   RIPEMD160S71=7,
1508 |   RIPEMD160S72=15,
1509 |   RIPEMD160S73=11,
1510 |   RIPEMD160S74=8,
1511 |   RIPEMD160S75=6,
1512 |   RIPEMD160S76=6,
1513 |   RIPEMD160S77=14,
1514 |   RIPEMD160S78=12,
1515 |   RIPEMD160S79=13,
1516 |   RIPEMD160S7A=5,
1517 |   RIPEMD160S7B=14,
1518 |   RIPEMD160S7C=13,
1519 |   RIPEMD160S7D=13,
1520 |   RIPEMD160S7E=7,
1521 |   RIPEMD160S7F=5,
1522 | 
1523 |   RIPEMD160S80=15,
1524 |   RIPEMD160S81=5,
1525 |   RIPEMD160S82=8,
1526 |   RIPEMD160S83=11,
1527 |   RIPEMD160S84=14,
1528 |   RIPEMD160S85=14,
1529 |   RIPEMD160S86=6,
1530 |   RIPEMD160S87=14,
1531 |   RIPEMD160S88=6,
1532 |   RIPEMD160S89=9,
1533 |   RIPEMD160S8A=12,
1534 |   RIPEMD160S8B=9,
1535 |   RIPEMD160S8C=12,
1536 |   RIPEMD160S8D=5,
1537 |   RIPEMD160S8E=15,
1538 |   RIPEMD160S8F=8,
1539 | 
1540 |   RIPEMD160S90=8,
1541 |   RIPEMD160S91=5,
1542 |   RIPEMD160S92=12,
1543 |   RIPEMD160S93=9,
1544 |   RIPEMD160S94=12,
1545 |   RIPEMD160S95=5,
1546 |   RIPEMD160S96=14,
1547 |   RIPEMD160S97=6,
1548 |   RIPEMD160S98=8,
1549 |   RIPEMD160S99=13,
1550 |   RIPEMD160S9A=6,
1551 |   RIPEMD160S9B=5,
1552 |   RIPEMD160S9C=15,
1553 |   RIPEMD160S9D=13,
1554 |   RIPEMD160S9E=11,
1555 |   RIPEMD160S9F=11
1556 | 
1557 | } ripemd160_constants_t;
1558 | 
1559 | typedef enum keccak_constants
1560 | {
1561 |   KECCAK_RNDC_00=0x0000000000000001UL,
1562 |   KECCAK_RNDC_01=0x0000000000008082UL,
1563 |   KECCAK_RNDC_02=0x800000000000808aUL,
1564 |   KECCAK_RNDC_03=0x8000000080008000UL,
1565 |   KECCAK_RNDC_04=0x000000000000808bUL,
1566 |   KECCAK_RNDC_05=0x0000000080000001UL,
1567 |   KECCAK_RNDC_06=0x8000000080008081UL,
1568 |   KECCAK_RNDC_07=0x8000000000008009UL,
1569 |   KECCAK_RNDC_08=0x000000000000008aUL,
1570 |   KECCAK_RNDC_09=0x0000000000000088UL,
1571 |   KECCAK_RNDC_10=0x0000000080008009UL,
1572 |   KECCAK_RNDC_11=0x000000008000000aUL,
1573 |   KECCAK_RNDC_12=0x000000008000808bUL,
1574 |   KECCAK_RNDC_13=0x800000000000008bUL,
1575 |   KECCAK_RNDC_14=0x8000000000008089UL,
1576 |   KECCAK_RNDC_15=0x8000000000008003UL,
1577 |   KECCAK_RNDC_16=0x8000000000008002UL,
1578 |   KECCAK_RNDC_17=0x8000000000000080UL,
1579 |   KECCAK_RNDC_18=0x000000000000800aUL,
1580 |   KECCAK_RNDC_19=0x800000008000000aUL,
1581 |   KECCAK_RNDC_20=0x8000000080008081UL,
1582 |   KECCAK_RNDC_21=0x8000000000008080UL,
1583 |   KECCAK_RNDC_22=0x0000000080000001UL,
1584 |   KECCAK_RNDC_23=0x8000000080008008UL,
1585 | 
1586 |   KECCAK_PILN_00=10,
1587 |   KECCAK_PILN_01=7,
1588 |   KECCAK_PILN_02=11,
1589 |   KECCAK_PILN_03=17,
1590 |   KECCAK_PILN_04=18,
1591 |   KECCAK_PILN_05=3,
1592 |   KECCAK_PILN_06=5,
1593 |   KECCAK_PILN_07=16,
1594 |   KECCAK_PILN_08=8,
1595 |   KECCAK_PILN_09=21,
1596 |   KECCAK_PILN_10=24,
1597 |   KECCAK_PILN_11=4,
1598 |   KECCAK_PILN_12=15,
1599 |   KECCAK_PILN_13=23,
1600 |   KECCAK_PILN_14=19,
1601 |   KECCAK_PILN_15=13,
1602 |   KECCAK_PILN_16=12,
1603 |   KECCAK_PILN_17=2,
1604 |   KECCAK_PILN_18=20,
1605 |   KECCAK_PILN_19=14,
1606 |   KECCAK_PILN_20=22,
1607 |   KECCAK_PILN_21=9,
1608 |   KECCAK_PILN_22=6,
1609 |   KECCAK_PILN_23=1,
1610 | 
1611 |   KECCAK_ROTC_00=1,
1612 |   KECCAK_ROTC_01=3,
1613 |   KECCAK_ROTC_02=6,
1614 |   KECCAK_ROTC_03=10,
1615 |   KECCAK_ROTC_04=15,
1616 |   KECCAK_ROTC_05=21,
1617 |   KECCAK_ROTC_06=28,
1618 |   KECCAK_ROTC_07=36,
1619 |   KECCAK_ROTC_08=45,
1620 |   KECCAK_ROTC_09=55,
1621 |   KECCAK_ROTC_10=2,
1622 |   KECCAK_ROTC_11=14,
1623 |   KECCAK_ROTC_12=27,
1624 |   KECCAK_ROTC_13=41,
1625 |   KECCAK_ROTC_14=56,
1626 |   KECCAK_ROTC_15=8,
1627 |   KECCAK_ROTC_16=25,
1628 |   KECCAK_ROTC_17=43,
1629 |   KECCAK_ROTC_18=62,
1630 |   KECCAK_ROTC_19=18,
1631 |   KECCAK_ROTC_20=39,
1632 |   KECCAK_ROTC_21=61,
1633 |   KECCAK_ROTC_22=20,
1634 |   KECCAK_ROTC_23=44,
1635 | 
1636 | } keccak_constants_t;
1637 | 
1638 | typedef enum mysql323_constants
1639 | {
1640 |   MYSQL323_A=0x50305735U,
1641 |   MYSQL323_B=0x12345671U
1642 | 
1643 | } mysql323_constants_t;
1644 | 
1645 | typedef enum fortigate_constants
1646 | {
1647 |   FORTIGATE_A=0x2eba88a3U,
1648 |   FORTIGATE_B=0x4ab04c42U,
1649 |   FORTIGATE_C=0xc1307953U,
1650 |   FORTIGATE_D=0x3fcc0731U,
1651 |   FORTIGATE_E=0x299032a1U,
1652 |   FORTIGATE_F=0x705b81a9U
1653 | 
1654 | } fortigate_constants_t;
1655 | 
1656 | typedef enum blake2b_constants
1657 | {
1658 |   BLAKE2B_IV_00=0x6a09e667f3bcc908UL,
1659 |   BLAKE2B_IV_01=0xbb67ae8584caa73bUL,
1660 |   BLAKE2B_IV_02=0x3c6ef372fe94f82bUL,
1661 |   BLAKE2B_IV_03=0xa54ff53a5f1d36f1UL,
1662 |   BLAKE2B_IV_04=0x510e527fade682d1UL,
1663 |   BLAKE2B_IV_05=0x9b05688c2b3e6c1fUL,
1664 |   BLAKE2B_IV_06=0x1f83d9abfb41bd6bUL,
1665 |   BLAKE2B_IV_07=0x5be0cd19137e2179UL
1666 | 
1667 | } blake2b_constants_t;
1668 | 
1669 | typedef enum blake2s_constants
1670 | {
1671 |   BLAKE2S_IV_00=0x6a09e667,
1672 |   BLAKE2S_IV_01=0xbb67ae85,
1673 |   BLAKE2S_IV_02=0x3c6ef372,
1674 |   BLAKE2S_IV_03=0xa54ff53a,
1675 |   BLAKE2S_IV_04=0x510e527f,
1676 |   BLAKE2S_IV_05=0x9b05688c,
1677 |   BLAKE2S_IV_06=0x1f83d9ab,
1678 |   BLAKE2S_IV_07=0x5be0cd19
1679 | 
1680 | } blake2s_constants_t;
1681 | 
1682 | typedef enum sm3_constants
1683 | {
1684 |   // SM3 Initial Hash Values
1685 |   SM3_IV_A=0x7380166fUL,
1686 |   SM3_IV_B=0x4914b2b9UL,
1687 |   SM3_IV_C=0x172442d7UL,
1688 |   SM3_IV_D=0xda8a0600UL,
1689 |   SM3_IV_E=0xa96f30bcUL,
1690 |   SM3_IV_F=0x163138aaUL,
1691 |   SM3_IV_G=0xe38dee4dUL,
1692 |   SM3_IV_H=0xb0fb0e4eUL,
1693 | 
1694 |   // SM3 Tj round constants
1695 |   SM3_T00=0x79CC4519UL,
1696 |   SM3_T01=0xF3988A32UL,
1697 |   SM3_T02=0xE7311465UL,
1698 |   SM3_T03=0xCE6228CBUL,
1699 |   SM3_T04=0x9CC45197UL,
1700 |   SM3_T05=0x3988A32FUL,
1701 |   SM3_T06=0x7311465EUL,
1702 |   SM3_T07=0xE6228CBCUL,
1703 |   SM3_T08=0xCC451979UL,
1704 |   SM3_T09=0x988A32F3UL,
1705 |   SM3_T10=0x311465E7UL,
1706 |   SM3_T11=0x6228CBCEUL,
1707 |   SM3_T12=0xC451979CUL,
1708 |   SM3_T13=0x88A32F39UL,
1709 |   SM3_T14=0x11465E73UL,
1710 |   SM3_T15=0x228CBCE6UL,
1711 |   SM3_T16=0x9D8A7A87UL,
1712 |   SM3_T17=0x3B14F50FUL,
1713 |   SM3_T18=0x7629EA1EUL,
1714 |   SM3_T19=0xEC53D43CUL,
1715 |   SM3_T20=0xD8A7A879UL,
1716 |   SM3_T21=0xB14F50F3UL,
1717 |   SM3_T22=0x629EA1E7UL,
1718 |   SM3_T23=0xC53D43CEUL,
1719 |   SM3_T24=0x8A7A879DUL,
1720 |   SM3_T25=0x14F50F3BUL,
1721 |   SM3_T26=0x29EA1E76UL,
1722 |   SM3_T27=0x53D43CECUL,
1723 |   SM3_T28=0xA7A879D8UL,
1724 |   SM3_T29=0x4F50F3B1UL,
1725 |   SM3_T30=0x9EA1E762UL,
1726 |   SM3_T31=0x3D43CEC5UL,
1727 |   SM3_T32=0x7A879D8AUL,
1728 |   SM3_T33=0xF50F3B14UL,
1729 |   SM3_T34=0xEA1E7629UL,
1730 |   SM3_T35=0xD43CEC53UL,
1731 |   SM3_T36=0xA879D8A7UL,
1732 |   SM3_T37=0x50F3B14FUL,
1733 |   SM3_T38=0xA1E7629EUL,
1734 |   SM3_T39=0x43CEC53DUL,
1735 |   SM3_T40=0x879D8A7AUL,
1736 |   SM3_T41=0x0F3B14F5UL,
1737 |   SM3_T42=0x1E7629EAUL,
1738 |   SM3_T43=0x3CEC53D4UL,
1739 |   SM3_T44=0x79D8A7A8UL,
1740 |   SM3_T45=0xF3B14F50UL,
1741 |   SM3_T46=0xE7629EA1UL,
1742 |   SM3_T47=0xCEC53D43UL,
1743 |   SM3_T48=0x9D8A7A87UL,
1744 |   SM3_T49=0x3B14F50FUL,
1745 |   SM3_T50=0x7629EA1EUL,
1746 |   SM3_T51=0xEC53D43CUL,
1747 |   SM3_T52=0xD8A7A879UL,
1748 |   SM3_T53=0xB14F50F3UL,
1749 |   SM3_T54=0x629EA1E7UL,
1750 |   SM3_T55=0xC53D43CEUL,
1751 |   SM3_T56=0x8A7A879DUL,
1752 |   SM3_T57=0x14F50F3BUL,
1753 |   SM3_T58=0x29EA1E76UL,
1754 |   SM3_T59=0x53D43CECUL,
1755 |   SM3_T60=0xA7A879D8UL,
1756 |   SM3_T61=0x4F50F3B1UL,
1757 |   SM3_T62=0x9EA1E762UL,
1758 |   SM3_T63=0x3D43CEC5UL
1759 | 
1760 | } sm3_constants_t;
1761 | 
1762 | typedef enum combinator_mode
1763 | {
1764 |   COMBINATOR_MODE_BASE_LEFT  = 10001,
1765 |   COMBINATOR_MODE_BASE_RIGHT = 10002
1766 | 
1767 | } combinator_mode_t;
1768 | 
1769 | #ifdef KERNEL_STATIC
1770 | typedef struct digest
1771 | {
1772 |   u32 digest_buf[DGST_ELEM];
1773 | 
1774 | } digest_t;
1775 | #endif
1776 | 
1777 | typedef struct kernel_param
1778 | {
1779 |   // We can only move attributes into this struct which do not use special declarations like __global
1780 | 
1781 |   u32 bitmap_mask;          // 24
1782 |   u32 bitmap_shift1;        // 25
1783 |   u32 bitmap_shift2;        // 26
1784 |   u32 salt_pos_host;        // 27
1785 |   u32 loop_pos;             // 28
1786 |   u32 loop_cnt;             // 29
1787 |   u32 il_cnt;               // 30
1788 |   u32 digests_cnt;          // 31
1789 |   u32 digests_offset_host;  // 32
1790 |   u32 combs_mode;           // 33
1791 |   u32 salt_repeat;          // 34
1792 |   u64 pws_pos;              // 35
1793 |   u64 gid_max;              // 36
1794 | 
1795 | } kernel_param_t;
1796 | 
1797 | typedef struct salt
1798 | {
1799 |   u32 salt_buf[64];
1800 |   u32 salt_buf_pc[64];
1801 | 
1802 |   u32 salt_len;
1803 |   u32 salt_len_pc;
1804 |   u32 salt_iter;
1805 |   u32 salt_iter2;
1806 |   u32 salt_sign[2];
1807 |   u32 salt_repeats;
1808 | 
1809 |   u32 orig_pos;
1810 | 
1811 |   u32 digests_cnt;
1812 |   u32 digests_done;
1813 | 
1814 |   u32 digests_offset;
1815 | 
1816 |   u32 scrypt_N;
1817 |   u32 scrypt_r;
1818 |   u32 scrypt_p;
1819 | 
1820 | } salt_t;
1821 | 
1822 | typedef struct
1823 | {
1824 |   u32 key;
1825 |   u64 val;
1826 | 
1827 | } hcstat_table_t;
1828 | 
1829 | typedef struct
1830 | {
1831 |   u32 cs_buf[0x100];
1832 |   u32 cs_len;
1833 | 
1834 | } cs_t;
1835 | 
1836 | typedef struct
1837 | {
1838 |   u32 cmds[32];
1839 | 
1840 | } kernel_rule_t;
1841 | 
1842 | typedef struct pw
1843 | {
1844 |   u32 i[64];
1845 | 
1846 |   u32 pw_len;
1847 | 
1848 | } pw_t;
1849 | 
1850 | typedef struct pw_idx
1851 | {
1852 |   u32 off;
1853 |   u32 cnt;
1854 |   u32 len;
1855 | 
1856 | } pw_idx_t;
1857 | 
1858 | typedef struct bf
1859 | {
1860 |   u32  i;
1861 | 
1862 | } bf_t;
1863 | 
1864 | typedef struct bs_word
1865 | {
1866 |   u32  b[32];
1867 | 
1868 | } bs_word_t;
1869 | 
1870 | typedef struct plain
1871 | {
1872 |   u64  gidvid;
1873 |   u32  il_pos;
1874 |   u32  salt_pos;
1875 |   u32  digest_pos;
1876 |   u32  hash_pos;
1877 |   u32  extra1;
1878 |   u32  extra2;
1879 | 
1880 | } plain_t;
1881 | 
1882 | typedef struct keyboard_layout_mapping
1883 | {
1884 |   u32 src_char;
1885 |   int src_len;
1886 |   u32 dst_char;
1887 |   int dst_len;
1888 | 
1889 | } keyboard_layout_mapping_t;
1890 | 
1891 | typedef struct hc_enc
1892 | {
1893 |   int  pos;   // source offset
1894 | 
1895 |   u32  cbuf;  // carry buffer
1896 |   int  clen;  // carry length
1897 | 
1898 | } hc_enc_t;
1899 | 
1900 | #endif
1901 | 


--------------------------------------------------------------------------------