├── .gitignore ├── CHANGELOG.md ├── LICENSE ├── Makefile ├── README.md ├── TROUBLESHOOTING.md ├── blake.c ├── blake.h ├── dump.co ├── haraka.c ├── haraka.h ├── haraka_portable.c ├── haraka_portable.h ├── haraka_portable.o ├── input.cl ├── main.c ├── main.o ├── param-nvidia.h ├── param.h ├── run ├── sa-solver.iobj ├── sa-solver.ipdb ├── sa-solver.pdb ├── sha256.c ├── sha256.h ├── sha256.o ├── silenarmy.spec ├── silentarmy ├── silentarmy.py ├── silentarmy.spec ├── verus.cl ├── verus_clhash.h ├── verus_clhash_portable.cpp ├── verus_hash.cpp └── verus_hash.h /.gitignore: -------------------------------------------------------------------------------- 1 | /.hg/ 2 | /.hgignore 3 | /sa-solver 4 | /_kernel.h 5 | *.o 6 | _temp_* 7 | *.swp 8 | *.tmp 9 | *.pyc 10 | tags 11 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Current tip 2 | 3 | * Implement mining.extranonce.subscribe (kenshirothefist) 4 | * Optimization: +10% speedup, increase collision items tracked per thread 5 | (nerdralph). 'make test' finds 196 sols again. 6 | 7 | # Version 5 (11 Nov 2016) 8 | 9 | * Optimization: major 2x speedup (eXtremal) by storing 8 atomic counters in 10 | 1 uint, and by reducing branch divergence when iterating over and XORing Xi's. 11 | Note that as a result of these optimizations, sa-solver compiled with 12 | NR_ROWS_LOG=20 now only finds 182 out of 196 existing solutions ("make test" 13 | verification data was adjusted accordingly) 14 | * Defaulting OPTIM_SIMPLIFY_ROUND to 1; GPU memory usage down to 0.8 GB per 15 | instance 16 | * Optimization: significantly reduce CPU usage and PCIe bandwidth (before: 17 | ~100 MB/s/GPU, after: 0.5 MB/s/GPU), accomplished by filtering invalid 18 | solutions on-device 19 | * Optimization: reduce size of collisions[] array; +7% speed increase measured 20 | on RX 480 and R9 Nano using AMDGPU-PRO 16.40 21 | * Implement stratum method client.reconnect 22 | * Avoid segfault when encountering an out-of-range input 23 | * For simplicity `-i

` now only accepts 140-byte headers 24 | * Update README.md with Nvidia performance numbers 25 | * Fix mining on Xeon Phi and CPUs (fix OpenCL warnings) 26 | * Fix compilation warnings and 32-bit platforms 27 | 28 | # Version 4 (08 Nov 2016) 29 | 30 | * Add Nvidia GPU support (fix more unaligned memory accesses) 31 | * Add nerdralph's optimization (OPTIM_SIMPLIFY_ROUND) for potential +30% 32 | speedup, especially useful on Nvidia GPUs 33 | * Drop the Python 3.5 dependency; now requires only Python 3.3 or above (lhl) 34 | * Drop the libsodium dependency; instead use our own SHA256 implementation 35 | * Add nicehash compatibility (stratum servers fixing 17 bytes of the nonce) 36 | * Only apply set_target to *next* mining job 37 | * Do not abandon previous mining jobs if clean_jobs is false 38 | * Fix KeyError's when displaying stats 39 | * Be more robust about different types of network errors during connection 40 | * Remove bytes.hex() which was only supported on Python 3.5+. 41 | 42 | # Version 3 (04 Nov 2016) 43 | 44 | * SILENTARMY is now a full miner, not just a solver; the solver binary was 45 | renamed "sa-solver" and the miner is the script "silentarmy" 46 | * Multi-GPU support 47 | * Stratum support for pool mining 48 | * Reduce GPU memory usage to 671 MB (NR_ROWS_LOG=19) or 1208 MB 49 | (NR_ROWS_LOG=20, default, ~10% faster than 19) per Equihash instance 50 | * Rename --list-gpu to --list and list all OpenCL devices (not just GPUs) 51 | * Add support for multiple OpenCL platforms: --list now scans all available 52 | platforms, numbering devices using globally unique IDs 53 | * Improve correctness: find ~0.09% more solutions 54 | 55 | # Version 2 (30 Oct 2016) 56 | 57 | * Support GCN 1.0 / remove unaligned memory accesses (because of this bug, 58 | previously SILENTARMY always reported 0 solutions on GCN 1.0 hardware) 59 | * Minor performance improvement (~1%) 60 | * Get rid of "kernel.cl" and move the OpenCL code to a C string embedded in the 61 | binary during compilation 62 | * Update README with instructions for installing 63 | **Radeon Software Crimson Edition** (fglrx.ko) in addition to 64 | **AMDGPU-PRO** (amdgpu.ko) 65 | 66 | # Version 1 (27 Oct 2016) 67 | 68 | * Initial import into GitHub 69 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Change this path if the SDK was installed in a non-standard location 2 | OPENCL_HEADERS = "/opt/AMDAPPSDK-3.0/include" 3 | # By default libOpenCL.so is searched in default system locations, this path 4 | # lets you adds one more directory to the search path. 5 | LIBOPENCL = "/opt/amdgpu-pro/lib/x86_64-linux-gnu" 6 | 7 | CC = gcc 8 | CPPFLAGS = -I${OPENCL_HEADERS} 9 | CFLAGS = -O2 -std=gnu99 -pedantic -Wextra -Wall \ 10 | -Wno-deprecated-declarations \ 11 | -Wno-overlength-strings \ 12 | -Wno-unused-parameter 13 | LDFLAGS = -rdynamic -L${LIBOPENCL} 14 | LDLIBS = -lOpenCL 15 | OBJ = main.o haraka_portable.o sha256.o 16 | INCLUDES = param.h _kernel.h sha256.h haraka_portable.h 17 | 18 | all : sa-solver 19 | 20 | sa-solver : ${OBJ} 21 | ${CC} -o sa-solver ${OBJ} ${LDFLAGS} ${LDLIBS} 22 | 23 | ${OBJ} : ${INCLUDES} 24 | 25 | _kernel.h : input.cl param.h 26 | echo 'const char *ocl_code = R"_mrb_(' >$@ 27 | cpp $< >>$@ 28 | echo ')_mrb_";' >>$@ 29 | 30 | test : sa-solver 31 | @echo Testing... 32 | @if res=`./sa-solver --nonces 100 -v -v 2>&1 | grep Soln: | \ 33 | diff -u testing/sols-100 -`; then \ 34 | echo "Test: success"; \ 35 | else \ 36 | echo "$$res\nTest: FAILED" | cut -c 1-75 >&2; \ 37 | fi 38 | # When compiling with NR_ROWS_LOG != 20, the solutions it finds are 39 | # different: testing/sols-100 40 | 41 | clean : 42 | rm -f sa-solver _kernel.h *.o _temp_* 43 | 44 | re : clean all 45 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SILENTARMY Standalone Version for Windows x86_64 2 | 3 | [Download precompiled binaries (v5-win64standalone-r12)](https://github.com/zawawawa/silentarmy/releases/tag/v5-win64standalone-r12) (Developmental) 4 | 5 | [Download precompiled binaries (v5-win64standalone-r7)](https://github.com/zawawawa/silentarmy/releases/download/v5-win64standalone-r7/silentarmy-v5-win64standalone-r7.zip) (Stable) 6 | 7 | This is a standalone Windows x86_64 port of SILENTARMY v5, which does not require Python, based on [Genoil's Windows port](https://github.com/Genoil/silentarmy/tree/windows). All you have to do for mining is to run `list.bat` to get device ID's and edit and run `silentarmy.bat`. MAKE SURE TO SPECIFY CORRECT DEVICE IDS WITH THE `--use` OPTION! See the documentation of the original SILENTARMY below for details. 8 | 9 | You may get the following error if you have missing DLL's on your system: `The application was unable to start correctly (0xc000007b)` In this case, you need to delete `vcruntime140.dll` in the package and install [Visual C++ 2015 Redistribution Package](https://www.microsoft.com/en-us/download/details.aspx?id=48145). 10 | 11 | If you find this port useful and/or would like to see a feature-rich ZEC miner based on it, please consider donations to: `t1NwUDeSKu4BxkD58mtEYKDjzw5toiLfmCu` 12 | 13 | Last but not least, mrb, nerdralph, eXtremal, and Genoil, thank you all so much for the great work. You guys are 14 | truly the cream of the FOSS movement. 15 | 16 | zawawa @ bitcointalk.org 17 | 18 | # SILENTARMY 19 | 20 | Official site: https://github.com/mbevand/silentarmy 21 | 22 | SILENTARMY is a free open source [Zcash](https://z.cash) miner for Linux 23 | with multi-GPU and [Stratum](https://github.com/str4d/zips/blob/77-zip-stratum/drafts/str4d-stratum/draft1.rst) support. It is written in OpenCL and has been tested 24 | on AMD/Nvidia/Intel GPUs, Xeon Phi, and more. 25 | 26 | After compiling SILENTARMY, list the available OpenCL devices: 27 | 28 | ``` 29 | $ silentarmy --list 30 | ``` 31 | 32 | Start mining with two GPUs (ID 2 and ID 5) on a pool: 33 | 34 | ``` 35 | $ silentarmy --use 2,5 -c stratum+tcp://us1-zcash.flypool.org:3333 -u t1cVviFvgJinQ4w3C2m2CfRxgP5DnHYaoFC 36 | ``` 37 | 38 | When run without options, SILENTARMY mines with the first OpenCL device, using 39 | my donation address, on flypool: 40 | 41 | ``` 42 | $ silentarmy 43 | Connecting to us1-zcash.flypool.org:3333 44 | Stratum server sent us the first job 45 | Mining on 1 device 46 | Total 0.0 sol/s [dev0 0.0] 0 shares 47 | Total 43.9 sol/s [dev0 43.9] 0 shares 48 | Total 46.9 sol/s [dev0 46.9] 0 shares 49 | Total 44.9 sol/s [dev0 44.9] 1 share 50 | [...] 51 | ``` 52 | 53 | Usage: 54 | 55 | ``` 56 | $ silentarmy --help 57 | Usage: silentarmy [options] 58 | 59 | Options: 60 | -h, --help show this help message and exit 61 | -v, --verbose verbose mode (may be repeated for more verbosity) 62 | --debug enable debug mode (for developers only) 63 | --list list available OpenCL devices by ID (GPUs...) 64 | --use=LIST use specified GPU device IDs to mine, for example to 65 | use the first three: 0,1,2 (default: 0) 66 | --instances=N run N instances of Equihash per GPU (default: 2) 67 | -c POOL, --connect=POOL 68 | connect to POOL, for example: 69 | stratum+tcp://example.com:1234 70 | -u USER, --user=USER username for connecting to the pool 71 | -p PWD, --pwd=PWD password for connecting to the pool 72 | ``` 73 | 74 | # Performance 75 | 76 | * 115 sol/s with one R9 Nano 77 | * 75 sol/s with one RX 480 8GB 78 | * 70 sol/s with one GTX 1070 79 | 80 | See [TROUBLESHOOTING.md](TROUBLESHOOTING.md#performance) to resolve performance 81 | issues. 82 | 83 | Note: the `silentarmy` **miner** automatically achieves this performance level, 84 | however the `sa-solver` **command-line solver** by design runs only 1 instance 85 | of the Equihash proof-of-work algorithm causing it to slightly underperform by 86 | 5-10%. One must manually run 2 instances of `sa-solver` (eg. in 2 terminal 87 | consoles) to achieve the same performance level as the `silentarmy` **miner**. 88 | 89 | # Compilation and installation 90 | 91 | The steps below describe how to obtain the dependencies needed by SILENTARMY, 92 | how to compile it, and how to install it. 93 | 94 | ## Step 1: OpenCL 95 | 96 | OpenCL support comes with the graphic card driver. Read the appropriate 97 | subsection below: 98 | 99 | ### Ubuntu 16.04 / amdgpu 100 | 101 | 1. Download the [AMDGPU-PRO Driver](http://support.amd.com/en-us/kb-articles/Pages/AMDGPU-PRO-Install.aspx) 102 | (as of 30 Oct 2016, the latest version is 16.40). 103 | 104 | 2. Extract it: 105 | `$ tar xf amdgpu-pro-16.40-348864.tar.xz` 106 | 107 | 3. Install (non-root, will use sudo access automatically): 108 | `$ ./amdgpu-pro-install` 109 | 110 | 4. Add yourself to the video group if not already a member: 111 | `$ sudo gpasswd -a $(whoami) video` 112 | 113 | 5. Reboot 114 | 115 | 6. Download the [AMD APP SDK](http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-app-sdk/) 116 | (as of 27 Oct 2016, the latest version is 3.0) 117 | 118 | 7. Extract it: 119 | `$ tar xf AMD-APP-SDKInstaller-v3.0.130.136-GA-linux64.tar.bz2` 120 | 121 | 8. Install system-wide by running as root (accept all the default options): 122 | `$ sudo ./AMD-APP-SDK-v3.0.130.136-GA-linux64.sh` 123 | 124 | ### Ubuntu 14.04 / fglrx 125 | 126 | 1. Install the official Ubuntu package for the **Radeon Software Crimson 127 | Edition** driver: 128 | `$ sudo apt-get install fglrx` 129 | (as of 30 Oct 2016, the latest version is 2:15.201-0ubuntu0.14.04.1) 130 | 131 | 2. Follow steps 5-8 above: reboot, install the AMD APP SDK... 132 | 133 | ### Ubuntu 16.04 / Nvidia 134 | 135 | 1. Install the OpenCL development files and the latest driver: 136 | `$ sudo apt-get install nvidia-opencl-dev nvidia-361` 137 | 138 | 2. Either reboot, or load the kernel driver: 139 | `$ sudo modprobe nvidia_361` 140 | 141 | ## Step 2: Python 3.3 142 | 143 | 1. SILENTARMY requires Python 3.3 or later (needed to support the use of the 144 | `yield from` syntax). On Ubuntu/Debian systems: 145 | `$ sudo apt-get install python3` 146 | 147 | 2. Verify the Python version is 3.3 or later: 148 | `$ python3 -V` 149 | 150 | ## Step 3: C compiler 151 | 152 | 1. A C compiler is needed to compile the SILENTARMY solver binary (`sa-solver`): 153 | `$ sudo apt-get install build-essential` 154 | 155 | ## Step 4: Get SILENTARMY 156 | 157 | Download it as a ZIP from github: https://github.com/mbevand/silentarmy/archive/master.zip 158 | 159 | Or clone it from the command line: 160 | `$ git clone https://github.com/mbevand/silentarmy.git` 161 | 162 | Or, for Arch Linux users, get the 163 | [silentarmy AUR package](https://aur.archlinux.org/packages/silentarmy/). 164 | 165 | ## Step 5: Compile and install 166 | 167 | Compiling SILENTARMY is easy: 168 | 169 | `$ make` 170 | 171 | You may need to specify the paths to the locations of your OpenCL C headers 172 | and libOpenCL.so if the compiler does not find them, eg.: 173 | 174 | `$ make OPENCL_HEADERS=/usr/local/cuda-8.0/targets/x86_64-linux/include LIBOPENCL=/usr/local/cuda-8.0/targets/x86_64-linux/lib` 175 | 176 | Self-testing the command-line solver (solves 100 all-zero 140-byte blocks with 177 | their nonces varying from 0 to 99): 178 | 179 | `$ make test` 180 | 181 | For more testing run `sa-solver --nonces 10000`. It should finds 18627 182 | solutions which is less than 1% off the theoretical expected average number of 183 | solutions of 1.88 per Equihash run at (n,k)=(200,9). 184 | 185 | For installing, just copy `silentarmy` and `sa-solver` to the same directory. 186 | 187 | # Equihash solver 188 | 189 | SILENTARMY also provides a command line Equihash solver (`sa-solver`) 190 | implementing the CLI API described in the 191 | [Zcash open source miner challenge](https://zcashminers.org/rules). 192 | To solve a specific block header and print the encoded solution on stdout, run 193 | the following command (this header is from 194 | [mainnet block #3400](https://explorer.zcha.in/blocks/00000001687e89e7e1ce48b349e601c89c70dd4c268fdf24b269a3ca4140426f) 195 | and should result in 1 Equihash solution): 196 | 197 | ``` 198 | $ sa-solver -i 04000000e54c27544050668f272ec3b460e1cde745c6b21239a81dae637fde4704000000844bc0c55696ef9920eeda11c1eb41b0c2e7324b46cc2e7aa0c2aa7736448d7a000000000000000000000000000000000000000000000000000000000000000068241a587e7e061d250e000000000000010000000000000000000000000000000000000000000000 199 | ``` 200 | 201 | If the option `-i` is not specified, `sa-solver` solves a 140-byte header of all 202 | zero bytes. The option `--nonces ` instructs the program to try multiple 203 | nonces, each time incrementing the nonce by 1. So a convenient way to run a 204 | quick test/benchmark is simply: 205 | 206 | `$ sa-solver --nonces 100` 207 | 208 | Note: due to BLAKE2b optimizations in my implementation, if the header is 209 | specified it must be 140 bytes and its last 12 bytes **must** be zero. 210 | 211 | Use the verbose (`-v`) and very verbose (`-v -v`) options to show the solutions 212 | and statistics in progressively more and more details. 213 | 214 | # Implementation details 215 | 216 | The `silentarmy` Python script is actually mostly a lightweight Stratum 217 | implementation which launches in the background one or more instances of 218 | `sa-solver --mining` per GPU. This "mining mode" enables `sa-solver` to 219 | communicate with `silentarmy` using stdin/stdout. By default 2 instances of 220 | `sa-solver` are launched for each GPU (this can be changed with the `silentarmy 221 | --instances N` option.) 2 instances per GPU usually results in the best 222 | performance. 223 | 224 | The `sa-solver` binary invokes the OpenCL kernel which contains the core of the 225 | Equihash algorithm. My implementation uses two hash tables to avoid having to 226 | sort the (Xi,i) pairs: 227 | 228 | * Round 0 (BLAKE2b) fills up table #0 229 | * Round 1 reads table #0, identifies collisions, XORs the Xi's, stores 230 | the results in table #1 231 | * Round 2 reads table #1 and fills up table #0 (reusing it) 232 | * Round 3 reads table #0 and fills up table #1 (also reusing it) 233 | * ... 234 | * Round 8 (last round) reads table #1 and fills up table #0. 235 | 236 | Only the non-zero parts of Xi are stored in the hash table, so fewer and fewer 237 | bytes are needed to store Xi as we progress toward round 8. For a description 238 | of the layout of the hash table, see the comment at the top of `input.cl`. 239 | 240 | Also the code implements the notion of "encoded reference to inputs" which 241 | I--apparently like most authors of Equihash solvers--independently discovered 242 | as a neat trick to save having to read/write so much data. Instead of saving 243 | lists of inputs that double in size every round, SILENTARMY re-uses the fact 244 | they were stored in the previous hash table, and saves a reference to the two 245 | previous inputs, encoded as a (row,slot0,slot1) where (row,slot0) and 246 | (row,slot1) themselves are each a reference to 2 previous inputs, and so on, 247 | until round 0 where the inputs are just the 21-bit values. 248 | 249 | A BLAKE2b optimization implemented by SILENTARMY requires the last 12 bytes of 250 | the nonce/header to be zero. When set to a fixed value like zero, not only the 251 | code does not need to implement the "sigma" permutations, but many 64-bit 252 | additions in the BLAKE2b mix() function can be pre-computed automatically by 253 | the OpenCL compiler. 254 | 255 | Managing invalid solutions (duplicate inputs) is done in multiple places: 256 | 257 | * Any time a XOR results in an all-zero value, this work item is discarded 258 | as it is statistically very unlikely that the XOR of 256 or fewer inputs 259 | is zero. This check is implemented at the end of `xor_and_store()` 260 | * When the final hash table produced at round 8 has many elements 261 | that collide in the same row (because bits 160-179 are identical, and 262 | almost certainly bits 180-199), this is also discarded as a likely invalid 263 | solution because this is statistically guaranteed to be all inputs repeated 264 | at least once. This check is implemented in `kernel_sols()` (see 265 | `likely_invalids`.) 266 | * When input references are expanded on-GPU by `expand_refs()`, the code 267 | checks if the last (512th) input is repeated at least once. 268 | * Finally when the GPU returns potential solutions, the CPU also checks for 269 | invalid solutions with duplicate inputs. This check is implemented in 270 | `verify_sol()`. 271 | 272 | Finally, SILENTARMY makes many optimization assumptions and currently only 273 | supports Equihash parameters 200,9. 274 | 275 | # Author 276 | 277 | Marc Bevand -- [http://zorinaq.com](http://zorinaq.com) 278 | 279 | Donations welcome: t1cVviFvgJinQ4w3C2m2CfRxgP5DnHYaoFC 280 | 281 | # Thanks 282 | 283 | I would like to thank these persons for their contributions to SILENTARMY, 284 | in alphabetical order: 285 | * eXtremal 286 | * kenshirothefist 287 | * lhl 288 | * nerdralph 289 | * poiuty 290 | * solardiz 291 | 292 | -------------------------------------------------------------------------------- /TROUBLESHOOTING.md: -------------------------------------------------------------------------------- 1 | # Troubleshooting 2 | 3 | Follow this checklist to verify that your entire hardware and software 4 | stack works (drivers, OpenCL, SILENTARMY). 5 | 6 | ## Driver / OpenCL installation 7 | 8 | Run `clinfo` to list all the OpenCL devices. If it does not find all your 9 | devices, something is wrong with your drivers and/or OpenCL stack. Uninstall 10 | and reinstall your drivers. Here are good instructions: 11 | https://hashcat.net/wiki/doku.php?id=frequently_asked_questions#i_may_have_the_wrong_driver_installed_what_should_i_do 12 | 13 | ## Check silentarmy 14 | 15 | Does `./silentarmy --list` list your devices? If `clinfo` does, silentarmy 16 | should list them as well. 17 | 18 | ## Basic operation 19 | 20 | Run the Equihash solver `sa-solver` to solve the all-zero block. It should 21 | report 2 solutions. Specify the device ID to test with `--use ID` 22 | 23 | ``` 24 | $ ./sa-solver --use 0 25 | Solving default all-zero 140-byte header 26 | Building program 27 | Hash tables will use 805.3 MB 28 | Running... 29 | Nonce 0000000000000000000000000000000000000000000000000000000000000000: 2 sols 30 | Total 2 solutions in 205.3 ms (9.7 Sol/s) 31 | ``` 32 | 33 | Note that `sa-solver` only supports 1 device at a time. It will not recognize 34 | eg. `--use 0,1,2`. 35 | 36 | ## Correct results 37 | 38 | Verify that `make test` succeeds. It should take between 5 and 60 seconds 39 | depending on your GPU: 40 | 41 | ``` 42 | $ make test 43 | Testing... 44 | Test: success 45 | ``` 46 | 47 | ## Sustained operation on one device 48 | 49 | Let the Equihash solver `sa-solver` run for multiple hours: 50 | 51 | ``` 52 | $ ./sa-solver --nonces 100000000 53 | Solving default all-zero 140-byte header 54 | Building program 55 | Hash tables will use 1208.0 MB 56 | Running... 57 | Nonce 0000000000000000000000000000000000000000000000000000000000000000: 2 sols 58 | Nonce 0100000000000000000000000000000000000000000000000000000000000000: 0 sols 59 | ... 60 | ``` 61 | 62 | It should not crash or hang. 63 | 64 | ## Mining 65 | 66 | Run the miner without options. By default it will use the first device, 67 | and connect to flypool with my donation address. These known-good parameters 68 | should let you know easily if your machine can mine properly: 69 | 70 | ``` 71 | $ ./silentarmy 72 | Connecting to us1-zcash.flypool.org:3333 73 | Stratum server sent us the first job 74 | Mining on 1 device 75 | Total 0.0 sol/s [dev0 0.0] 0 shares 76 | Total 48.9 sol/s [dev0 48.9] 1 share 77 | Total 44.9 sol/s [dev0 44.9] 1 share 78 | ... 79 | ``` 80 | 81 | Verify that the number of shares increases over time. 82 | 83 | ## Performance 84 | 85 | Not reaching the sol/s performance you expected? 86 | 87 | * Try running a different number of instances using the `silentarmy --instances 88 | N` argument. Try 1, 2, 3, or more. Note that each instance requires 805 MB of 89 | GPU memory. 90 | * If 1 instance still requires more GPU memory than available, edit `param.h` 91 | and set `NR_ROWS_LOG` to `19` (this reduces the per-instance memory usage 92 | to 671 MB) and run with `--instances 1`. 93 | * By default SILENTARMY mines with only one device/GPU; make sure to specify 94 | all the GPUs in the `--use` option, for example `silentarmy --use 0,1,2` 95 | if the host has three devices with IDs 0, 1, and 2. 96 | * Update your graphics card driver. The OpenCL compiler comes with the driver 97 | and occasionally new driver versions significantly tweak or improve it. 98 | -------------------------------------------------------------------------------- /blake.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "blake.h" 5 | 6 | static const uint32_t blake2b_block_len = 128; 7 | static const uint32_t blake2b_rounds = 12; 8 | static const uint64_t blake2b_iv[8] = 9 | { 10 | 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, 11 | 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, 12 | 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, 13 | 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL, 14 | }; 15 | static const uint8_t blake2b_sigma[12][16] = 16 | { 17 | { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, 18 | { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, 19 | { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, 20 | { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, 21 | { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, 22 | { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, 23 | { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, 24 | { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, 25 | { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, 26 | { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, 27 | { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, 28 | { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, 29 | }; 30 | 31 | /* 32 | ** Init the state according to Zcash parameters. 33 | */ 34 | void zcash_blake2b_init(blake2b_state_t *st, uint8_t hash_len, 35 | uint32_t n, uint32_t k) 36 | { 37 | assert(n > k); 38 | assert(hash_len <= 64); 39 | st->h[0] = blake2b_iv[0] ^ (0x01010000 | hash_len); 40 | for (uint32_t i = 1; i <= 5; i++) 41 | st->h[i] = blake2b_iv[i]; 42 | st->h[6] = blake2b_iv[6] ^ *(uint64_t *)"ZcashPoW"; 43 | st->h[7] = blake2b_iv[7] ^ (((uint64_t)k << 32) | n); 44 | st->bytes = 0; 45 | } 46 | 47 | static uint64_t rotr64(uint64_t a, uint8_t bits) 48 | { 49 | return (a >> bits) | (a << (64 - bits)); 50 | } 51 | 52 | static void mix(uint64_t *va, uint64_t *vb, uint64_t *vc, uint64_t *vd, 53 | uint64_t x, uint64_t y) 54 | { 55 | *va = (*va + *vb + x); 56 | *vd = rotr64(*vd ^ *va, 32); 57 | *vc = (*vc + *vd); 58 | *vb = rotr64(*vb ^ *vc, 24); 59 | *va = (*va + *vb + y); 60 | *vd = rotr64(*vd ^ *va, 16); 61 | *vc = (*vc + *vd); 62 | *vb = rotr64(*vb ^ *vc, 63); 63 | } 64 | 65 | /* 66 | ** Process either a full message block or the final partial block. 67 | ** Note that v[13] is not XOR'd because st->bytes is assumed to never overflow. 68 | ** 69 | ** _msg pointer to message (must be zero-padded to 128 bytes if final block) 70 | ** msg_len must be 128 (<= 128 allowed only for final partial block) 71 | ** is_final indicate if this is the final block 72 | */ 73 | void zcash_blake2b_update(blake2b_state_t *st, const uint8_t *_msg, 74 | uint32_t msg_len, uint32_t is_final) 75 | { 76 | const uint64_t *m = (const uint64_t *)_msg; 77 | uint64_t v[16]; 78 | assert(msg_len <= 128); 79 | assert(st->bytes <= UINT64_MAX - msg_len); 80 | memcpy(v + 0, st->h, 8 * sizeof (*v)); 81 | memcpy(v + 8, blake2b_iv, 8 * sizeof (*v)); 82 | v[12] ^= (st->bytes += msg_len); 83 | v[14] ^= is_final ? -1 : 0; 84 | for (uint32_t round = 0; round < blake2b_rounds; round++) 85 | { 86 | const uint8_t *s = blake2b_sigma[round]; 87 | mix(v + 0, v + 4, v + 8, v + 12, m[s[0]], m[s[1]]); 88 | mix(v + 1, v + 5, v + 9, v + 13, m[s[2]], m[s[3]]); 89 | mix(v + 2, v + 6, v + 10, v + 14, m[s[4]], m[s[5]]); 90 | mix(v + 3, v + 7, v + 11, v + 15, m[s[6]], m[s[7]]); 91 | mix(v + 0, v + 5, v + 10, v + 15, m[s[8]], m[s[9]]); 92 | mix(v + 1, v + 6, v + 11, v + 12, m[s[10]], m[s[11]]); 93 | mix(v + 2, v + 7, v + 8, v + 13, m[s[12]], m[s[13]]); 94 | mix(v + 3, v + 4, v + 9, v + 14, m[s[14]], m[s[15]]); 95 | } 96 | for (uint32_t i = 0; i < 8; i++) 97 | st->h[i] ^= v[i] ^ v[i + 8]; 98 | } 99 | 100 | void zcash_blake2b_final(blake2b_state_t *st, uint8_t *out, uint8_t outlen) 101 | { 102 | assert(outlen <= 64); 103 | memcpy(out, st->h, outlen); 104 | } 105 | -------------------------------------------------------------------------------- /blake.h: -------------------------------------------------------------------------------- 1 | typedef struct blake2b_state_s 2 | { 3 | uint64_t h[8]; 4 | uint64_t bytes; 5 | } blake2b_state_t; 6 | void zcash_blake2b_init(blake2b_state_t *st, uint8_t hash_len, 7 | uint32_t n, uint32_t k); 8 | void zcash_blake2b_update(blake2b_state_t *st, const uint8_t *_msg, 9 | uint32_t msg_len, uint32_t is_final); 10 | void zcash_blake2b_final(blake2b_state_t *st, uint8_t *out, uint8_t outlen); 11 | -------------------------------------------------------------------------------- /haraka.c: -------------------------------------------------------------------------------- 1 | /* 2 | Plain C implementation of the Haraka256 and Haraka512 permutations. 3 | */ 4 | #include 5 | #include 6 | #include 7 | 8 | #include "haraka.h" 9 | 10 | #define HARAKAS_RATE 32 11 | 12 | static const unsigned char haraka_rc[40][16] = { 13 | {0x9d, 0x7b, 0x81, 0x75, 0xf0, 0xfe, 0xc5, 0xb2, 0x0a, 0xc0, 0x20, 0xe6, 0x4c, 0x70, 0x84, 0x06}, 14 | {0x17, 0xf7, 0x08, 0x2f, 0xa4, 0x6b, 0x0f, 0x64, 0x6b, 0xa0, 0xf3, 0x88, 0xe1, 0xb4, 0x66, 0x8b}, 15 | {0x14, 0x91, 0x02, 0x9f, 0x60, 0x9d, 0x02, 0xcf, 0x98, 0x84, 0xf2, 0x53, 0x2d, 0xde, 0x02, 0x34}, 16 | {0x79, 0x4f, 0x5b, 0xfd, 0xaf, 0xbc, 0xf3, 0xbb, 0x08, 0x4f, 0x7b, 0x2e, 0xe6, 0xea, 0xd6, 0x0e}, 17 | {0x44, 0x70, 0x39, 0xbe, 0x1c, 0xcd, 0xee, 0x79, 0x8b, 0x44, 0x72, 0x48, 0xcb, 0xb0, 0xcf, 0xcb}, 18 | {0x7b, 0x05, 0x8a, 0x2b, 0xed, 0x35, 0x53, 0x8d, 0xb7, 0x32, 0x90, 0x6e, 0xee, 0xcd, 0xea, 0x7e}, 19 | {0x1b, 0xef, 0x4f, 0xda, 0x61, 0x27, 0x41, 0xe2, 0xd0, 0x7c, 0x2e, 0x5e, 0x43, 0x8f, 0xc2, 0x67}, 20 | {0x3b, 0x0b, 0xc7, 0x1f, 0xe2, 0xfd, 0x5f, 0x67, 0x07, 0xcc, 0xca, 0xaf, 0xb0, 0xd9, 0x24, 0x29}, 21 | {0xee, 0x65, 0xd4, 0xb9, 0xca, 0x8f, 0xdb, 0xec, 0xe9, 0x7f, 0x86, 0xe6, 0xf1, 0x63, 0x4d, 0xab}, 22 | {0x33, 0x7e, 0x03, 0xad, 0x4f, 0x40, 0x2a, 0x5b, 0x64, 0xcd, 0xb7, 0xd4, 0x84, 0xbf, 0x30, 0x1c}, 23 | {0x00, 0x98, 0xf6, 0x8d, 0x2e, 0x8b, 0x02, 0x69, 0xbf, 0x23, 0x17, 0x94, 0xb9, 0x0b, 0xcc, 0xb2}, 24 | {0x8a, 0x2d, 0x9d, 0x5c, 0xc8, 0x9e, 0xaa, 0x4a, 0x72, 0x55, 0x6f, 0xde, 0xa6, 0x78, 0x04, 0xfa}, 25 | {0xd4, 0x9f, 0x12, 0x29, 0x2e, 0x4f, 0xfa, 0x0e, 0x12, 0x2a, 0x77, 0x6b, 0x2b, 0x9f, 0xb4, 0xdf}, 26 | {0xee, 0x12, 0x6a, 0xbb, 0xae, 0x11, 0xd6, 0x32, 0x36, 0xa2, 0x49, 0xf4, 0x44, 0x03, 0xa1, 0x1e}, 27 | {0xa6, 0xec, 0xa8, 0x9c, 0xc9, 0x00, 0x96, 0x5f, 0x84, 0x00, 0x05, 0x4b, 0x88, 0x49, 0x04, 0xaf}, 28 | {0xec, 0x93, 0xe5, 0x27, 0xe3, 0xc7, 0xa2, 0x78, 0x4f, 0x9c, 0x19, 0x9d, 0xd8, 0x5e, 0x02, 0x21}, 29 | {0x73, 0x01, 0xd4, 0x82, 0xcd, 0x2e, 0x28, 0xb9, 0xb7, 0xc9, 0x59, 0xa7, 0xf8, 0xaa, 0x3a, 0xbf}, 30 | {0x6b, 0x7d, 0x30, 0x10, 0xd9, 0xef, 0xf2, 0x37, 0x17, 0xb0, 0x86, 0x61, 0x0d, 0x70, 0x60, 0x62}, 31 | {0xc6, 0x9a, 0xfc, 0xf6, 0x53, 0x91, 0xc2, 0x81, 0x43, 0x04, 0x30, 0x21, 0xc2, 0x45, 0xca, 0x5a}, 32 | {0x3a, 0x94, 0xd1, 0x36, 0xe8, 0x92, 0xaf, 0x2c, 0xbb, 0x68, 0x6b, 0x22, 0x3c, 0x97, 0x23, 0x92}, 33 | {0xb4, 0x71, 0x10, 0xe5, 0x58, 0xb9, 0xba, 0x6c, 0xeb, 0x86, 0x58, 0x22, 0x38, 0x92, 0xbf, 0xd3}, 34 | {0x8d, 0x12, 0xe1, 0x24, 0xdd, 0xfd, 0x3d, 0x93, 0x77, 0xc6, 0xf0, 0xae, 0xe5, 0x3c, 0x86, 0xdb}, 35 | {0xb1, 0x12, 0x22, 0xcb, 0xe3, 0x8d, 0xe4, 0x83, 0x9c, 0xa0, 0xeb, 0xff, 0x68, 0x62, 0x60, 0xbb}, 36 | {0x7d, 0xf7, 0x2b, 0xc7, 0x4e, 0x1a, 0xb9, 0x2d, 0x9c, 0xd1, 0xe4, 0xe2, 0xdc, 0xd3, 0x4b, 0x73}, 37 | {0x4e, 0x92, 0xb3, 0x2c, 0xc4, 0x15, 0x14, 0x4b, 0x43, 0x1b, 0x30, 0x61, 0xc3, 0x47, 0xbb, 0x43}, 38 | {0x99, 0x68, 0xeb, 0x16, 0xdd, 0x31, 0xb2, 0x03, 0xf6, 0xef, 0x07, 0xe7, 0xa8, 0x75, 0xa7, 0xdb}, 39 | {0x2c, 0x47, 0xca, 0x7e, 0x02, 0x23, 0x5e, 0x8e, 0x77, 0x59, 0x75, 0x3c, 0x4b, 0x61, 0xf3, 0x6d}, 40 | {0xf9, 0x17, 0x86, 0xb8, 0xb9, 0xe5, 0x1b, 0x6d, 0x77, 0x7d, 0xde, 0xd6, 0x17, 0x5a, 0xa7, 0xcd}, 41 | {0x5d, 0xee, 0x46, 0xa9, 0x9d, 0x06, 0x6c, 0x9d, 0xaa, 0xe9, 0xa8, 0x6b, 0xf0, 0x43, 0x6b, 0xec}, 42 | {0xc1, 0x27, 0xf3, 0x3b, 0x59, 0x11, 0x53, 0xa2, 0x2b, 0x33, 0x57, 0xf9, 0x50, 0x69, 0x1e, 0xcb}, 43 | {0xd9, 0xd0, 0x0e, 0x60, 0x53, 0x03, 0xed, 0xe4, 0x9c, 0x61, 0xda, 0x00, 0x75, 0x0c, 0xee, 0x2c}, 44 | {0x50, 0xa3, 0xa4, 0x63, 0xbc, 0xba, 0xbb, 0x80, 0xab, 0x0c, 0xe9, 0x96, 0xa1, 0xa5, 0xb1, 0xf0}, 45 | {0x39, 0xca, 0x8d, 0x93, 0x30, 0xde, 0x0d, 0xab, 0x88, 0x29, 0x96, 0x5e, 0x02, 0xb1, 0x3d, 0xae}, 46 | {0x42, 0xb4, 0x75, 0x2e, 0xa8, 0xf3, 0x14, 0x88, 0x0b, 0xa4, 0x54, 0xd5, 0x38, 0x8f, 0xbb, 0x17}, 47 | {0xf6, 0x16, 0x0a, 0x36, 0x79, 0xb7, 0xb6, 0xae, 0xd7, 0x7f, 0x42, 0x5f, 0x5b, 0x8a, 0xbb, 0x34}, 48 | {0xde, 0xaf, 0xba, 0xff, 0x18, 0x59, 0xce, 0x43, 0x38, 0x54, 0xe5, 0xcb, 0x41, 0x52, 0xf6, 0x26}, 49 | {0x78, 0xc9, 0x9e, 0x83, 0xf7, 0x9c, 0xca, 0xa2, 0x6a, 0x02, 0xf3, 0xb9, 0x54, 0x9a, 0xe9, 0x4c}, 50 | {0x35, 0x12, 0x90, 0x22, 0x28, 0x6e, 0xc0, 0x40, 0xbe, 0xf7, 0xdf, 0x1b, 0x1a, 0xa5, 0x51, 0xae}, 51 | {0xcf, 0x59, 0xa6, 0x48, 0x0f, 0xbc, 0x73, 0xc1, 0x2b, 0xd2, 0x7e, 0xba, 0x3c, 0x61, 0xc1, 0xa0}, 52 | {0xa1, 0x9d, 0xc5, 0xe9, 0xfd, 0xbd, 0xd6, 0x4a, 0x88, 0x82, 0x28, 0x02, 0x03, 0xcc, 0x6a, 0x75} 53 | }; 54 | 55 | static unsigned char rc[40][16]; 56 | static unsigned char rc0[40][16]; 57 | static unsigned char rc_sseed[40][16]; 58 | 59 | static const unsigned char sbox[256] = 60 | { 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 61 | 0xd7, 0xab, 0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 62 | 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 63 | 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, 0x04, 0xc7, 0x23, 0xc3, 64 | 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, 0x09, 65 | 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 66 | 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 67 | 0x39, 0x4a, 0x4c, 0x58, 0xcf, 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 68 | 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 69 | 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 70 | 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 71 | 0x73, 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 72 | 0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 73 | 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 74 | 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, 0xba, 0x78, 0x25, 75 | 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, 76 | 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 77 | 0xc1, 0x1d, 0x9e, 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 78 | 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 79 | 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 }; 80 | 81 | #define XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b)) 82 | 83 | // Simulate _mm_aesenc_si128 instructions from AESNI 84 | void aesenc(unsigned char *s, const unsigned char *rk) 85 | { 86 | unsigned char i, t, u, v[4][4]; 87 | for (i = 0; i < 16; ++i) { 88 | v[((i / 4) + 4 - (i%4) ) % 4][i % 4] = sbox[s[i]]; 89 | } 90 | for (i = 0; i < 4; ++i) { 91 | t = v[i][0]; 92 | u = v[i][0] ^ v[i][1] ^ v[i][2] ^ v[i][3]; 93 | v[i][0] ^= u ^ XT(v[i][0] ^ v[i][1]); 94 | v[i][1] ^= u ^ XT(v[i][1] ^ v[i][2]); 95 | v[i][2] ^= u ^ XT(v[i][2] ^ v[i][3]); 96 | v[i][3] ^= u ^ XT(v[i][3] ^ t); 97 | } 98 | for (i = 0; i < 16; ++i) { 99 | s[i] = v[i / 4][i % 4] ^ rk[i]; 100 | } 101 | } 102 | 103 | // Simulate _mm_unpacklo_epi32 104 | void unpacklo32(unsigned char *t, unsigned char *a, unsigned char *b) 105 | { 106 | unsigned char tmp[16]; 107 | memcpy(tmp, a, 4); 108 | memcpy(tmp + 4, b, 4); 109 | memcpy(tmp + 8, a + 4, 4); 110 | memcpy(tmp + 12, b + 4, 4); 111 | memcpy(t, tmp, 16); 112 | } 113 | 114 | // Simulate _mm_unpackhi_epi32 115 | void unpackhi32(unsigned char *t, unsigned char *a, unsigned char *b) 116 | { 117 | unsigned char tmp[16]; 118 | memcpy(tmp, a + 8, 4); 119 | memcpy(tmp + 4, b + 8, 4); 120 | memcpy(tmp + 8, a + 12, 4); 121 | memcpy(tmp + 12, b + 12, 4); 122 | memcpy(t, tmp, 16); 123 | } 124 | 125 | void load_constants_port() 126 | { 127 | /* Use the standard constants to generate tweaked ones. */ 128 | memcpy(rc, haraka_rc, 40*16); 129 | } 130 | 131 | void tweak_constants(const unsigned char *pk_seed, const unsigned char *sk_seed, 132 | unsigned long long seed_length) 133 | { 134 | unsigned char buf[40*16]; 135 | 136 | /* Use the standard constants to generate tweaked ones. */ 137 | memcpy(rc, haraka_rc, 40*16); 138 | 139 | /* Constants for sk.seed */ 140 | if (sk_seed != NULL) { 141 | haraka_S(buf, 40*16, sk_seed, seed_length); 142 | memcpy(rc_sseed, buf, 40*16); 143 | } 144 | 145 | /* Constants for pk.seed */ 146 | haraka_S(buf, 40*16, pk_seed, seed_length); 147 | memcpy(rc, buf, 40*16); 148 | } 149 | 150 | static void haraka_S_absorb(unsigned char *s, unsigned int r, 151 | const unsigned char *m, unsigned long long mlen, 152 | unsigned char p) 153 | { 154 | unsigned long long i; 155 | unsigned char t[r]; 156 | 157 | while (mlen >= r) { 158 | // XOR block to state 159 | for (i = 0; i < r; ++i) { 160 | s[i] ^= m[i]; 161 | } 162 | haraka512_perm(s, s); 163 | mlen -= r; 164 | m += r; 165 | } 166 | 167 | for (i = 0; i < r; ++i) { 168 | t[i] = 0; 169 | } 170 | for (i = 0; i < mlen; ++i) { 171 | t[i] = m[i]; 172 | } 173 | t[i] = p; 174 | t[r - 1] |= 128; 175 | for (i = 0; i < r; ++i) { 176 | s[i] ^= t[i]; 177 | } 178 | } 179 | 180 | static void haraka_S_squeezeblocks(unsigned char *h, unsigned long long nblocks, 181 | unsigned char *s, unsigned int r) 182 | { 183 | while (nblocks > 0) { 184 | haraka512_perm(s, s); 185 | memcpy(h, s, HARAKAS_RATE); 186 | h += r; 187 | nblocks--; 188 | } 189 | } 190 | 191 | 192 | void haraka_S(unsigned char *out, unsigned long long outlen, 193 | const unsigned char *in, unsigned long long inlen) 194 | { 195 | unsigned long long i; 196 | unsigned char s[64]; 197 | unsigned char d[32]; 198 | 199 | for (i = 0; i < 64; i++) { 200 | s[i] = 0; 201 | } 202 | haraka_S_absorb(s, 32, in, inlen, 0x1F); 203 | 204 | haraka_S_squeezeblocks(out, outlen / 32, s, 32); 205 | out += (outlen / 32) * 32; 206 | 207 | if (outlen % 32) { 208 | haraka_S_squeezeblocks(d, 1, s, 32); 209 | for (i = 0; i < outlen % 32; i++) { 210 | out[i] = d[i]; 211 | } 212 | } 213 | } 214 | 215 | void haraka512_perm(unsigned char *out, const unsigned char *in) 216 | { 217 | int i, j; 218 | 219 | unsigned char s[64], tmp[16]; 220 | 221 | memcpy(s, in, 16); 222 | memcpy(s + 16, in + 16, 16); 223 | memcpy(s + 32, in + 32, 16); 224 | memcpy(s + 48, in + 48, 16); 225 | 226 | for (i = 0; i < 5; ++i) { 227 | // aes round(s) 228 | for (j = 0; j < 2; ++j) { 229 | aesenc(s, rc[4*2*i + 4*j]); 230 | aesenc(s + 16, rc[4*2*i + 4*j + 1]); 231 | aesenc(s + 32, rc[4*2*i + 4*j + 2]); 232 | aesenc(s + 48, rc[4*2*i + 4*j + 3]); 233 | } 234 | 235 | // mixing 236 | unpacklo32(tmp, s, s + 16); 237 | unpackhi32(s, s, s + 16); 238 | unpacklo32(s + 16, s + 32, s + 48); 239 | unpackhi32(s + 32, s + 32, s + 48); 240 | unpacklo32(s + 48, s, s + 32); 241 | unpackhi32(s, s, s + 32); 242 | unpackhi32(s + 32, s + 16, tmp); 243 | unpacklo32(s + 16, s + 16, tmp); 244 | } 245 | 246 | memcpy(out, s, 64); 247 | } 248 | 249 | void haraka512_port(unsigned char *out, const unsigned char *in) 250 | { 251 | int i; 252 | 253 | unsigned char buf[64]; 254 | 255 | haraka512_perm(buf, in); 256 | /* Feed-forward */ 257 | for (i = 0; i < 64; i++) { 258 | buf[i] = buf[i] ^ in[i]; 259 | } 260 | 261 | /* Truncated */ 262 | memcpy(out, buf + 8, 8); 263 | memcpy(out + 8, buf + 24, 8); 264 | memcpy(out + 16, buf + 32, 8); 265 | memcpy(out + 24, buf + 48, 8); 266 | } 267 | 268 | void haraka512_perm_zero(unsigned char *out, const unsigned char *in) 269 | { 270 | int i, j; 271 | 272 | unsigned char s[64], tmp[16]; 273 | 274 | memcpy(s, in, 16); 275 | memcpy(s + 16, in + 16, 16); 276 | memcpy(s + 32, in + 32, 16); 277 | memcpy(s + 48, in + 48, 16); 278 | 279 | for (i = 0; i < 5; ++i) { 280 | // aes round(s) 281 | for (j = 0; j < 2; ++j) { 282 | aesenc(s, rc0[4*2*i + 4*j]); 283 | aesenc(s + 16, rc0[4*2*i + 4*j + 1]); 284 | aesenc(s + 32, rc0[4*2*i + 4*j + 2]); 285 | aesenc(s + 48, rc0[4*2*i + 4*j + 3]); 286 | } 287 | 288 | // mixing 289 | unpacklo32(tmp, s, s + 16); 290 | unpackhi32(s, s, s + 16); 291 | unpacklo32(s + 16, s + 32, s + 48); 292 | unpackhi32(s + 32, s + 32, s + 48); 293 | unpacklo32(s + 48, s, s + 32); 294 | unpackhi32(s, s, s + 32); 295 | unpackhi32(s + 32, s + 16, tmp); 296 | unpacklo32(s + 16, s + 16, tmp); 297 | } 298 | 299 | memcpy(out, s, 64); 300 | } 301 | 302 | void haraka512_port_zero(unsigned char *out, const unsigned char *in) 303 | { 304 | int i; 305 | 306 | unsigned char buf[64]; 307 | 308 | haraka512_perm_zero(buf, in); 309 | /* Feed-forward */ 310 | for (i = 0; i < 64; i++) { 311 | buf[i] = buf[i] ^ in[i]; 312 | } 313 | 314 | /* Truncated */ 315 | memcpy(out, buf + 8, 8); 316 | memcpy(out + 8, buf + 24, 8); 317 | memcpy(out + 16, buf + 32, 8); 318 | memcpy(out + 24, buf + 48, 8); 319 | } 320 | 321 | void haraka256_port(unsigned char *out, const unsigned char *in) 322 | { 323 | int i, j; 324 | 325 | unsigned char s[32], tmp[16]; 326 | 327 | memcpy(s, in, 16); 328 | memcpy(s + 16, in + 16, 16); 329 | 330 | for (i = 0; i < 5; ++i) { 331 | // aes round(s) 332 | for (j = 0; j < 2; ++j) { 333 | aesenc(s, rc[2*2*i + 2*j]); 334 | aesenc(s + 16, rc[2*2*i + 2*j + 1]); 335 | } 336 | 337 | // mixing 338 | unpacklo32(tmp, s, s + 16); 339 | unpackhi32(s + 16, s, s + 16); 340 | memcpy(s, tmp, 16); 341 | } 342 | 343 | /* Feed-forward */ 344 | for (i = 0; i < 32; i++) { 345 | out[i] = in[i] ^ s[i]; 346 | } 347 | } 348 | 349 | void haraka256_sk(unsigned char *out, const unsigned char *in) 350 | { 351 | int i, j; 352 | 353 | unsigned char s[32], tmp[16]; 354 | 355 | memcpy(s, in, 16); 356 | memcpy(s + 16, in + 16, 16); 357 | 358 | for (i = 0; i < 5; ++i) { 359 | // aes round(s) 360 | for (j = 0; j < 2; ++j) { 361 | aesenc(s, rc_sseed[2*2*i + 2*j]); 362 | aesenc(s + 16, rc_sseed[2*2*i + 2*j + 1]); 363 | } 364 | 365 | // mixing 366 | unpacklo32(tmp, s, s + 16); 367 | unpackhi32(s + 16, s, s + 16); 368 | memcpy(s, tmp, 16); 369 | } 370 | 371 | /* Feed-forward */ 372 | for (i = 0; i < 32; i++) { 373 | out[i] = in[i] ^ s[i]; 374 | } 375 | } 376 | -------------------------------------------------------------------------------- /haraka.h: -------------------------------------------------------------------------------- 1 | #ifndef SPX_HARAKA_H 2 | #define SPX_HARAKA_H 3 | 4 | /* load constants */ 5 | void load_constants_port(); 6 | 7 | /* Tweak constants with seed */ 8 | void tweak_constants(const unsigned char *pk_seed, const unsigned char *sk_seed, 9 | unsigned long long seed_length); 10 | 11 | /* Haraka Sponge */ 12 | void haraka_S(unsigned char *out, unsigned long long outlen, 13 | const unsigned char *in, unsigned long long inlen); 14 | 15 | /* Applies the 512-bit Haraka permutation to in. */ 16 | void haraka512_perm(unsigned char *out, const unsigned char *in); 17 | 18 | /* Implementation of Haraka-512 */ 19 | void haraka512_port(unsigned char *out, const unsigned char *in); 20 | 21 | /* Applies the 512-bit Haraka permutation to in, using zero key. */ 22 | void haraka512_perm_zero(unsigned char *out, const unsigned char *in); 23 | 24 | /* Implementation of Haraka-512, using zero key */ 25 | void haraka512_port_zero(unsigned char *out, const unsigned char *in); 26 | 27 | /* Implementation of Haraka-256 */ 28 | void haraka256_port(unsigned char *out, const unsigned char *in); 29 | 30 | /* Implementation of Haraka-256 using sk.seed constants */ 31 | void haraka256_sk(unsigned char *out, const unsigned char *in); 32 | 33 | #endif 34 | -------------------------------------------------------------------------------- /haraka_portable.c: -------------------------------------------------------------------------------- 1 | /* 2 | Plain C implementation of the Haraka256 and Haraka512 permutations. 3 | */ 4 | #include 5 | #include 6 | #include 7 | 8 | #include "haraka_portable.h" 9 | 10 | #define HARAKAS_RATE 32 11 | 12 | static const unsigned char haraka_rc[40][16] = { 13 | { 0x9d, 0x7b, 0x81, 0x75, 0xf0, 0xfe, 0xc5, 0xb2, 0x0a, 0xc0, 0x20, 0xe6, 0x4c, 0x70, 0x84, 0x06 }, 14 | { 0x17, 0xf7, 0x08, 0x2f, 0xa4, 0x6b, 0x0f, 0x64, 0x6b, 0xa0, 0xf3, 0x88, 0xe1, 0xb4, 0x66, 0x8b }, 15 | { 0x14, 0x91, 0x02, 0x9f, 0x60, 0x9d, 0x02, 0xcf, 0x98, 0x84, 0xf2, 0x53, 0x2d, 0xde, 0x02, 0x34 }, 16 | { 0x79, 0x4f, 0x5b, 0xfd, 0xaf, 0xbc, 0xf3, 0xbb, 0x08, 0x4f, 0x7b, 0x2e, 0xe6, 0xea, 0xd6, 0x0e }, 17 | { 0x44, 0x70, 0x39, 0xbe, 0x1c, 0xcd, 0xee, 0x79, 0x8b, 0x44, 0x72, 0x48, 0xcb, 0xb0, 0xcf, 0xcb }, 18 | { 0x7b, 0x05, 0x8a, 0x2b, 0xed, 0x35, 0x53, 0x8d, 0xb7, 0x32, 0x90, 0x6e, 0xee, 0xcd, 0xea, 0x7e }, 19 | { 0x1b, 0xef, 0x4f, 0xda, 0x61, 0x27, 0x41, 0xe2, 0xd0, 0x7c, 0x2e, 0x5e, 0x43, 0x8f, 0xc2, 0x67 }, 20 | { 0x3b, 0x0b, 0xc7, 0x1f, 0xe2, 0xfd, 0x5f, 0x67, 0x07, 0xcc, 0xca, 0xaf, 0xb0, 0xd9, 0x24, 0x29 }, 21 | { 0xee, 0x65, 0xd4, 0xb9, 0xca, 0x8f, 0xdb, 0xec, 0xe9, 0x7f, 0x86, 0xe6, 0xf1, 0x63, 0x4d, 0xab }, 22 | { 0x33, 0x7e, 0x03, 0xad, 0x4f, 0x40, 0x2a, 0x5b, 0x64, 0xcd, 0xb7, 0xd4, 0x84, 0xbf, 0x30, 0x1c }, 23 | { 0x00, 0x98, 0xf6, 0x8d, 0x2e, 0x8b, 0x02, 0x69, 0xbf, 0x23, 0x17, 0x94, 0xb9, 0x0b, 0xcc, 0xb2 }, 24 | { 0x8a, 0x2d, 0x9d, 0x5c, 0xc8, 0x9e, 0xaa, 0x4a, 0x72, 0x55, 0x6f, 0xde, 0xa6, 0x78, 0x04, 0xfa }, 25 | { 0xd4, 0x9f, 0x12, 0x29, 0x2e, 0x4f, 0xfa, 0x0e, 0x12, 0x2a, 0x77, 0x6b, 0x2b, 0x9f, 0xb4, 0xdf }, 26 | { 0xee, 0x12, 0x6a, 0xbb, 0xae, 0x11, 0xd6, 0x32, 0x36, 0xa2, 0x49, 0xf4, 0x44, 0x03, 0xa1, 0x1e }, 27 | { 0xa6, 0xec, 0xa8, 0x9c, 0xc9, 0x00, 0x96, 0x5f, 0x84, 0x00, 0x05, 0x4b, 0x88, 0x49, 0x04, 0xaf }, 28 | { 0xec, 0x93, 0xe5, 0x27, 0xe3, 0xc7, 0xa2, 0x78, 0x4f, 0x9c, 0x19, 0x9d, 0xd8, 0x5e, 0x02, 0x21 }, 29 | { 0x73, 0x01, 0xd4, 0x82, 0xcd, 0x2e, 0x28, 0xb9, 0xb7, 0xc9, 0x59, 0xa7, 0xf8, 0xaa, 0x3a, 0xbf }, 30 | { 0x6b, 0x7d, 0x30, 0x10, 0xd9, 0xef, 0xf2, 0x37, 0x17, 0xb0, 0x86, 0x61, 0x0d, 0x70, 0x60, 0x62 }, 31 | { 0xc6, 0x9a, 0xfc, 0xf6, 0x53, 0x91, 0xc2, 0x81, 0x43, 0x04, 0x30, 0x21, 0xc2, 0x45, 0xca, 0x5a }, 32 | { 0x3a, 0x94, 0xd1, 0x36, 0xe8, 0x92, 0xaf, 0x2c, 0xbb, 0x68, 0x6b, 0x22, 0x3c, 0x97, 0x23, 0x92 }, 33 | { 0xb4, 0x71, 0x10, 0xe5, 0x58, 0xb9, 0xba, 0x6c, 0xeb, 0x86, 0x58, 0x22, 0x38, 0x92, 0xbf, 0xd3 }, 34 | { 0x8d, 0x12, 0xe1, 0x24, 0xdd, 0xfd, 0x3d, 0x93, 0x77, 0xc6, 0xf0, 0xae, 0xe5, 0x3c, 0x86, 0xdb }, 35 | { 0xb1, 0x12, 0x22, 0xcb, 0xe3, 0x8d, 0xe4, 0x83, 0x9c, 0xa0, 0xeb, 0xff, 0x68, 0x62, 0x60, 0xbb }, 36 | { 0x7d, 0xf7, 0x2b, 0xc7, 0x4e, 0x1a, 0xb9, 0x2d, 0x9c, 0xd1, 0xe4, 0xe2, 0xdc, 0xd3, 0x4b, 0x73 }, 37 | { 0x4e, 0x92, 0xb3, 0x2c, 0xc4, 0x15, 0x14, 0x4b, 0x43, 0x1b, 0x30, 0x61, 0xc3, 0x47, 0xbb, 0x43 }, 38 | { 0x99, 0x68, 0xeb, 0x16, 0xdd, 0x31, 0xb2, 0x03, 0xf6, 0xef, 0x07, 0xe7, 0xa8, 0x75, 0xa7, 0xdb }, 39 | { 0x2c, 0x47, 0xca, 0x7e, 0x02, 0x23, 0x5e, 0x8e, 0x77, 0x59, 0x75, 0x3c, 0x4b, 0x61, 0xf3, 0x6d }, 40 | { 0xf9, 0x17, 0x86, 0xb8, 0xb9, 0xe5, 0x1b, 0x6d, 0x77, 0x7d, 0xde, 0xd6, 0x17, 0x5a, 0xa7, 0xcd }, 41 | { 0x5d, 0xee, 0x46, 0xa9, 0x9d, 0x06, 0x6c, 0x9d, 0xaa, 0xe9, 0xa8, 0x6b, 0xf0, 0x43, 0x6b, 0xec }, 42 | { 0xc1, 0x27, 0xf3, 0x3b, 0x59, 0x11, 0x53, 0xa2, 0x2b, 0x33, 0x57, 0xf9, 0x50, 0x69, 0x1e, 0xcb }, 43 | { 0xd9, 0xd0, 0x0e, 0x60, 0x53, 0x03, 0xed, 0xe4, 0x9c, 0x61, 0xda, 0x00, 0x75, 0x0c, 0xee, 0x2c }, 44 | { 0x50, 0xa3, 0xa4, 0x63, 0xbc, 0xba, 0xbb, 0x80, 0xab, 0x0c, 0xe9, 0x96, 0xa1, 0xa5, 0xb1, 0xf0 }, 45 | { 0x39, 0xca, 0x8d, 0x93, 0x30, 0xde, 0x0d, 0xab, 0x88, 0x29, 0x96, 0x5e, 0x02, 0xb1, 0x3d, 0xae }, 46 | { 0x42, 0xb4, 0x75, 0x2e, 0xa8, 0xf3, 0x14, 0x88, 0x0b, 0xa4, 0x54, 0xd5, 0x38, 0x8f, 0xbb, 0x17 }, 47 | { 0xf6, 0x16, 0x0a, 0x36, 0x79, 0xb7, 0xb6, 0xae, 0xd7, 0x7f, 0x42, 0x5f, 0x5b, 0x8a, 0xbb, 0x34 }, 48 | { 0xde, 0xaf, 0xba, 0xff, 0x18, 0x59, 0xce, 0x43, 0x38, 0x54, 0xe5, 0xcb, 0x41, 0x52, 0xf6, 0x26 }, 49 | { 0x78, 0xc9, 0x9e, 0x83, 0xf7, 0x9c, 0xca, 0xa2, 0x6a, 0x02, 0xf3, 0xb9, 0x54, 0x9a, 0xe9, 0x4c }, 50 | { 0x35, 0x12, 0x90, 0x22, 0x28, 0x6e, 0xc0, 0x40, 0xbe, 0xf7, 0xdf, 0x1b, 0x1a, 0xa5, 0x51, 0xae }, 51 | { 0xcf, 0x59, 0xa6, 0x48, 0x0f, 0xbc, 0x73, 0xc1, 0x2b, 0xd2, 0x7e, 0xba, 0x3c, 0x61, 0xc1, 0xa0 }, 52 | { 0xa1, 0x9d, 0xc5, 0xe9, 0xfd, 0xbd, 0xd6, 0x4a, 0x88, 0x82, 0x28, 0x02, 0x03, 0xcc, 0x6a, 0x75 } 53 | }; 54 | 55 | static unsigned char rc[40][16]; 56 | static unsigned char rc0[40][16]; 57 | static unsigned char rc_sseed[40][16]; 58 | 59 | static const unsigned char sbox[256] = 60 | { 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 61 | 0xd7, 0xab, 0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 62 | 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 63 | 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, 0x04, 0xc7, 0x23, 0xc3, 64 | 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, 0x09, 65 | 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 66 | 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 67 | 0x39, 0x4a, 0x4c, 0x58, 0xcf, 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 68 | 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 69 | 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 70 | 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 71 | 0x73, 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 72 | 0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 73 | 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 74 | 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, 0xba, 0x78, 0x25, 75 | 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, 76 | 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 77 | 0xc1, 0x1d, 0x9e, 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 78 | 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 79 | 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 }; 80 | 81 | #define XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b)) 82 | 83 | // Simulate _mm_aesenc_si128 instructions from AESNI 84 | void aesenc(unsigned char *s, const unsigned char *rk) 85 | { 86 | unsigned char i, t, u, v[4][4]; 87 | for (i = 0; i < 16; ++i) { 88 | v[((i / 4) + 4 - (i % 4)) % 4][i % 4] = sbox[s[i]]; 89 | } 90 | for (i = 0; i < 4; ++i) { 91 | t = v[i][0]; 92 | u = v[i][0] ^ v[i][1] ^ v[i][2] ^ v[i][3]; 93 | v[i][0] ^= u ^ XT(v[i][0] ^ v[i][1]); 94 | v[i][1] ^= u ^ XT(v[i][1] ^ v[i][2]); 95 | v[i][2] ^= u ^ XT(v[i][2] ^ v[i][3]); 96 | v[i][3] ^= u ^ XT(v[i][3] ^ t); 97 | } 98 | for (i = 0; i < 16; ++i) { 99 | s[i] = v[i / 4][i % 4] ^ rk[i]; 100 | } 101 | } 102 | 103 | // Simulate _mm_unpacklo_epi32 104 | void unpacklo32(unsigned char *t, unsigned char *a, unsigned char *b) 105 | { 106 | unsigned char tmp[16]; 107 | memcpy(tmp, a, 4); 108 | memcpy(tmp + 4, b, 4); 109 | memcpy(tmp + 8, a + 4, 4); 110 | memcpy(tmp + 12, b + 4, 4); 111 | memcpy(t, tmp, 16); 112 | } 113 | 114 | // Simulate _mm_unpackhi_epi32 115 | void unpackhi32(unsigned char *t, unsigned char *a, unsigned char *b) 116 | { 117 | unsigned char tmp[16]; 118 | memcpy(tmp, a + 8, 4); 119 | memcpy(tmp + 4, b + 8, 4); 120 | memcpy(tmp + 8, a + 12, 4); 121 | memcpy(tmp + 12, b + 12, 4); 122 | memcpy(t, tmp, 16); 123 | } 124 | 125 | void load_constants_port() 126 | { 127 | /* Use the standard constants to generate tweaked ones. */ 128 | memcpy(rc, haraka_rc, 40 * 16); 129 | } 130 | 131 | void tweak_constants(const unsigned char *pk_seed, const unsigned char *sk_seed, 132 | unsigned long long seed_length) 133 | { 134 | unsigned char buf[40 * 16]; 135 | 136 | /* Use the standard constants to generate tweaked ones. */ 137 | memcpy(rc, haraka_rc, 40 * 16); 138 | 139 | /* Constants for sk.seed */ 140 | if (sk_seed != NULL) { 141 | haraka_S(buf, 40 * 16, sk_seed, seed_length); 142 | memcpy(rc_sseed, buf, 40 * 16); 143 | } 144 | 145 | /* Constants for pk.seed */ 146 | haraka_S(buf, 40 * 16, pk_seed, seed_length); 147 | memcpy(rc, buf, 40 * 16); 148 | } 149 | 150 | static void haraka_S_absorb(unsigned char *s, 151 | const unsigned char *m, unsigned long long mlen, 152 | unsigned char p) 153 | { 154 | unsigned long long i; 155 | 156 | unsigned char t[2]; 157 | 158 | 159 | 160 | while (mlen >= 32) { 161 | // XOR block to state 162 | for (i = 0; i < 32; ++i) { 163 | s[i] ^= m[i]; 164 | } 165 | haraka512_perm(s, s); 166 | mlen -= 32; 167 | m += 32; 168 | } 169 | 170 | for (i = 0; i < 32; ++i) { 171 | t[i] = 0; 172 | } 173 | for (i = 0; i < mlen; ++i) { 174 | t[i] = m[i]; 175 | } 176 | t[i] = p; 177 | t[32 - 1] |= 128; 178 | for (i = 0; i < 32; ++i) { 179 | s[i] ^= t[i]; 180 | } 181 | } 182 | 183 | static void haraka_S_squeezeblocks(unsigned char *h, unsigned long long nblocks, 184 | unsigned char *s, unsigned int r) 185 | { 186 | while (nblocks > 0) { 187 | haraka512_perm(s, s); 188 | memcpy(h, s, HARAKAS_RATE); 189 | h += r; 190 | nblocks--; 191 | } 192 | } 193 | 194 | 195 | void haraka_S(unsigned char *out, unsigned long long outlen, 196 | const unsigned char *in, unsigned long long inlen) 197 | { 198 | unsigned long long i; 199 | unsigned char s[64]; 200 | unsigned char d[32]; 201 | 202 | for (i = 0; i < 64; i++) { 203 | s[i] = 0; 204 | } 205 | haraka_S_absorb(s, in, inlen, 0x1F); 206 | 207 | haraka_S_squeezeblocks(out, outlen / 32, s, 32); 208 | out += (outlen / 32) * 32; 209 | 210 | if (outlen % 32) { 211 | haraka_S_squeezeblocks(d, 1, s, 32); 212 | for (i = 0; i < outlen % 32; i++) { 213 | out[i] = d[i]; 214 | } 215 | } 216 | } 217 | 218 | void haraka512_perm(unsigned char *out, const unsigned char *in) 219 | { 220 | int i, j; 221 | 222 | unsigned char s[64], tmp[16]; 223 | 224 | memcpy(s, in, 16); 225 | memcpy(s + 16, in + 16, 16); 226 | memcpy(s + 32, in + 32, 16); 227 | memcpy(s + 48, in + 48, 16); 228 | 229 | for (i = 0; i < 5; ++i) { 230 | // aes round(s) 231 | for (j = 0; j < 2; ++j) { 232 | aesenc(s, rc[4 * 2 * i + 4 * j]); 233 | aesenc(s + 16, rc[4 * 2 * i + 4 * j + 1]); 234 | aesenc(s + 32, rc[4 * 2 * i + 4 * j + 2]); 235 | aesenc(s + 48, rc[4 * 2 * i + 4 * j + 3]); 236 | } 237 | 238 | // mixing 239 | unpacklo32(tmp, s, s + 16); 240 | unpackhi32(s, s, s + 16); 241 | unpacklo32(s + 16, s + 32, s + 48); 242 | unpackhi32(s + 32, s + 32, s + 48); 243 | unpacklo32(s + 48, s, s + 32); 244 | unpackhi32(s, s, s + 32); 245 | unpackhi32(s + 32, s + 16, tmp); 246 | unpacklo32(s + 16, s + 16, tmp); 247 | } 248 | 249 | memcpy(out, s, 64); 250 | } 251 | 252 | void haraka512_perm_keyed(unsigned char *out, const unsigned char *in, const u128 *rc) 253 | { 254 | int i, j; 255 | 256 | unsigned char s[64], tmp[16]; 257 | 258 | memcpy(s, in, 16); 259 | memcpy(s + 16, in + 16, 16); 260 | memcpy(s + 32, in + 32, 16); 261 | memcpy(s + 48, in + 48, 16); 262 | 263 | for (i = 0; i < 5; ++i) { 264 | // aes round(s) 265 | for (j = 0; j < 2; ++j) { 266 | aesenc(s, (const unsigned char *)&rc[4 * 2 * i + 4 * j]); 267 | aesenc(s + 16, (const unsigned char *)&rc[4 * 2 * i + 4 * j + 1]); 268 | aesenc(s + 32, (const unsigned char *)&rc[4 * 2 * i + 4 * j + 2]); 269 | aesenc(s + 48, (const unsigned char *)&rc[4 * 2 * i + 4 * j + 3]); 270 | } 271 | 272 | // mixing 273 | unpacklo32(tmp, s, s + 16); 274 | unpackhi32(s, s, s + 16); 275 | unpacklo32(s + 16, s + 32, s + 48); 276 | unpackhi32(s + 32, s + 32, s + 48); 277 | unpacklo32(s + 48, s, s + 32); 278 | unpackhi32(s, s, s + 32); 279 | unpackhi32(s + 32, s + 16, tmp); 280 | unpacklo32(s + 16, s + 16, tmp); 281 | } 282 | 283 | memcpy(out, s, 64); 284 | } 285 | 286 | void haraka512_port(unsigned char *out, const unsigned char *in) 287 | { 288 | int i; 289 | 290 | unsigned char buf[64]; 291 | 292 | haraka512_perm(buf, in); 293 | /* Feed-forward */ 294 | for (i = 0; i < 64; i++) { 295 | buf[i] = buf[i] ^ in[i]; 296 | } 297 | 298 | /* Truncated */ 299 | memcpy(out, buf + 8, 8); 300 | memcpy(out + 8, buf + 24, 8); 301 | memcpy(out + 16, buf + 32, 8); 302 | memcpy(out + 24, buf + 48, 8); 303 | } 304 | 305 | void haraka512_port_keyed(unsigned char *out, const unsigned char *in, const u128 *rc) 306 | { 307 | int i; 308 | 309 | unsigned char buf[64]; 310 | 311 | haraka512_perm_keyed(buf, in, rc); 312 | /* Feed-forward */ 313 | for (i = 0; i < 64; i++) { 314 | buf[i] = buf[i] ^ in[i]; 315 | } 316 | 317 | /* Truncated */ 318 | memcpy(out, buf + 8, 8); 319 | memcpy(out + 8, buf + 24, 8); 320 | memcpy(out + 16, buf + 32, 8); 321 | memcpy(out + 24, buf + 48, 8); 322 | } 323 | 324 | void haraka512_perm_zero(unsigned char *out, const unsigned char *in) 325 | { 326 | int i, j; 327 | 328 | unsigned char s[64], tmp[16]; 329 | 330 | memcpy(s, in, 16); 331 | memcpy(s + 16, in + 16, 16); 332 | memcpy(s + 32, in + 32, 16); 333 | memcpy(s + 48, in + 48, 16); 334 | 335 | for (i = 0; i < 5; ++i) { 336 | // aes round(s) 337 | for (j = 0; j < 2; ++j) { 338 | aesenc(s, rc0[4 * 2 * i + 4 * j]); 339 | aesenc(s + 16, rc0[4 * 2 * i + 4 * j + 1]); 340 | aesenc(s + 32, rc0[4 * 2 * i + 4 * j + 2]); 341 | aesenc(s + 48, rc0[4 * 2 * i + 4 * j + 3]); 342 | } 343 | 344 | // mixing 345 | unpacklo32(tmp, s, s + 16); 346 | unpackhi32(s, s, s + 16); 347 | unpacklo32(s + 16, s + 32, s + 48); 348 | unpackhi32(s + 32, s + 32, s + 48); 349 | unpacklo32(s + 48, s, s + 32); 350 | unpackhi32(s, s, s + 32); 351 | unpackhi32(s + 32, s + 16, tmp); 352 | unpacklo32(s + 16, s + 16, tmp); 353 | } 354 | 355 | memcpy(out, s, 64); 356 | } 357 | 358 | void haraka512_port_zero(unsigned char *out, const unsigned char *in) 359 | { 360 | int i; 361 | 362 | unsigned char buf[64]; 363 | 364 | haraka512_perm_zero(buf, in); 365 | /* Feed-forward */ 366 | for (i = 0; i < 64; i++) { 367 | buf[i] = buf[i] ^ in[i]; 368 | } 369 | 370 | /* Truncated */ 371 | memcpy(out, buf + 8, 8); 372 | memcpy(out + 8, buf + 24, 8); 373 | memcpy(out + 16, buf + 32, 8); 374 | memcpy(out + 24, buf + 48, 8); 375 | } 376 | 377 | void haraka256_port(unsigned char *out, const unsigned char *in) 378 | { 379 | int i, j; 380 | 381 | unsigned char s[32], tmp[16]; 382 | 383 | memcpy(s, in, 16); 384 | memcpy(s + 16, in + 16, 16); 385 | 386 | for (i = 0; i < 5; ++i) { 387 | // aes round(s) 388 | for (j = 0; j < 2; ++j) { 389 | aesenc(s, rc[2 * 2 * i + 2 * j]); 390 | aesenc(s + 16, rc[2 * 2 * i + 2 * j + 1]); 391 | } 392 | 393 | // mixing 394 | unpacklo32(tmp, s, s + 16); 395 | unpackhi32(s + 16, s, s + 16); 396 | memcpy(s, tmp, 16); 397 | } 398 | 399 | /* Feed-forward */ 400 | for (i = 0; i < 32; i++) { 401 | out[i] = in[i] ^ s[i]; 402 | } 403 | } 404 | 405 | void haraka256_sk(unsigned char *out, const unsigned char *in) 406 | { 407 | int i, j; 408 | 409 | unsigned char s[32], tmp[16]; 410 | 411 | memcpy(s, in, 16); 412 | memcpy(s + 16, in + 16, 16); 413 | 414 | for (i = 0; i < 5; ++i) { 415 | // aes round(s) 416 | for (j = 0; j < 2; ++j) { 417 | aesenc(s, rc_sseed[2 * 2 * i + 2 * j]); 418 | aesenc(s + 16, rc_sseed[2 * 2 * i + 2 * j + 1]); 419 | } 420 | 421 | // mixing 422 | unpacklo32(tmp, s, s + 16); 423 | unpackhi32(s + 16, s, s + 16); 424 | memcpy(s, tmp, 16); 425 | } 426 | 427 | /* Feed-forward */ 428 | for (i = 0; i < 32; i++) { 429 | out[i] = in[i] ^ s[i]; 430 | } 431 | } 432 | -------------------------------------------------------------------------------- /haraka_portable.h: -------------------------------------------------------------------------------- 1 | #ifndef SPX_HARAKA_H 2 | #define SPX_HARAKA_H 3 | 4 | #include "emmintrin.h" 5 | 6 | #define NUMROUNDS 5 7 | 8 | #ifdef _WIN32 9 | typedef unsigned long long u64; 10 | #else 11 | typedef unsigned long u64; 12 | #endif 13 | typedef __m128i u128; 14 | 15 | extern void aesenc(unsigned char *s, const unsigned char *rk); 16 | 17 | #define AES2_EMU(s0, s1, rci) \ 18 | aesenc((unsigned char *)&s0, (unsigned char *)&(rc[rci])); \ 19 | aesenc((unsigned char *)&s1, (unsigned char *)&(rc[rci + 1])); \ 20 | aesenc((unsigned char *)&s0, (unsigned char *)&(rc[rci + 2])); \ 21 | aesenc((unsigned char *)&s1, (unsigned char *)&(rc[rci + 3])); 22 | 23 | typedef unsigned int uint32_t; 24 | 25 | static inline __m128i _mm_unpacklo_epi32_emu(__m128i a, __m128i b) 26 | { 27 | uint32_t result[4]; 28 | uint32_t *tmp1 = (uint32_t *)&a, *tmp2 = (uint32_t *)&b; 29 | result[0] = tmp1[0]; 30 | result[1] = tmp2[0]; 31 | result[2] = tmp1[1]; 32 | result[3] = tmp2[1]; 33 | return *(__m128i *)result; 34 | } 35 | 36 | static inline __m128i _mm_unpackhi_epi32_emu(__m128i a, __m128i b) 37 | { 38 | uint32_t result[4]; 39 | uint32_t *tmp1 = (uint32_t *)&a, *tmp2 = (uint32_t *)&b; 40 | result[0] = tmp1[2]; 41 | result[1] = tmp2[2]; 42 | result[2] = tmp1[3]; 43 | result[3] = tmp2[3]; 44 | return *(__m128i *)result; 45 | } 46 | 47 | #define MIX2_EMU(s0, s1) \ 48 | tmp = _mm_unpacklo_epi32_emu(s0, s1); \ 49 | s1 = _mm_unpackhi_epi32_emu(s0, s1); \ 50 | s0 = tmp; 51 | 52 | /* load constants */ 53 | void load_constants_port(); 54 | 55 | /* Tweak constants with seed */ 56 | void tweak_constants(const unsigned char *pk_seed, const unsigned char *sk_seed, 57 | unsigned long long seed_length); 58 | 59 | /* Haraka Sponge */ 60 | void haraka_S(unsigned char *out, unsigned long long outlen, 61 | const unsigned char *in, unsigned long long inlen); 62 | 63 | /* Applies the 512-bit Haraka permutation to in. */ 64 | void haraka512_perm(unsigned char *out, const unsigned char *in); 65 | 66 | /* Implementation of Haraka-512 */ 67 | void haraka512_port(unsigned char *out, const unsigned char *in); 68 | 69 | /* Implementation of Haraka-512 */ 70 | void haraka512_port_keyed(unsigned char *out, const unsigned char *in, const u128 *rc); 71 | 72 | /* Applies the 512-bit Haraka permutation to in, using zero key. */ 73 | void haraka512_perm_zero(unsigned char *out, const unsigned char *in); 74 | 75 | /* Implementation of Haraka-512, using zero key */ 76 | void haraka512_port_zero(unsigned char *out, const unsigned char *in); 77 | 78 | /* Implementation of Haraka-256 */ 79 | void haraka256_port(unsigned char *out, const unsigned char *in); 80 | 81 | /* Implementation of Haraka-256 using sk.seed constants */ 82 | void haraka256_sk(unsigned char *out, const unsigned char *in); 83 | 84 | #endif 85 | -------------------------------------------------------------------------------- /haraka_portable.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/monkins1010/AMDVerusCoin/2706c705b1ed7cb23a316d574d6b8bc4dee778ce/haraka_portable.o -------------------------------------------------------------------------------- /input.cl: -------------------------------------------------------------------------------- 1 | 2 | #define VERUS_KEY_SIZE 8832 3 | #define VERUS_KEY_SIZE128 552 4 | 5 | 6 | typedef uint4 uint128m; 7 | typedef ulong uint64_t; 8 | typedef uint uint32_t; 9 | typedef uchar uint8_t; 10 | typedef long int64_t; 11 | typedef int int32_t; 12 | typedef short int16_t; 13 | typedef unsigned short uint16_t; 14 | 15 | 16 | #define AES2_EMU(s0, s1, rci) \ 17 | aesenc((unsigned char *)&s0, &rc[rci],sharedMemory1); \ 18 | aesenc((unsigned char *)&s1, &rc[rci + 1],sharedMemory1); \ 19 | aesenc((unsigned char *)&s0, &rc[rci + 2],sharedMemory1); \ 20 | aesenc((unsigned char *)&s1, &rc[rci + 3],sharedMemory1); 21 | 22 | #define AES2_EMU_LOC(s0, s1, rci) \ 23 | aesenc_loc((unsigned char *)&s0, &rc[rci],sharedMemory1); \ 24 | aesenc_loc((unsigned char *)&s1, &rc[rci + 1],sharedMemory1); \ 25 | aesenc_loc((unsigned char *)&s0, &rc[rci + 2],sharedMemory1); \ 26 | aesenc_loc((unsigned char *)&s1, &rc[rci + 3],sharedMemory1); 27 | 28 | 29 | 30 | #define AES4(s0, s1, s2, s3, rci) \ 31 | aesenc((unsigned char *)&s0, &rc[rci],sharedMemory1); \ 32 | aesenc((unsigned char *)&s1, &rc[rci + 1],sharedMemory1); \ 33 | aesenc((unsigned char *)&s2, &rc[rci + 2],sharedMemory1); \ 34 | aesenc((unsigned char *)&s3, &rc[rci + 3],sharedMemory1); \ 35 | aesenc((unsigned char *)&s0, &rc[rci + 4], sharedMemory1); \ 36 | aesenc((unsigned char *)&s1, &rc[rci + 5], sharedMemory1); \ 37 | aesenc((unsigned char *)&s2, &rc[rci + 6], sharedMemory1); \ 38 | aesenc((unsigned char *)&s3, &rc[rci + 7], sharedMemory1); 39 | 40 | 41 | #define AES4_LAST(s3, rci) \ 42 | aesenc((unsigned char *)&s3, &rc[rci + 2],sharedMemory1); \ 43 | aesenc((unsigned char *)&s3, &rc[rci + 6], sharedMemory1); \ 44 | 45 | 46 | 47 | #define MIX2_EMU(s0, s1) \ 48 | tmp = _mm_unpacklo_epi32_emu(s0, s1); \ 49 | s1 = _mm_unpackhi_epi32_emu(s0, s1); \ 50 | s0 = tmp; 51 | 52 | #define MIX4(s0, s1, s2, s3) \ 53 | tmp = _mm_unpacklo_epi32_emu(s0, s1); \ 54 | s0 = _mm_unpackhi_epi32_emu(s0, s1); \ 55 | s1 = _mm_unpacklo_epi32_emu(s2, s3); \ 56 | s2 = _mm_unpackhi_epi32_emu(s2, s3); \ 57 | s3 = _mm_unpacklo_epi32_emu(s0, s2); \ 58 | s0 = _mm_unpackhi_epi32_emu(s0, s2); \ 59 | s2 = _mm_unpackhi_epi32_emu(s1, tmp); \ 60 | s1 = _mm_unpacklo_epi32_emu(s1, tmp); 61 | 62 | #define MIX4_LASTBUT1(s0, s1, s2, s3) \ 63 | tmp = _mm_unpacklo_epi32_emu(s0, s1); \ 64 | s1 = _mm_unpacklo_epi32_emu(s2, s3); \ 65 | s2 = _mm_unpackhi_epi32_emu(s1, tmp); 66 | 67 | #define saes_data(w) {\ 68 | w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), w(0xc5),\ 69 | w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), w(0xab), w(0x76),\ 70 | w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), w(0x59), w(0x47), w(0xf0),\ 71 | w(0xad), w(0xd4), w(0xa2), w(0xaf), w(0x9c), w(0xa4), w(0x72), w(0xc0),\ 72 | w(0xb7), w(0xfd), w(0x93), w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc),\ 73 | w(0x34), w(0xa5), w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15),\ 74 | w(0x04), w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a),\ 75 | w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), w(0x75),\ 76 | w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), w(0x5a), w(0xa0),\ 77 | w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), w(0xe3), w(0x2f), w(0x84),\ 78 | w(0x53), w(0xd1), w(0x00), w(0xed), w(0x20), w(0xfc), w(0xb1), w(0x5b),\ 79 | w(0x6a), w(0xcb), w(0xbe), w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf),\ 80 | w(0xd0), w(0xef), w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85),\ 81 | w(0x45), w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8),\ 82 | w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), w(0xf5),\ 83 | w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), w(0xf3), w(0xd2),\ 84 | w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), w(0x97), w(0x44), w(0x17),\ 85 | w(0xc4), w(0xa7), w(0x7e), w(0x3d), w(0x64), w(0x5d), w(0x19), w(0x73),\ 86 | w(0x60), w(0x81), w(0x4f), w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88),\ 87 | w(0x46), w(0xee), w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb),\ 88 | w(0xe0), w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c),\ 89 | w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), w(0x79),\ 90 | w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), w(0x4e), w(0xa9),\ 91 | w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), w(0x7a), w(0xae), w(0x08),\ 92 | w(0xba), w(0x78), w(0x25), w(0x2e), w(0x1c), w(0xa6), w(0xb4), w(0xc6),\ 93 | w(0xe8), w(0xdd), w(0x74), w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a),\ 94 | w(0x70), w(0x3e), w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e),\ 95 | w(0x61), w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e),\ 96 | w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), w(0x94),\ 97 | w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), w(0x28), w(0xdf),\ 98 | w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), w(0xe6), w(0x42), w(0x68),\ 99 | w(0x41), w(0x99), w(0x2d), w(0x0f), w(0xb0), w(0x54), w(0xbb), w(0x16) } 100 | 101 | #define SAES_WPOLY 0x011b 102 | 103 | #define saes_b2w(b0, b1, b2, b3) (((uint32_t)(b3) << 24) | \ 104 | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | (b0)) 105 | 106 | #define saes_f2(x) ((x<<1) ^ (((x>>7) & 1) * SAES_WPOLY)) 107 | #define saes_f3(x) (saes_f2(x) ^ x) 108 | #define saes_h0(x) (x) 109 | 110 | #define saes_u0(p) saes_b2w(saes_f2(p), p, p, saes_f3(p)) 111 | #define saes_u1(p) saes_b2w(saes_f3(p), saes_f2(p), p, p) 112 | #define saes_u2(p) saes_b2w( p, saes_f3(p), saes_f2(p), p) 113 | #define saes_u3(p) saes_b2w( p, p, saes_f3(p), saes_f2(p)) 114 | 115 | static const __constant uint32_t saes_table[4][256] = { saes_data(saes_u0), saes_data(saes_u1), saes_data(saes_u2), saes_data(saes_u3) }; 116 | 117 | 118 | uint32_t xor3x(uint a, uint b, uint c) { 119 | uint result; 120 | 121 | result = a^b^c; 122 | 123 | return result; 124 | } 125 | 126 | #define _mm_xor_si128_emu(a, b) (a ^ b) 127 | 128 | 129 | #define LIMMY_R(x, y, z) ( x >> z | (y << (32 - z))) 130 | 131 | uint128m _mm_clmulepi64_si128_emu(uint128m ai, uint128m bi, int imm) 132 | { 133 | uint64_t a = ((uint64_t*)&ai)[0]; 134 | 135 | uint64_t b = ((uint64_t*)&bi)[1]; 136 | 137 | 138 | 139 | 140 | uint8_t i; 141 | 142 | uint64_t u[8]; 143 | uint128m r; 144 | uint64_t tmp; 145 | 146 | u[0] = 0; //000 x b 147 | u[1] = b; //001 x b 148 | u[2] = u[1] << 1; //010 x b 149 | u[3] = u[2] ^ b; //011 x b 150 | u[4] = u[2] << 1; //100 x b 151 | u[5] = u[4] ^ b; //101 x b 152 | u[6] = u[3] << 1; //110 x b 153 | u[7] = u[6] ^ b; //111 x b 154 | //Multiply 155 | ((uint64_t*)&r)[0] = u[a & 7]; //first window only affects lower word 156 | 157 | r.z = r.w = 0; 158 | //#pragma unroll 159 | for (i = 3; i < 64; i += 3) { 160 | tmp = u[a >> i & 7]; 161 | ((uint64_t*)&r)[0] ^= tmp << i; 162 | // r.x ^= (tmp << i) & 0xffffffff ; 163 | // r.y ^= (((tmp << i) ) >> 32) & 0xffffffff ; 164 | ((uint64_t*)&r)[1] ^= tmp >> (64 - i); 165 | //r.z ^= (tmp >> (64 - i)) & 0xffffffff ; 166 | //r.w ^= (((tmp >> (64 - i)) ) >> 32) & 0xffffffff ; 167 | } 168 | 169 | 170 | if ((bi.w ) & 0x80000000) 171 | { 172 | uint32_t t0 = LIMMY_R(ai.x, ai.y, 1); 173 | uint32_t t1 = ai.y >> 1; 174 | r.z ^= (t0 & 0xDB6DB6DB); //0, 21x 110 175 | r.w ^= (t1 & 0x36DB6DB6); //0x6DB6DB6DB6DB6DB6 -> 0x36DB6DB6DB6DB6DB after >>1 176 | } 177 | if ((bi.w ) & 0x40000000) 178 | { 179 | uint32_t t0 = LIMMY_R(ai.x, ai.y, 2); 180 | uint32_t t1 = ai.y >> 2; 181 | r.z ^= (t0 & 0x49249249); //0, 21x 100 182 | r.w ^= (t1 & 0x12492492); //0x4924924924924924 -> 0x1249249249249249 after >>2 183 | } 184 | 185 | return ((uint128m*)&r)[0]; 186 | } 187 | 188 | 189 | 190 | uint128m _mm_clmulepi64_si128_emu2(uint128m ai) 191 | { 192 | uint64_t a = ((uint64_t*)&ai)[1]; 193 | 194 | //uint64_t b = 27 ; 195 | uint8_t i; //window size s = 4, 196 | //uint64_t two_s = 16; //2^s 197 | //uint64_t smask = 15; //s 15 198 | uint8_t u[8]; 199 | uint128m r; 200 | uint64_t tmp; 201 | //Precomputation 202 | 203 | //#pragma unroll 204 | u[0] = 0; //000 x b 205 | u[1] = 27; //001 x b 206 | u[2] = 54; // u[1] << 1; //010 x b 207 | u[3] = 45; //011 x b 208 | u[4] = 108; //100 x b 209 | u[5] = 119; //101 x b 210 | u[6] = 90; //110 x b 211 | u[7] = 65; //111 x b 212 | //Multiply 213 | ((uint64_t*)&r)[0] = u[a & 7]; //first window only affects lower word 214 | 215 | r.z = r.w = 0; 216 | //#pragma unroll 217 | for (i = 3; i < 64; i += 3) { 218 | tmp = u[a >> i & 7]; 219 | ((uint64_t*)&r)[0] ^= tmp << i; 220 | 221 | ((uint64_t*)&r)[1] ^= tmp >> (64 - i); 222 | } 223 | 224 | return r; 225 | } 226 | 227 | #define _mm_load_si128_emu(p) (*(uint128m*)(p)); 228 | 229 | #define _mm_cvtsi128_si64_emu(p) (((int64_t *)&p)[0]); 230 | 231 | #define _mm_cvtsi128_si32_emu(p) (((int32_t *)&a)[0]); 232 | 233 | 234 | void _mm_unpackboth_epi32_emu(uint128m *a, uint128m *b) 235 | { 236 | uint32_t value; 237 | 238 | value = a[0].z; a[0].z = a[0].y; a[0].y = value; 239 | value = a[0].y; a[0].y = b[0].x; b[0].x = value; 240 | value = b[0].z; b[0].z = a[0].w; a[0].w = value; 241 | value = b[0].y; b[0].y = a[0].w; a[0].w = value; 242 | 243 | } 244 | 245 | uint128m _mm_unpacklo_epi32_emu(uint128m a, uint128m b) 246 | { 247 | a.z = a.y; 248 | a.y = b.x; 249 | a.w = b.y; 250 | return a; 251 | } 252 | 253 | uint128m _mm_unpackhi_epi32_emu(uint128m a, uint128m b) 254 | { 255 | b.x = a.z; 256 | b.y = b.z; 257 | b.z = a.w; 258 | return b; 259 | } 260 | 261 | 262 | void aesenc(unsigned char *s, __global uint128m *key, __local uint *t) 263 | { 264 | uint128m x0 = ((uint128m*)s)[0]; 265 | 266 | uint128m y0 = { 0,0,0,0 }; 267 | y0.x ^= t[x0.x & 0xff]; x0.x >>= 8; 268 | y0.y ^= t[x0.y & 0xff]; x0.y >>= 8; 269 | y0.z ^= t[x0.z & 0xff]; x0.z >>= 8; 270 | y0.w ^= t[x0.w & 0xff]; x0.w >>= 8; 271 | t += 256; 272 | 273 | y0.x ^= t[x0.y & 0xff]; x0.y >>= 8; 274 | y0.y ^= t[x0.z & 0xff]; x0.z >>= 8; 275 | y0.z ^= t[x0.w & 0xff]; x0.w >>= 8; 276 | y0.w ^= t[x0.x & 0xff]; x0.x >>= 8; 277 | t += 256; 278 | 279 | y0.x ^= t[x0.z & 0xff]; x0.z >>= 8; 280 | y0.y ^= t[x0.w & 0xff]; x0.w >>= 8; 281 | y0.z ^= t[x0.x & 0xff]; x0.x >>= 8; 282 | y0.w ^= t[x0.y & 0xff]; x0.y >>= 8; 283 | 284 | t += 256; 285 | 286 | y0.x ^= t[x0.w]; 287 | y0.y ^= t[x0.x]; 288 | y0.z ^= t[x0.y]; 289 | y0.w ^= t[x0.z]; 290 | 291 | ((uint128m*)s)[0] = _mm_xor_si128_emu(y0, key[0]); 292 | 293 | } 294 | 295 | 296 | 297 | 298 | uint128m _mm_cvtsi32_si128_emu(uint32_t lo) 299 | { 300 | uint128m result = { 0,0,0,0 }; 301 | result.x = lo; 302 | return result; 303 | } 304 | 305 | uint128m _mm_cvtsi64_si128_emu(uint64_t lo) 306 | { 307 | uint128m result = { 0,0,0,0 }; 308 | ((uint64_t *)&result)[0] = lo; 309 | //((uint64_t *)&result)[1] = 0; 310 | return result; 311 | } 312 | uint128m _mm_set_epi64x_emu(uint64_t hi, uint64_t lo) 313 | { 314 | uint128m result; 315 | ((uint64_t *)&result)[0] = lo; 316 | ((uint64_t *)&result)[1] = hi; 317 | return result; 318 | } 319 | uint128m _mm_shuffle_epi8_emu(uint128m b) 320 | { 321 | uint128m result = { 0,0,0,0 }; 322 | uint128m M = { 0x2d361b00,0x415a776c,0xf5eec3d8,0x9982afb4 }; 323 | //#pragma unroll 16 324 | for (int i = 0; i < 16; i++) 325 | { 326 | if (((uint8_t *)&b)[i] & 0x80) 327 | { 328 | ((uint8_t *)&result)[i] = 0; 329 | } 330 | else 331 | { 332 | ((uint8_t *)&result)[i] = ((uint8_t *)&M)[((uint8_t *)&b)[i] & 0xf]; 333 | } 334 | } 335 | 336 | return result; 337 | } 338 | 339 | uint128m _mm_srli_si128_emu(uint128m input, int imm8) 340 | { 341 | //we can cheat here as its an 8 byte shift just copy the 64bits 342 | uint128m temp; 343 | ((uint64_t*)&temp)[0] = ((uint64_t*)&input)[1]; 344 | ((uint64_t*)&temp)[1] = 0; 345 | return temp; 346 | } 347 | 348 | uint128m _mm_mulhrs_epi16_emu(uint128m _a, uint128m _b) 349 | { 350 | int16_t result[8]; 351 | 352 | int16_t *a = (int16_t*)&_a, *b = (int16_t*)&_b; 353 | 354 | for (int i = 0; i < 8; i++) 355 | { 356 | result[i] = (int16_t)((((int32_t)(a[i]) * (int32_t)(b[i])) + 0x4000) >> 15); 357 | } 358 | return *(uint128m *)result; 359 | } 360 | 361 | void case_0(uint128m *prand, uint128m *prandex, const uint128m *pbuf, 362 | uint64_t selector, uint128m *acc) 363 | { 364 | uint128m temp1 = prandex[0]; 365 | 366 | uint128m temp2 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1)); 367 | 368 | 369 | uint128m add1 = _mm_xor_si128_emu(temp1, temp2); 370 | 371 | uint128m clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10); 372 | acc[0] = _mm_xor_si128_emu(clprod1, acc[0]); 373 | 374 | uint128m tempa1 = _mm_mulhrs_epi16_emu(acc[0], temp1); 375 | uint128m tempa2 = _mm_xor_si128_emu(tempa1, temp1); 376 | 377 | uint128m temp12 = prand[0]; 378 | prand[0] = tempa2; 379 | 380 | 381 | uint128m temp22 = _mm_load_si128_emu(pbuf); 382 | uint128m add12 = _mm_xor_si128_emu(temp12, temp22); 383 | uint128m clprod12 = _mm_clmulepi64_si128_emu(add12, add12, 0x10); 384 | acc[0] = _mm_xor_si128_emu(clprod12, acc[0]); 385 | 386 | uint128m tempb1 = _mm_mulhrs_epi16_emu(acc[0], temp12); 387 | uint128m tempb2 = _mm_xor_si128_emu(tempb1, temp12); 388 | prandex[0] = tempb2; 389 | 390 | } 391 | 392 | void case_4(uint128m *prand, uint128m *prandex, const uint128m *pbuf, 393 | uint64_t selector, uint128m *acc) 394 | { 395 | uint128m temp1 = prand[0]; 396 | uint128m temp2 = _mm_load_si128_emu(pbuf); 397 | uint128m add1 = _mm_xor_si128_emu(temp1, temp2); 398 | uint128m clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10); 399 | acc[0] = _mm_xor_si128_emu(clprod1, acc[0]); 400 | uint128m clprod2 = _mm_clmulepi64_si128_emu(temp2, temp2, 0x10); 401 | acc[0] = _mm_xor_si128_emu(clprod2, acc[0]); 402 | 403 | uint128m tempa1 = _mm_mulhrs_epi16_emu(acc[0], temp1); 404 | uint128m tempa2 = _mm_xor_si128_emu(tempa1, temp1); 405 | 406 | uint128m temp12 = prandex[0]; 407 | prandex[0] = tempa2; 408 | 409 | uint128m temp22 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1)); 410 | uint128m add12 = _mm_xor_si128_emu(temp12, temp22); 411 | acc[0] = _mm_xor_si128_emu(add12, acc[0]); 412 | 413 | uint128m tempb1 = _mm_mulhrs_epi16_emu(acc[0], temp12); 414 | uint128m tempb2 = _mm_xor_si128_emu(tempb1, temp12); 415 | prand[0] = tempb2; 416 | } 417 | 418 | void case_8(uint128m *prand, uint128m *prandex, const uint128m *pbuf, 419 | uint64_t selector, uint128m *acc) 420 | { 421 | uint128m temp1 = prandex[0]; 422 | uint128m temp2 = _mm_load_si128_emu(pbuf); 423 | uint128m add1 = _mm_xor_si128_emu(temp1, temp2); 424 | acc[0] = _mm_xor_si128_emu(add1, acc[0]); 425 | 426 | uint128m tempa1 = _mm_mulhrs_epi16_emu(acc[0], temp1); 427 | uint128m tempa2 = _mm_xor_si128_emu(tempa1, temp1); 428 | 429 | uint128m temp12 = prand[0]; 430 | prand[0] = tempa2; 431 | 432 | uint128m temp22 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1)); 433 | uint128m add12 = _mm_xor_si128_emu(temp12, temp22); 434 | uint128m clprod12 = _mm_clmulepi64_si128_emu(add12, add12, 0x10); 435 | acc[0] = _mm_xor_si128_emu(clprod12, acc[0]); 436 | uint128m clprod22 = _mm_clmulepi64_si128_emu(temp22, temp22, 0x10); 437 | acc[0] = _mm_xor_si128_emu(clprod22, acc[0]); 438 | 439 | uint128m tempb1 = _mm_mulhrs_epi16_emu(acc[0], temp12); 440 | uint128m tempb2 = _mm_xor_si128_emu(tempb1, temp12); 441 | prandex[0] = tempb2; 442 | } 443 | 444 | void case_0c(uint128m *prand, uint128m *prandex, const uint128m *pbuf, 445 | uint64_t selector, uint128m *acc) 446 | { 447 | uint128m temp1 = prand[0]; 448 | uint128m temp2 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1)); 449 | uint128m add1 = _mm_xor_si128_emu(temp1, temp2); 450 | 451 | // cannot be zero here 452 | int32_t divisor = ((uint32_t*)&selector)[0]; 453 | 454 | acc[0] = _mm_xor_si128_emu(add1, acc[0]); 455 | 456 | int64_t dividend = _mm_cvtsi128_si64_emu(acc[0]); 457 | int64_t tmpmod = dividend % divisor; 458 | uint128m modulo = _mm_cvtsi32_si128_emu(tmpmod); 459 | acc[0] = _mm_xor_si128_emu(modulo, acc[0]); 460 | 461 | uint128m tempa1 = _mm_mulhrs_epi16_emu(acc[0], temp1); 462 | uint128m tempa2 = _mm_xor_si128_emu(tempa1, temp1); 463 | dividend &= 1; 464 | if (dividend) 465 | { 466 | uint128m temp12 = prandex[0]; 467 | prandex[0] = tempa2; 468 | 469 | uint128m temp22 = _mm_load_si128_emu(pbuf); 470 | uint128m add12 = _mm_xor_si128_emu(temp12, temp22); 471 | uint128m clprod12 = _mm_clmulepi64_si128_emu(add12, add12, 0x10); 472 | acc[0] = _mm_xor_si128_emu(clprod12, acc[0]); 473 | uint128m clprod22 = _mm_clmulepi64_si128_emu(temp22, temp22, 0x10); 474 | acc[0] = _mm_xor_si128_emu(clprod22, acc[0]); 475 | 476 | uint128m tempb1 = _mm_mulhrs_epi16_emu(acc[0], temp12); 477 | uint128m tempb2 = _mm_xor_si128_emu(tempb1, temp12); 478 | prand[0] = tempb2; 479 | } 480 | else 481 | { 482 | uint128m tempb3 = prandex[0]; 483 | prandex[0] = tempa2; 484 | prand[0] = tempb3; 485 | uint128m tempb4 = pbuf[0]; 486 | acc[0] = _mm_xor_si128_emu(tempb4, acc[0]); 487 | } 488 | } 489 | 490 | void case_10(uint128m *prand, uint128m *prandex, const uint128m *pbuf, 491 | uint64_t selector, uint128m *acc, __global uint128m *randomsource, uint32_t prand_idx, __local uint32_t *sharedMemory1) 492 | { // a few AES operations 493 | //uint128m rc[12]; 494 | 495 | //rc[0] = prand[0]; 496 | 497 | __global uint128m *rc = &randomsource[prand_idx]; 498 | 499 | uint128m tmp, temp1 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1)); 500 | uint128m temp2 = _mm_load_si128_emu(pbuf); 501 | 502 | AES2_EMU(temp1, temp2, 0); 503 | MIX2_EMU(temp1, temp2); 504 | 505 | 506 | AES2_EMU(temp1, temp2, 4); 507 | MIX2_EMU(temp1, temp2); 508 | 509 | AES2_EMU(temp1, temp2, 8); 510 | MIX2_EMU(temp1, temp2); 511 | 512 | 513 | acc[0] = _mm_xor_si128_emu(temp1, acc[0]); 514 | acc[0] = _mm_xor_si128_emu(temp2, acc[0]); 515 | 516 | uint128m tempa1 = prand[0]; 517 | uint128m tempa2 = _mm_mulhrs_epi16_emu(acc[0], tempa1); 518 | uint128m tempa3 = _mm_xor_si128_emu(tempa1, tempa2); 519 | 520 | uint128m tempa4 = prandex[0]; 521 | prandex[0] = tempa3; 522 | prand[0] = tempa4; 523 | } 524 | 525 | void case_14(uint128m *prand, uint128m *prandex, const uint128m *pbuf, 526 | uint64_t selector, uint128m *acc, __global uint128m *randomsource, uint32_t prand_idx, __local uint32_t *sharedMemory1) 527 | { 528 | // we'll just call this one the monkins loop, inspired by Chris 529 | uint128m *buftmp = pbuf - (((selector & 1) << 1) - 1); 530 | // uint128m tmp; // used by MIX2 531 | 532 | uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times 533 | __global uint128m *rc = &randomsource[prand_idx]; 534 | 535 | 536 | uint64_t aesround = 0; 537 | uint128m onekey, tmp; 538 | bool loop_c; 539 | 540 | do { 541 | loop_c = (selector & ((uint64_t)0x10000000 << rounds)) >> 28 ; 542 | if (loop_c) 543 | { 544 | onekey = rc[0]; rc++; // _mm_load_si128_emu(rc++); 545 | uint128m temp2 = _mm_load_si128_emu(rounds & 1 ? pbuf : buftmp); 546 | uint128m add1 = _mm_xor_si128_emu(onekey, temp2); 547 | uint128m clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10); 548 | acc[0] = _mm_xor_si128_emu(clprod1, acc[0]); 549 | } 550 | else 551 | { 552 | onekey = rc[0]; rc++; // _mm_load_si128_emu(rc++); 553 | uint128m temp2 = _mm_load_si128_emu(rounds & 1 ? buftmp : pbuf); 554 | 555 | uint64_t roundidx = aesround++ << 2; 556 | AES2_EMU(onekey, temp2, roundidx); 557 | 558 | MIX2_EMU(onekey, temp2); 559 | 560 | acc[0] = _mm_xor_si128_emu(onekey, acc[0]); 561 | acc[0] = _mm_xor_si128_emu(temp2, acc[0]); 562 | 563 | } 564 | }while(rounds--); 565 | 566 | uint128m tempa1 = (prand[0]); 567 | uint128m tempa2 = _mm_mulhrs_epi16_emu(acc[0], tempa1); 568 | uint128m tempa3 = _mm_xor_si128_emu(tempa1, tempa2); 569 | 570 | uint128m tempa4 = (prandex[0]); 571 | prandex[0] = tempa3; 572 | prand[0] = tempa4; 573 | } 574 | 575 | void case_18(uint128m *prand, uint128m *prandex, const uint128m *pbuf, 576 | uint64_t selector, uint128m *acc, __global uint128m *randomsource, uint32_t prand_idx, __local uint32_t *sharedMemory1) 577 | { 578 | const uint128m *buftmp = pbuf - (((selector & 1) << 1) - 1); 579 | uint128m tmp; // used by MIX2 580 | 581 | uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times 582 | __global uint128m *rc = &randomsource[prand_idx]; 583 | 584 | 585 | uint64_t aesround = 0; 586 | uint128m onekey; 587 | uint64_t loop_c; 588 | 589 | do { 590 | loop_c = selector & ((uint64_t)0x10000000 << rounds); 591 | if (loop_c) 592 | { 593 | onekey = rc[0]; rc++; 594 | const uint128m temp2 = _mm_load_si128_emu(rounds & 1 ? pbuf : buftmp); 595 | onekey = _mm_xor_si128_emu(onekey, temp2); 596 | 597 | const int32_t divisor = (uint32_t)selector; 598 | const int64_t dividend = ((int64_t*)&onekey)[0]; 599 | uint128m modulo = { 0,0,0,0 }; ((int32_t*)&modulo)[0] = (dividend % divisor); 600 | acc[0] = _mm_xor_si128_emu(modulo , acc[0]); 601 | 602 | } 603 | else 604 | { 605 | onekey = rc[0]; rc++; 606 | uint128m temp2 = _mm_load_si128_emu(rounds & 1 ? buftmp : pbuf); 607 | uint128m add1 = _mm_xor_si128_emu(onekey, temp2); 608 | onekey = _mm_clmulepi64_si128_emu(add1, add1, 0x10); 609 | uint128m clprod2 = _mm_mulhrs_epi16_emu(acc[0], onekey); 610 | acc[0] = clprod2^ acc[0]; 611 | } 612 | }while(rounds--); 613 | 614 | 615 | const uint128m tempa3 = (prandex[0]); 616 | const uint128m tempa4 = _mm_xor_si128_emu(tempa3, acc[0]); 617 | prandex[0] = onekey; 618 | prand[0] = tempa4; 619 | } 620 | 621 | void case_1c(uint128m *prand, uint128m *prandex, const uint128m *pbuf, 622 | uint64_t selector, uint128m *acc) 623 | { 624 | uint128m temp1 = _mm_load_si128_emu(pbuf); 625 | uint128m temp2 = (prandex[0]); 626 | uint128m add1 = _mm_xor_si128_emu(temp1, temp2); 627 | uint128m clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10); 628 | acc[0] = _mm_xor_si128_emu(clprod1, acc[0]); 629 | 630 | 631 | uint128m tempa1 = _mm_mulhrs_epi16_emu(acc[0], temp2); 632 | uint128m tempa2 = _mm_xor_si128_emu(tempa1, temp2); 633 | uint128m tempa3 = (prand[0]); 634 | 635 | 636 | prand[0] = tempa2; 637 | 638 | acc[0] = _mm_xor_si128_emu(tempa3, acc[0]); 639 | uint128m temp4 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1)); 640 | acc[0] = _mm_xor_si128_emu(temp4, acc[0]); 641 | uint128m tempb1 = _mm_mulhrs_epi16_emu(acc[0], tempa3); 642 | uint128m tempb2 = _mm_xor_si128_emu(tempb1, tempa3); 643 | prandex[0] = tempb2; 644 | } 645 | 646 | 647 | 648 | 649 | #define m2 selector = _mm_cvtsi128_si64_emu(acc);\ 650 | if(i > 0){prand_idx = ((acc.x >> 5) & 511);\ 651 | prandex_idx = ((acc.y) & 511);\ 652 | case_v = selector & 0x1cu;\ 653 | prand = randomsource[prand_idx];\ 654 | prandex = randomsource[prandex_idx];\ 655 | pbuf = buf + (acc.x & 3);} 656 | 657 | 658 | #define m3 d_fix_r[i] = prand_idx;\ 659 | d_fix_rex[i] = prandex_idx;\ 660 | randomsource[prand_idx] = prand;\ 661 | randomsource[prandex_idx] = prandex;\ 662 | i++; 663 | 664 | #define C0 if (case_v == 0 )\ 665 | {case_0(&prand, &prandex, pbuf, selector, &acc);\ 666 | m3\ 667 | if(i==32)break; m2} 668 | 669 | #define C1 if (case_v == 4 )\ 670 | {case_4(&prand, &prandex, pbuf, selector, &acc);\ 671 | m3\ 672 | if(i==32)break; m2} 673 | 674 | 675 | 676 | #define C2 if (case_v == 8 )\ 677 | {case_8(&prand, &prandex, pbuf, selector, &acc);\ 678 | m3\ 679 | if(i==32)break; m2}\ 680 | 681 | #define C3 if (case_v == 0xc )\ 682 | { case_0c(&prand, &prandex, pbuf, selector, &acc);\ 683 | m3\ 684 | if(i==32)break; m2}\ 685 | 686 | 687 | #define C4 if (case_v == 0x10 )\ 688 | { case_10(&prand, &prandex, pbuf, selector, &acc, randomsource, prand_idx, sharedMemory1);\ 689 | m3\ 690 | if(i==32)break; m2}\ 691 | 692 | #define C5 if (case_v == 0x14 )\ 693 | { case_14(&prand, &prandex, pbuf, selector, &acc, randomsource, prand_idx, sharedMemory1);\ 694 | m3\ 695 | if(i==32)break; m2}\ 696 | 697 | #define C6 if (case_v == 0x18 )\ 698 | { case_18(&prand, &prandex, pbuf, selector, &acc, randomsource, prand_idx, sharedMemory1);\ 699 | m3\ 700 | if(i==32)break; m2}\ 701 | 702 | #define C7 if (case_v == 0x1C )\ 703 | { case_1c(&prand, &prandex, pbuf, selector, &acc);\ 704 | m3\ 705 | if(i==32)break; m2}\ 706 | 707 | 708 | 709 | 710 | uint128m __verusclmulwithoutreduction64alignedrepeatgpu(__global uint128m * randomsource, uint128m *buf, 711 | __local uint32_t *sharedMemory1, __local uint16_t *d_fix_r, __local uint16_t *d_fix_rex) 712 | { 713 | uint128m const *pbuf; 714 | //keyMask >>= 4; 715 | uint128m acc = randomsource[513]; 716 | buf[0] = buf[0] ^ buf[2]; 717 | buf[1] = buf[1] ^ buf[3]; 718 | 719 | // divide key mask by 32 from bytes to uint128m 720 | 721 | uint16_t prand_idx, prandex_idx; 722 | uint64_t selector; 723 | uint128m prand; 724 | uint128m prandex; 725 | prand_idx = ((acc.x >> 5) & 511); 726 | prandex_idx = ((acc.y) & 511); 727 | 728 | prand = randomsource[prand_idx]; 729 | prandex = randomsource[prandex_idx]; 730 | uint8_t case_v; 731 | //#pragma unroll 732 | uint32_t i = 0; 733 | selector = _mm_cvtsi128_si64_emu(acc); 734 | case_v = selector & 0x1cu; 735 | pbuf = buf + (acc.x & 3); 736 | do 737 | { 738 | 739 | C5 740 | C0 741 | C1 742 | C2 743 | C3 744 | C4 745 | C5 746 | C6 747 | C7 748 | 749 | 750 | 751 | } while (i != 32); 752 | 753 | return acc; 754 | } 755 | 756 | 757 | uint32_t haraka512_port_keyed2222(const uint128m *in, __global uint128m *rc, __local uint32_t *sharedMemory1) 758 | { 759 | uint128m s0, s1, s2, s3, tmp; 760 | 761 | s0 = in[0]; 762 | s1 = in[1]; 763 | s2 = in[2]; 764 | s3 = in[3]; 765 | 766 | AES4(s0, s1, s2, s3, 0); 767 | MIX4(s0, s1, s2, s3); 768 | 769 | AES4(s0, s1, s2, s3, 8); 770 | MIX4(s0, s1, s2, s3); 771 | 772 | AES4(s0, s1, s2, s3, 16); 773 | MIX4(s0, s1, s2, s3); 774 | 775 | AES4(s0, s1, s2, s3, 24); 776 | 777 | s2.x = s2.y; 778 | s2.y = s0.y; 779 | s2.z = s3.y; 780 | s2.w = s1.y; 781 | 782 | 783 | AES4_LAST(s2, 32); 784 | 785 | return s2.z ^ in[3].y; 786 | 787 | } 788 | 789 | 790 | ulong precompReduction64(uint128m A) { 791 | 792 | 793 | //static const uint128m M = { 0x2d361b00,0x415a776c,0xf5eec3d8,0x9982afb4 }; 794 | // const uint128m tmp = { 27 }; 795 | // A.z = 0; 796 | //tmp.x = 27u; 797 | uint128m Q2 = _mm_clmulepi64_si128_emu2(A); 798 | uint128m Q3 = _mm_shuffle_epi8_emu(_mm_srli_si128_emu(Q2, 8)); 799 | 800 | //uint128m Q4 = _mm_xor_si128_emu(Q2, A); 801 | uint128m final; 802 | final.x = xor3x(A.x, Q2.x, Q3.x); 803 | final.y = xor3x(A.y, Q2.y, Q3.y); 804 | 805 | return _mm_cvtsi128_si64_emu(final);/// WARNING: HIGH 64 BITS SHOULD BE ASSUMED TO CONTAIN GARBAGE 806 | } 807 | 808 | //#define TOTAL_MAX (0xffff) 809 | 810 | __kernel __attribute__((reqd_work_group_size(THREADS, 1, 1))) 811 | __kernel void verus_gpu_hash(__constant uint *startNonce, 812 | __constant uint128m *blockhash_half, __global uint128m *data_keylarge,constant uint128m * d_key_input, __global uint *target, __global uint *resNonce ) 813 | { 814 | const uint thread = get_global_id(0); 815 | uint128m mid; 816 | uint128m s[4]; 817 | 818 | const uint lid = get_local_id(0); 819 | const uint nounce = startNonce[0] + thread; 820 | 821 | __local uint sharedMemory1[4][256]; 822 | __local uint16_t sharedrand[THREADS * 32]; 823 | __local uint16_t sharedrandex[THREADS * 32]; 824 | 825 | __global uint128m *pkey = &data_keylarge[0] + ((thread & TOTAL_MAX) * VERUS_KEY_SIZE128); 826 | 827 | s[0] = blockhash_half[0]; 828 | s[1] = blockhash_half[1]; 829 | s[2] = blockhash_half[2]; 830 | s[3] = blockhash_half[3]; 831 | 832 | 833 | for (int i = get_local_id(0); i < 256; i += THREADS) { 834 | 835 | sharedMemory1[0][i] = saes_table[0][i]; 836 | sharedMemory1[1][i] = saes_table[1][i]; 837 | sharedMemory1[2][i] = saes_table[2][i]; 838 | sharedMemory1[3][i] = saes_table[3][i]; 839 | } 840 | //mem_fence(CLK_LOCAL_MEM_FENCE); //sync sharedmem 841 | if (startNonce[0] == 0) { 842 | for (int i = 0; i < VERUS_KEY_SIZE128; i++) { 843 | 844 | pkey[i] = d_key_input[i]; 845 | } 846 | } 847 | 848 | s[2].x = nounce; 849 | 850 | mid = __verusclmulwithoutreduction64alignedrepeatgpu(pkey, s, sharedMemory1[0],&sharedrand[lid *32], &sharedrandex[lid * 32]); 851 | mid.x ^= 0x00010000; 852 | 853 | s[0] = blockhash_half[0]; 854 | s[1] = blockhash_half[1]; 855 | //s[2] = blockhash_half[2]; 856 | //s[3] = blockhash_half[3]; 857 | 858 | uint2 acc2; ((ulong*)&acc2)[0] = precompReduction64(mid); 859 | //s[2].x = nounce; 860 | ((uint8_t*)&s)[47] = acc2.x & 0xff; 861 | s[3].x = LIMMY_R(acc2.x, acc2.y, 8); 862 | s[3].y = LIMMY_R(acc2.y, acc2.x, 8); 863 | s[3].z = s[3].x ; 864 | s[3].w = s[3].y ; 865 | //s[3].z = LIMMY_R(acc2.x, acc2.y, 8); 866 | //s[3].w = LIMMY_R(acc2.y, acc2.x, 8); 867 | 868 | acc2.x = acc2.x & 511; 869 | 870 | const uint hash = haraka512_port_keyed2222(s, &pkey[acc2.x], sharedMemory1[0]); 871 | 872 | if (hash < target[7]) { 873 | resNonce[0] = nounce; 874 | } 875 | 876 | barrier(CLK_LOCAL_MEM_FENCE); 877 | #pragma unroll 32 878 | for (int i = 0; i < 32; i++) 879 | { 880 | pkey[sharedrand[(lid * 32) + i]] = d_key_input[sharedrand[(lid * 32) + i]]; 881 | pkey[sharedrandex[(lid * 32) + i]] = d_key_input[sharedrandex[(lid * 32) + i]]; 882 | } 883 | // mem_fence(CLK_LOCAL_MEM_FENCE); 884 | } 885 | 886 | 887 | -------------------------------------------------------------------------------- /main.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/monkins1010/AMDVerusCoin/2706c705b1ed7cb23a316d574d6b8bc4dee778ce/main.o -------------------------------------------------------------------------------- /param-nvidia.h: -------------------------------------------------------------------------------- 1 | #define THRD 32 2 | #define PARAM_N 200 3 | #define PARAM_K 9 4 | #define PREFIX (PARAM_N / (PARAM_K + 1)) 5 | #define NR_INPUTS (1 << PREFIX) 6 | // Approximate log base 2 of number of elements in hash tables 7 | #define APX_NR_ELMS_LOG (PREFIX + 1) 8 | // Number of rows and slots is affected by this. 20 offers the best performance 9 | // but occasionally misses ~1% of solutions. 10 | #define NR_ROWS_LOG 16 11 | 12 | // Setting this to 1 might make SILENTARMY faster, see TROUBLESHOOTING.md 13 | #define OPTIM_SIMPLIFY_ROUND 1 14 | 15 | // Number of collision items to track, per thread 16 | #define THREADS_PER_ROW 16 17 | #define ROWS_PER_WORKGROUP (THRD/THREADS_PER_ROW) 18 | #define LDS_COLL_SIZE (NR_SLOTS * 15 * (THRD / THREADS_PER_ROW)) 19 | 20 | // Ratio of time of sleeping before rechecking if task is done (0-1) 21 | #define SLEEP_RECHECK_RATIO 0.60 22 | // Ratio of time to busy wait for the solution (0-1) 23 | // The higher value the higher CPU usage with Nvidia 24 | #define SLEEP_SKIP_RATIO 0.005 25 | 26 | // Make hash tables OVERHEAD times larger than necessary to store the average 27 | // number of elements per row. The ideal value is as small as possible to 28 | // reduce memory usage, but not too small or else elements are dropped from the 29 | // hash tables. 30 | // 31 | // The actual number of elements per row is closer to the theoretical average 32 | // (less variance) when NR_ROWS_LOG is small. So accordingly OVERHEAD can be 33 | // smaller. 34 | // 35 | // Even (as opposed to odd) values of OVERHEAD sometimes significantly decrease 36 | // performance as they cause VRAM channel conflicts. 37 | #if NR_ROWS_LOG == 16 38 | // #error "NR_ROWS_LOG = 16 is currently broken - do not use" 39 | #define OVERHEAD 2 40 | #define COLLISION_TYPES_NUM 16u 41 | #define COLLISION_BUFFER_SIZE 16u 42 | #elif NR_ROWS_LOG == 18 43 | #define OVERHEAD 4 44 | #define COLLISION_TYPES_NUM 4u 45 | #define COLLISION_BUFFER_SIZE 16u 46 | #elif NR_ROWS_LOG == 19 47 | #define OVERHEAD 5 48 | #define COLLISION_TYPES_NUM 2u 49 | #define COLLISION_BUFFER_SIZE 16u 50 | #elif NR_ROWS_LOG == 20 && OPTIM_SIMPLIFY_ROUND 51 | #define OVERHEAD 6 52 | #elif NR_ROWS_LOG == 20 53 | #define OVERHEAD 9 54 | #endif 55 | 56 | #define NR_ROWS (1 << NR_ROWS_LOG) 57 | #define NR_SLOTS (((1 << (APX_NR_ELMS_LOG - NR_ROWS_LOG)) * OVERHEAD)) 58 | // Length of 1 element (slot) in byte 59 | #define SLOT_LEN 32 60 | // Total size of hash table 61 | #define HT_SIZE (NR_ROWS * NR_SLOTS * SLOT_LEN) 62 | // Length of Zcash block header, nonce (part of header) 63 | #define ZCASH_BLOCK_HEADER_LEN 140 64 | // Offset of nTime in header 65 | #define ZCASH_BLOCK_OFFSET_NTIME (4 + 3 * 32) 66 | // Length of nonce 67 | #define ZCASH_NONCE_LEN 32 68 | // Length of encoded representation of solution size 69 | #define ZCASH_SOLSIZE_LEN 3 70 | // Solution size (1344 = 0x540) represented as a compact integer, in hex 71 | #define ZCASH_SOLSIZE_HEX "fd4005" 72 | // Length of encoded solution (512 * 21 bits / 8 = 1344 bytes) 73 | #define ZCASH_SOL_LEN ((1 << PARAM_K) * (PREFIX + 1) / 8) 74 | // Last N_ZERO_BYTES of nonce must be zero due to my BLAKE2B optimization 75 | #define N_ZERO_BYTES 12 76 | // Number of bytes Zcash needs out of Blake 77 | #define ZCASH_HASH_LEN 50 78 | // Number of wavefronts per SIMD for the Blake kernel. 79 | // Blake is ALU-bound (beside the atomic counter being incremented) so we need 80 | // at least 2 wavefronts per SIMD to hide the 2-clock latency of integer 81 | // instructions. 10 is the max supported by the hw. 82 | #define BLAKE_WPS 10 83 | // Maximum number of solutions reported by kernel to host 84 | #define MAX_SOLS 10 85 | // Length of SHA256 target 86 | #define SHA256_TARGET_LEN (256 / 8) 87 | 88 | #if (NR_SLOTS < 16) 89 | #define BITS_PER_ROW 4 90 | #define ROWS_PER_UINT 8 91 | #define ROW_MASK 0x0F 92 | #else 93 | #define BITS_PER_ROW 8 94 | #define ROWS_PER_UINT 4 95 | #define ROW_MASK 0xFF 96 | #endif 97 | 98 | // Optional features 99 | #undef ENABLE_DEBUG 100 | 101 | /* 102 | ** Return the offset of Xi in bytes from the beginning of the slot. 103 | */ 104 | #define xi_offset_for_round(round) (8 + ((round) / 2) * 4) 105 | 106 | // An (uncompressed) solution stores (1 << PARAM_K) 32-bit values 107 | #define SOL_SIZE ((1 << PARAM_K) * 4) 108 | typedef struct sols_s 109 | { 110 | uint nr; 111 | uint likely_invalids; 112 | uchar valid[MAX_SOLS]; 113 | uint values[MAX_SOLS][(1 << PARAM_K)]; 114 | } sols_t; 115 | -------------------------------------------------------------------------------- /param.h: -------------------------------------------------------------------------------- 1 | // SILENTARMY v5 Standalone Version 2 | // Copyright 2016-2017 zawawa @ bitcointalk.org 3 | // 4 | // The initial version of this software was based on: 5 | // SILENTARMY v5 6 | // The MIT License (MIT) Copyright (c) 2016 Marc Bevand, Genoil, eXtremal 7 | // 8 | // This program is free software : you can redistribute it and / or modify 9 | // it under the terms of the GNU General Public License as published by 10 | // the Free Software Foundation, either version 3 of the License, or 11 | // (at your option) any later version. 12 | // 13 | // This program is distributed in the hope that it will be useful, 14 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.See the 16 | // GNU General Public License for more details. 17 | // 18 | // You should have received a copy of the GNU General Public License 19 | // along with this program. If not, see . 20 | 21 | 22 | 23 | #if !defined(__OPENCL_VERSION__) && defined(cl_amd_fp64) 24 | #define uint uint32_t 25 | #define uchar uint8_t 26 | #endif 27 | #if defined(cl_amd_fp64) && !defined(AMD) 28 | #define AMD 29 | #endif 30 | #if (defined(__Tahiti__) || defined(__Pitcairn__) || defined(__Capeverde__) || defined(__Oland__)) && !defined(AMD_LEGACY) 31 | #define AMD_LEGACY 32 | #endif 33 | #ifdef cl_nv_pragma_unroll 34 | #define NVIDIA 35 | #endif 36 | 37 | 38 | 39 | // 40 | // Parameters for Hash Tables 41 | // 42 | 43 | // There are PARAM_K - 1 hash tables, and each hash table has NR_ROWS rows. 44 | // Each row contains NR_SLOTS slots. 45 | 46 | //#define NR_ROWS_LOG 12 47 | //#define NR_ROWS 4096 48 | //#define NR_SLOTS 684 49 | //#define LDS_COLL_SIZE 793 50 | 51 | #define NR_ROWS_LOG_13(round) 0//((round) <= 1) 52 | #define MAX_NR_ROWS 4096//8192 53 | 54 | #define _NR_ROWS_LOG(round) (NR_ROWS_LOG_13(round) ? 13 : 12) 55 | #define _NR_SLOTS(round) (NR_ROWS_LOG_13(round) ? 360 : 684) 56 | #define _LDS_COLL_SIZE(round) (NR_ROWS_LOG_13(round) ? 400 : 794) 57 | #define _NR_ROWS(round) (NR_ROWS_LOG_13(round) ? 8192 : 4096) 58 | 59 | #define LOCAL_WORK_SIZE_ROUND0 256 60 | #define LOCAL_WORK_SIZE 256 61 | #define LOCAL_WORK_SIZE_POTENTIAL_SOLS 256 62 | #define LOCAL_WORK_SIZE_SOLS 256 63 | 64 | #if defined(AMD) 65 | #define THREADS_PER_WRITE(round) (((round) <= 5) ? 2 : 1) 66 | #else 67 | #define THREADS_PER_WRITE(round) 1 68 | #endif 69 | 70 | #if defined(AMD) && !defined(AMD_LEGACY) 71 | #define OPTIM_24BYTE_WRITES 72 | #endif 73 | #define OPTIM_16BYTE_WRITES 74 | #if 1//!defined(AMD_LEGACY) 75 | #define OPTIM_8BYTE_WRITES 76 | #endif 77 | 78 | //#define OPTIM_UINT_ROW_COUNTERS 79 | //#define OPTIM_FAST_INTEGER_DIVISION 80 | #define OPTIM_COMPACT_ROW_COUNTERS 81 | #define OPTIM_IGNORE_ROW_COUNTER_OVERFLOWS 82 | #if defined(AMD) && !defined(AMD_LEGACY) 83 | #define OPTIM_ON_THE_FLY_COLLISION_SEARCH 84 | #endif 85 | 86 | #define ADJUSTED_LDS_ARRAY_SIZE(n) (n) 87 | 88 | 89 | 90 | #define UINTS_IN_XI(round) (((round) == 0) ? 6 : \ 91 | ((round) == 1) ? 6 : \ 92 | ((round) == 2) ? 5 : \ 93 | ((round) == 3) ? 5 : \ 94 | ((round) == 4) ? 4 : \ 95 | ((round) == 5) ? 4 : \ 96 | ((round) == 6) ? 3 : \ 97 | ((round) == 7) ? 2 : \ 98 | 1) 99 | 100 | 101 | 102 | #define PARAM_N 200 103 | #define PARAM_K 9 104 | #define PREFIX (PARAM_N / (PARAM_K + 1)) 105 | #define NR_INPUTS (1 << PREFIX) 106 | // Length of 1 element (slot) in byte 107 | #define MAX_SLOT_LEN 32 108 | #define _SLOT_LEN(round) ((UINTS_IN_XI(round) >= 4) ? 32 : (UINTS_IN_XI(round) >= 2) ? 16 : 8) 109 | // Total size of hash table 110 | #define HASH_TABLE_SIZE(round) (_NR_ROWS(round) * _NR_SLOTS(round) * _SLOT_LEN(round)) 111 | // Length of Zcash block header, nonce (part of header) 112 | #define ZCASH_BLOCK_HEADER_LEN 140 113 | // Offset of nTime in header 114 | #define ZCASH_BLOCK_OFFSET_NTIME (4 + 3 * 32) 115 | // Length of nonce 116 | #define ZCASH_NONCE_LEN 32 117 | // Length of encoded representation of solution size 118 | #define ZCASH_SOLSIZE_LEN 3 119 | // Solution size (1344 = 0x540) represented as a compact integer, in hex 120 | #define ZCASH_SOLSIZE_HEX "fd4005" 121 | // Length of encoded solution (512 * 21 bits / 8 = 1344 bytes) 122 | #define ZCASH_SOL_LEN ((1 << PARAM_K) * (PREFIX + 1) / 8) 123 | // Last N_ZERO_BYTES of nonce must be zero due to my BLAKE2B optimization 124 | #define N_ZERO_BYTES 12 125 | // Number of bytes Zcash needs out of Blake 126 | #define ZCASH_HASH_LEN 50 127 | // Number of wavefronts per SIMD for the Blake kernel. 128 | // Blake is ALU-bound (beside the atomic counter being incremented) so we need 129 | // at least 2 wavefronts per SIMD to hide the 2-clock latency of integer 130 | // instructions. 10 is the max supported by the hw. 131 | #define BLAKE_WPS 10 132 | // Maximum number of solutions reported by kernel to host 133 | #define MAX_SOLS 11 134 | #define MAX_POTENTIAL_SOLS 4096 135 | // Length of SHA256 target 136 | #define SHA256_TARGET_LEN (256 / 8) 137 | 138 | #ifdef OPTIM_UINT_ROW_COUNTERS 139 | #define BITS_PER_ROW 32 140 | #define ROWS_PER_UINT 1 141 | #define ROW_MASK 0xffffffff 142 | #elif defined(OPTIM_COMPACT_ROW_COUNTERS) 143 | #define BITS_PER_ROW 10 144 | #define ROWS_PER_UINT (32 / BITS_PER_ROW) 145 | #define ROW_MASK ((1 << BITS_PER_ROW) - 1) 146 | #else 147 | #define BITS_PER_ROW 16 148 | #define ROWS_PER_UINT (32 / BITS_PER_ROW) 149 | #define ROW_MASK ((1 << BITS_PER_ROW) - 1) 150 | #endif 151 | 152 | #define ROW_COUNTERS_SIZE (((MAX_NR_ROWS + ROWS_PER_UINT - 1) / ROWS_PER_UINT) * sizeof(uint)) 153 | 154 | 155 | 156 | // An (uncompressed) solution stores (1 << PARAM_K) 32-bit values 157 | #define SOL_SIZE ((1 << PARAM_K) * 4) 158 | typedef struct sols_s 159 | { 160 | uint nr; 161 | uint likely_invalids; 162 | uchar valid[MAX_SOLS]; 163 | uint values[MAX_SOLS][(1 << PARAM_K)]; 164 | } sols_t; 165 | 166 | typedef struct potential_sols_s 167 | { 168 | uint nr; 169 | uint values[MAX_POTENTIAL_SOLS][2]; 170 | } potential_sols_t; 171 | 172 | #define INPUT_ENCODING_SLOT_BITS(round) ((32 - _NR_ROWS_LOG(round)) / 2) 173 | #define INPUT_ENCODING_SLOT_MASK(round) ((1 << INPUT_ENCODING_SLOT_BITS(round)) - 1) 174 | #define INPUT_ENCODING_ROW_POS(round) (INPUT_ENCODING_SLOT_BITS(round) * 2) 175 | 176 | #define ENCODE_INPUTS(round, row, slot0, slot1) ( ( (row) << INPUT_ENCODING_ROW_POS(round) ) \ 177 | | (((slot1) & INPUT_ENCODING_SLOT_MASK(round)) << INPUT_ENCODING_SLOT_BITS(round)) \ 178 | | ( (slot0) & INPUT_ENCODING_SLOT_MASK(round) )) 179 | 180 | #define DECODE_ROW(round, ref) ((ref) >> INPUT_ENCODING_ROW_POS(round)) 181 | #define DECODE_SLOT1(round, ref) (((ref) >> INPUT_ENCODING_SLOT_BITS(round)) & INPUT_ENCODING_SLOT_MASK(round)) 182 | #define DECODE_SLOT0(round, ref) ( (ref) & INPUT_ENCODING_SLOT_MASK(round)) 183 | 184 | 185 | 186 | #define NEXT_PRIME_NO(n) \ 187 | (((n) <= 2) ? 2 : \ 188 | ((n) <= 3) ? 3 : \ 189 | ((n) <= 5) ? 5 : \ 190 | ((n) <= 7) ? 7 : \ 191 | ((n) <= 11) ? 11 : \ 192 | ((n) <= 13) ? 13 : \ 193 | ((n) <= 17) ? 17 : \ 194 | ((n) <= 19) ? 19 : \ 195 | ((n) <= 23) ? 23 : \ 196 | ((n) <= 29) ? 29 : \ 197 | ((n) <= 31) ? 31 : \ 198 | ((n) <= 37) ? 37 : \ 199 | ((n) <= 41) ? 41 : \ 200 | ((n) <= 43) ? 43 : \ 201 | ((n) <= 47) ? 47 : \ 202 | ((n) <= 53) ? 53 : \ 203 | ((n) <= 59) ? 59 : \ 204 | ((n) <= 61) ? 61 : \ 205 | ((n) <= 67) ? 67 : \ 206 | ((n) <= 71) ? 71 : \ 207 | ((n) <= 73) ? 73 : \ 208 | ((n) <= 79) ? 79 : \ 209 | ((n) <= 83) ? 83 : \ 210 | ((n) <= 89) ? 89 : \ 211 | ((n) <= 97) ? 97 : \ 212 | ((n) <= 101) ? 101 : \ 213 | ((n) <= 103) ? 103 : \ 214 | ((n) <= 107) ? 107 : \ 215 | ((n) <= 109) ? 109 : \ 216 | ((n) <= 113) ? 113 : \ 217 | ((n) <= 127) ? 127 : \ 218 | ((n) <= 131) ? 131 : \ 219 | ((n) <= 137) ? 137 : \ 220 | ((n) <= 139) ? 139 : \ 221 | ((n) <= 149) ? 149 : \ 222 | ((n) <= 151) ? 151 : \ 223 | ((n) <= 157) ? 157 : \ 224 | ((n) <= 163) ? 163 : \ 225 | ((n) <= 167) ? 167 : \ 226 | ((n) <= 173) ? 173 : \ 227 | ((n) <= 179) ? 179 : \ 228 | ((n) <= 181) ? 181 : \ 229 | ((n) <= 191) ? 191 : \ 230 | ((n) <= 193) ? 193 : \ 231 | ((n) <= 197) ? 197 : \ 232 | ((n) <= 199) ? 199 : \ 233 | ((n) <= 211) ? 211 : \ 234 | ((n) <= 223) ? 223 : \ 235 | ((n) <= 227) ? 227 : \ 236 | ((n) <= 229) ? 229 : \ 237 | ((n) <= 233) ? 233 : \ 238 | ((n) <= 239) ? 239 : \ 239 | ((n) <= 241) ? 241 : \ 240 | ((n) <= 251) ? 251 : \ 241 | ((n) <= 257) ? 257 : \ 242 | ((n) <= 263) ? 263 : \ 243 | ((n) <= 269) ? 269 : \ 244 | ((n) <= 271) ? 271 : \ 245 | ((n) <= 277) ? 277 : \ 246 | ((n) <= 281) ? 281 : \ 247 | ((n) <= 283) ? 283 : \ 248 | ((n) <= 293) ? 293 : \ 249 | ((n) <= 307) ? 307 : \ 250 | ((n) <= 311) ? 311 : \ 251 | ((n) <= 313) ? 313 : \ 252 | ((n) <= 317) ? 317 : \ 253 | ((n) <= 331) ? 331 : \ 254 | ((n) <= 337) ? 337 : \ 255 | ((n) <= 347) ? 347 : \ 256 | ((n) <= 349) ? 349 : \ 257 | ((n) <= 353) ? 353 : \ 258 | ((n) <= 359) ? 359 : \ 259 | ((n) <= 367) ? 367 : \ 260 | ((n) <= 373) ? 373 : \ 261 | ((n) <= 379) ? 379 : \ 262 | ((n) <= 383) ? 383 : \ 263 | ((n) <= 389) ? 389 : \ 264 | ((n) <= 397) ? 397 : \ 265 | ((n) <= 401) ? 401 : \ 266 | ((n) <= 409) ? 409 : \ 267 | ((n) <= 419) ? 419 : \ 268 | ((n) <= 421) ? 421 : \ 269 | ((n) <= 431) ? 431 : \ 270 | ((n) <= 433) ? 433 : \ 271 | ((n) <= 439) ? 439 : \ 272 | ((n) <= 443) ? 443 : \ 273 | ((n) <= 449) ? 449 : \ 274 | ((n) <= 457) ? 457 : \ 275 | ((n) <= 461) ? 461 : \ 276 | ((n) <= 463) ? 463 : \ 277 | ((n) <= 467) ? 467 : \ 278 | ((n) <= 479) ? 479 : \ 279 | ((n) <= 487) ? 487 : \ 280 | ((n) <= 491) ? 491 : \ 281 | ((n) <= 499) ? 499 : \ 282 | ((n) <= 503) ? 503 : \ 283 | ((n) <= 509) ? 509 : \ 284 | ((n) <= 521) ? 521 : \ 285 | ((n) <= 523) ? 523 : \ 286 | ((n) <= 541) ? 541 : \ 287 | ((n) <= 547) ? 547 : \ 288 | ((n) <= 557) ? 557 : \ 289 | ((n) <= 563) ? 563 : \ 290 | ((n) <= 569) ? 569 : \ 291 | ((n) <= 571) ? 571 : \ 292 | ((n) <= 577) ? 577 : \ 293 | ((n) <= 587) ? 587 : \ 294 | ((n) <= 593) ? 593 : \ 295 | ((n) <= 599) ? 599 : \ 296 | ((n) <= 601) ? 601 : \ 297 | ((n) <= 607) ? 607 : \ 298 | ((n) <= 613) ? 613 : \ 299 | ((n) <= 617) ? 617 : \ 300 | ((n) <= 619) ? 619 : \ 301 | ((n) <= 631) ? 631 : \ 302 | ((n) <= 641) ? 641 : \ 303 | ((n) <= 643) ? 643 : \ 304 | ((n) <= 647) ? 647 : \ 305 | ((n) <= 653) ? 653 : \ 306 | ((n) <= 659) ? 659 : \ 307 | ((n) <= 661) ? 661 : \ 308 | ((n) <= 673) ? 673 : \ 309 | ((n) <= 677) ? 677 : \ 310 | ((n) <= 683) ? 683 : \ 311 | ((n) <= 691) ? 691 : \ 312 | ((n) <= 701) ? 701 : \ 313 | ((n) <= 709) ? 709 : \ 314 | ((n) <= 719) ? 719 : \ 315 | ((n) <= 727) ? 727 : \ 316 | ((n) <= 733) ? 733 : \ 317 | ((n) <= 739) ? 739 : \ 318 | ((n) <= 743) ? 743 : \ 319 | ((n) <= 751) ? 751 : \ 320 | ((n) <= 757) ? 757 : \ 321 | ((n) <= 761) ? 761 : \ 322 | ((n) <= 769) ? 769 : \ 323 | ((n) <= 773) ? 773 : \ 324 | ((n) <= 787) ? 787 : \ 325 | ((n) <= 797) ? 797 : \ 326 | ((n) <= 809) ? 809 : \ 327 | ((n) <= 811) ? 811 : \ 328 | ((n) <= 821) ? 821 : \ 329 | ((n) <= 823) ? 823 : \ 330 | ((n) <= 827) ? 827 : \ 331 | ((n) <= 829) ? 829 : \ 332 | ((n) <= 839) ? 839 : \ 333 | ((n) <= 853) ? 853 : \ 334 | ((n) <= 857) ? 857 : \ 335 | ((n) <= 859) ? 859 : \ 336 | ((n) <= 863) ? 863 : \ 337 | ((n) <= 877) ? 877 : \ 338 | ((n) <= 881) ? 881 : \ 339 | ((n) <= 883) ? 883 : \ 340 | ((n) <= 887) ? 887 : \ 341 | ((n) <= 907) ? 907 : \ 342 | ((n) <= 911) ? 911 : \ 343 | ((n) <= 919) ? 919 : \ 344 | ((n) <= 929) ? 929 : \ 345 | ((n) <= 937) ? 937 : \ 346 | ((n) <= 941) ? 941 : \ 347 | ((n) <= 947) ? 947 : \ 348 | ((n) <= 953) ? 953 : \ 349 | ((n) <= 967) ? 967 : \ 350 | ((n) <= 971) ? 971 : \ 351 | ((n) <= 977) ? 977 : \ 352 | ((n) <= 983) ? 983 : \ 353 | ((n) <= 991) ? 991 : \ 354 | ((n) <= 997) ? 997 : \ 355 | ((n) <= 1009) ? 1009 : \ 356 | (n)) 357 | 358 | #define NEXT_POWER_OF_TWO(n) \ 359 | (((n) <= 2) ? 2 : \ 360 | ((n) <= 4) ? 4 : \ 361 | ((n) <= 8) ? 8 : \ 362 | ((n) <= 16) ? 16 : \ 363 | ((n) <= 32) ? 32 : \ 364 | ((n) <= 64) ? 64 : \ 365 | ((n) <= 128) ? 128 : \ 366 | ((n) <= 256) ? 256 : \ 367 | ((n) <= 512) ? 512 : \ 368 | ((n) <= 1024) ? 1024 : \ 369 | ((n) <= 2048) ? 2048 : \ 370 | ((n) <= 4096) ? 4096 : \ 371 | ((n) <= 8192) ? 8192 : \ 372 | ((n) <= 16384) ? 16384 : \ 373 | ((n) <= 32768) ? 32768 : \ 374 | (n)) 375 | 376 | #define SLOT_INDEX_TYPE ushort 377 | -------------------------------------------------------------------------------- /run: -------------------------------------------------------------------------------- 1 | ./silentarmy.py -c stratum+tcp://na.luckpool.net:3956 -u REoPcdGXthL5yeTCrJtrQv5xhYTknbFbec.amdwin -p d=4 --use=0 2 | -------------------------------------------------------------------------------- /sa-solver.iobj: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/monkins1010/AMDVerusCoin/2706c705b1ed7cb23a316d574d6b8bc4dee778ce/sa-solver.iobj -------------------------------------------------------------------------------- /sa-solver.ipdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/monkins1010/AMDVerusCoin/2706c705b1ed7cb23a316d574d6b8bc4dee778ce/sa-solver.ipdb -------------------------------------------------------------------------------- /sa-solver.pdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/monkins1010/AMDVerusCoin/2706c705b1ed7cb23a316d574d6b8bc4dee778ce/sa-solver.pdb -------------------------------------------------------------------------------- /sha256.c: -------------------------------------------------------------------------------- 1 | /* Crypto/Sha256.c -- SHA-256 Hash 2 | 2016-11-04 : Marc Bevand : A few changes to make it more self-contained 3 | 2010-06-11 : Igor Pavlov : Public domain 4 | This code is based on public domain code from Wei Dai's Crypto++ library. */ 5 | 6 | #include 7 | #include 8 | #include "sha256.h" 9 | 10 | /* define it for speed optimization */ 11 | /* #define _SHA256_UNROLL */ 12 | /* #define _SHA256_UNROLL2 */ 13 | 14 | #define rotlFixed(x, n) (((x) << (n)) | ((x) >> (32 - (n)))) 15 | #define rotrFixed(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) 16 | 17 | void Sha256_Init(CSha256 *p) 18 | { 19 | p->state[0] = 0x6a09e667; 20 | p->state[1] = 0xbb67ae85; 21 | p->state[2] = 0x3c6ef372; 22 | p->state[3] = 0xa54ff53a; 23 | p->state[4] = 0x510e527f; 24 | p->state[5] = 0x9b05688c; 25 | p->state[6] = 0x1f83d9ab; 26 | p->state[7] = 0x5be0cd19; 27 | p->count = 0; 28 | } 29 | 30 | #define S0(x) (rotrFixed(x, 2) ^ rotrFixed(x,13) ^ rotrFixed(x, 22)) 31 | #define S1(x) (rotrFixed(x, 6) ^ rotrFixed(x,11) ^ rotrFixed(x, 25)) 32 | #define s0(x) (rotrFixed(x, 7) ^ rotrFixed(x,18) ^ (x >> 3)) 33 | #define s1(x) (rotrFixed(x,17) ^ rotrFixed(x,19) ^ (x >> 10)) 34 | 35 | #define blk0(i) (W[i] = data[i]) 36 | #define blk2(i) (W[i&15] += s1(W[(i-2)&15]) + W[(i-7)&15] + s0(W[(i-15)&15])) 37 | 38 | #define Ch(x,y,z) (z^(x&(y^z))) 39 | #define Maj(x,y,z) ((x&y)|(z&(x|y))) 40 | 41 | #define a(i) T[(0-(i))&7] 42 | #define b(i) T[(1-(i))&7] 43 | #define c(i) T[(2-(i))&7] 44 | #define d(i) T[(3-(i))&7] 45 | #define e(i) T[(4-(i))&7] 46 | #define f(i) T[(5-(i))&7] 47 | #define g(i) T[(6-(i))&7] 48 | #define h(i) T[(7-(i))&7] 49 | 50 | 51 | #ifdef _SHA256_UNROLL2 52 | 53 | #define R(a,b,c,d,e,f,g,h, i) h += S1(e) + Ch(e,f,g) + K[i+j] + (j?blk2(i):blk0(i));\ 54 | d += h; h += S0(a) + Maj(a, b, c) 55 | 56 | #define RX_8(i) \ 57 | R(a,b,c,d,e,f,g,h, i); \ 58 | R(h,a,b,c,d,e,f,g, i+1); \ 59 | R(g,h,a,b,c,d,e,f, i+2); \ 60 | R(f,g,h,a,b,c,d,e, i+3); \ 61 | R(e,f,g,h,a,b,c,d, i+4); \ 62 | R(d,e,f,g,h,a,b,c, i+5); \ 63 | R(c,d,e,f,g,h,a,b, i+6); \ 64 | R(b,c,d,e,f,g,h,a, i+7) 65 | 66 | #else 67 | 68 | #define R(i) h(i) += S1(e(i)) + Ch(e(i),f(i),g(i)) + K[i+j] + (j?blk2(i):blk0(i));\ 69 | d(i) += h(i); h(i) += S0(a(i)) + Maj(a(i), b(i), c(i)) 70 | 71 | #ifdef _SHA256_UNROLL 72 | 73 | #define RX_8(i) R(i+0); R(i+1); R(i+2); R(i+3); R(i+4); R(i+5); R(i+6); R(i+7); 74 | 75 | #endif 76 | 77 | #endif 78 | 79 | static const uint32_t K[64] = { 80 | 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 81 | 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 82 | 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 83 | 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 84 | 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 85 | 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 86 | 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 87 | 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 88 | 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 89 | 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 90 | 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 91 | 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 92 | 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 93 | 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 94 | 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 95 | 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 96 | }; 97 | 98 | static void Sha256_Transform(uint32_t *state, const uint32_t *data) 99 | { 100 | uint32_t W[16]; 101 | unsigned j; 102 | #ifdef _SHA256_UNROLL2 103 | uint32_t a,b,c,d,e,f,g,h; 104 | a = state[0]; 105 | b = state[1]; 106 | c = state[2]; 107 | d = state[3]; 108 | e = state[4]; 109 | f = state[5]; 110 | g = state[6]; 111 | h = state[7]; 112 | #else 113 | uint32_t T[8]; 114 | for (j = 0; j < 8; j++) 115 | T[j] = state[j]; 116 | #endif 117 | 118 | for (j = 0; j < 64; j += 16) 119 | { 120 | #if defined(_SHA256_UNROLL) || defined(_SHA256_UNROLL2) 121 | RX_8(0); RX_8(8); 122 | #else 123 | unsigned i; 124 | for (i = 0; i < 16; i++) { R(i); } 125 | #endif 126 | } 127 | 128 | #ifdef _SHA256_UNROLL2 129 | state[0] += a; 130 | state[1] += b; 131 | state[2] += c; 132 | state[3] += d; 133 | state[4] += e; 134 | state[5] += f; 135 | state[6] += g; 136 | state[7] += h; 137 | #else 138 | for (j = 0; j < 8; j++) 139 | state[j] += T[j]; 140 | #endif 141 | 142 | /* Wipe variables */ 143 | /* memset(W, 0, sizeof(W)); */ 144 | /* memset(T, 0, sizeof(T)); */ 145 | } 146 | 147 | #undef S0 148 | #undef S1 149 | #undef s0 150 | #undef s1 151 | 152 | static void Sha256_WriteByteBlock(CSha256 *p) 153 | { 154 | uint32_t data32[16]; 155 | unsigned i; 156 | for (i = 0; i < 16; i++) 157 | data32[i] = 158 | ((uint32_t)(p->buffer[i * 4 ]) << 24) + 159 | ((uint32_t)(p->buffer[i * 4 + 1]) << 16) + 160 | ((uint32_t)(p->buffer[i * 4 + 2]) << 8) + 161 | ((uint32_t)(p->buffer[i * 4 + 3])); 162 | Sha256_Transform(p->state, data32); 163 | } 164 | 165 | void Sha256_Update(CSha256 *p, const uint8_t *data, size_t size) 166 | { 167 | uint32_t curBufferPos = (uint32_t)p->count & 0x3F; 168 | while (size > 0) 169 | { 170 | p->buffer[curBufferPos++] = *data++; 171 | p->count++; 172 | size--; 173 | if (curBufferPos == 64) 174 | { 175 | curBufferPos = 0; 176 | Sha256_WriteByteBlock(p); 177 | } 178 | } 179 | } 180 | 181 | void Sha256_Final(CSha256 *p, uint8_t *digest) 182 | { 183 | uint64_t lenInBits = (p->count << 3); 184 | uint32_t curBufferPos = (uint32_t)p->count & 0x3F; 185 | unsigned i; 186 | p->buffer[curBufferPos++] = 0x80; 187 | while (curBufferPos != (64 - 8)) 188 | { 189 | curBufferPos &= 0x3F; 190 | if (curBufferPos == 0) 191 | Sha256_WriteByteBlock(p); 192 | p->buffer[curBufferPos++] = 0; 193 | } 194 | for (i = 0; i < 8; i++) 195 | { 196 | p->buffer[curBufferPos++] = (uint8_t)(lenInBits >> 56); 197 | lenInBits <<= 8; 198 | } 199 | Sha256_WriteByteBlock(p); 200 | 201 | for (i = 0; i < 8; i++) 202 | { 203 | *digest++ = (uint8_t)(p->state[i] >> 24); 204 | *digest++ = (uint8_t)(p->state[i] >> 16); 205 | *digest++ = (uint8_t)(p->state[i] >> 8); 206 | *digest++ = (uint8_t)(p->state[i]); 207 | } 208 | Sha256_Init(p); 209 | } 210 | 211 | void Sha256_Onestep(const uint8_t *data, size_t size, uint8_t *digest) 212 | { 213 | CSha256 p; 214 | Sha256_Init(&p); 215 | Sha256_Update(&p, data, size); 216 | Sha256_Final(&p, digest); 217 | } 218 | -------------------------------------------------------------------------------- /sha256.h: -------------------------------------------------------------------------------- 1 | /* Sha256.h -- SHA-256 Hash 2 | 2016-11-04 : Marc Bevand : A few changes to make it more self-contained 3 | 2010-06-11 : Igor Pavlov : Public domain */ 4 | 5 | #ifndef __CRYPTO_SHA256_H 6 | #define __CRYPTO_SHA256_H 7 | 8 | #define SHA256_DIGEST_SIZE 32 9 | 10 | typedef struct 11 | { 12 | uint32_t state[8]; 13 | uint64_t count; 14 | uint8_t buffer[64]; 15 | } CSha256; 16 | 17 | void Sha256_Init(CSha256 *p); 18 | void Sha256_Update(CSha256 *p, const uint8_t *data, size_t size); 19 | void Sha256_Final(CSha256 *p, uint8_t *digest); 20 | void Sha256_Onestep(const uint8_t *data, size_t size, uint8_t *digest); 21 | 22 | #endif 23 | -------------------------------------------------------------------------------- /sha256.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/monkins1010/AMDVerusCoin/2706c705b1ed7cb23a316d574d6b8bc4dee778ce/sha256.o -------------------------------------------------------------------------------- /silenarmy.spec: -------------------------------------------------------------------------------- 1 | # -*- mode: python -*- 2 | 3 | block_cipher = None 4 | 5 | 6 | a = Analysis(['silenarmy.py'], 7 | pathex=['C:\\Users\\Chris\\Desktop\\silentarmy-windows - Copy'], 8 | binaries=[], 9 | datas=[], 10 | hiddenimports=[], 11 | hookspath=[], 12 | runtime_hooks=[], 13 | excludes=[], 14 | win_no_prefer_redirects=False, 15 | win_private_assemblies=False, 16 | cipher=block_cipher, 17 | noarchive=False) 18 | pyz = PYZ(a.pure, a.zipped_data, 19 | cipher=block_cipher) 20 | exe = EXE(pyz, 21 | a.scripts, 22 | [], 23 | exclude_binaries=True, 24 | name='silenarmy', 25 | debug=False, 26 | bootloader_ignore_signals=False, 27 | strip=False, 28 | upx=True, 29 | console=True ) 30 | coll = COLLECT(exe, 31 | a.binaries, 32 | a.zipfiles, 33 | a.datas, 34 | strip=False, 35 | upx=True, 36 | name='silenarmy') 37 | -------------------------------------------------------------------------------- /silentarmy: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | from optparse import OptionParser 4 | import sys 5 | import os 6 | import time 7 | import socket 8 | import struct 9 | import json 10 | import binascii 11 | import re 12 | import logging 13 | 14 | try: 15 | import asyncio 16 | except ImportError as e: 17 | # system doesn't provide asyncio module (eg. Python 3.3), 18 | # so use module bundled with silentarmy 19 | p = os.path.join(sys.path[0], 'thirdparty', 'asyncio') 20 | sys.path.insert(1, p) # make it the 2nd path 21 | import asyncio 22 | 23 | verbose_level = 0 24 | 25 | def b2hex(b): 26 | '''Convert a bytes object to a hex string.''' 27 | # This is equivalent to bytes.hex() in Python 3.5. 28 | return binascii.hexlify(b).decode('ascii') 29 | 30 | def warn(msg): 31 | sys.stderr.write(msg + '\n') 32 | 33 | def fatal(msg): 34 | sys.stderr.write(msg + '\n') 35 | sys.exit(1) 36 | 37 | def verbose(msg): 38 | if verbose_level > 0: 39 | print(msg) 40 | 41 | def very_verbose(msg): 42 | if verbose_level > 1: 43 | print(msg) 44 | 45 | def my_ensure_future(coro): 46 | loop = asyncio.get_event_loop() 47 | task = loop.create_task(coro) 48 | return task 49 | 50 | def parse_url(url): 51 | '''Return (host, port, xnsub) from "stratum+tcp://host:port" optionally 52 | postfixed with #xnsub or /#xnsub (in which case xnsub is set to True)''' 53 | prefix = 'stratum+tcp://' 54 | if not url.startswith(prefix): 55 | fatal('Invalid stratum url: %s' % url) 56 | xnsub = False 57 | if url.endswith('#xnsub'): 58 | xnsub = True 59 | url = url[:-len('#xnsub')].strip('/') 60 | url = url[len(prefix):] 61 | colon = url.rfind(':') 62 | if colon == -1: 63 | fatal('Invalid stratum url: %s' % url) 64 | host = url[:colon] 65 | port = url[colon + 1:] 66 | if not host: 67 | fatal('Invalid host: %s' % host) 68 | try: 69 | port = int(port) 70 | except ValueError as e: 71 | fatal('Invalid port number: %s' % port) 72 | return (url[:colon], int(url[colon + 1:]), xnsub) 73 | 74 | def decode_solver_line(line): 75 | '''Decode a line read from the solver. 3 types of lines exist: 76 | "sol: " 77 | "status: " 78 | "" 79 | ''' 80 | m = re.match(r'(?i)^(sol): ([^ ]+) ([0-9a-f]{8}) ([0-9a-f]+) ([0-9a-f]+)$', 81 | line) 82 | if m is not None: 83 | return m.groups() 84 | m = re.match(r'(?i)^status: ([0-9]+) ([0-9]+)$', line) 85 | if m is not None: 86 | return ('status', int(m.group(1)), int(m.group(2))) 87 | return ('msg', line) 88 | 89 | # 90 | # StratumClientProtocol 91 | # 92 | class StratumClientProtocol(asyncio.Protocol): 93 | '''asyncio protocol implementation of stratum''' 94 | 95 | def __init__(self, silentarmy): 96 | '''silentarmy is the instance of Silentarmy''' 97 | self.sa = silentarmy 98 | self.read_buffer = b'' 99 | self.transport = None 100 | 101 | # 102 | # asyncio callbacks 103 | # 104 | def connection_made(self, transport): 105 | self.transport = transport 106 | verbose('Successfully connected to %s:%s' % 107 | (self.sa.host, self.sa.port)) 108 | self.do_send(self.sa.stratum_msg('mining.subscribe')) 109 | self.sa.st_state = 'SENT_SUBSCRIBE' 110 | 111 | def data_received(self, data): 112 | very_verbose('From stratum server: %s' % (repr(data))) 113 | self.read_buffer += data 114 | # process 1 or more messages 115 | while True: 116 | i = self.read_buffer.find(b'\n') 117 | if i == -1: 118 | break 119 | tosend = self.sa.process_incoming_msg(self.read_buffer[:i]) 120 | self.read_buffer = self.read_buffer[i + 1:] 121 | if tosend is not None: 122 | self.do_send(tosend) 123 | 124 | def connection_lost(self, exc): 125 | if exc is None: 126 | print('Stratum: connection was closed (invalid user/pwd?)') 127 | else: 128 | print('Stratum: lost connection: %s' % exc) 129 | my_ensure_future(self.sa.reconnect()) 130 | 131 | # 132 | # other methods 133 | # 134 | def do_send(self, data): 135 | very_verbose('To stratum server: %s' % (repr(data))) 136 | self.transport.write(data) 137 | 138 | # 139 | # Silentarmy 140 | # 141 | class Silentarmy: 142 | '''Silentarmy Zcash miner''' 143 | 144 | def __init__(self, opts): 145 | self.opts = opts 146 | self.host = None 147 | self.port = None 148 | self.loop = None 149 | # Solver-related attributes 150 | self.solver_procs = {} 151 | self.solver_binary = os.path.join(sys.path[0], 'sa-solver') 152 | # Stratum-related attributes 153 | self.st_transport = None 154 | self.st_conn_attempt = 0 155 | self.st_had_job = False 156 | self.st_protocol = StratumClientProtocol(self) 157 | self.st_state = 'DISCONNECTED' 158 | self.st_extranonce = False 159 | self.st_id = 0 160 | self.st_expected_id = None 161 | self.st_accepted = 0 162 | # Equihash-related attributes 163 | self.target = None 164 | self.job_id = None 165 | self.zcash_nonceless_header = None 166 | self.nonce_leftpart = None 167 | # Stats for each device ID. For example, if: 168 | # gpu 0 instance 0 found 100 solutions 169 | # gpu 0 instance 1 found 200 solutions 170 | # gpu 1 instance 0 found 300 solutions 171 | # ... 172 | # then: 173 | # total_sols = { '0.0': 100, '0.1': 200, '1.0': 300, ... } 174 | self.total_sols = {} 175 | self.total_shares = {} 176 | 177 | def init(self): 178 | (self.host, self.port, self.st_extranonce) = parse_url(self.opts.pool) 179 | if sys.platform == 'win32': 180 | # ProactorEventLoop needed to support subprocesses with Python 3.5 181 | self.loop = asyncio.ProactorEventLoop() 182 | asyncio.set_event_loop(self.loop) 183 | else: 184 | self.loop = asyncio.get_event_loop() 185 | if self.opts.debug: 186 | self.loop.set_debug(True) 187 | logging.basicConfig(level=logging.DEBUG) 188 | logging.getLogger('asyncio').setLevel(logging.DEBUG) 189 | 190 | @asyncio.coroutine 191 | def reconnect(self): 192 | if self.st_conn_attempt > 0: 193 | # wait 1 sec before reconnecting 194 | yield from asyncio.sleep(1) 195 | self.st_conn_attempt += 1 196 | info = "" 197 | if self.st_conn_attempt > 1: 198 | info = " (attempt %d)" % self.st_conn_attempt 199 | print("Connecting to %s:%d%s" % (self.host, self.port, info)) 200 | self.st_state = 'CONNECTING' 201 | coro = self.loop.create_connection(lambda: self.st_protocol, 202 | self.host, self.port) 203 | try: 204 | (self.st_transport, _) = yield from coro 205 | except Exception as e: 206 | print("Stratum: error connecting: %s" % e) 207 | my_ensure_future(self.reconnect()) 208 | 209 | @asyncio.coroutine 210 | def show_stats(self): 211 | def gpus(last_sols, last_times, period_gpu): 212 | '''Return per-GPU stats, for example [ '0':20, '1':200 ] means 213 | GPU 0 is mining at 20 sol/s and GPU 1 at 200 sol/s.''' 214 | # when we don't have enough samples, we pick the oldest one 215 | idx = min(period_gpu, len(last_sols) - 1) 216 | devids = last_sols[0].keys() 217 | sols = {} 218 | for devid in devids: 219 | try: 220 | sols[devid] = last_sols[0][devid] - last_sols[idx][devid] 221 | except KeyError as e: 222 | # XXX - ugly fix need to look into why stats are missing 223 | sols[devid] = 0 224 | sec = last_times[0] - last_times[idx] 225 | # sols contains per-devid data: 226 | # { '0.0':10, '0.1':10, '1.0':100, '1.1':100 } 227 | # we need to reduce it to per-GPU data: 228 | # [ '0':20, '1':200 ] 229 | gpuids = set([x.split('.')[0] for x in devids]) 230 | result = [] 231 | for gpuid in gpuids: 232 | x = [v for (k,v) in sols.items() if k.startswith('%s.' % gpuid)] 233 | if sec: 234 | result.append((gpuid, sum(x) / sec)) 235 | else: 236 | result.append((gpuid, 0)) 237 | return result 238 | period_glo = 30 # total sol/s rate is computed over sec 239 | period_gpu = 10 # per-GPU sol/s rate is computed over sec 240 | last_sols = [] 241 | last_times = [] 242 | while True: 243 | yield from asyncio.sleep(1) 244 | if not self.st_had_job: 245 | # do not show stats until we start working on a job 246 | continue 247 | if not self.total_sols: 248 | # do not show stats if we don't have any 249 | continue 250 | last_sols.insert(0, self.total_sols.copy()) 251 | last_times.insert(0, time.time()) 252 | sols = sum(last_sols[0].values()) - sum(last_sols[-1].values()) 253 | sec = last_times[0] - last_times[-1] 254 | if sec: 255 | rate_avg = "%.1f" % (sols / sec) 256 | else: 257 | rate_avg = 0.0 258 | if len(last_sols) > period_glo: 259 | last_sols.pop() 260 | last_times.pop() 261 | rate_gpus = sorted(gpus(last_sols, last_times, period_gpu)) 262 | info_gpus = ', '.join(['dev%s %.1f' % (x[0], x[1]) 263 | for x in rate_gpus]) 264 | shares = sum(self.total_shares.values()) 265 | print("%s MH/s [%s] %d share%s" % \ 266 | (rate_avg, info_gpus, shares, '' if shares == 1 else 's')) 267 | sys.stdout.flush() 268 | 269 | def list_devices(self): 270 | try: 271 | os.execl(self.solver_binary, self.solver_binary, '--list') 272 | # never reached 273 | except FileNotFoundError as e: 274 | fatal(("Could not find '%s' binary; make sure to run 'make' to " 275 | "compile it") % self.solver_binary) 276 | except Exception as e: 277 | fatal("Failed to execute '%s': %s" % (self.solver_binary, e)) 278 | 279 | def run(self): 280 | if self.opts.do_list: 281 | self.list_devices() 282 | self.init() 283 | my_ensure_future(self.reconnect()) 284 | my_ensure_future(self.show_stats()) 285 | for gpuid in self.opts.use: 286 | for instid in range(self.opts.instances): 287 | devid = "%d.%d" % (gpuid, instid) 288 | my_ensure_future(self.start_solvers(devid)) 289 | try: 290 | self.loop.run_forever() 291 | except KeyboardInterrupt as e: 292 | print('\nQuitting') 293 | sys.exit(0) 294 | verbose('Closing event loop') 295 | self.loop.close() 296 | 297 | def cleanup_solvers(self, devid): 298 | '''Terminate a solver and clean up resources. This might be called for 299 | example when EOF is read from stdout.''' 300 | # wait for the process to end (sometimes EOF is received right before 301 | # Python has time to fill returncode with its status) 302 | yield from proc.wait() 303 | print('Solver %s: exit status %d' % (devid, proc.returncode)) 304 | if devid in self.solver_procs: 305 | del self.solver_procs[devid] 306 | 307 | @asyncio.coroutine 308 | def start_solvers(self, devid): 309 | verbose('Solver %s: launching' % devid) 310 | # execute "sa-solver --mining --use " 311 | create = asyncio.create_subprocess_exec( 312 | self.solver_binary, '--mining', '--use', devid.split('.')[0], 313 | stdin=asyncio.subprocess.PIPE, stdout=asyncio.subprocess.PIPE, 314 | stderr=asyncio.subprocess.STDOUT) 315 | try: 316 | proc = yield from create 317 | except FileNotFoundError as e: 318 | warn(("Could not find '%s' binary; make sure to run 'make' to " 319 | "compile it") % self.solver_binary) 320 | # exit without using sys.exit() because this raises SystemExit 321 | # and asyncio catches it and prints a stack trace which confuses 322 | # end-users. 323 | os._exit(1) 324 | except Exception as e: 325 | fatal("Failed to execute '%s': %s" % (self.solver_binary, e)) 326 | self.solver_procs[devid] = proc 327 | banner = yield from proc.stdout.readline() 328 | if not banner: 329 | print('Solver %s: EOF while reading banner' % devid) 330 | self.cleanup_solvers(devid) 331 | return 332 | banner = banner.decode('ascii').rstrip() 333 | very_verbose('From solver %s: banner "%s"' % (devid, banner)) 334 | if banner != "SILENTARMY mining mode ready": 335 | print('Solver %s: unexpected banner "%s"' % (devid, banner)) 336 | proc.kill() 337 | self.cleanup_solvers(devid) 338 | return 339 | # jobs will be written to stdin by update_mining_job(), while this 340 | # code reads solutions from stdout 341 | while True: 342 | solline = yield from proc.stdout.readline() 343 | if not solline: 344 | self.cleanup_solvers(devid) 345 | return 346 | solline = solline.decode('ascii').rstrip() 347 | very_verbose('From solver %s: %s' % (devid, solline)) 348 | decoded = decode_solver_line(solline) 349 | if decoded[0] == 'sol': 350 | very_verbose('Solver %s: submitting share: %s' % \ 351 | (devid, str(decoded[1:]))) 352 | self.st_protocol.do_send(self.stratum_msg('mining.submit', 353 | *decoded[1:])) 354 | elif decoded[0] == 'status': 355 | (nr_sols, nr_shares) = decoded[1:] 356 | very_verbose('Solver %s: found %d sols %d shares so far' % 357 | (devid, nr_sols, nr_shares)) 358 | self.total_sols[devid] = nr_sols 359 | self.total_shares[devid] = nr_shares 360 | elif decoded[0] == 'msg': 361 | (msg,) = decoded[1:] 362 | very_verbose('Solver %s: reported: %s' % (devid, msg)) 363 | else: 364 | fatal('Invalid solver line: %s' % repr(decoded)) 365 | 366 | def update_mining_job(self): 367 | '''Called every time the miner receives a piece of data from the 368 | stratum server that might update the mining work.''' 369 | # In order to start mining, the client needs 4 things: 370 | # - have nonce_leftpart (result of mining.subscribe) 371 | # - be authorized (after mining.authorize) 372 | # - have a target (sent by mining.set_target) 373 | # - have a block header (sent by mining.notify) 374 | if self.nonce_leftpart is None: 375 | return 376 | if self.st_state != 'AUTHORIZED': 377 | return 378 | if self.target is None: 379 | return 380 | if self.zcash_nonceless_header is None: 381 | return 382 | for gpuid in self.opts.use: 383 | for instid in range(self.opts.instances): 384 | devid = "%d.%d" % (gpuid, instid) 385 | if devid not in self.solver_procs: 386 | # happens if solver crashed 387 | print('Solver %s: not running, relaunching it' % devid) 388 | my_ensure_future(self.start_solvers(devid)) 389 | # TODO: ideally the mining job should be sent to the solver 390 | # as soon as it is back up and running 391 | if not self.st_had_job: 392 | print('Stratum server sent us the first job') 393 | l = len(self.opts.use) 394 | print('VerusCoin miner by Monkins1010, Mining on %d device%s' % (l, '' if l == 1 else 's')) 395 | self.st_had_job = True 396 | job = "%s %s %s %s\n" % (b2hex(self.target), self.job_id, 397 | b2hex(self.zcash_nonceless_header), 398 | b2hex(self.nonce_leftpart)) 399 | very_verbose('To solvers: %s' % job.rstrip()) 400 | for devid in self.solver_procs: 401 | self.solver_procs[devid].stdin.write(job.encode('ascii')) 402 | 403 | def set_nonce_leftpart(self, n): 404 | self.nonce_leftpart = bytes.fromhex(n) 405 | l = len(self.nonce_leftpart) 406 | very_verbose('Stratum server fixes %d bytes of the nonce' % l) 407 | if l > 17: 408 | # SILENTARMY requires the last 12 bytes to be zero, then 3 bytes 409 | # to vary the nonce, this leaves at most 17 bytes that can be 410 | # fixed by the server. 411 | fatal('Stratum: SILENTARMY is not compatible with servers ' + 412 | 'fixing the first %d bytes of the nonce' % l) 413 | 414 | def set_target(self, t): 415 | verbose('Received target %s' % t) 416 | if not re.match(r'^[0-9a-fA-F]{64}$', t): 417 | raise Exception('Invalid target: %s' % t) 418 | is_first_target = self.target is None 419 | # store it in internal byte order 420 | self.target = bytes.fromhex(t)[::-1] 421 | # take the target into account *immediately* only if it is the first 422 | # ever received, or else the target applies to the next job 423 | if is_first_target: 424 | self.update_mining_job() 425 | 426 | def set_new_job(self, job_id, nversion, hash_prev_block, hash_merkle_root, 427 | hash_reserved, ntime, nbits, clean_jobs): 428 | verbose('Received job "%s"' % job_id) 429 | if not clean_jobs: 430 | verbose('Ignoring job "%s" (clean_jobs=False)' % job_id) 431 | return 432 | self.job_id = job_id 433 | #if nversion != '04000000': 434 | if not nversion.startswith('04000'): 435 | raise Exception('Invalid version: %s' % nversion) 436 | if not re.match(r'^[0-9a-fA-F]{64}$', hash_prev_block): 437 | raise Exception('Invalid hashPrevBlock: %s' % hash_prev_block) 438 | if not re.match(r'^[0-9a-fA-F]{64}$', hash_merkle_root): 439 | raise Exception('Invalid hashMerkleRoot: %s' % hash_merkle_root) 440 | # if hash_reserved != '00' * 32: 441 | # raise Exception('Invalid hashReserved: %s' % hash_reserved) 442 | if not re.match(r'^[0-9a-fA-F]{8}$', ntime): 443 | raise Exception('Invalid nTime: %s' % ntime) 444 | if not re.match(r'^[0-9a-fA-F]{8}$', nbits): 445 | raise Exception('Invalid nBits: %s' % nbits) 446 | self.zcash_nonceless_header = bytes.fromhex(nversion + \ 447 | hash_prev_block + hash_merkle_root + hash_reserved + ntime + 448 | nbits) 449 | 450 | def stratum_next_id(self): 451 | self.st_id += 1 452 | self.st_expected_id = self.st_id 453 | return self.st_id 454 | 455 | def stratum_msg(self, method, *args): 456 | '''Generate a stratum message to call the specified method.''' 457 | if method == 'mining.subscribe': 458 | p = ["silentarmy", None, self.host, str(self.port)] 459 | elif method == 'mining.extranonce.subscribe': 460 | p = [] 461 | elif method == 'mining.authorize': 462 | if self.opts.pwd: 463 | p = [self.opts.user,self.opts.pwd] 464 | else: 465 | p = [self.opts.user,''] 466 | elif method == 'mining.submit': 467 | (job_id, ntime, nonce_rightpart, sol) = args 468 | p = [self.opts.user, job_id, ntime, nonce_rightpart, sol] 469 | else: 470 | fatal('Bug: unknown method %s' % method) 471 | msg_id = self.stratum_next_id() 472 | msg = json.dumps({'id':msg_id, 'method':method, 'params':p}) + '\n' 473 | return msg.encode('utf-8') 474 | 475 | def process_incoming_msg(self, msg): 476 | '''Process an incoming stratum message. 477 | Return None, or a message to send back.''' 478 | try: 479 | msg = json.loads(msg.decode()) 480 | if 'id' not in msg: 481 | raise Exception("'id' field is missing") 482 | # server returning a method call result 483 | if 'result' in msg: 484 | if 'error' in msg and msg['error'] is not None: 485 | print("Stratum server returned an error: %s" % msg['error']) 486 | return 487 | if msg['id'] != self.st_expected_id: 488 | # XXX need to track which outstanding IDs we are waiting 489 | # a response for 490 | very_verbose("Stratum server returned wrong id: %s" % \ 491 | msg['id']) 492 | # attempt to proceed and ignore this error 493 | self.st_expected_id = None 494 | if self.st_state == 'SENT_SUBSCRIBE': 495 | # result: [ , nonce_leftpart ] 496 | self.set_nonce_leftpart(msg['result'][1]) 497 | if self.st_extranonce: 498 | self.st_state = 'SENT_EXTRANONCE_SUBSCRIBE' 499 | return self.stratum_msg('mining.extranonce.subscribe') 500 | else: 501 | self.st_state = 'SENT_AUTHORIZE' 502 | return self.stratum_msg('mining.authorize') 503 | elif self.st_state == 'SENT_EXTRANONCE_SUBSCRIBE': 504 | # ignore 505 | self.st_state = 'SENT_AUTHORIZE' 506 | return self.stratum_msg('mining.authorize') 507 | elif self.st_state == 'SENT_AUTHORIZE': 508 | # result: succeeded 509 | if not msg['result']: 510 | raise Exception('mining.authorize failed') 511 | self.st_state = 'AUTHORIZED' 512 | self.update_mining_job() 513 | elif self.st_state == 'AUTHORIZED': 514 | # result: succeeded 515 | very_verbose("Stratum server accepted a share") 516 | self.st_accepted += 1 517 | else: 518 | fatal('Bug: unknown state %s' % self.st_state) 519 | # server calling a method 520 | elif 'method' in msg: 521 | if msg['method'] == 'mining.set_target': 522 | # params: [ target ] 523 | self.set_target(msg['params'][0]) 524 | elif msg['method'] == 'mining.set_extranonce': 525 | # params: [ nonce_leftpart ] 526 | self.set_nonce_leftpart(msg['params'][0]) 527 | elif msg['method'] == 'mining.notify': 528 | # params: [ job_id, nVersion, hashPrevBlock, hashMerkleRoot, 529 | # hashReserved, nTime, nBits, clean_jobs ] 530 | self.set_new_job(*msg['params'][:8]) 531 | self.update_mining_job() 532 | elif msg['method'] == 'client.reconnect': 533 | print("Stratum server forcing a reconnection") 534 | self.st_transport.close() 535 | # reconnection will happen automatically in connection_lost() 536 | else: 537 | raise Exception('Unimplemented method: %s' % msg['method']) 538 | else: 539 | raise Exception('Message is neither a result nor a method call') 540 | except Exception as e: 541 | print('Stratum: invalid msg from server: %s: %s\n' % (e, msg)) 542 | return None 543 | 544 | # 545 | # Main 546 | # 547 | def main(): 548 | global verbose_level 549 | parser = OptionParser() 550 | parser.add_option( 551 | "-v", "--verbose", 552 | dest="verbose", action="count", default=0, 553 | help="verbose mode (may be repeated for more verbosity)") 554 | parser.add_option( 555 | "--debug", 556 | dest="debug", action="store_true", 557 | help="enable debug mode (for developers only)") 558 | parser.add_option( 559 | "--list", 560 | dest="do_list", action="store_true", 561 | help="list available OpenCL devices by ID (GPUs...)") 562 | parser.add_option( 563 | "--use", 564 | dest="use", action="store", type="string", metavar="LIST", 565 | default='0', 566 | help="use specified GPU device IDs to mine, for example to use " + 567 | "the first three: 0,1,2 (default: 0)") 568 | parser.add_option( 569 | "--instances", 570 | dest="instances", action="store", type="int", metavar="N", 571 | default=2, 572 | help="run N instances of Equihash per GPU (default: 2)") 573 | parser.add_option( 574 | "-c", "--connect", 575 | dest="pool", action="store", type="string", metavar="POOL", 576 | default='stratum+tcp://stratum.veruspool.xyz:9999', 577 | help="connect to POOL, for example stratum+tcp://example.com:1234" + 578 | " (add \"#xnsub\" to enable extranonce.subscribe)") 579 | parser.add_option( 580 | "-u", "--user", 581 | dest="user", action="store", type="string", metavar="USER", 582 | default="REoPcdGXthL5yeTCrJtrQv5xhYTknbFbec.monkins", 583 | help="username for connecting to the pool") 584 | parser.add_option( 585 | "-p", "--pwd", 586 | dest="pwd", action="store", type="string", metavar="PWD", 587 | help="password for connecting to the pool") 588 | (opts, args) = parser.parse_args() 589 | if args: 590 | parser.error("Extraneous arguments found on command line") 591 | if not (opts.do_list or opts.pool): 592 | parser.error("No pool was specified; use --connect") 593 | verbose_level = opts.verbose 594 | try: 595 | opts.use = set([int(x) for x in opts.use.split(',')]) 596 | except Exception: 597 | fatal("Invalid syntax for --use: %s" % opts.use) 598 | if opts.instances < 1: 599 | fatal("The number of instances per GPU should be 1 or greater") 600 | Silentarmy(opts).run() 601 | 602 | if __name__ == "__main__": 603 | main() 604 | -------------------------------------------------------------------------------- /silentarmy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | from optparse import OptionParser 4 | import sys 5 | import os 6 | import time 7 | import socket 8 | import struct 9 | import json 10 | import binascii 11 | import re 12 | import logging 13 | 14 | try: 15 | import asyncio 16 | except ImportError as e: 17 | # system doesn't provide asyncio module (eg. Python 3.3), 18 | # so use module bundled with silentarmy 19 | p = os.path.join(sys.path[0], 'thirdparty', 'asyncio') 20 | sys.path.insert(1, p) # make it the 2nd path 21 | import asyncio 22 | 23 | verbose_level = 0 24 | 25 | def b2hex(b): 26 | '''Convert a bytes object to a hex string.''' 27 | # This is equivalent to bytes.hex() in Python 3.5. 28 | return binascii.hexlify(b).decode('ascii') 29 | 30 | def warn(msg): 31 | sys.stderr.write(msg + '\n') 32 | 33 | def fatal(msg): 34 | sys.stderr.write(msg + '\n') 35 | sys.exit(1) 36 | 37 | def verbose(msg): 38 | if verbose_level > 0: 39 | print(msg) 40 | 41 | def very_verbose(msg): 42 | if verbose_level > 1: 43 | print(msg) 44 | 45 | def my_ensure_future(coro): 46 | loop = asyncio.get_event_loop() 47 | task = loop.create_task(coro) 48 | return task 49 | 50 | def parse_url(url): 51 | '''Return (host, port, xnsub) from "stratum+tcp://host:port" optionally 52 | postfixed with #xnsub or /#xnsub (in which case xnsub is set to True)''' 53 | prefix = 'stratum+tcp://' 54 | if not url.startswith(prefix): 55 | fatal('Invalid stratum url: %s' % url) 56 | xnsub = False 57 | if url.endswith('#xnsub'): 58 | xnsub = True 59 | url = url[:-len('#xnsub')].strip('/') 60 | url = url[len(prefix):] 61 | colon = url.rfind(':') 62 | if colon == -1: 63 | fatal('Invalid stratum url: %s' % url) 64 | host = url[:colon] 65 | port = url[colon + 1:] 66 | if not host: 67 | fatal('Invalid host: %s' % host) 68 | try: 69 | port = int(port) 70 | except ValueError as e: 71 | fatal('Invalid port number: %s' % port) 72 | return (url[:colon], int(url[colon + 1:]), xnsub) 73 | 74 | def decode_solver_line(line): 75 | '''Decode a line read from the solver. 3 types of lines exist: 76 | "sol: " 77 | "status: " 78 | "" 79 | ''' 80 | m = re.match(r'(?i)^(sol): ([^ ]+) ([0-9a-f]{8}) ([0-9a-f]+) ([0-9a-f]+)$', 81 | line) 82 | if m is not None: 83 | return m.groups() 84 | m = re.match(r'(?i)^status: ([0-9]+) ([0-9]+)$', line) 85 | if m is not None: 86 | return ('status', int(m.group(1)), int(m.group(2))) 87 | return ('msg', line) 88 | 89 | # 90 | # StratumClientProtocol 91 | # 92 | class StratumClientProtocol(asyncio.Protocol): 93 | '''asyncio protocol implementation of stratum''' 94 | 95 | def __init__(self, silentarmy): 96 | '''silentarmy is the instance of Silentarmy''' 97 | self.sa = silentarmy 98 | self.read_buffer = b'' 99 | self.transport = None 100 | 101 | # 102 | # asyncio callbacks 103 | # 104 | def connection_made(self, transport): 105 | self.transport = transport 106 | verbose('Successfully connected to %s:%s' % 107 | (self.sa.host, self.sa.port)) 108 | self.do_send(self.sa.stratum_msg('mining.subscribe')) 109 | self.sa.st_state = 'SENT_SUBSCRIBE' 110 | 111 | def data_received(self, data): 112 | very_verbose('From stratum server: %s' % (repr(data))) 113 | self.read_buffer += data 114 | # process 1 or more messages 115 | while True: 116 | i = self.read_buffer.find(b'\n') 117 | if i == -1: 118 | break 119 | tosend = self.sa.process_incoming_msg(self.read_buffer[:i]) 120 | self.read_buffer = self.read_buffer[i + 1:] 121 | if tosend is not None: 122 | self.do_send(tosend) 123 | 124 | def connection_lost(self, exc): 125 | if exc is None: 126 | print('Stratum: connection was closed (invalid user/pwd?)') 127 | else: 128 | print('Stratum: lost connection: %s' % exc) 129 | my_ensure_future(self.sa.reconnect()) 130 | 131 | # 132 | # other methods 133 | # 134 | def do_send(self, data): 135 | very_verbose('To stratum server: %s' % (repr(data))) 136 | self.transport.write(data) 137 | 138 | # 139 | # Silentarmy 140 | # 141 | class Silentarmy: 142 | '''Silentarmy Zcash miner''' 143 | 144 | def __init__(self, opts): 145 | self.opts = opts 146 | self.host = None 147 | self.port = None 148 | self.loop = None 149 | # Solver-related attributes 150 | self.solver_procs = {} 151 | self.solver_binary = os.path.join(sys.path[0], 'sa-solver') 152 | # Stratum-related attributes 153 | self.st_transport = None 154 | self.st_conn_attempt = 0 155 | self.st_had_job = False 156 | self.st_protocol = StratumClientProtocol(self) 157 | self.st_state = 'DISCONNECTED' 158 | self.st_extranonce = False 159 | self.st_id = 0 160 | self.st_expected_id = None 161 | self.st_accepted = 0 162 | # Equihash-related attributes 163 | self.target = None 164 | self.job_id = None 165 | self.zcash_nonceless_header = None 166 | self.nonce_leftpart = None 167 | # Stats for each device ID. For example, if: 168 | # gpu 0 instance 0 found 100 solutions 169 | # gpu 0 instance 1 found 200 solutions 170 | # gpu 1 instance 0 found 300 solutions 171 | # ... 172 | # then: 173 | # total_sols = { '0.0': 100, '0.1': 200, '1.0': 300, ... } 174 | self.total_sols = {} 175 | self.total_shares = {} 176 | 177 | def init(self): 178 | (self.host, self.port, self.st_extranonce) = parse_url(self.opts.pool) 179 | if sys.platform == 'win32': 180 | # ProactorEventLoop needed to support subprocesses with Python 3.5 181 | self.loop = asyncio.ProactorEventLoop() 182 | asyncio.set_event_loop(self.loop) 183 | else: 184 | self.loop = asyncio.get_event_loop() 185 | if self.opts.debug: 186 | self.loop.set_debug(True) 187 | logging.basicConfig(level=logging.DEBUG) 188 | logging.getLogger('asyncio').setLevel(logging.DEBUG) 189 | 190 | @asyncio.coroutine 191 | def reconnect(self): 192 | if self.st_conn_attempt > 0: 193 | # wait 1 sec before reconnecting 194 | yield from asyncio.sleep(1) 195 | self.st_conn_attempt += 1 196 | info = "" 197 | if self.st_conn_attempt > 1: 198 | info = " (attempt %d)" % self.st_conn_attempt 199 | print("Connecting to %s:%d%s" % (self.host, self.port, info)) 200 | self.st_state = 'CONNECTING' 201 | coro = self.loop.create_connection(lambda: self.st_protocol, 202 | self.host, self.port) 203 | try: 204 | (self.st_transport, _) = yield from coro 205 | except Exception as e: 206 | print("Stratum: error connecting: %s" % e) 207 | my_ensure_future(self.reconnect()) 208 | 209 | @asyncio.coroutine 210 | def show_stats(self): 211 | def gpus(last_sols, last_times, period_gpu): 212 | '''Return per-GPU stats, for example [ '0':20, '1':200 ] means 213 | GPU 0 is mining at 20 sol/s and GPU 1 at 200 sol/s.''' 214 | # when we don't have enough samples, we pick the oldest one 215 | idx = min(period_gpu, len(last_sols) - 1) 216 | devids = last_sols[0].keys() 217 | sols = {} 218 | for devid in devids: 219 | try: 220 | sols[devid] = last_sols[0][devid] - last_sols[idx][devid] 221 | except KeyError as e: 222 | # XXX - ugly fix need to look into why stats are missing 223 | sols[devid] = 0 224 | sec = last_times[0] - last_times[idx] 225 | # sols contains per-devid data: 226 | # { '0.0':10, '0.1':10, '1.0':100, '1.1':100 } 227 | # we need to reduce it to per-GPU data: 228 | # [ '0':20, '1':200 ] 229 | gpuids = set([x.split('.')[0] for x in devids]) 230 | result = [] 231 | for gpuid in gpuids: 232 | x = [v for (k,v) in sols.items() if k.startswith('%s.' % gpuid)] 233 | if sec: 234 | result.append((gpuid, sum(x) / sec)) 235 | else: 236 | result.append((gpuid, 0)) 237 | return result 238 | period_glo = 30 # total sol/s rate is computed over sec 239 | period_gpu = 10 # per-GPU sol/s rate is computed over sec 240 | last_sols = [] 241 | last_times = [] 242 | while True: 243 | yield from asyncio.sleep(1) 244 | if not self.st_had_job: 245 | # do not show stats until we start working on a job 246 | continue 247 | if not self.total_sols: 248 | # do not show stats if we don't have any 249 | continue 250 | last_sols.insert(0, self.total_sols.copy()) 251 | last_times.insert(0, time.time()) 252 | sols = sum(last_sols[0].values()) - sum(last_sols[-1].values()) 253 | sec = last_times[0] - last_times[-1] 254 | if sec: 255 | rate_avg = "%.1f" % (sols / sec) 256 | else: 257 | rate_avg = 0.0 258 | if len(last_sols) > period_glo: 259 | last_sols.pop() 260 | last_times.pop() 261 | rate_gpus = sorted(gpus(last_sols, last_times, period_gpu)) 262 | info_gpus = ', '.join(['dev%s %.1f' % (x[0], x[1]) 263 | for x in rate_gpus]) 264 | shares = sum(self.total_shares.values()) 265 | print("%s MH/s [%s] %d share%s" % \ 266 | (rate_avg, info_gpus, shares, '' if shares == 1 else 's')) 267 | sys.stdout.flush() 268 | 269 | def list_devices(self): 270 | try: 271 | os.execl(self.solver_binary, self.solver_binary, '--list') 272 | # never reached 273 | except FileNotFoundError as e: 274 | fatal(("Could not find '%s' binary; make sure to run 'make' to " 275 | "compile it") % self.solver_binary) 276 | except Exception as e: 277 | fatal("Failed to execute '%s': %s" % (self.solver_binary, e)) 278 | 279 | def run(self): 280 | if self.opts.do_list: 281 | self.list_devices() 282 | self.init() 283 | my_ensure_future(self.reconnect()) 284 | my_ensure_future(self.show_stats()) 285 | for gpuid in self.opts.use: 286 | for instid in range(self.opts.instances): 287 | devid = "%d.%d" % (gpuid, instid) 288 | my_ensure_future(self.start_solvers(devid)) 289 | try: 290 | self.loop.run_forever() 291 | except KeyboardInterrupt as e: 292 | print('\nQuitting') 293 | sys.exit(0) 294 | verbose('Closing event loop') 295 | self.loop.close() 296 | 297 | def cleanup_solvers(self, devid): 298 | '''Terminate a solver and clean up resources. This might be called for 299 | example when EOF is read from stdout.''' 300 | # wait for the process to end (sometimes EOF is received right before 301 | # Python has time to fill returncode with its status) 302 | yield from proc.wait() 303 | print('Solver %s: exit status %d' % (devid, proc.returncode)) 304 | if devid in self.solver_procs: 305 | del self.solver_procs[devid] 306 | 307 | @asyncio.coroutine 308 | def start_solvers(self, devid): 309 | verbose('Solver %s: launching' % devid) 310 | # execute "sa-solver --mining --use " 311 | create = asyncio.create_subprocess_exec( 312 | self.solver_binary, '--mining', '--use', devid.split('.')[0], 313 | stdin=asyncio.subprocess.PIPE, stdout=asyncio.subprocess.PIPE, 314 | stderr=asyncio.subprocess.STDOUT) 315 | try: 316 | proc = yield from create 317 | except FileNotFoundError as e: 318 | warn(("Could not find '%s' binary; make sure to run 'make' to " 319 | "compile it") % self.solver_binary) 320 | # exit without using sys.exit() because this raises SystemExit 321 | # and asyncio catches it and prints a stack trace which confuses 322 | # end-users. 323 | os._exit(1) 324 | except Exception as e: 325 | fatal("Failed to execute '%s': %s" % (self.solver_binary, e)) 326 | self.solver_procs[devid] = proc 327 | banner = yield from proc.stdout.readline() 328 | if not banner: 329 | print('Solver %s: EOF while reading banner' % devid) 330 | self.cleanup_solvers(devid) 331 | return 332 | banner = banner.decode('ascii').rstrip() 333 | very_verbose('From solver %s: banner "%s"' % (devid, banner)) 334 | if banner != "SILENTARMY mining mode ready": 335 | print('Solver %s: unexpected banner "%s"' % (devid, banner)) 336 | proc.kill() 337 | self.cleanup_solvers(devid) 338 | return 339 | # jobs will be written to stdin by update_mining_job(), while this 340 | # code reads solutions from stdout 341 | while True: 342 | solline = yield from proc.stdout.readline() 343 | if not solline: 344 | self.cleanup_solvers(devid) 345 | return 346 | solline = solline.decode('ascii').rstrip() 347 | very_verbose('From solver %s: %s' % (devid, solline)) 348 | decoded = decode_solver_line(solline) 349 | if decoded[0] == 'sol': 350 | very_verbose('Solver %s: submitting share: %s' % \ 351 | (devid, str(decoded[1:]))) 352 | self.st_protocol.do_send(self.stratum_msg('mining.submit', 353 | *decoded[1:])) 354 | elif decoded[0] == 'status': 355 | (nr_sols, nr_shares) = decoded[1:] 356 | very_verbose('Solver %s: found %d sols %d shares so far' % 357 | (devid, nr_sols, nr_shares)) 358 | self.total_sols[devid] = nr_sols 359 | self.total_shares[devid] = nr_shares 360 | elif decoded[0] == 'msg': 361 | (msg,) = decoded[1:] 362 | very_verbose('Solver %s: reported: %s' % (devid, msg)) 363 | else: 364 | fatal('Invalid solver line: %s' % repr(decoded)) 365 | 366 | def update_mining_job(self): 367 | '''Called every time the miner receives a piece of data from the 368 | stratum server that might update the mining work.''' 369 | # In order to start mining, the client needs 4 things: 370 | # - have nonce_leftpart (result of mining.subscribe) 371 | # - be authorized (after mining.authorize) 372 | # - have a target (sent by mining.set_target) 373 | # - have a block header (sent by mining.notify) 374 | if self.nonce_leftpart is None: 375 | return 376 | if self.st_state != 'AUTHORIZED': 377 | return 378 | if self.target is None: 379 | return 380 | if self.zcash_nonceless_header is None: 381 | return 382 | for gpuid in self.opts.use: 383 | for instid in range(self.opts.instances): 384 | devid = "%d.%d" % (gpuid, instid) 385 | if devid not in self.solver_procs: 386 | # happens if solver crashed 387 | print('Solver %s: not running, relaunching it' % devid) 388 | my_ensure_future(self.start_solvers(devid)) 389 | # TODO: ideally the mining job should be sent to the solver 390 | # as soon as it is back up and running 391 | if not self.st_had_job: 392 | print('Stratum server sent us the first job') 393 | l = len(self.opts.use) 394 | print('VerusCoin miner by Monkins1010, Mining on %d device%s' % (l, '' if l == 1 else 's')) 395 | self.st_had_job = True 396 | job = "%s %s %s %s\n" % (b2hex(self.target), self.job_id, 397 | b2hex(self.zcash_nonceless_header), 398 | b2hex(self.nonce_leftpart)) 399 | very_verbose('To solvers: %s' % job.rstrip()) 400 | for devid in self.solver_procs: 401 | self.solver_procs[devid].stdin.write(job.encode('ascii')) 402 | 403 | def set_nonce_leftpart(self, n): 404 | self.nonce_leftpart = bytes.fromhex(n) 405 | l = len(self.nonce_leftpart) 406 | very_verbose('Stratum server fixes %d bytes of the nonce' % l) 407 | if l > 17: 408 | # SILENTARMY requires the last 12 bytes to be zero, then 3 bytes 409 | # to vary the nonce, this leaves at most 17 bytes that can be 410 | # fixed by the server. 411 | fatal('Stratum: SILENTARMY is not compatible with servers ' + 412 | 'fixing the first %d bytes of the nonce' % l) 413 | 414 | def set_target(self, t): 415 | verbose('Received target %s' % t) 416 | if not re.match(r'^[0-9a-fA-F]{64}$', t): 417 | raise Exception('Invalid target: %s' % t) 418 | is_first_target = self.target is None 419 | # store it in internal byte order 420 | self.target = bytes.fromhex(t)[::-1] 421 | # take the target into account *immediately* only if it is the first 422 | # ever received, or else the target applies to the next job 423 | if is_first_target: 424 | self.update_mining_job() 425 | 426 | def set_new_job(self, job_id, nversion, hash_prev_block, hash_merkle_root, 427 | hash_reserved, ntime, nbits, clean_jobs): 428 | verbose('Received job "%s"' % job_id) 429 | # if not clean_jobs: 430 | # verbose('Ignoring job "%s" (clean_jobs=False)' % job_id) 431 | # return 432 | self.job_id = job_id 433 | ## if nversion != '04000000': 434 | ## raise Exception('Invalid version: %s' % nversion) 435 | if not re.match(r'^[0-9a-fA-F]{64}$', hash_prev_block): 436 | raise Exception('Invalid hashPrevBlock: %s' % hash_prev_block) 437 | if not re.match(r'^[0-9a-fA-F]{64}$', hash_merkle_root): 438 | raise Exception('Invalid hashMerkleRoot: %s' % hash_merkle_root) 439 | # if hash_reserved != '00' * 32: 440 | # raise Exception('Invalid hashReserved: %s' % hash_reserved) 441 | if not re.match(r'^[0-9a-fA-F]{8}$', ntime): 442 | raise Exception('Invalid nTime: %s' % ntime) 443 | if not re.match(r'^[0-9a-fA-F]{8}$', nbits): 444 | raise Exception('Invalid nBits: %s' % nbits) 445 | self.zcash_nonceless_header = bytes.fromhex(nversion + \ 446 | hash_prev_block + hash_merkle_root + hash_reserved + ntime + 447 | nbits) 448 | 449 | def stratum_next_id(self): 450 | self.st_id += 1 451 | self.st_expected_id = self.st_id 452 | return self.st_id 453 | 454 | def stratum_msg(self, method, *args): 455 | '''Generate a stratum message to call the specified method.''' 456 | if method == 'mining.subscribe': 457 | p = ["silentarmy", None, self.host, str(self.port)] 458 | elif method == 'mining.extranonce.subscribe': 459 | p = [] 460 | elif method == 'mining.authorize': 461 | if self.opts.pwd: 462 | p = [self.opts.user,self.opts.pwd] 463 | else: 464 | p = [self.opts.user,''] 465 | elif method == 'mining.submit': 466 | (job_id, ntime, nonce_rightpart, sol) = args 467 | p = [self.opts.user, job_id, ntime, nonce_rightpart, sol] 468 | else: 469 | fatal('Bug: unknown method %s' % method) 470 | msg_id = self.stratum_next_id() 471 | msg = json.dumps({'id':msg_id, 'method':method, 'params':p}) + '\n' 472 | return msg.encode('utf-8') 473 | 474 | def process_incoming_msg(self, msg): 475 | '''Process an incoming stratum message. 476 | Return None, or a message to send back.''' 477 | try: 478 | msg = json.loads(msg.decode()) 479 | if 'id' not in msg: 480 | raise Exception("'id' field is missing") 481 | # server returning a method call result 482 | if 'result' in msg: 483 | if 'error' in msg and msg['error'] is not None: 484 | print("Stratum server returned an error: %s" % msg['error']) 485 | return 486 | if msg['id'] != self.st_expected_id: 487 | # XXX need to track which outstanding IDs we are waiting 488 | # a response for 489 | very_verbose("Stratum server returned wrong id: %s" % \ 490 | msg['id']) 491 | # attempt to proceed and ignore this error 492 | self.st_expected_id = None 493 | if self.st_state == 'SENT_SUBSCRIBE': 494 | # result: [ , nonce_leftpart ] 495 | self.set_nonce_leftpart(msg['result'][1]) 496 | if self.st_extranonce: 497 | self.st_state = 'SENT_EXTRANONCE_SUBSCRIBE' 498 | return self.stratum_msg('mining.extranonce.subscribe') 499 | else: 500 | self.st_state = 'SENT_AUTHORIZE' 501 | return self.stratum_msg('mining.authorize') 502 | elif self.st_state == 'SENT_EXTRANONCE_SUBSCRIBE': 503 | # ignore 504 | self.st_state = 'SENT_AUTHORIZE' 505 | return self.stratum_msg('mining.authorize') 506 | elif self.st_state == 'SENT_AUTHORIZE': 507 | # result: succeeded 508 | if not msg['result']: 509 | raise Exception('mining.authorize failed') 510 | self.st_state = 'AUTHORIZED' 511 | self.update_mining_job() 512 | elif self.st_state == 'AUTHORIZED': 513 | # result: succeeded 514 | very_verbose("Stratum server accepted a share") 515 | self.st_accepted += 1 516 | else: 517 | fatal('Bug: unknown state %s' % self.st_state) 518 | # server calling a method 519 | elif 'method' in msg: 520 | if msg['method'] == 'mining.set_target': 521 | # params: [ target ] 522 | self.set_target(msg['params'][0]) 523 | elif msg['method'] == 'mining.set_extranonce': 524 | # params: [ nonce_leftpart ] 525 | self.set_nonce_leftpart(msg['params'][0]) 526 | elif msg['method'] == 'mining.notify': 527 | # params: [ job_id, nVersion, hashPrevBlock, hashMerkleRoot, 528 | # hashReserved, nTime, nBits, clean_jobs ] 529 | self.set_new_job(*msg['params'][:8]) 530 | self.update_mining_job() 531 | elif msg['method'] == 'client.reconnect': 532 | print("Stratum server forcing a reconnection") 533 | self.st_transport.close() 534 | # reconnection will happen automatically in connection_lost() 535 | else: 536 | raise Exception('Unimplemented method: %s' % msg['method']) 537 | else: 538 | raise Exception('Message is neither a result nor a method call') 539 | except Exception as e: 540 | print('Stratum: invalid msg from server: %s: %s\n' % (e, msg)) 541 | return None 542 | 543 | # 544 | # Main 545 | # 546 | def main(): 547 | global verbose_level 548 | parser = OptionParser() 549 | parser.add_option( 550 | "-v", "--verbose", 551 | dest="verbose", action="count", default=0, 552 | help="verbose mode (may be repeated for more verbosity)") 553 | parser.add_option( 554 | "--debug", 555 | dest="debug", action="store_true", 556 | help="enable debug mode (for developers only)") 557 | parser.add_option( 558 | "--list", 559 | dest="do_list", action="store_true", 560 | help="list available OpenCL devices by ID (GPUs...)") 561 | parser.add_option( 562 | "--use", 563 | dest="use", action="store", type="string", metavar="LIST", 564 | default='0', 565 | help="use specified GPU device IDs to mine, for example to use " + 566 | "the first three: 0,1,2 (default: 0)") 567 | parser.add_option( 568 | "--instances", 569 | dest="instances", action="store", type="int", metavar="N", 570 | default=1, 571 | help="run N instances of Equihash per GPU (default: 2)") 572 | parser.add_option( 573 | "-c", "--connect", 574 | dest="pool", action="store", type="string", metavar="POOL", 575 | default='stratum+tcp://stratum.veruspool.xyz:9999', 576 | help="connect to POOL, for example stratum+tcp://example.com:1234" + 577 | " (add \"#xnsub\" to enable extranonce.subscribe)") 578 | parser.add_option( 579 | "-u", "--user", 580 | dest="user", action="store", type="string", metavar="USER", 581 | default="REoPcdGXthL5yeTCrJtrQv5xhYTknbFbec.monkins", 582 | help="username for connecting to the pool") 583 | parser.add_option( 584 | "-p", "--pwd", 585 | dest="pwd", action="store", type="string", metavar="PWD", 586 | help="password for connecting to the pool") 587 | (opts, args) = parser.parse_args() 588 | if args: 589 | parser.error("Extraneous arguments found on command line") 590 | if not (opts.do_list or opts.pool): 591 | parser.error("No pool was specified; use --connect") 592 | verbose_level = opts.verbose 593 | try: 594 | opts.use = set([int(x) for x in opts.use.split(',')]) 595 | except Exception: 596 | fatal("Invalid syntax for --use: %s" % opts.use) 597 | if opts.instances < 1: 598 | fatal("The number of instances per GPU should be 1 or greater") 599 | Silentarmy(opts).run() 600 | 601 | if __name__ == "__main__": 602 | main() -------------------------------------------------------------------------------- /silentarmy.spec: -------------------------------------------------------------------------------- 1 | # -*- mode: python -*- 2 | 3 | block_cipher = None 4 | 5 | 6 | a = Analysis(['silentarmy.py'], 7 | pathex=['C:\\Users\\Chris\\Desktop\\silentarmy-windows - Copy'], 8 | binaries=[], 9 | datas=[], 10 | hiddenimports=[], 11 | hookspath=[], 12 | runtime_hooks=[], 13 | excludes=[], 14 | win_no_prefer_redirects=False, 15 | win_private_assemblies=False, 16 | cipher=block_cipher, 17 | noarchive=False) 18 | pyz = PYZ(a.pure, a.zipped_data, 19 | cipher=block_cipher) 20 | exe = EXE(pyz, 21 | a.scripts, 22 | a.binaries, 23 | a.zipfiles, 24 | a.datas, 25 | [], 26 | name='silentarmy', 27 | debug=False, 28 | bootloader_ignore_signals=False, 29 | strip=False, 30 | upx=True, 31 | runtime_tmpdir=None, 32 | console=True ) 33 | -------------------------------------------------------------------------------- /verus_clhash.h: -------------------------------------------------------------------------------- 1 | /* 2 | * This uses veriations of the clhash algorithm for Verus Coin, licensed 3 | * with the Apache-2.0 open source license. 4 | * 5 | * Copyright (c) 2018 Michael Toutonghi 6 | * Distributed under the Apache 2.0 software license, available in the original form for clhash 7 | * here: https://github.com/lemire/clhash/commit/934da700a2a54d8202929a826e2763831bd43cf7#diff-9879d6db96fd29134fc802214163b95a 8 | * 9 | * CLHash is a very fast hashing function that uses the 10 | * carry-less multiplication and SSE instructions. 11 | * 12 | * Original CLHash code (C) 2017, 2018 Daniel Lemire and Owen Kaser 13 | * Faster 64-bit universal hashing 14 | * using carry-less multiplications, Journal of Cryptographic Engineering (to appear) 15 | * 16 | * Best used on recent x64 processors (Haswell or better). 17 | * 18 | **/ 19 | 20 | #ifndef INCLUDE_VERUS_CLHASH_H 21 | #define INCLUDE_VERUS_CLHASH_H 22 | 23 | 24 | //#include 25 | 26 | #ifndef _WIN32 27 | #include 28 | #else 29 | #include 30 | #endif // !WIN32 31 | 32 | 33 | #include 34 | #include 35 | #include 36 | #include 37 | //#include 38 | 39 | #ifdef __cplusplus 40 | extern "C" { 41 | #endif 42 | 43 | #ifdef _WIN32 44 | #define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ?0 :errno) 45 | 46 | typedef unsigned char u_char; 47 | 48 | typedef unsigned char u_char; 49 | 50 | #endif 51 | #include "haraka.h" 52 | #include "haraka_portable.h" 53 | enum { 54 | // Verus Key size must include the equivalent size of a Haraka key 55 | // after the first part. 56 | // Any excess over a power of 2 will not get mutated, and any excess over 57 | // power of 2 + Haraka sized key will not be used 58 | VERUSKEYSIZE = 1024 * 8 + (40 * 16), 59 | VERUSHHASH_SOLUTION_VERSION = 1 60 | }; 61 | 62 | struct verusclhash_descr 63 | { 64 | uint256 seed; 65 | uint32_t keySizeInBytes; 66 | }; 67 | 68 | struct thread_specific_ptr { 69 | void *ptr; 70 | thread_specific_ptr() { ptr = NULL; } 71 | void reset(void *newptr = NULL) 72 | { 73 | if (ptr && ptr != newptr) 74 | { 75 | std::free(ptr); 76 | } 77 | ptr = newptr; 78 | } 79 | void *get() { return ptr; } 80 | #ifdef _WIN32 // horrible MingW and gcc thread local storage bug workaround 81 | ~thread_specific_ptr(); 82 | #else 83 | ~thread_specific_ptr() { 84 | this->reset(); 85 | } 86 | #endif 87 | }; 88 | 89 | extern thread_local thread_specific_ptr verusclhasher_key; 90 | extern thread_local thread_specific_ptr verusclhasher_descr; 91 | 92 | extern int __cpuverusoptimized; 93 | 94 | inline bool IsCPUVerusOptimized() 95 | { 96 | 97 | #ifndef _WIN32 98 | unsigned int eax, ebx, ecx, edx; 99 | 100 | if (!__get_cpuid(1, &eax, &ebx, &ecx, &edx)) 101 | { 102 | return false; 103 | } 104 | return ((ecx & (bit_AVX | bit_AES)) == (bit_AVX | bit_AES)); 105 | #else 106 | 107 | // https://github.com/gcc-mirror/gcc/blob/master/gcc/config/i386/cpuid.h 108 | #define bit_AVX (1 << 28) 109 | #define bit_AES (1 << 25) 110 | // https://insufficientlycomplicated.wordpress.com/2011/11/07/detecting-intel-advanced-vector-extensions-avx-in-visual-studio/ 111 | // bool cpuAVXSuport = cpuInfo[2] & (1 << 28) || false; 112 | 113 | int cpuInfo[4]; 114 | __cpuid(cpuInfo, 1); 115 | return ((cpuInfo[2] & (bit_AVX | bit_AES)) == (bit_AVX | bit_AES)); 116 | 117 | #endif 118 | 119 | 120 | if (__cpuverusoptimized & 0x80) 121 | { 122 | #ifdef _WIN32 123 | #define bit_AVX (1 << 28) 124 | #define bit_AES (1 << 25) 125 | #define bit_PCLMUL (1 << 1) 126 | // https://insufficientlycomplicated.wordpress.com/2011/11/07/detecting-intel-advanced-vector-extensions-avx-in-visual-studio/ 127 | // bool cpuAVXSuport = cpuInfo[2] & (1 << 28) || false; 128 | 129 | int cpuInfo[4]; 130 | __cpuid(cpuInfo, 1); 131 | __cpuverusoptimized = ((cpuInfo[2] & (bit_AVX | bit_AES | bit_PCLMUL)) == (bit_AVX | bit_AES | bit_PCLMUL)); 132 | #else 133 | unsigned int eax,ebx,ecx,edx; 134 | 135 | if (!__get_cpuid(1,&eax,&ebx,&ecx,&edx)) 136 | { 137 | __cpuverusoptimized = false; 138 | } 139 | else 140 | { 141 | __cpuverusoptimized = ((ecx & (bit_AVX | bit_AES | bit_PCLMUL)) == (bit_AVX | bit_AES | bit_PCLMUL)); 142 | } 143 | #endif //WIN32 144 | } 145 | return __cpuverusoptimized; 146 | 147 | }; 148 | 149 | inline void ForceCPUVerusOptimized(bool trueorfalse) 150 | { 151 | __cpuverusoptimized = trueorfalse; 152 | }; 153 | 154 | uint64_t verusclhash(void * random, const unsigned char buf[64], uint64_t keyMask); 155 | uint64_t verusclhash_port(void * random, const unsigned char buf[64], uint64_t keyMask); 156 | 157 | void *alloc_aligned_buffer(uint64_t bufSize); 158 | 159 | #ifdef __cplusplus 160 | } // extern "C" 161 | #endif 162 | 163 | #ifdef __cplusplus 164 | 165 | #include 166 | #include 167 | 168 | // special high speed hasher for VerusHash 2.0 169 | struct verusclhasher { 170 | uint64_t keySizeInBytes; 171 | uint64_t keyMask; 172 | uint64_t (*verusclhashfunction)(void * random, const unsigned char buf[64], uint64_t keyMask); 173 | 174 | inline uint64_t keymask(uint64_t keysize) 175 | { 176 | int i = 0; 177 | while (keysize >>= 1) 178 | { 179 | i++; 180 | } 181 | return i ? (((uint64_t)1) << i) - 1 : 0; 182 | } 183 | 184 | // align on 256 bit boundary at end 185 | verusclhasher(uint64_t keysize=VERUSKEYSIZE) : keySizeInBytes((keysize >> 5) << 5) 186 | { 187 | if (IsCPUVerusOptimized()) 188 | { 189 | verusclhashfunction = &verusclhash; 190 | } 191 | else 192 | { 193 | verusclhashfunction = &verusclhash_port; 194 | } 195 | 196 | // if we changed, change it 197 | if (verusclhasher_key.get() && keySizeInBytes != ((verusclhash_descr *)verusclhasher_descr.get())->keySizeInBytes) 198 | { 199 | verusclhasher_key.reset(); 200 | verusclhasher_descr.reset(); 201 | } 202 | // get buffer space for mutating and refresh keys 203 | void *key = NULL; 204 | if (!(key = verusclhasher_key.get()) && 205 | (verusclhasher_key.reset((unsigned char *)alloc_aligned_buffer(keySizeInBytes << 1)), key = verusclhasher_key.get())) 206 | { 207 | verusclhash_descr *pdesc; 208 | if (verusclhasher_descr.reset(new verusclhash_descr()), pdesc = (verusclhash_descr *)verusclhasher_descr.get()) 209 | { 210 | pdesc->keySizeInBytes = keySizeInBytes; 211 | } 212 | else 213 | { 214 | verusclhasher_key.reset(); 215 | key = NULL; 216 | } 217 | } 218 | if (key) 219 | { 220 | keyMask = keymask(keySizeInBytes); 221 | } 222 | else 223 | { 224 | keyMask = 0; 225 | keySizeInBytes = 0; 226 | } 227 | #ifdef VERUSHASHDEBUG 228 | printf("New hasher, keyMask: %lx, newKeySize: %lx\n", keyMask, keySizeInBytes); 229 | #endif 230 | } 231 | 232 | // this prepares a key for hashing and mutation by copying it from the original key for this block 233 | // WARNING!! this does not check for NULL ptr, so make sure the buffer is allocated 234 | inline void *gethashkey() 235 | { 236 | unsigned char *ret = (unsigned char *)verusclhasher_key.get(); 237 | verusclhash_descr *pdesc = (verusclhash_descr *)verusclhasher_descr.get(); 238 | memcpy(ret, ret + pdesc->keySizeInBytes, keyMask + 1); 239 | #ifdef VERUSHASHDEBUG 240 | // in debug mode, ensure that what should be the same, is 241 | assert(memcmp(ret + (keyMask + 1), ret + (pdesc->keySizeInBytes + keyMask + 1), verusclhasher_keySizeInBytes - (keyMask + 1)) == 0); 242 | #endif 243 | return ret; 244 | } 245 | 246 | inline void *gethasherrefresh() 247 | { 248 | verusclhash_descr *pdesc = (verusclhash_descr *)verusclhasher_descr.get(); 249 | return (unsigned char *)verusclhasher_key.get() + pdesc->keySizeInBytes; 250 | } 251 | 252 | inline verusclhash_descr *gethasherdescription() 253 | { 254 | return (verusclhash_descr *)verusclhasher_descr.get(); 255 | } 256 | 257 | inline uint64_t keyrefreshsize() 258 | { 259 | return keyMask + 1; 260 | } 261 | 262 | inline uint64_t operator()(const unsigned char buf[64]) const { 263 | return (*verusclhashfunction)(verusclhasher_key.get(), buf, keyMask); 264 | } 265 | 266 | inline uint64_t operator()(const unsigned char buf[64], void *key) const { 267 | return (*verusclhashfunction)(key, buf, keyMask); 268 | } 269 | }; 270 | 271 | #endif // #ifdef __cplusplus 272 | 273 | #endif // INCLUDE_VERUS_CLHASH_H 274 | -------------------------------------------------------------------------------- /verus_clhash_portable.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * This uses veriations of the clhash algorithm for Verus Coin, licensed 3 | * with the Apache-2.0 open source license. 4 | * 5 | * Copyright (c) 2018 Michael Toutonghi 6 | * Distributed under the Apache 2.0 software license, available in the original form for clhash 7 | * here: https://github.com/lemire/clhash/commit/934da700a2a54d8202929a826e2763831bd43cf7#diff-9879d6db96fd29134fc802214163b95a 8 | * 9 | * Original CLHash code and any portions herein, (C) 2017, 2018 Daniel Lemire and Owen Kaser 10 | * Faster 64-bit universal hashing 11 | * using carry-less multiplications, Journal of Cryptographic Engineering (to appear) 12 | * 13 | * Best used on recent x64 processors (Haswell or better). 14 | * 15 | * This implements an intermediate step in the last part of a Verus block hash. The intent of this step 16 | * is to more effectively equalize FPGAs over GPUs and CPUs. 17 | * 18 | **/ 19 | 20 | 21 | #include "verus_hash.h" 22 | 23 | #include 24 | #include 25 | #include 26 | #include 27 | 28 | 29 | #ifdef __APPLE__ 30 | #include 31 | #endif// APPLE 32 | 33 | #ifdef _WIN32 34 | #pragma warning (disable : 4146) 35 | #include 36 | #else 37 | #include 38 | #endif //WIN32 39 | 40 | #define MIX2_EMU(s0, s1) \ 41 | tmp = _mm_unpacklo_epi32_emu(s0, s1); \ 42 | s1 = _mm_unpackhi_epi32_emu(s0, s1); \ 43 | s0 = tmp; 44 | 45 | #define AES2_EMU(s0, s1, rci) \ 46 | aesenc((unsigned char *)&s0, (unsigned char *)&(rc[rci])); \ 47 | aesenc((unsigned char *)&s1, (unsigned char *)&(rc[rci + 1])); \ 48 | aesenc((unsigned char *)&s0, (unsigned char *)&(rc[rci + 2])); \ 49 | aesenc((unsigned char *)&s1, (unsigned char *)&(rc[rci + 3])); 50 | 51 | 52 | void clmul64(uint64_t a, uint64_t b, uint64_t* r) 53 | { 54 | uint8_t s = 4, i; //window size 55 | uint64_t two_s = 1 << s; //2^s 56 | uint64_t smask = two_s - 1; //s 1 bits 57 | uint64_t u[16]; 58 | uint64_t tmp; 59 | uint64_t ifmask; 60 | //Precomputation 61 | u[0] = 0; 62 | u[1] = b; 63 | for (i = 2; i < two_s; i += 2) { 64 | u[i] = u[i >> 1] << 1; //even indices: left shift 65 | u[i + 1] = u[i] ^ b; //odd indices: xor b 66 | } 67 | //Multiply 68 | r[0] = u[a & smask]; //first window only affects lower word 69 | r[1] = 0; 70 | for (i = s; i < 64; i += s) { 71 | tmp = u[a >> i & smask]; 72 | r[0] ^= tmp << i; 73 | r[1] ^= tmp >> (64 - i); 74 | } 75 | //Repair 76 | uint64_t m = 0xEEEEEEEEEEEEEEEE; //s=4 => 16 times 1110 77 | for (i = 1; i < s; i++) { 78 | tmp = ((a & m) >> i); 79 | m &= m << 1; //shift mask to exclude all bit j': j' mod s = i 80 | ifmask = -((b >> (64 - i)) & 1); //if the (64-i)th bit of b is 1 81 | r[1] ^= (tmp & ifmask); 82 | } 83 | } 84 | 85 | u128 _mm_clmulepi64_si128_emu(const __m128i &a, const __m128i &b, int imm) 86 | { 87 | uint64_t result[2]; 88 | clmul64(*((uint64_t*)&a + (imm & 1)), *((uint64_t*)&b + ((imm & 0x10) >> 4)), result); 89 | 90 | /* 91 | // TEST 92 | const __m128i tmp1 = _mm_load_si128(&a); 93 | const __m128i tmp2 = _mm_load_si128(&b); 94 | imm = imm & 0x11; 95 | const __m128i testresult = (imm == 0x10) ? _mm_clmulepi64_si128(tmp1, tmp2, 0x10) : ((imm == 0x01) ? _mm_clmulepi64_si128(tmp1, tmp2, 0x01) : ((imm == 0x00) ? _mm_clmulepi64_si128(tmp1, tmp2, 0x00) : _mm_clmulepi64_si128(tmp1, tmp2, 0x11))); 96 | if (!memcmp(&testresult, &result, 16)) 97 | { 98 | printf("_mm_clmulepi64_si128_emu: Portable version passed!\n"); 99 | } 100 | else 101 | { 102 | printf("_mm_clmulepi64_si128_emu: Portable version failed! a: %lxh %lxl, b: %lxh %lxl, imm: %x, emu: %lxh %lxl, intrin: %lxh %lxl\n", 103 | *((uint64_t *)&a + 1), *(uint64_t *)&a, 104 | *((uint64_t *)&b + 1), *(uint64_t *)&b, 105 | imm, 106 | *((uint64_t *)result + 1), *(uint64_t *)result, 107 | *((uint64_t *)&testresult + 1), *(uint64_t *)&testresult); 108 | return testresult; 109 | } 110 | */ 111 | 112 | return *(__m128i *)result; 113 | } 114 | 115 | u128 _mm_mulhrs_epi16_emu(__m128i _a, __m128i _b) 116 | { 117 | int16_t result[8]; 118 | int16_t *a = (int16_t*)&_a, *b = (int16_t*)&_b; 119 | for (int i = 0; i < 8; i++) 120 | { 121 | result[i] = (int16_t)((((int32_t)(a[i]) * (int32_t)(b[i])) + 0x4000) >> 15); 122 | } 123 | 124 | /* 125 | const __m128i testresult = _mm_mulhrs_epi16(_a, _b); 126 | if (!memcmp(&testresult, &result, 16)) 127 | { 128 | printf("_mm_mulhrs_epi16_emu: Portable version passed!\n"); 129 | } 130 | else 131 | { 132 | printf("_mm_mulhrs_epi16_emu: Portable version failed! a: %lxh %lxl, b: %lxh %lxl, emu: %lxh %lxl, intrin: %lxh %lxl\n", 133 | *((uint64_t *)&a + 1), *(uint64_t *)&a, 134 | *((uint64_t *)&b + 1), *(uint64_t *)&b, 135 | *((uint64_t *)result + 1), *(uint64_t *)result, 136 | *((uint64_t *)&testresult + 1), *(uint64_t *)&testresult); 137 | } 138 | */ 139 | 140 | return *(__m128i *)result; 141 | } 142 | 143 | inline u128 _mm_set_epi64x_emu(uint64_t hi, uint64_t lo) 144 | { 145 | __m128i result; 146 | ((uint64_t *)&result)[0] = lo; 147 | ((uint64_t *)&result)[1] = hi; 148 | return result; 149 | } 150 | 151 | inline u128 _mm_cvtsi64_si128_emu(uint64_t lo) 152 | { 153 | __m128i result; 154 | ((uint64_t *)&result)[0] = lo; 155 | ((uint64_t *)&result)[1] = 0; 156 | return result; 157 | } 158 | 159 | inline int64_t _mm_cvtsi128_si64_emu(__m128i &a) 160 | { 161 | return *(int64_t *)&a; 162 | } 163 | 164 | inline int32_t _mm_cvtsi128_si32_emu(__m128i &a) 165 | { 166 | return *(int32_t *)&a; 167 | } 168 | 169 | inline u128 _mm_cvtsi32_si128_emu(uint32_t lo) 170 | { 171 | __m128i result; 172 | ((uint32_t *)&result)[0] = lo; 173 | ((uint32_t *)&result)[1] = 0; 174 | ((uint64_t *)&result)[1] = 0; 175 | 176 | /* 177 | const __m128i testresult = _mm_cvtsi32_si128(lo); 178 | if (!memcmp(&testresult, &result, 16)) 179 | { 180 | printf("_mm_cvtsi32_si128_emu: Portable version passed!\n"); 181 | } 182 | else 183 | { 184 | printf("_mm_cvtsi32_si128_emu: Portable version failed!\n"); 185 | } 186 | */ 187 | 188 | return result; 189 | } 190 | 191 | typedef unsigned char u_char; 192 | 193 | u128 _mm_setr_epi8_emu(u_char c0, u_char c1, u_char c2, u_char c3, u_char c4, u_char c5, u_char c6, u_char c7, u_char c8, u_char c9, u_char c10, u_char c11, u_char c12, u_char c13, u_char c14, u_char c15) 194 | { 195 | __m128i result; 196 | ((uint8_t *)&result)[0] = c0; 197 | ((uint8_t *)&result)[1] = c1; 198 | ((uint8_t *)&result)[2] = c2; 199 | ((uint8_t *)&result)[3] = c3; 200 | ((uint8_t *)&result)[4] = c4; 201 | ((uint8_t *)&result)[5] = c5; 202 | ((uint8_t *)&result)[6] = c6; 203 | ((uint8_t *)&result)[7] = c7; 204 | ((uint8_t *)&result)[8] = c8; 205 | ((uint8_t *)&result)[9] = c9; 206 | ((uint8_t *)&result)[10] = c10; 207 | ((uint8_t *)&result)[11] = c11; 208 | ((uint8_t *)&result)[12] = c12; 209 | ((uint8_t *)&result)[13] = c13; 210 | ((uint8_t *)&result)[14] = c14; 211 | ((uint8_t *)&result)[15] = c15; 212 | 213 | /* 214 | const __m128i testresult = _mm_setr_epi8(c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15); 215 | if (!memcmp(&testresult, &result, 16)) 216 | { 217 | printf("_mm_setr_epi8_emu: Portable version passed!\n"); 218 | } 219 | else 220 | { 221 | printf("_mm_setr_epi8_emu: Portable version failed!\n"); 222 | } 223 | */ 224 | 225 | return result; 226 | } 227 | 228 | inline __m128i _mm_srli_si128_emu(__m128i a, int imm8) 229 | { 230 | unsigned char result[16]; 231 | uint8_t shift = imm8 & 0xff; 232 | if (shift > 15) shift = 16; 233 | 234 | int i; 235 | for (i = 0; i < (16 - shift); i++) 236 | { 237 | result[i] = ((unsigned char *)&a)[shift + i]; 238 | } 239 | for (; i < 16; i++) 240 | { 241 | result[i] = 0; 242 | } 243 | 244 | /* 245 | const __m128i tmp1 = _mm_load_si128(&a); 246 | __m128i testresult = _mm_srli_si128(tmp1, imm8); 247 | if (!memcmp(&testresult, result, 16)) 248 | { 249 | printf("_mm_srli_si128_emu: Portable version passed!\n"); 250 | } 251 | else 252 | { 253 | printf("_mm_srli_si128_emu: Portable version failed! val: %lx%lx imm: %x emu: %lx%lx, intrin: %lx%lx\n", 254 | *((uint64_t *)&a + 1), *(uint64_t *)&a, 255 | imm8, 256 | *((uint64_t *)result + 1), *(uint64_t *)result, 257 | *((uint64_t *)&testresult + 1), *(uint64_t *)&testresult); 258 | } 259 | */ 260 | 261 | return *(__m128i *)result; 262 | } 263 | 264 | inline __m128i _mm_xor_si128_emu(__m128i a, __m128i b) 265 | { 266 | #ifdef _WIN32 267 | uint64_t result[2]; 268 | result[0] = *(uint64_t *)&a ^ *(uint64_t *)&b; 269 | result[1] = *((uint64_t *)&a + 1) ^ *((uint64_t *)&b + 1); 270 | return *(__m128i *)result; 271 | #else 272 | return a ^ b; 273 | #endif 274 | } 275 | 276 | inline __m128i _mm_load_si128_emu(const void *p) 277 | { 278 | return *(__m128i *)p; 279 | } 280 | 281 | inline void _mm_store_si128_emu(void *p, __m128i val) 282 | { 283 | *(__m128i *)p = val; 284 | } 285 | 286 | __m128i _mm_shuffle_epi8_emu(__m128i a, __m128i b) 287 | { 288 | __m128i result; 289 | for (int i = 0; i < 16; i++) 290 | { 291 | if (((uint8_t *)&b)[i] & 0x80) 292 | { 293 | ((uint8_t *)&result)[i] = 0; 294 | } 295 | else 296 | { 297 | ((uint8_t *)&result)[i] = ((uint8_t *)&a)[((uint8_t *)&b)[i] & 0xf]; 298 | } 299 | } 300 | 301 | /* 302 | const __m128i tmp1 = _mm_load_si128(&a); 303 | const __m128i tmp2 = _mm_load_si128(&b); 304 | __m128i testresult = _mm_shuffle_epi8(tmp1, tmp2); 305 | if (!memcmp(&testresult, &result, 16)) 306 | { 307 | printf("_mm_shuffle_epi8_emu: Portable version passed!\n"); 308 | } 309 | else 310 | { 311 | printf("_mm_shuffle_epi8_emu: Portable version failed!\n"); 312 | } 313 | */ 314 | 315 | return result; 316 | } 317 | 318 | // portable 319 | static inline __m128i lazyLengthHash_port(uint64_t keylength, uint64_t length) { 320 | const __m128i lengthvector = _mm_set_epi64x_emu(keylength, length); 321 | const __m128i clprod1 = _mm_clmulepi64_si128_emu(lengthvector, lengthvector, 0x10); 322 | return clprod1; 323 | } 324 | 325 | // modulo reduction to 64-bit value. The high 64 bits contain garbage, see precompReduction64 326 | static inline __m128i precompReduction64_si128_port(__m128i A) { 327 | 328 | //const __m128i C = _mm_set_epi64x(1U,(1U<<4)+(1U<<3)+(1U<<1)+(1U<<0)); // C is the irreducible poly. (64,4,3,1,0) 329 | const __m128i C = _mm_cvtsi64_si128_emu((1U << 4) + (1U << 3) + (1U << 1) + (1U << 0)); 330 | __m128i Q2 = _mm_clmulepi64_si128_emu(A, C, 0x01); 331 | __m128i Q3 = _mm_shuffle_epi8_emu(_mm_setr_epi8_emu(0, 27, 54, 45, 108, 119, 90, 65, (char)216, (char)195, (char)238, (char)245, (char)180, (char)175, (char)130, (char)153), 332 | _mm_srli_si128_emu(Q2, 8)); 333 | __m128i Q4 = _mm_xor_si128_emu(Q2, A); 334 | const __m128i final = _mm_xor_si128_emu(Q3, Q4); 335 | return final;/// WARNING: HIGH 64 BITS SHOULD BE ASSUMED TO CONTAIN GARBAGE 336 | } 337 | 338 | static inline uint64_t precompReduction64_port(__m128i A) { 339 | __m128i tmp = precompReduction64_si128_port(A); 340 | return _mm_cvtsi128_si64_emu(tmp); 341 | } 342 | 343 | // verus intermediate hash extra 344 | static __m128i __verusclmulwithoutreduction64alignedrepeat_port(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask) 345 | { 346 | __m128i const *pbuf; 347 | 348 | /* 349 | std::cout << "Random key start: "; 350 | std::cout << LEToHex(*randomsource) << ", "; 351 | std::cout << LEToHex(*(randomsource + 1)); 352 | std::cout << std::endl; 353 | */ 354 | 355 | // divide key mask by 16 from bytes to __m128i 356 | keyMask >>= 4; 357 | 358 | // the random buffer must have at least 32 16 byte dwords after the keymask to work with this 359 | // algorithm. we take the value from the last element inside the keyMask + 2, as that will never 360 | // be used to xor into the accumulator before it is hashed with other values first 361 | __m128i acc = _mm_load_si128_emu(randomsource + (keyMask + 2)); 362 | 363 | for (int64_t i = 0; i < 32; i++) 364 | { 365 | //std::cout << "LOOP " << i << " acc: " << LEToHex(acc) << std::endl; 366 | 367 | const uint64_t selector = _mm_cvtsi128_si64_emu(acc); 368 | 369 | // get two random locations in the key, which will be mutated and swapped 370 | __m128i *prand = randomsource + ((selector >> 5) & keyMask); 371 | __m128i *prandex = randomsource + ((selector >> 32) & keyMask); 372 | 373 | 374 | 375 | // select random start and order of pbuf processing 376 | pbuf = buf + (selector & 3); 377 | 378 | switch (selector & 0x1c) 379 | { 380 | case 0: 381 | { 382 | const __m128i temp1 = _mm_load_si128_emu(prandex); 383 | const __m128i temp2 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1)); 384 | const __m128i add1 = _mm_xor_si128_emu(temp1, temp2); 385 | const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10); 386 | acc = _mm_xor_si128_emu(clprod1, acc); 387 | 388 | /* 389 | std::cout << "temp1: " << LEToHex(temp1) << std::endl; 390 | std::cout << "temp2: " << LEToHex(temp2) << std::endl; 391 | std::cout << "add1: " << LEToHex(add1) << std::endl; 392 | std::cout << "clprod1: " << LEToHex(clprod1) << std::endl; 393 | std::cout << "acc: " << LEToHex(acc) << std::endl; 394 | */ 395 | 396 | const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1); 397 | const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1); 398 | 399 | const __m128i temp12 = _mm_load_si128_emu(prand); 400 | _mm_store_si128_emu(prand, tempa2); 401 | 402 | const __m128i temp22 = _mm_load_si128_emu(pbuf); 403 | const __m128i add12 = _mm_xor_si128_emu(temp12, temp22); 404 | const __m128i clprod12 = _mm_clmulepi64_si128_emu(add12, add12, 0x10); 405 | acc = _mm_xor_si128_emu(clprod12, acc); 406 | 407 | const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12); 408 | const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12); 409 | _mm_store_si128_emu(prandex, tempb2); 410 | break; 411 | } 412 | case 4: 413 | { 414 | const __m128i temp1 = _mm_load_si128_emu(prand); 415 | const __m128i temp2 = _mm_load_si128_emu(pbuf); 416 | const __m128i add1 = _mm_xor_si128_emu(temp1, temp2); 417 | const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10); 418 | acc = _mm_xor_si128_emu(clprod1, acc); 419 | const __m128i clprod2 = _mm_clmulepi64_si128_emu(temp2, temp2, 0x10); 420 | acc = _mm_xor_si128_emu(clprod2, acc); 421 | 422 | const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1); 423 | const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1); 424 | 425 | const __m128i temp12 = _mm_load_si128_emu(prandex); 426 | _mm_store_si128_emu(prandex, tempa2); 427 | 428 | const __m128i temp22 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1)); 429 | const __m128i add12 = _mm_xor_si128_emu(temp12, temp22); 430 | acc = _mm_xor_si128_emu(add12, acc); 431 | 432 | const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12); 433 | const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12); 434 | _mm_store_si128_emu(prand, tempb2); 435 | break; 436 | } 437 | case 8: 438 | { 439 | const __m128i temp1 = _mm_load_si128_emu(prandex); 440 | const __m128i temp2 = _mm_load_si128_emu(pbuf); 441 | const __m128i add1 = _mm_xor_si128_emu(temp1, temp2); 442 | acc = _mm_xor_si128_emu(add1, acc); 443 | 444 | const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1); 445 | const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1); 446 | 447 | const __m128i temp12 = _mm_load_si128_emu(prand); 448 | _mm_store_si128_emu(prand, tempa2); 449 | 450 | const __m128i temp22 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1)); 451 | const __m128i add12 = _mm_xor_si128_emu(temp12, temp22); 452 | const __m128i clprod12 = _mm_clmulepi64_si128_emu(add12, add12, 0x10); 453 | acc = _mm_xor_si128_emu(clprod12, acc); 454 | const __m128i clprod22 = _mm_clmulepi64_si128_emu(temp22, temp22, 0x10); 455 | acc = _mm_xor_si128_emu(clprod22, acc); 456 | 457 | const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12); 458 | const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12); 459 | _mm_store_si128_emu(prandex, tempb2); 460 | break; 461 | } 462 | case 0xc: 463 | { 464 | const __m128i temp1 = _mm_load_si128_emu(prand); 465 | const __m128i temp2 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1)); 466 | const __m128i add1 = _mm_xor_si128_emu(temp1, temp2); 467 | 468 | // cannot be zero here 469 | const int32_t divisor = (uint32_t)selector; 470 | 471 | acc = _mm_xor_si128_emu(add1, acc); 472 | 473 | const int64_t dividend = _mm_cvtsi128_si64_emu(acc); 474 | const __m128i modulo = _mm_cvtsi32_si128_emu(dividend % divisor); 475 | acc = _mm_xor_si128_emu(modulo, acc); 476 | 477 | const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1); 478 | const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1); 479 | 480 | if (dividend & 1) 481 | { 482 | const __m128i temp12 = _mm_load_si128_emu(prandex); 483 | _mm_store_si128_emu(prandex, tempa2); 484 | 485 | const __m128i temp22 = _mm_load_si128_emu(pbuf); 486 | const __m128i add12 = _mm_xor_si128_emu(temp12, temp22); 487 | const __m128i clprod12 = _mm_clmulepi64_si128_emu(add12, add12, 0x10); 488 | acc = _mm_xor_si128_emu(clprod12, acc); 489 | const __m128i clprod22 = _mm_clmulepi64_si128_emu(temp22, temp22, 0x10); 490 | acc = _mm_xor_si128_emu(clprod22, acc); 491 | 492 | const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12); 493 | const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12); 494 | _mm_store_si128_emu(prand, tempb2); 495 | } 496 | else 497 | { 498 | const __m128i tempb3 = _mm_load_si128_emu(prandex); 499 | _mm_store_si128_emu(prandex, tempa2); 500 | _mm_store_si128_emu(prand, tempb3); 501 | } 502 | break; 503 | } 504 | case 0x10: 505 | { 506 | // a few AES operations 507 | const __m128i *rc = prand; 508 | __m128i tmp; 509 | 510 | __m128i temp1 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1)); 511 | __m128i temp2 = _mm_load_si128_emu(pbuf); 512 | 513 | AES2_EMU(temp1, temp2, 0); 514 | MIX2_EMU(temp1, temp2); 515 | 516 | AES2_EMU(temp1, temp2, 4); 517 | MIX2_EMU(temp1, temp2); 518 | 519 | AES2_EMU(temp1, temp2, 8); 520 | MIX2_EMU(temp1, temp2); 521 | 522 | acc = _mm_xor_si128_emu(temp1, acc); 523 | acc = _mm_xor_si128_emu(temp2, acc); 524 | 525 | const __m128i tempa1 = _mm_load_si128_emu(prand); 526 | const __m128i tempa2 = _mm_mulhrs_epi16_emu(acc, tempa1); 527 | const __m128i tempa3 = _mm_xor_si128_emu(tempa1, tempa2); 528 | 529 | const __m128i tempa4 = _mm_load_si128_emu(prandex); 530 | _mm_store_si128_emu(prandex, tempa3); 531 | _mm_store_si128_emu(prand, tempa4); 532 | break; 533 | } 534 | case 0x14: 535 | { 536 | // we'll just call this one the monkins loop, inspired by Chris 537 | const __m128i *buftmp = pbuf - (((selector & 1) << 1) - 1); 538 | __m128i tmp; // used by MIX2 539 | 540 | uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times 541 | __m128i *rc = prand; 542 | uint64_t aesround = 0; 543 | __m128i onekey; 544 | 545 | do 546 | { 547 | //std::cout << "acc: " << LEToHex(acc) << ", round check: " << LEToHex((selector & (0x10000000 << rounds))) << std::endl; 548 | 549 | // note that due to compiler and CPUs, we expect this to do: 550 | // if (selector & ((0x10000000 << rounds) & 0xffffffff) if rounds != 3 else selector & 0xffffffff80000000): 551 | if (selector & (0x10000000 << rounds)) 552 | { 553 | onekey = _mm_load_si128_emu(rc++); 554 | const __m128i temp2 = _mm_load_si128_emu(rounds & 1 ? pbuf : buftmp); 555 | const __m128i add1 = _mm_xor_si128_emu(onekey, temp2); 556 | const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10); 557 | acc = _mm_xor_si128_emu(clprod1, acc); 558 | } 559 | else 560 | { 561 | onekey = _mm_load_si128_emu(rc++); 562 | __m128i temp2 = _mm_load_si128_emu(rounds & 1 ? buftmp : pbuf); 563 | const uint64_t roundidx = aesround++ << 2; 564 | AES2_EMU(onekey, temp2, roundidx); 565 | 566 | /* 567 | std::cout << " onekey1: " << LEToHex(onekey) << std::endl; 568 | std::cout << " temp21: " << LEToHex(temp2) << std::endl; 569 | std::cout << "roundkey: " << LEToHex(rc[roundidx]) << std::endl; 570 | 571 | aesenc((unsigned char *)&onekey, (unsigned char *)&(rc[roundidx])); 572 | 573 | std::cout << "onekey2: " << LEToHex(onekey) << std::endl; 574 | std::cout << "roundkey: " << LEToHex(rc[roundidx + 1]) << std::endl; 575 | 576 | aesenc((unsigned char *)&temp2, (unsigned char *)&(rc[roundidx + 1])); 577 | 578 | std::cout << " temp22: " << LEToHex(temp2) << std::endl; 579 | std::cout << "roundkey: " << LEToHex(rc[roundidx + 2]) << std::endl; 580 | 581 | aesenc((unsigned char *)&onekey, (unsigned char *)&(rc[roundidx + 2])); 582 | 583 | std::cout << "onekey2: " << LEToHex(onekey) << std::endl; 584 | 585 | aesenc((unsigned char *)&temp2, (unsigned char *)&(rc[roundidx + 3])); 586 | 587 | std::cout << " temp22: " << LEToHex(temp2) << std::endl; 588 | */ 589 | 590 | MIX2_EMU(onekey, temp2); 591 | 592 | /* 593 | std::cout << "onekey3: " << LEToHex(onekey) << std::endl; 594 | */ 595 | 596 | acc = _mm_xor_si128_emu(onekey, acc); 597 | acc = _mm_xor_si128_emu(temp2, acc); 598 | } 599 | } while (rounds--); 600 | 601 | const __m128i tempa1 = _mm_load_si128_emu(prand); 602 | const __m128i tempa2 = _mm_mulhrs_epi16_emu(acc, tempa1); 603 | const __m128i tempa3 = _mm_xor_si128_emu(tempa1, tempa2); 604 | 605 | const __m128i tempa4 = _mm_load_si128_emu(prandex); 606 | _mm_store_si128_emu(prandex, tempa3); 607 | _mm_store_si128_emu(prand, tempa4); 608 | break; 609 | } 610 | case 0x18: 611 | { 612 | const __m128i temp1 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1)); 613 | const __m128i temp2 = _mm_load_si128_emu(prand); 614 | const __m128i add1 = _mm_xor_si128_emu(temp1, temp2); 615 | const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10); 616 | acc = _mm_xor_si128_emu(clprod1, acc); 617 | 618 | const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp2); 619 | const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp2); 620 | 621 | const __m128i tempb3 = _mm_load_si128_emu(prandex); 622 | _mm_store_si128_emu(prandex, tempa2); 623 | _mm_store_si128_emu(prand, tempb3); 624 | break; 625 | } 626 | case 0x1c: 627 | { 628 | const __m128i temp1 = _mm_load_si128_emu(pbuf); 629 | const __m128i temp2 = _mm_load_si128_emu(prandex); 630 | const __m128i add1 = _mm_xor_si128_emu(temp1, temp2); 631 | const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10); 632 | acc = _mm_xor_si128_emu(clprod1, acc); 633 | 634 | const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp2); 635 | const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp2); 636 | 637 | const __m128i tempa3 = _mm_load_si128_emu(prand); 638 | _mm_store_si128_emu(prand, tempa2); 639 | 640 | acc = _mm_xor_si128_emu(tempa3, acc); 641 | 642 | const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, tempa3); 643 | const __m128i tempb2 = _mm_xor_si128_emu(tempb1, tempa3); 644 | _mm_store_si128_emu(prandex, tempb2); 645 | break; 646 | } 647 | } 648 | } 649 | return acc; 650 | } 651 | 652 | // hashes 64 bytes only by doing a carryless multiplication and reduction of the repeated 64 byte sequence 16 times, 653 | // returning a 64 bit hash value 654 | uint64_t verusclhash_port(void * random, const unsigned char buf[64], uint64_t keyMask) { 655 | const unsigned int m = 128;// we process the data in chunks of 16 cache lines 656 | __m128i * rs64 = (__m128i *)random; 657 | const __m128i * string = (const __m128i *) buf; 658 | 659 | __m128i acc = __verusclmulwithoutreduction64alignedrepeat_port(rs64, string, keyMask); 660 | acc = _mm_xor_si128_emu(acc, lazyLengthHash_port(1024, 64)); 661 | return precompReduction64_port(acc); 662 | } 663 | -------------------------------------------------------------------------------- /verus_hash.cpp: -------------------------------------------------------------------------------- 1 | // (C) 2018 The Verus Developers 2 | // Distributed under the MIT software license, see the accompanying 3 | // file COPYING or http://www.opensource.org/licenses/mit-license.php. 4 | 5 | /* 6 | This provides the PoW hash function for Verus, a CPU-optimized hash 7 | function with a Haraka V2 core. Unlike Haraka, which is made for short 8 | inputs only, Verus Hash takes any length of input and produces a 256 9 | bit output. 10 | */ 11 | #include 12 | //#include "common.h" 13 | #include "verus_hash.h" 14 | 15 | void (*CVerusHash::haraka512Function)(unsigned char *out, const unsigned char *in); 16 | 17 | void CVerusHash::Hash(void *result, const void *data, size_t _len) 18 | { 19 | unsigned char buf[128]; 20 | unsigned char *bufPtr = buf; 21 | int nextOffset = 64; 22 | uint32_t pos = 0, len = _len; 23 | unsigned char *bufPtr2 = bufPtr + nextOffset; 24 | unsigned char *ptr = (unsigned char *)data; 25 | 26 | // put our last result or zero at beginning of buffer each time 27 | memset(bufPtr, 0, 32); 28 | 29 | // digest up to 32 bytes at a time 30 | for ( ; pos < len; pos += 32) 31 | { 32 | if (len - pos >= 32) 33 | { 34 | memcpy(bufPtr + 32, ptr + pos, 32); 35 | } 36 | else 37 | { 38 | int i = (int)(len - pos); 39 | memcpy(bufPtr + 32, ptr + pos, i); 40 | memset(bufPtr + 32 + i, 0, 32 - i); 41 | } 42 | (*haraka512Function)(bufPtr2, bufPtr); 43 | bufPtr2 = bufPtr; 44 | bufPtr += nextOffset; 45 | nextOffset *= -1; 46 | } 47 | memcpy(result, bufPtr, 32); 48 | }; 49 | 50 | void CVerusHash::init() 51 | { 52 | 53 | haraka512Function = &haraka512_port_zero; 54 | 55 | } 56 | 57 | CVerusHash &CVerusHash::Write(const unsigned char *data, size_t _len) 58 | { 59 | unsigned char *tmp; 60 | uint32_t pos, len = _len; 61 | 62 | // digest up to 32 bytes at a time 63 | for ( pos = 0; pos < len; ) 64 | { 65 | uint32_t room = 32 - curPos; 66 | 67 | if (len - pos >= room) 68 | { 69 | memcpy(curBuf + 32 + curPos, data + pos, room); 70 | (*haraka512Function)(result, curBuf); 71 | tmp = curBuf; 72 | curBuf = result; 73 | result = tmp; 74 | pos += room; 75 | curPos = 0; 76 | } 77 | else 78 | { 79 | memcpy(curBuf + 32 + curPos, data + pos, len - pos); 80 | curPos += len - pos; 81 | pos = len; 82 | } 83 | } 84 | return *this; 85 | } 86 | 87 | // to be declared and accessed from C 88 | void verus_hash(void *result, const void *data, size_t len) 89 | { 90 | return CVerusHash::Hash(result, data, len); 91 | } 92 | 93 | void (*CVerusHashV2::haraka512Function)(unsigned char *out, const unsigned char *in); 94 | void (*CVerusHashV2::haraka512KeyedFunction)(unsigned char *out, const unsigned char *in, const u128 *rc); 95 | void (*CVerusHashV2::haraka256Function)(unsigned char *out, const unsigned char *in); 96 | 97 | void CVerusHashV2::init() 98 | { 99 | if (IsCPUVerusOptimized()) 100 | { 101 | load_constants(); 102 | haraka512Function = &haraka512; 103 | haraka512KeyedFunction = &haraka512_keyed; 104 | haraka256Function = &haraka256; 105 | } 106 | else 107 | { 108 | // load the haraka constants 109 | load_constants_port(); 110 | haraka512Function = &haraka512_port; 111 | haraka512KeyedFunction = &haraka512_port_keyed; 112 | haraka256Function = &haraka256_port; 113 | } 114 | } 115 | 116 | void CVerusHashV2::Hash(void *result, const void *data, size_t len) 117 | { 118 | unsigned char buf[128]; 119 | unsigned char *bufPtr = buf; 120 | int pos = 0, nextOffset = 64; 121 | unsigned char *bufPtr2 = bufPtr + nextOffset; 122 | unsigned char *ptr = (unsigned char *)data; 123 | 124 | // put our last result or zero at beginning of buffer each time 125 | memset(bufPtr, 0, 32); 126 | 127 | // digest up to 32 bytes at a time 128 | for ( ; pos < len; pos += 32) 129 | { 130 | if (len - pos >= 32) 131 | { 132 | memcpy(bufPtr + 32, ptr + pos, 32); 133 | } 134 | else 135 | { 136 | int i = (int)(len - pos); 137 | memcpy(bufPtr + 32, ptr + pos, i); 138 | memset(bufPtr + 32 + i, 0, 32 - i); 139 | } 140 | (*haraka512Function)(bufPtr2, bufPtr); 141 | bufPtr2 = bufPtr; 142 | bufPtr += nextOffset; 143 | nextOffset *= -1; 144 | } 145 | memcpy(result, bufPtr, 32); 146 | }; 147 | 148 | CVerusHashV2 &CVerusHashV2::Write(const unsigned char *data, size_t len) 149 | { 150 | unsigned char *tmp; 151 | 152 | // digest up to 32 bytes at a time 153 | for ( int pos = 0; pos < len; ) 154 | { 155 | int room = 32 - curPos; 156 | 157 | if (len - pos >= room) 158 | { 159 | memcpy(curBuf + 32 + curPos, data + pos, room); 160 | (*haraka512Function)(result, curBuf); 161 | tmp = curBuf; 162 | curBuf = result; 163 | result = tmp; 164 | pos += room; 165 | curPos = 0; 166 | } 167 | else 168 | { 169 | memcpy(curBuf + 32 + curPos, data + pos, len - pos); 170 | curPos += len - pos; 171 | pos = len; 172 | } 173 | } 174 | return *this; 175 | } 176 | 177 | // to be declared and accessed from C 178 | void verus_hash_v2(void *result, const void *data, size_t len) 179 | { 180 | return CVerusHashV2::Hash(result, data, len); 181 | } 182 | -------------------------------------------------------------------------------- /verus_hash.h: -------------------------------------------------------------------------------- 1 | // (C) 2018 Michael Toutonghi 2 | // Distributed under the MIT software license, see the accompanying 3 | // file COPYING or http://www.opensource.org/licenses/mit-license.php. 4 | 5 | /* 6 | This provides the PoW hash function for Verus, enabling CPU mining. 7 | */ 8 | 9 | #include "haraka.h" 10 | #include "haraka_portable.h" 11 | 12 | uint64_t verusclhash_port(void * random, const unsigned char buf[64], uint64_t keyMask); 13 | 14 | --------------------------------------------------------------------------------