├── .gitattributes
├── Makefile
├── .gitignore
├── src
    ├── c
    │   ├── noncegen_128_avx.h
    │   ├── noncegen_128_sse2.h
    │   ├── noncegen_256_avx2.h
    │   ├── noncegen_512_avx512f.h
    │   ├── common.c
    │   ├── common.h
    │   ├── .clang-format
    │   ├── sph_shabal.h
    │   ├── mshabal_128_avx.h
    │   ├── mshabal_128_sse2.h
    │   ├── mshabal_256_avx2.h
    │   ├── mshabal_512_avx512f.h
    │   ├── noncegen_128_avx.c
    │   ├── noncegen_128_sse2.c
    │   ├── noncegen_256_avx2.c
    │   └── noncegen_512_avx512f.c
    ├── buffer.rs
    ├── gpu_hasher.rs
    ├── poc_hashing.rs
    ├── writer.rs
    ├── cpu_hasher.rs
    ├── main.rs
    ├── scheduler.rs
    ├── shabal256.rs
    ├── utils.rs
    ├── plotter.rs
    └── ocl
    │   └── kernel.cl
├── .travis.yml
├── LICENSE
├── Cargo.toml
├── README.md
└── Cargo.lock


/.gitattributes:
--------------------------------------------------------------------------------
1 | src/c/* linguist-vendored
2 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | format-c:
2 | 	cd src/c && clang-format -i *
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Generated by Cargo
 2 | # will have compiled files and executables
 3 | /target/
 4 | /.vs/
 5 | /bin/
 6 | /obj/
 7 | /packages/
 8 | 
 9 | # These are backup files generated by rustfmt
10 | **/*.rs.bk
11 | 
12 | .cquery_cached_index
13 | *.bat
14 | *.exe


--------------------------------------------------------------------------------
/src/c/noncegen_128_avx.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <stdint.h>
 4 | #include <stdlib.h>
 5 | 
 6 | void init_shabal_avx();
 7 | void noncegen_avx(char *cache, const size_t cache_size, const size_t chunk_offset,
 8 |                    const uint64_t numeric_id, const uint64_t local_startnonce,
 9 |                    const uint64_t local_nonces);
10 | 


--------------------------------------------------------------------------------
/src/c/noncegen_128_sse2.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <stdint.h>
 4 | #include <stdlib.h>
 5 | 
 6 | void init_shabal_sse2();
 7 | void noncegen_sse2(char *cache, const size_t cache_size, const size_t chunk_offset,
 8 |                    const uint64_t numeric_id, const uint64_t local_startnonce,
 9 |                    const uint64_t local_nonces);
10 | 


--------------------------------------------------------------------------------
/src/c/noncegen_256_avx2.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <stdint.h>
 4 | #include <stdlib.h>
 5 | 
 6 | void init_shabal_avx2();
 7 | void noncegen_avx2(char *cache, const size_t cache_size, const size_t chunk_offset,
 8 |                    const uint64_t numeric_id, const uint64_t local_startnonce,
 9 |                    const uint64_t local_nonces);
10 | 


--------------------------------------------------------------------------------
/src/c/noncegen_512_avx512f.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <stdint.h>
 4 | #include <stdlib.h>
 5 | 
 6 | void init_shabal_avx512f();
 7 | void noncegen_avx512(char *cache, const size_t cache_size, const size_t chunk_offset,
 8 |                    const uint64_t numeric_id, const uint64_t local_startnonce,
 9 |                    const uint64_t local_nonces);
10 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: rust
 2 | rust:
 3 |   - stable
 4 | 
 5 | matrix:
 6 |   include:
 7 |     - os: linux
 8 |       dist: trusty
 9 |       sudo: required
10 |       addons:
11 |          apt:
12 |            sources:
13 |              - ubuntu-toolchain-r-test
14 |            packages:
15 |             - g++-4.9
16 |       env:
17 |          - CC=gcc-4.9      
18 |     - os: osx
19 |       
20 | fast_finish: true
21 | 
22 | 


--------------------------------------------------------------------------------
/src/c/common.c:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | #include <string.h>
 3 | 
 4 | void write_seed(char seed[32], uint64_t numeric_id) {
 5 |     numeric_id = bswap_64(numeric_id);
 6 |     memmove(&seed[0], &numeric_id, 8);
 7 |     memset(&seed[8], 0, 8);
 8 |     seed[16] = -128;  // shabal message termination bit
 9 |     memset(&seed[17], 0, 15);
10 | }
11 | 
12 | void write_term(char term[32]) {
13 |     term[0] = -128;  // shabal message termination bit
14 |     memset(&term[1], 0, 31);
15 | }
16 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 PoC Consortium
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "engraver"
 3 | version = "2.5.0"
 4 | license = "GPL-3.0"
 5 | authors = ["PoC Consortium <bots@cryptoguru.org>"]
 6 | description = """
 7 | Engraver - a PoC2 plotter written in Rust
 8 | """
 9 | repository = "https://github.com/PoC-Consortium/engraver"
10 | documentation = "https://github.com/PoC-Consortium/engraver"
11 | keywords = ["poc2", "plotter", "rust","cryptocurrency"]
12 | readme = "README.md"
13 | edition = "2018"
14 | 
15 | [features]
16 | opencl = ["ocl-core"]
17 | simd=[]
18 | 
19 | [dependencies]
20 | crossbeam-channel = "0.3.6"
21 | ocl-core = { version = "0.11.1", optional = true } 
22 | clap = "2.32.0"
23 | raw-cpuid = "6.1.0"
24 | sys-info = "0.5.6"
25 | cfg-if = "0.1.6"
26 | pbr = "1.0.1"
27 | humanize-rs = "0.1.5"
28 | libc = "0.2.46"
29 | rayon = "1.0.3"
30 | core_affinity = "0.5.9"
31 | stopwatch = "0.0.7"
32 | fs2 = "0.4.3"
33 | page_size = "0.4.1"
34 | aligned_alloc = "0.1.3"
35 | 
36 | [target.'cfg(linux)'.dependencies]
37 | thread-priority = "0.1.0"
38 | 
39 | [target.'cfg(windows)'.dependencies]
40 | winapi = { version = "0.3", features = ["std","fileapi","securitybaseapi"] }
41 | 
42 | [build-dependencies]
43 | cc = "1.0"
44 | 
45 | [dev-dependencies]
46 | rust-crypto = "0.2.36"
47 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <img align="right" src="https://i.imgur.com/PJsPNSG.png" height="200">
 2 |  
 3 |  [![Build Status](https://travis-ci.org/PoC-Consortium/engraver.svg?branch=master)](https://travis-ci.org/PoC-Consortium/engraver) [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 4 | 
 5 | # Engraver - PoC2 plotter in Rust
 6 | 
 7 | ### Features
 8 | - windows, linux, unix & macOS
 9 | - x86 32&64bit 
10 | - direct and async i/o
11 | - SIMD support: sse2, avx, avx2, avx512f
12 | - gpu support
13 | - fastest plotter there is
14 | 
15 | ### Requirements
16 | - new version of rust [stable toolchain]
17 | 
18 | ### Compile, test, ...
19 | 
20 | Binaries are in **target/debug** or **target/release** depending on optimization.
21 | 
22 | ``` shell
23 | # build debug und run directly
24 | cargo run [--features=opencl]
25 | 
26 | # build debug (unoptimized)
27 | cargo build [--features=opencl]
28 | 
29 | # build release (optimized)
30 | cargo build --release [--features=opencl]
31 | ```
32 | 
33 | ### Run
34 | 
35 | ```shell
36 | engraver --help
37 | ```
38 | 
39 | ### Donate 
40 | * JohnnyDeluxe: BURST-S338-R6VC-LTFA-2GC6G
41 |   - shabal optimizations
42 |   - windows support
43 | * bold: BURST-8V9Y-58B4-RVWP-8HQAV
44 |   - architecture
45 |   - linux support
46 | 
47 | 


--------------------------------------------------------------------------------
/src/buffer.rs:
--------------------------------------------------------------------------------
 1 | use aligned_alloc::{aligned_alloc, aligned_free};
 2 | use std::sync::{Arc, Mutex};
 3 | 
 4 | pub struct PageAlignedByteBuffer {
 5 |     data: Option<Arc<Mutex<Vec<u8>>>>,
 6 |     pointer: *mut (),
 7 | }
 8 | 
 9 | impl PageAlignedByteBuffer {
10 |     pub fn new(buffer_size: usize) -> Self {
11 |         let pointer = aligned_alloc(buffer_size, page_size::get());
12 |         let data: Vec<u8>;
13 |         unsafe {
14 |             data = Vec::from_raw_parts(pointer as *mut u8, buffer_size, buffer_size);
15 |         }
16 |         PageAlignedByteBuffer {
17 |             data: Some(Arc::new(Mutex::new(data))),
18 |             pointer,
19 |         }
20 |     }
21 | 
22 |     pub fn get_buffer(&self) -> Arc<Mutex<Vec<u8>>> {
23 |         self.data.as_ref().unwrap().clone()
24 |     }
25 | }
26 | 
27 | impl Drop for PageAlignedByteBuffer {
28 |     fn drop(&mut self) {
29 |         std::mem::forget(self.data.take().unwrap());
30 |         unsafe {
31 |             aligned_free(self.pointer);
32 |         }
33 |     }
34 | }
35 | 
36 | unsafe impl Send for PageAlignedByteBuffer {}
37 | 
38 | #[cfg(test)]
39 | mod buffer_tests {
40 |     use super::PageAlignedByteBuffer;
41 | 
42 |     #[test]
43 |     fn buffer_creation_destruction_test() {
44 |         {
45 |             let _test = PageAlignedByteBuffer::new(1024 * 1024);
46 |         }
47 |         assert!(true);
48 |     }
49 | }
50 | 


--------------------------------------------------------------------------------
/src/c/common.h:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | 
 3 | #pragma once
 4 | 
 5 | #ifdef _MSC_VER
 6 | 
 7 | #include <stdlib.h>
 8 | #define bswap_32(x) _byteswap_ulong(x)
 9 | #define bswap_64(x) _byteswap_uint64(x)
10 | 
11 | #elif defined(__APPLE__)
12 | 
13 | // Mac OS X / Darwin features
14 | #include <libkern/OSByteOrder.h>
15 | #define bswap_32(x) OSSwapInt32(x)
16 | #define bswap_64(x) OSSwapInt64(x)
17 | 
18 | #elif defined(__sun) || defined(sun)
19 | 
20 | #include <sys/byteorder.h>
21 | #define bswap_32(x) BSWAP_32(x)
22 | #define bswap_64(x) BSWAP_64(x)
23 | 
24 | #elif defined(__FreeBSD__)
25 | 
26 | #include <sys/endian.h>
27 | #define bswap_32(x) bswap32(x)
28 | #define bswap_64(x) bswap64(x)
29 | 
30 | #elif defined(__OpenBSD__)
31 | 
32 | #include <sys/types.h>
33 | #define bswap_32(x) swap32(x)
34 | #define bswap_64(x) swap64(x)
35 | 
36 | #elif defined(__NetBSD__)
37 | 
38 | #include <machine/bswap.h>
39 | #include <sys/types.h>
40 | #if defined(__BSWAP_RENAME) && !defined(__bswap_32)
41 | #define bswap_32(x) bswap32(x)
42 | #define bswap_64(x) bswap64(x)
43 | #endif
44 | 
45 | #else
46 | 
47 | #include <byteswap.h>
48 | 
49 | #endif
50 | 
51 | #define HASH_SIZE 32
52 | #define HASH_CAP 4096
53 | #define NUM_SCOOPS 4096
54 | #define SCOOP_SIZE 64
55 | #define NONCE_SIZE (HASH_CAP * SCOOP_SIZE)  // 4096*64
56 | 
57 | void write_seed(char seed[32], uint64_t numeric_id);
58 | 
59 | void write_term(char term[32]);
60 | 
61 | #define SET_BEST_DEADLINE(d, o) \
62 |     if ((d) < *best_deadline) { \
63 |         *best_deadline = (d);   \
64 |         *best_offset = (o);     \
65 |     }
66 | 


--------------------------------------------------------------------------------
/src/gpu_hasher.rs:
--------------------------------------------------------------------------------
 1 | use crate::cpu_hasher::SafePointer;
 2 | use crate::ocl::{gpu_hash, gpu_hash_and_transfer_to_host, gpu_transfer_to_host, GpuContext};
 3 | use crossbeam_channel::Receiver;
 4 | use std::sync::mpsc::Sender;
 5 | use std::sync::{Arc, Mutex};
 6 | 
 7 | pub struct GpuTask {
 8 |     pub cache: SafePointer,
 9 |     pub cache_size: u64,
10 |     pub chunk_offset: u64,
11 |     pub numeric_id: u64,
12 |     pub local_startnonce: u64,
13 |     pub local_nonces: u64,
14 | }
15 | 
16 | pub fn create_gpu_hasher_thread(
17 |     gpu_id: u8,
18 |     gpu_context: Arc<Mutex<GpuContext>>,
19 |     tx: Sender<(u8, u8, u64)>,
20 |     rx_hasher_task: Receiver<Option<GpuTask>>,
21 | ) -> impl FnOnce() {
22 |     move || {
23 |         let mut first_run = true;
24 |         let mut buffer_id = 0u8;
25 |         let mut last_task = GpuTask {
26 |             cache: SafePointer { ptr: &mut 0u8 },
27 |             cache_size: 0,
28 |             chunk_offset: 0,
29 |             numeric_id: 0,
30 |             local_startnonce: 0,
31 |             local_nonces: 0,
32 |         };
33 |         for task in rx_hasher_task {
34 |             // check if new task or termination
35 |             match task {
36 |                 // new task
37 |                 Some(task) => {
38 |                     // first run - just hash
39 |                     if first_run {
40 |                         if task.local_nonces != 0 {
41 |                             first_run = false;
42 |                             gpu_hash(&gpu_context, &task);
43 |                             buffer_id = 1 - buffer_id;
44 |                             last_task = task;
45 |                             tx.send((gpu_id, 1u8, 0))
46 |                                 .expect("GPU task can't communicate with scheduler thread.");
47 |                         }
48 |                     // last run - just transfer
49 |                     } else if task.local_nonces == 0 {
50 |                         gpu_transfer_to_host(&gpu_context, buffer_id, &last_task);
51 |                         first_run = true;
52 |                         buffer_id = 0;
53 |                         tx.send((gpu_id, 0u8, last_task.local_nonces))
54 |                             .expect("GPU task can't communicate with scheduler thread.");
55 |                     // normal run - hash and transfer async
56 |                     } else {
57 |                         gpu_hash_and_transfer_to_host(&gpu_context, buffer_id, &task, &last_task);
58 |                         buffer_id = 1 - buffer_id;
59 |                         tx.send((gpu_id, 0u8, last_task.local_nonces))
60 |                             .expect("GPU task can't communicate with scheduler thread.");
61 |                         last_task = task;
62 |                         tx.send((gpu_id, 1u8, 0))
63 |                             .expect("GPU task can't communicate with scheduler thread.");
64 |                     }
65 |                 }
66 |                 // termination
67 |                 None => {
68 |                     break;
69 |                 }
70 |             }
71 |         }
72 |     }
73 | }
74 | 


--------------------------------------------------------------------------------
/src/c/.clang-format:
--------------------------------------------------------------------------------
  1 | ---
  2 | Language:        Cpp
  3 | # BasedOnStyle:  Google
  4 | AccessModifierOffset: -1
  5 | AlignAfterOpenBracket: Align
  6 | AlignConsecutiveAssignments: false
  7 | AlignConsecutiveDeclarations: false
  8 | AlignEscapedNewlines: Left
  9 | AlignOperands:   true
 10 | AlignTrailingComments: true
 11 | AllowAllParametersOfDeclarationOnNextLine: true
 12 | AllowShortBlocksOnASingleLine: false
 13 | AllowShortCaseLabelsOnASingleLine: false
 14 | AllowShortFunctionsOnASingleLine: All
 15 | AllowShortIfStatementsOnASingleLine: true
 16 | AllowShortLoopsOnASingleLine: true
 17 | AlwaysBreakAfterDefinitionReturnType: None
 18 | AlwaysBreakAfterReturnType: None
 19 | AlwaysBreakBeforeMultilineStrings: true
 20 | AlwaysBreakTemplateDeclarations: true
 21 | BinPackArguments: true
 22 | BinPackParameters: true
 23 | BraceWrapping:
 24 |   AfterClass:      false
 25 |   AfterControlStatement: false
 26 |   AfterEnum:       false
 27 |   AfterFunction:   false
 28 |   AfterNamespace:  false
 29 |   AfterObjCDeclaration: false
 30 |   AfterStruct:     false
 31 |   AfterUnion:      false
 32 |   AfterExternBlock: false
 33 |   BeforeCatch:     false
 34 |   BeforeElse:      false
 35 |   IndentBraces:    false
 36 |   SplitEmptyFunction: true
 37 |   SplitEmptyRecord: true
 38 |   SplitEmptyNamespace: true
 39 | BreakBeforeBinaryOperators: None
 40 | BreakBeforeBraces: Attach
 41 | BreakBeforeInheritanceComma: false
 42 | BreakBeforeTernaryOperators: true
 43 | BreakConstructorInitializersBeforeComma: false
 44 | BreakConstructorInitializers: BeforeColon
 45 | BreakAfterJavaFieldAnnotations: false
 46 | BreakStringLiterals: true
 47 | ColumnLimit:     100
 48 | CommentPragmas:  '^ IWYU pragma:'
 49 | CompactNamespaces: false
 50 | ConstructorInitializerAllOnOneLineOrOnePerLine: true
 51 | ConstructorInitializerIndentWidth: 4
 52 | ContinuationIndentWidth: 4
 53 | Cpp11BracedListStyle: true
 54 | DerivePointerAlignment: true
 55 | DisableFormat:   false
 56 | ExperimentalAutoDetectBinPacking: false
 57 | FixNamespaceComments: true
 58 | ForEachMacros:
 59 |   - foreach
 60 |   - Q_FOREACH
 61 |   - BOOST_FOREACH
 62 | IncludeBlocks:   Preserve
 63 | IncludeCategories:
 64 |   - Regex:           '^<ext/.*\.h>'
 65 |     Priority:        2
 66 |   - Regex:           '^<.*\.h>'
 67 |     Priority:        1
 68 |   - Regex:           '^<.*'
 69 |     Priority:        2
 70 |   - Regex:           '.*'
 71 |     Priority:        3
 72 | IncludeIsMainRegex: '([-_](test|unittest))?$'
 73 | IndentCaseLabels: true
 74 | IndentPPDirectives: None
 75 | IndentWidth:     4
 76 | IndentWrappedFunctionNames: false
 77 | JavaScriptQuotes: Leave
 78 | JavaScriptWrapImports: true
 79 | KeepEmptyLinesAtTheStartOfBlocks: false
 80 | MacroBlockBegin: ''
 81 | MacroBlockEnd:   ''
 82 | MaxEmptyLinesToKeep: 1
 83 | NamespaceIndentation: None
 84 | ObjCBlockIndentWidth: 2
 85 | ObjCSpaceAfterProperty: false
 86 | ObjCSpaceBeforeProtocolList: false
 87 | PenaltyBreakAssignment: 2
 88 | PenaltyBreakBeforeFirstCallParameter: 1
 89 | PenaltyBreakComment: 300
 90 | PenaltyBreakFirstLessLess: 120
 91 | PenaltyBreakString: 1000
 92 | PenaltyExcessCharacter: 1000000
 93 | PenaltyReturnTypeOnItsOwnLine: 200
 94 | PointerAlignment: Left
 95 | RawStringFormats:
 96 |   - Delimiter:       pb
 97 |     Language:        TextProto
 98 |     BasedOnStyle:    google
 99 | ReflowComments:  true
100 | SortIncludes:    true
101 | SortUsingDeclarations: true
102 | SpaceAfterCStyleCast: false
103 | SpaceAfterTemplateKeyword: true
104 | SpaceBeforeAssignmentOperators: true
105 | SpaceBeforeParens: ControlStatements
106 | SpaceInEmptyParentheses: false
107 | SpacesBeforeTrailingComments: 2
108 | SpacesInAngles:  false
109 | SpacesInContainerLiterals: true
110 | SpacesInCStyleCastParentheses: false
111 | SpacesInParentheses: false
112 | SpacesInSquareBrackets: false
113 | Standard:        Auto
114 | TabWidth:        8
115 | UseTab:          Never
116 | ...
117 | 


--------------------------------------------------------------------------------
/src/poc_hashing.rs:
--------------------------------------------------------------------------------
  1 | use crate::shabal256::shabal256_fast;
  2 | 
  3 | const HASH_SIZE: usize = 32;
  4 | const HASH_CAP: usize = 4096;
  5 | const NUM_SCOOPS: usize = 4096;
  6 | const SCOOP_SIZE: usize = 64;
  7 | const NONCE_SIZE: usize = NUM_SCOOPS * SCOOP_SIZE;
  8 | const MESSAGE_SIZE: usize = 16;
  9 | 
 10 | // cache:		    cache to save to
 11 | // local_num:		thread number
 12 | // numeric_id:		numeric account id
 13 | // loc_startnonce	nonce to start generation at
 14 | // local_nonces: 	number of nonces to generate
 15 | pub fn noncegen_rust(
 16 |     cache: &mut [u8],
 17 |     cache_offset: usize,
 18 |     numeric_id: u64,
 19 |     local_startnonce: u64,
 20 |     local_nonces: u64,
 21 | ) {
 22 |     let numeric_id: [u32; 2] = unsafe { std::mem::transmute(numeric_id.to_be()) };
 23 | 
 24 |     let mut buffer = [0u8; NONCE_SIZE];
 25 |     let mut final_buffer = [0u8; HASH_SIZE];
 26 | 
 27 |     // prepare termination strings
 28 |     let mut t1 = [0u32; MESSAGE_SIZE];
 29 |     t1[0..2].clone_from_slice(&numeric_id);
 30 |     t1[4] = 0x80;
 31 | 
 32 |     let mut t2 = [0u32; MESSAGE_SIZE];
 33 |     t2[8..10].clone_from_slice(&numeric_id);
 34 |     t2[12] = 0x80;
 35 | 
 36 |     let mut t3 = [0u32; MESSAGE_SIZE];
 37 |     t3[0] = 0x80;
 38 | 
 39 |     for n in 0..local_nonces {
 40 |         // generate nonce numbers & change endianness
 41 |         let nonce: [u32; 2] = unsafe { std::mem::transmute((local_startnonce + n).to_be()) };
 42 | 
 43 |         // store nonce numbers in relevant termination strings
 44 |         t1[2..4].clone_from_slice(&nonce);
 45 |         t2[10..12].clone_from_slice(&nonce);
 46 | 
 47 |         // start shabal rounds
 48 | 
 49 |         // 3 cases: first 128 rounds uses case 1 or 2, after that case 3
 50 |         // case 1: first 128 rounds, hashes are even: use termination string 1
 51 |         // case 2: first 128 rounds, hashes are odd: use termination string 2
 52 |         // case 3: round > 128: use termination string 3
 53 |         // round 1
 54 |         let hash = shabal256_fast(&[], &t1);
 55 | 
 56 |         buffer[NONCE_SIZE - HASH_SIZE..NONCE_SIZE].clone_from_slice(&hash);
 57 |         let hash = unsafe { std::mem::transmute::<[u8; 32], [u32; 8]>(hash) };
 58 | 
 59 |         // store first hash into smart termination string 2
 60 |         t2[0..8].clone_from_slice(&hash);
 61 |         // round 2 - 128
 62 |         for i in (NONCE_SIZE - HASH_CAP + HASH_SIZE..=NONCE_SIZE - HASH_SIZE)
 63 |             .rev()
 64 |             .step_by(HASH_SIZE)
 65 |         {
 66 |             // check if msg can be divided into 512bit packages without a
 67 |             // remainder
 68 |             if i % 64 == 0 {
 69 |                 // last msg = seed + termination
 70 |                 let hash = &shabal256_fast(&buffer[i..NONCE_SIZE], &t1);
 71 |                 buffer[i - HASH_SIZE..i].clone_from_slice(hash);
 72 |             } else {
 73 |                 // last msg = 256 bit data + seed + termination
 74 |                 let hash = &shabal256_fast(&buffer[i..NONCE_SIZE], &t2);
 75 |                 buffer[i - HASH_SIZE..i].clone_from_slice(hash);
 76 |             }
 77 |         }
 78 | 
 79 |         // round 128-8192
 80 |         for i in (HASH_SIZE..=NONCE_SIZE - HASH_CAP).rev().step_by(HASH_SIZE) {
 81 |             let hash = &shabal256_fast(&buffer[i..i + HASH_CAP], &t3);
 82 |             buffer[i - HASH_SIZE..i].clone_from_slice(hash);
 83 |         }
 84 | 
 85 |         // generate final hash
 86 |         final_buffer.clone_from_slice(&shabal256_fast(&buffer[0..NONCE_SIZE], &t1));
 87 | 
 88 |         // XOR with final
 89 |         for i in 0..NONCE_SIZE {
 90 |             buffer[i] ^= final_buffer[i % HASH_SIZE];
 91 |         }
 92 | 
 93 |         // PoC2 shuffle
 94 |         let cache_size = cache.len() / NONCE_SIZE;
 95 |         for i in 0..NUM_SCOOPS {
 96 |             let offset = i * cache_size * SCOOP_SIZE + (n as usize + cache_offset) * SCOOP_SIZE;
 97 |             cache[offset..offset + HASH_SIZE]
 98 |                 .clone_from_slice(&buffer[i * SCOOP_SIZE..i * SCOOP_SIZE + HASH_SIZE]);
 99 |             let mirror_offset = (4095 - i) * cache_size * SCOOP_SIZE
100 |                 + (n as usize + cache_offset) * SCOOP_SIZE
101 |                 + HASH_SIZE;
102 |             cache[mirror_offset..mirror_offset + HASH_SIZE].clone_from_slice(
103 |                 &buffer[i * SCOOP_SIZE + HASH_SIZE..i * SCOOP_SIZE + 2 * HASH_SIZE],
104 |             );
105 |         }
106 |     }
107 | }
108 | 


--------------------------------------------------------------------------------
/src/c/sph_shabal.h:
--------------------------------------------------------------------------------
  1 | /* $Id: sph_shabal.h 175 2010-05-07 16:03:20Z tp $ */
  2 | /**
  3 |  * Shabal interface. Shabal is a family of functions which differ by
  4 |  * their output size; this implementation defines Shabal for output
  5 |  * sizes 192, 224, 256, 384 and 512 bits.
  6 |  *
  7 |  * ==========================(LICENSE BEGIN)============================
  8 |  *
  9 |  * Copyright (c) 2007-2010  Projet RNRT SAPHIR
 10 |  *
 11 |  * Permission is hereby granted, free of charge, to any person obtaining
 12 |  * a copy of this software and associated documentation files (the
 13 |  * "Software"), to deal in the Software without restriction, including
 14 |  * without limitation the rights to use, copy, modify, merge, publish,
 15 |  * distribute, sublicense, and/or sell copies of the Software, and to
 16 |  * permit persons to whom the Software is furnished to do so, subject to
 17 |  * the following conditions:
 18 |  *
 19 |  * The above copyright notice and this permission notice shall be
 20 |  * included in all copies or substantial portions of the Software.
 21 |  *
 22 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 23 |  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 24 |  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 25 |  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 26 |  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 27 |  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 28 |  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 29 |  *
 30 |  * ===========================(LICENSE END)=============================
 31 |  *
 32 |  * @file     sph_shabal.h
 33 |  * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
 34 |  */
 35 | 
 36 | #ifndef SPH_SHABAL_H__
 37 | #define SPH_SHABAL_H__
 38 | 
 39 | #include <stddef.h>
 40 | #include "sph_types.h"
 41 | 
 42 | #ifdef __cplusplus
 43 | extern "C" {
 44 | #endif
 45 | 
 46 | /**
 47 |  * Output size (in bits) for Shabal-256.
 48 |  */
 49 | #define SPH_SIZE_shabal256 256
 50 | 
 51 | /**
 52 |  * This structure is a context for Shabal computations: it contains the
 53 |  * intermediate values and some data from the last entered block. Once
 54 |  * a Shabal computation has been performed, the context can be reused for
 55 |  * another computation.
 56 |  *
 57 |  * The contents of this structure are private. A running Shabal computation
 58 |  * can be cloned by copying the context (e.g. with a simple
 59 |  * <code>memcpy()</code>).
 60 |  */
 61 | typedef struct {
 62 | #ifndef DOXYGEN_IGNORE
 63 |     unsigned char buf[64]; /* first field, for alignment */
 64 |     size_t ptr;
 65 |     sph_u32 A[12], B[16], C[16];
 66 |     sph_u32 Whigh, Wlow;
 67 | #endif
 68 | } sph_shabal_context;
 69 | 
 70 | /**
 71 |  * Type for a Shabal-256 context (identical to the common context).
 72 |  */
 73 | typedef sph_shabal_context sph_shabal256_context;
 74 | 
 75 | /**
 76 |  * Initialize a Shabal-256 context. This process performs no memory
 77 |  * allocation.
 78 |  *
 79 |  * @param cc   the Shabal-256 context (pointer to a
 80 |  *             <code>sph_shabal256_context</code>)
 81 |  */
 82 | void sph_shabal256_init(sph_shabal_context* cc);
 83 | 
 84 | /**
 85 |  * Process some data bytes. It is acceptable that <code>len</code> is zero
 86 |  * (in which case this function does nothing).
 87 |  *
 88 |  * @param cc     the Shabal-256 context
 89 |  * @param data   the input data
 90 |  * @param len    the input data length (in bytes)
 91 |  */
 92 | void sph_shabal256(void* cc, const unsigned char* data, size_t len);
 93 | 
 94 | /**
 95 |  * Terminate the current Shabal-256 computation and output the result into
 96 |  * the provided buffer. The destination buffer must be wide enough to
 97 |  * accomodate the result (32 bytes). The context is automatically
 98 |  * reinitialized.
 99 |  *
100 |  * @param cc    the Shabal-256 context
101 |  * @param dst   the destination buffer
102 |  */
103 | void sph_shabal256_close(void* cc, void* dst);
104 | 
105 | /**
106 |  * Add a few additional bits (0 to 7) to the current computation, then
107 |  * terminate it and output the result in the provided buffer, which must
108 |  * be wide enough to accomodate the result (32 bytes). If bit number i
109 |  * in <code>ub</code> has value 2^i, then the extra bits are those
110 |  * numbered 7 downto 8-n (this is the big-endian convention at the byte
111 |  * level). The context is automatically reinitialized.
112 |  *
113 |  * @param cc    the Shabal-256 context
114 |  * @param ub    the extra bits
115 |  * @param n     the number of extra bits (0 to 7)
116 |  * @param dst   the destination buffer
117 |  */
118 | void sph_shabal256_addbits_and_close(void* cc, unsigned ub, unsigned n, void* dst);
119 | 
120 | /*
121 |  * optimised Shabal routine for PoC plotting and hashing
122 |  */
123 | void sph_shabal_hash_fast(void *message, void *termination, void* dst, unsigned num);
124 | 
125 | /*
126 |  * optimised Shabal routine for PoC mining
127 |  */
128 | void sph_shabal_deadline_fast(void *scoop_data, void *gen_sig, void *dst);
129 | 
130 | #ifdef __cplusplus
131 | }
132 | #endif
133 | #endif
134 | 


--------------------------------------------------------------------------------
/src/writer.rs:
--------------------------------------------------------------------------------
  1 | use crate::plotter::{PlotterTask, NONCE_SIZE, SCOOP_SIZE};
  2 | use crate::buffer::PageAlignedByteBuffer;
  3 | use crate::utils::{open, open_r, open_using_direct_io};
  4 | use crossbeam_channel::{Receiver, Sender};
  5 | use std::cmp::min;
  6 | use std::io::{Read, Seek, SeekFrom, Write, Error, ErrorKind};
  7 | use std::path::Path;
  8 | use std::sync::Arc;
  9 | 
 10 | const TASK_SIZE: u64 = 16384;
 11 | 
 12 | pub fn create_writer_thread(
 13 |     task: Arc<PlotterTask>,
 14 |     mut nonces_written: u64,
 15 |     mut pb: Option<pbr::ProgressBar<pbr::Pipe>>,
 16 |     rx_buffers_to_writer: Receiver<PageAlignedByteBuffer>,
 17 |     tx_empty_buffers: Sender<PageAlignedByteBuffer>,
 18 | ) -> impl FnOnce() {
 19 |     move || {
 20 |         for buffer in rx_buffers_to_writer {
 21 |             let mut_bs = &buffer.get_buffer();
 22 |             let bs = mut_bs.lock().unwrap();
 23 |             let buffer_size = (*bs).len() as u64;
 24 |             let nonces_to_write = min(buffer_size / NONCE_SIZE, task.nonces - nonces_written);
 25 | 
 26 |             let filename = Path::new(&task.output_path).join(format!(
 27 |                 "{}_{}_{}",
 28 |                 task.numeric_id, task.start_nonce, task.nonces
 29 |             ));
 30 |             if !task.benchmark {
 31 |                 let file = if task.direct_io {
 32 |                     open_using_direct_io(&filename)
 33 |                 } else {
 34 |                     open(&filename)
 35 |                 };
 36 | 
 37 |                 let mut file = file.unwrap();
 38 | 
 39 |                 for scoop in 0..4096 {
 40 |                     let mut seek_addr = scoop * task.nonces as u64 * SCOOP_SIZE;
 41 |                     seek_addr += nonces_written as u64 * SCOOP_SIZE;
 42 | 
 43 |                     file.seek(SeekFrom::Start(seek_addr)).unwrap();
 44 | 
 45 |                     let mut local_addr = scoop * buffer_size / NONCE_SIZE * SCOOP_SIZE;
 46 |                     for _ in 0..nonces_to_write / TASK_SIZE {
 47 |                         file.write_all(
 48 |                             &bs[local_addr as usize
 49 |                                 ..(local_addr + TASK_SIZE * SCOOP_SIZE) as usize],
 50 |                         )
 51 |                         .unwrap();
 52 | 
 53 |                         local_addr += TASK_SIZE * SCOOP_SIZE;
 54 |                     }
 55 | 
 56 |                     // write remainder
 57 |                     if nonces_to_write % TASK_SIZE > 0 {
 58 |                         file.write_all(
 59 |                             &bs[local_addr as usize
 60 |                                 ..(local_addr + (nonces_to_write % TASK_SIZE) * SCOOP_SIZE)
 61 |                                     as usize],
 62 |                         )
 63 |                         .unwrap();
 64 |                     }
 65 | 
 66 |                     if (scoop + 1) % 128 == 0 {
 67 |                         match &mut pb {
 68 |                             Some(pb) => {
 69 |                                 pb.add(nonces_to_write * SCOOP_SIZE * 128);
 70 |                             }
 71 |                             None => (),
 72 |                         }
 73 |                     }
 74 |                 }
 75 |             }
 76 |             nonces_written += nonces_to_write;
 77 | 
 78 |             // thread end
 79 |             if task.nonces == nonces_written {
 80 |                 match &mut pb {
 81 |                     Some(pb) => {
 82 |                         pb.finish_print("Writer done.");
 83 |                     }
 84 |                     None => (),
 85 |                 }
 86 |                 tx_empty_buffers.send(buffer).unwrap();
 87 |                 break;
 88 |             }
 89 | 
 90 |             if !task.benchmark {
 91 |                 if write_resume_info(&filename, nonces_written).is_err() {
 92 |                     println!("Error: couldn't write resume info");
 93 |                 }
 94 |             }
 95 |             tx_empty_buffers.send(buffer).unwrap();
 96 |         }
 97 |     }
 98 | }
 99 | 
100 | pub fn read_resume_info(file: &Path) -> Result<u64, Error> {
101 |     let mut file = open_r(&file)?;
102 |     file.seek(SeekFrom::End(-8))?;
103 |       
104 | 
105 |     let mut progress = [0u8; 4];
106 |     let mut double_monkey = [0u8; 4];
107 | 
108 |     file.read_exact(&mut progress[0..4])?;
109 |     file.read_exact(&mut double_monkey[0..4])?;
110 | 
111 |     if double_monkey == [0xAF, 0xFE, 0xAF, 0xFE] {
112 |         Ok(u64::from(as_u32_le(progress)))
113 |     } else {
114 |         Err(Error::new(ErrorKind::Other, "End marker not found"))
115 |     }
116 | }
117 | 
118 | pub fn write_resume_info(file: &Path, nonces_written: u64) -> Result<(), Error> {
119 |     let mut file = open(&file)?;
120 |     file.seek(SeekFrom::End(-8))?;
121 | 
122 |     let progress = as_u8_le(nonces_written as u32);
123 |     let double_monkey = [0xAF, 0xFE, 0xAF, 0xFE];
124 | 
125 |     file.write_all(&progress[0..4])?;
126 |     file.write_all(&double_monkey[0..4])?;
127 |     Ok(())    
128 | }
129 | 
130 | fn as_u32_le(array: [u8; 4]) -> u32 {
131 |     u32::from(array[0])
132 |         + (u32::from(array[1]) << 8)
133 |         + (u32::from(array[2]) << 16)
134 |         + (u32::from(array[3]) << 24)
135 | }
136 | 
137 | fn as_u8_le(x: u32) -> [u8; 4] {
138 |     let b1: u8 = (x & 0xff) as u8;
139 |     let b2: u8 = ((x >> 8) & 0xff) as u8;
140 |     let b3: u8 = ((x >> 16) & 0xff) as u8;
141 |     let b4: u8 = ((x >> 24) & 0xff) as u8;
142 |     [b1, b2, b3, b4]
143 | }
144 | 


--------------------------------------------------------------------------------
/src/c/mshabal_128_avx.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * A parallel implementation of Shabal, for platforms with AVX.
  3 |  *
  4 |  * This is the header file for an implementation of the Shabal family
  5 |  * of hash functions, designed for maximum parallel speed. It processes
  6 |  * up to four instances of Shabal in parallel, using the AVX unit.
  7 |  * Total bandwidth appear to be up to twice that of a plain 32-bit
  8 |  * Shabal implementation.
  9 |  *
 10 |  * A computation uses a mshabal_context structure. That structure is
 11 |  * supposed to be allocated and released by the caller, e.g. as a
 12 |  * local or global variable, or on the heap. The structure contents
 13 |  * are initialized with mshabal_init(). Once the structure has been
 14 |  * initialized, data is input as chunks, with the mshabal() functions.
 15 |  * Chunks for the four parallel instances are provided simultaneously
 16 |  * and must have the same length. It is allowed not to use some of the
 17 |  * instances; the corresponding parameters in mshabal() are then NULL.
 18 |  * However, using NULL as a chunk for one of the instances effectively
 19 |  * deactivates that instance; this cannot be used to "skip" a chunk
 20 |  * for one instance.
 21 |  *
 22 |  * The computation is finalized with mshabal_close(). Some extra message
 23 |  * bits (0 to 7) can be input. The outputs of the four parallel instances
 24 |  * are written in the provided buffers. There again, NULL can be
 25 |  * provided as parameter is the output of one of the instances is not
 26 |  * needed.
 27 |  *
 28 |  * A mshabal_context instance is self-contained and holds no pointer.
 29 |  * Thus, it can be cloned (e.g. with memcpy()) or moved (as long as
 30 |  * proper alignment is maintained). This implementation uses no state
 31 |  * variable beyond the context instance; this, it is thread-safe and
 32 |  * reentrant.
 33 |  *
 34 |  * The Shabal specification defines Shabal with output sizes of 192,
 35 |  * 224, 256, 384 and 512 bits. This code accepts all those sizes, as
 36 |  * well as any output size which is multiple of 32, between 32 and
 37 |  * 512 (inclusive).
 38 |  *
 39 |  * Parameters are not validated. Thus, undefined behaviour occurs if
 40 |  * any of the "shall" or "must" clauses in this documentation is
 41 |  * violated.
 42 |  *
 43 |  *
 44 |  * (c) 2010 SAPHIR project. This software is provided 'as-is', without
 45 |  * any epxress or implied warranty. In no event will the authors be held
 46 |  * liable for any damages arising from the use of this software.
 47 |  *
 48 |  * Permission is granted to anyone to use this software for any purpose,
 49 |  * including commercial applications, and to alter it and redistribute it
 50 |  * freely, subject to no restriction.
 51 |  *
 52 |  * Technical remarks and questions can be addressed to:
 53 |  * <thomas.pornin@cryptolog.com>
 54 |  */
 55 | 
 56 | #ifndef MSHABAL_H__
 57 | #define MSHABAL_H__
 58 | 
 59 | #include <limits.h>
 60 | 
 61 | #ifdef __cplusplus
 62 | extern "C" {
 63 | #endif
 64 | 
 65 | /*
 66 |  * We need an integer type with width 32-bit or more (preferably, with
 67 |  * a width of exactly 32 bits).
 68 |  */
 69 | #if defined __STDC__ && __STDC_VERSION__ >= 199901L
 70 | #include <stdint.h>
 71 | #ifdef UINT32_MAX
 72 | typedef uint32_t mshabal_u32;
 73 | #else
 74 | typedef uint_fast32_t mshabal_u32;
 75 | #endif
 76 | #else
 77 | #if ((UINT_MAX >> 11) >> 11) >= 0x3FF
 78 | typedef unsigned int mshabal_u32;
 79 | #else
 80 | typedef unsigned long mshabal_u32;
 81 | #endif
 82 | #endif
 83 | 
 84 | #define MSHABAL128_VECTOR_SIZE 4
 85 | 
 86 | /*
 87 |  * The context structure for a Shabal computation. Contents are
 88 |  * private. Such a structure should be allocated and released by
 89 |  * the caller, in any memory area.
 90 |  */
 91 | typedef struct {
 92 |     unsigned char buf0[64];
 93 |     unsigned char buf1[64];
 94 |     unsigned char buf2[64];
 95 |     unsigned char buf3[64];
 96 |     size_t ptr;
 97 |     mshabal_u32 state[(12 + 16 + 16) * MSHABAL128_VECTOR_SIZE];
 98 |     mshabal_u32 Whigh, Wlow;
 99 |     unsigned out_size;
100 | } mshabal128_context;
101 | 
102 | #pragma pack(1)
103 | typedef struct {
104 |     mshabal_u32 state[(12 + 16 + 16) * MSHABAL128_VECTOR_SIZE];
105 |     mshabal_u32 Whigh, Wlow;
106 |     unsigned out_size;
107 | } mshabal128_context_fast;
108 | #pragma pack()
109 | 
110 | /*
111 |  * Initialize a context structure. The output size must be a multiple
112 |  * of 32, between 32 and 512 (inclusive). The output size is expressed
113 |  * in bits.
114 |  */
115 | void mshabal_init_avx(mshabal128_context *sc, unsigned out_size);
116 | 
117 | /*
118 |  * Process some more data bytes; four chunks of data, pointed to by
119 |  * data0, data1, data2 and data3, are processed. The four chunks have
120 |  * the same length of "len" bytes. For efficiency, it is best if data is
121 |  * processed by medium-sized chunks, e.g. a few kilobytes at a time.
122 |  *
123 |  * The "len" data bytes shall all be accessible. If "len" is zero, this
124 |  * this function does nothing and ignores the data* arguments.
125 |  * Otherwise, if one of the data* argument is NULL, then the
126 |  * corresponding instance is deactivated (the final value obtained from
127 |  * that instance is undefined).
128 |  */
129 | void mshabal_avx(mshabal128_context *sc, const void *data0, const void *data1, const void *data2,
130 |                 const void *data3, size_t len);
131 | 
132 | /*
133 |  * Terminate the Shabal computation incarnated by the provided context
134 |  * structure. "n" shall be a value between 0 and 7 (inclusive): this is
135 |  * the number of extra bits to extract from ub0, ub1, ub2 and ub3, and
136 |  * append at the end of the input message for each of the four parallel
137 |  * instances. Bits in "ub*" are taken in big-endian format: first bit is
138 |  * the one of numerical value 128, second bit has numerical value 64,
139 |  * and so on. Other bits in "ub*" are ignored. For most applications,
140 |  * input messages will consist in sequence of bytes, and the "ub*" and
141 |  * "n" parameters will be zero.
142 |  *
143 |  * The Shabal output for each of the parallel instances is written out
144 |  * in the areas pointed to by, respectively, dst0, dst1, dst2 and dst3.
145 |  * These areas shall be wide enough to accomodate the result (result
146 |  * size was specified as parameter to mshabal_init()). It is acceptable
147 |  * to use NULL for any of those pointers, if the result from the
148 |  * corresponding instance is not needed.
149 |  *
150 |  * After this call, the context structure is invalid. The caller shall
151 |  * release it, or reinitialize it with mshabal_init(). The mshabal_close()
152 |  * function does NOT imply a hidden call to mshabal_init().
153 |  */
154 | void mshabal_close_avx(mshabal128_context *sc, unsigned ub0, unsigned ub1, unsigned ub2,
155 |                       unsigned ub3, unsigned n, void *dst0, void *dst1, void *dst2, 
156 |                       void *dst3);
157 | 
158 | /*
159 |  * optimised Shabal routine for PoC mining
160 |  */
161 | void mshabal_deadline_fast_avx(mshabal128_context_fast *sc, void *message, void *termination, void *dst0,
162 |                                     void *dst1, void *dst2, void *dst3);
163 | 
164 | /*
165 |  * optimised Shabal routine for PoC plotting and hashing
166 |  */
167 | void mshabal_hash_fast_avx(mshabal128_context_fast *sc, void *message, void *termination,
168 |                                void *dst, unsigned num);
169 | 
170 | #ifdef __cplusplus
171 | }
172 | #endif
173 | 
174 | #endif
175 | 


--------------------------------------------------------------------------------
/src/c/mshabal_128_sse2.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * A parallel implementation of Shabal, for platforms with SSE2.
  3 |  *
  4 |  * This is the header file for an implementation of the Shabal family
  5 |  * of hash functions, designed for maximum parallel speed. It processes
  6 |  * up to four instances of Shabal in parallel, using the SSE2 unit.
  7 |  * Total bandwidth appear to be up to twice that of a plain 32-bit
  8 |  * Shabal implementation.
  9 |  *
 10 |  * A computation uses a mshabal_context structure. That structure is
 11 |  * supposed to be allocated and released by the caller, e.g. as a
 12 |  * local or global variable, or on the heap. The structure contents
 13 |  * are initialized with mshabal_init(). Once the structure has been
 14 |  * initialized, data is input as chunks, with the mshabal() functions.
 15 |  * Chunks for the four parallel instances are provided simultaneously
 16 |  * and must have the same length. It is allowed not to use some of the
 17 |  * instances; the corresponding parameters in mshabal() are then NULL.
 18 |  * However, using NULL as a chunk for one of the instances effectively
 19 |  * deactivates that instance; this cannot be used to "skip" a chunk
 20 |  * for one instance.
 21 |  *
 22 |  * The computation is finalized with mshabal_close(). Some extra message
 23 |  * bits (0 to 7) can be input. The outputs of the four parallel instances
 24 |  * are written in the provided buffers. There again, NULL can be
 25 |  * provided as parameter is the output of one of the instances is not
 26 |  * needed.
 27 |  *
 28 |  * A mshabal_context instance is self-contained and holds no pointer.
 29 |  * Thus, it can be cloned (e.g. with memcpy()) or moved (as long as
 30 |  * proper alignment is maintained). This implementation uses no state
 31 |  * variable beyond the context instance; this, it is thread-safe and
 32 |  * reentrant.
 33 |  *
 34 |  * The Shabal specification defines Shabal with output sizes of 192,
 35 |  * 224, 256, 384 and 512 bits. This code accepts all those sizes, as
 36 |  * well as any output size which is multiple of 32, between 32 and
 37 |  * 512 (inclusive).
 38 |  *
 39 |  * Parameters are not validated. Thus, undefined behaviour occurs if
 40 |  * any of the "shall" or "must" clauses in this documentation is
 41 |  * violated.
 42 |  *
 43 |  *
 44 |  * (c) 2010 SAPHIR project. This software is provided 'as-is', without
 45 |  * any epxress or implied warranty. In no event will the authors be held
 46 |  * liable for any damages arising from the use of this software.
 47 |  *
 48 |  * Permission is granted to anyone to use this software for any purpose,
 49 |  * including commercial applications, and to alter it and redistribute it
 50 |  * freely, subject to no restriction.
 51 |  *
 52 |  * Technical remarks and questions can be addressed to:
 53 |  * <thomas.pornin@cryptolog.com>
 54 |  */
 55 | 
 56 | #ifndef MSHABAL_H__
 57 | #define MSHABAL_H__
 58 | 
 59 | #include <limits.h>
 60 | 
 61 | #ifdef __cplusplus
 62 | extern "C" {
 63 | #endif
 64 | 
 65 | /*
 66 |  * We need an integer type with width 32-bit or more (preferably, with
 67 |  * a width of exactly 32 bits).
 68 |  */
 69 | #if defined __STDC__ && __STDC_VERSION__ >= 199901L
 70 | #include <stdint.h>
 71 | #ifdef UINT32_MAX
 72 | typedef uint32_t mshabal_u32;
 73 | #else
 74 | typedef uint_fast32_t mshabal_u32;
 75 | #endif
 76 | #else
 77 | #if ((UINT_MAX >> 11) >> 11) >= 0x3FF
 78 | typedef unsigned int mshabal_u32;
 79 | #else
 80 | typedef unsigned long mshabal_u32;
 81 | #endif
 82 | #endif
 83 | 
 84 | #define MSHABAL128_VECTOR_SIZE 4
 85 | 
 86 | /*
 87 |  * The context structure for a Shabal computation. Contents are
 88 |  * private. Such a structure should be allocated and released by
 89 |  * the caller, in any memory area.
 90 |  */
 91 | typedef struct {
 92 |     unsigned char buf0[64];
 93 |     unsigned char buf1[64];
 94 |     unsigned char buf2[64];
 95 |     unsigned char buf3[64];
 96 |     size_t ptr;
 97 |     mshabal_u32 state[(12 + 16 + 16) * MSHABAL128_VECTOR_SIZE];
 98 |     mshabal_u32 Whigh, Wlow;
 99 |     unsigned out_size;
100 | } mshabal128_context;
101 | 
102 | #pragma pack(1)
103 | typedef struct {
104 |     mshabal_u32 state[(12 + 16 + 16) * MSHABAL128_VECTOR_SIZE];
105 |     mshabal_u32 Whigh, Wlow;
106 |     unsigned out_size;
107 | } mshabal128_context_fast;
108 | #pragma pack()
109 | 
110 | /*
111 |  * Initialize a context structure. The output size must be a multiple
112 |  * of 32, between 32 and 512 (inclusive). The output size is expressed
113 |  * in bits.
114 |  */
115 | void mshabal_init_sse2(mshabal128_context *sc, unsigned out_size);
116 | 
117 | /*
118 |  * Process some more data bytes; four chunks of data, pointed to by
119 |  * data0, data1, data2 and data3, are processed. The four chunks have
120 |  * the same length of "len" bytes. For efficiency, it is best if data is
121 |  * processed by medium-sized chunks, e.g. a few kilobytes at a time.
122 |  *
123 |  * The "len" data bytes shall all be accessible. If "len" is zero, this
124 |  * this function does nothing and ignores the data* arguments.
125 |  * Otherwise, if one of the data* argument is NULL, then the
126 |  * corresponding instance is deactivated (the final value obtained from
127 |  * that instance is undefined).
128 |  */
129 | void mshabal_sse2(mshabal128_context *sc, const void *data0, const void *data1, const void *data2,
130 |                 const void *data3, size_t len);
131 | 
132 | /*
133 |  * Terminate the Shabal computation incarnated by the provided context
134 |  * structure. "n" shall be a value between 0 and 7 (inclusive): this is
135 |  * the number of extra bits to extract from ub0, ub1, ub2 and ub3, and
136 |  * append at the end of the input message for each of the four parallel
137 |  * instances. Bits in "ub*" are taken in big-endian format: first bit is
138 |  * the one of numerical value 128, second bit has numerical value 64,
139 |  * and so on. Other bits in "ub*" are ignored. For most applications,
140 |  * input messages will consist in sequence of bytes, and the "ub*" and
141 |  * "n" parameters will be zero.
142 |  *
143 |  * The Shabal output for each of the parallel instances is written out
144 |  * in the areas pointed to by, respectively, dst0, dst1, dst2 and dst3.
145 |  * These areas shall be wide enough to accomodate the result (result
146 |  * size was specified as parameter to mshabal_init()). It is acceptable
147 |  * to use NULL for any of those pointers, if the result from the
148 |  * corresponding instance is not needed.
149 |  *
150 |  * After this call, the context structure is invalid. The caller shall
151 |  * release it, or reinitialize it with mshabal_init(). The mshabal_close()
152 |  * function does NOT imply a hidden call to mshabal_init().
153 |  */
154 | void mshabal_close_sse2(mshabal128_context *sc, unsigned ub0, unsigned ub1, unsigned ub2,
155 |                       unsigned ub3, unsigned n, void *dst0, void *dst1, void *dst2,
156 |                       void *dst3);
157 | 
158 | /*
159 |  * optimised Shabal routine for PoC plotting and hashing
160 |  */
161 | void mshabal_hash_fast_sse2(mshabal128_context_fast *sc, void *message, void *termination,
162 |                                void *dst, unsigned num);
163 | 
164 | /*
165 |  * optimised Shabal routine for PoC mining
166 |  */
167 | void mshabal_deadline_fast_sse2(mshabal128_context_fast *sc, void *message, void *termination, void *dst0,
168 |                                     void *dst1, void *dst2, void *dst3);
169 | 
170 | #ifdef __cplusplus
171 | }
172 | #endif
173 | 
174 | #endif
175 | 


--------------------------------------------------------------------------------
/src/c/mshabal_256_avx2.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * A parallel implementation of Shabal, for platforms with AVX2.
  3 |  *
  4 |  * This is the header file for an implementation of the Shabal family
  5 |  * of hash functions, designed for maximum parallel speed. It processes
  6 |  * up to four instances of Shabal in parallel, using the AVX2 unit.
  7 |  * Total bandwidth appear to be up to twice that of a plain 32-bit
  8 |  * Shabal implementation.
  9 |  *
 10 |  * A computation uses a mshabal_context structure. That structure is
 11 |  * supposed to be allocated and released by the caller, e.g. as a
 12 |  * local or global variable, or on the heap. The structure contents
 13 |  * are initialized with mshabal_init(). Once the structure has been
 14 |  * initialized, data is input as chunks, with the mshabal() functions.
 15 |  * Chunks for the four parallel instances are provided simultaneously
 16 |  * and must have the same length. It is allowed not to use some of the
 17 |  * instances; the corresponding parameters in mshabal() are then NULL.
 18 |  * However, using NULL as a chunk for one of the instances effectively
 19 |  * deactivates that instance; this cannot be used to "skip" a chunk
 20 |  * for one instance.
 21 |  *
 22 |  * The computation is finalized with mshabal_close(). Some extra message
 23 |  * bits (0 to 7) can be input. The outputs of the four parallel instances
 24 |  * are written in the provided buffers. There again, NULL can be
 25 |  * provided as parameter is the output of one of the instances is not
 26 |  * needed.
 27 |  *
 28 |  * A mshabal_context instance is self-contained and holds no pointer.
 29 |  * Thus, it can be cloned (e.g. with memcpy()) or moved (as long as
 30 |  * proper alignment is maintained). This implementation uses no state
 31 |  * variable beyond the context instance; this, it is thread-safe and
 32 |  * reentrant.
 33 |  *
 34 |  * The Shabal specification defines Shabal with output sizes of 192,
 35 |  * 224, 256, 384 and 512 bits. This code accepts all those sizes, as
 36 |  * well as any output size which is multiple of 32, between 32 and
 37 |  * 512 (inclusive).
 38 |  *
 39 |  * Parameters are not validated. Thus, undefined behaviour occurs if
 40 |  * any of the "shall" or "must" clauses in this documentation is
 41 |  * violated.
 42 |  *
 43 |  *
 44 |  * (c) 2010 SAPHIR project. This software is provided 'as-is', without
 45 |  * any epxress or implied warranty. In no event will the authors be held
 46 |  * liable for any damages arising from the use of this software.
 47 |  *
 48 |  * Permission is granted to anyone to use this software for any purpose,
 49 |  * including commercial applications, and to alter it and redistribute it
 50 |  * freely, subject to no restriction.
 51 |  *
 52 |  * Technical remarks and questions can be addressed to:
 53 |  * <thomas.pornin@cryptolog.com>
 54 |  */
 55 | 
 56 | #ifndef MSHABAL_H__
 57 | #define MSHABAL_H__
 58 | 
 59 | #include <limits.h>
 60 | 
 61 | #ifdef __cplusplus
 62 | extern "C" {
 63 | #endif
 64 | 
 65 | /*
 66 |  * We need an integer type with width 32-bit or more (preferably, with
 67 |  * a width of exactly 32 bits).
 68 |  */
 69 | #if defined __STDC__ && __STDC_VERSION__ >= 199901L
 70 | #include <stdint.h>
 71 | #ifdef UINT32_MAX
 72 | typedef uint32_t mshabal_u32;
 73 | #else
 74 | typedef uint_fast32_t mshabal_u32;
 75 | #endif
 76 | #else
 77 | #if ((UINT_MAX >> 11) >> 11) >= 0x3FF
 78 | typedef unsigned int mshabal_u32;
 79 | #else
 80 | typedef unsigned long mshabal_u32;
 81 | #endif
 82 | #endif
 83 | 
 84 | #define MSHABAL256_VECTOR_SIZE 8
 85 | 
 86 | /*
 87 |  * The context structure for a Shabal computation. Contents are
 88 |  * private. Such a structure should be allocated and released by
 89 |  * the caller, in any memory area.
 90 |  */
 91 | typedef struct {
 92 |     unsigned char buf0[64];
 93 |     unsigned char buf1[64];
 94 |     unsigned char buf2[64];
 95 |     unsigned char buf3[64];
 96 |     unsigned char buf4[64];
 97 |     unsigned char buf5[64];
 98 |     unsigned char buf6[64];
 99 |     unsigned char buf7[64];
100 |     size_t ptr;
101 |     mshabal_u32 state[(12 + 16 + 16) * MSHABAL256_VECTOR_SIZE];
102 |     mshabal_u32 Whigh, Wlow;
103 |     unsigned out_size;
104 | } mshabal256_context;
105 | 
106 | #pragma pack(1)
107 | typedef struct {
108 |     mshabal_u32 state[(12 + 16 + 16) * MSHABAL256_VECTOR_SIZE];
109 |     mshabal_u32 Whigh, Wlow;
110 |     unsigned out_size;
111 | } mshabal256_context_fast;
112 | #pragma pack()
113 | 
114 | /*
115 |  * Initialize a context structure. The output size must be a multiple
116 |  * of 32, between 32 and 512 (inclusive). The output size is expressed
117 |  * in bits.
118 |  */
119 | void mshabal_init_avx2(mshabal256_context *sc, unsigned out_size);
120 | 
121 | /*
122 |  * Process some more data bytes; four chunks of data, pointed to by
123 |  * data0, data1, data2 and data3, are processed. The four chunks have
124 |  * the same length of "len" bytes. For efficiency, it is best if data is
125 |  * processed by medium-sized chunks, e.g. a few kilobytes at a time.
126 |  *
127 |  * The "len" data bytes shall all be accessible. If "len" is zero, this
128 |  * this function does nothing and ignores the data* arguments.
129 |  * Otherwise, if one of the data* argument is NULL, then the
130 |  * corresponding instance is deactivated (the final value obtained from
131 |  * that instance is undefined).
132 |  */
133 | void mshabal_avx2(mshabal256_context *sc, const void *data0, const void *data1, const void *data2, const void *data3, 
134 |                 const void *data4, const void *data5, const void *data6, const void *data7, size_t len);
135 | 
136 | /*
137 |  * Terminate the Shabal computation incarnated by the provided context
138 |  * structure. "n" shall be a value between 0 and 7 (inclusive): this is
139 |  * the number of extra bits to extract from ub0, ub1, ub2 and ub3, and
140 |  * append at the end of the input message for each of the four parallel
141 |  * instances. Bits in "ub*" are taken in big-endian format: first bit is
142 |  * the one of numerical value 128, second bit has numerical value 64,
143 |  * and so on. Other bits in "ub*" are ignored. For most applications,
144 |  * input messages will consist in sequence of bytes, and the "ub*" and
145 |  * "n" parameters will be zero.
146 |  *
147 |  * The Shabal output for each of the parallel instances is written out
148 |  * in the areas pointed to by, respectively, dst0, dst1, dst2 and dst3.
149 |  * These areas shall be wide enough to accomodate the result (result
150 |  * size was specified as parameter to mshabal_init()). It is acceptable
151 |  * to use NULL for any of those pointers, if the result from the
152 |  * corresponding instance is not needed.
153 |  *
154 |  * After this call, the context structure is invalid. The caller shall
155 |  * release it, or reinitialize it with mshabal_init(). The mshabal_close()
156 |  * function does NOT imply a hidden call to mshabal_init().
157 |  */
158 | void mshabal_close_avx2(mshabal256_context *sc, unsigned ub0, unsigned ub1, unsigned ub2,
159 |                       unsigned ub3, unsigned ub4, unsigned ub5, unsigned ub6, unsigned ub7,
160 |                       unsigned n, void *dst0, void *dst1, void *dst2, void *dst3, void *dst4,
161 |                       void *dst5, void *dst6, void *dst7);
162 | 
163 | /*
164 |  * optimised Shabal routine for PoC plotting and hashing
165 |  */
166 | void mshabal_hash_fast_avx2(mshabal256_context_fast *sc, void *message, void *termination,
167 |                                void *dst, unsigned len);
168 | 
169 | /*
170 |  * optimised Shabal routine for PoC mining
171 |  */
172 | void mshabal_deadline_fast_avx2(mshabal256_context_fast *sc, void *message, void *termination, void *dst0,
173 |                                     void *dst1, void *dst2, void *dst3, void *dst4, void *dst5,
174 |                                     void *dst6, void *dst7);
175 | #ifdef __cplusplus
176 | }
177 | #endif
178 | 
179 | #endif
180 | 


--------------------------------------------------------------------------------
/src/cpu_hasher.rs:
--------------------------------------------------------------------------------
  1 | use crate::poc_hashing::noncegen_rust;
  2 | use libc::{c_void, size_t};
  3 | use std::slice::from_raw_parts_mut;
  4 | use std::sync::mpsc::Sender;
  5 | 
  6 | const NUM_SCOOPS: usize = 4096;
  7 | const SCOOP_SIZE: usize = 64;
  8 | const NONCE_SIZE: usize = NUM_SCOOPS * SCOOP_SIZE;
  9 | 
 10 | extern "C" {
 11 |     pub fn init_shabal_sse2() -> ();
 12 |     pub fn init_shabal_avx() -> ();
 13 |     pub fn init_shabal_avx2() -> ();
 14 |     pub fn init_shabal_avx512f() -> ();
 15 |     pub fn noncegen_sse2(
 16 |         cache: *mut c_void,
 17 |         cache_size: size_t,
 18 |         chunk_offset: size_t,
 19 |         numeric_ID: u64,
 20 |         local_startnonce: u64,
 21 |         local_nonces: u64,
 22 |     );
 23 |     pub fn noncegen_avx(
 24 |         cache: *mut c_void,
 25 |         cache_size: size_t,
 26 |         chunk_offset: size_t,
 27 |         numeric_ID: u64,
 28 |         local_startnonce: u64,
 29 |         local_nonces: u64,
 30 |     );
 31 |     pub fn noncegen_avx2(
 32 |         cache: *mut c_void,
 33 |         cache_size: size_t,
 34 |         chunk_offset: size_t,
 35 |         numeric_ID: u64,
 36 |         local_startnonce: u64,
 37 |         local_nonces: u64,
 38 |     );
 39 |     pub fn noncegen_avx512(
 40 |         cache: *mut c_void,
 41 |         cache_size: size_t,
 42 |         chunk_offset: size_t,
 43 |         numeric_ID: u64,
 44 |         local_startnonce: u64,
 45 |         local_nonces: u64,
 46 |     );
 47 | }
 48 | pub struct SafePointer {
 49 |     pub ptr: *mut u8,
 50 | }
 51 | unsafe impl Send for SafePointer {}
 52 | unsafe impl Sync for SafePointer {}
 53 | 
 54 | pub struct CpuTask {
 55 |     pub cache: SafePointer,
 56 |     pub cache_size: usize,
 57 |     pub chunk_offset: usize,
 58 |     pub numeric_id: u64,
 59 |     pub local_startnonce: u64,
 60 |     pub local_nonces: u64,
 61 | }
 62 | 
 63 | #[derive(Debug, Clone)]
 64 | pub enum SimdExtension {
 65 |     AVX512f,
 66 |     AVX2,
 67 |     AVX,
 68 |     SSE2,
 69 |     None,
 70 | }
 71 | 
 72 | pub fn init_simd() -> SimdExtension {
 73 |     if is_x86_feature_detected!("avx512f") {
 74 |         unsafe {
 75 |             init_shabal_avx512f();
 76 |         }
 77 |         SimdExtension::AVX512f
 78 |     } else if is_x86_feature_detected!("avx2") {
 79 |         unsafe {
 80 |             init_shabal_avx2();
 81 |         }
 82 |         SimdExtension::AVX2
 83 |     } else if is_x86_feature_detected!("avx") {
 84 |         unsafe {
 85 |             init_shabal_avx();
 86 |         }
 87 |         SimdExtension::AVX
 88 |     } else if is_x86_feature_detected!("sse2") {
 89 |         unsafe {
 90 |             init_shabal_sse2();
 91 |         }
 92 |         SimdExtension::SSE2
 93 |     } else {
 94 |         SimdExtension::None
 95 |     }
 96 | }
 97 | 
 98 | pub fn hash_cpu(
 99 |     tx: Sender<(u8, u8, u64)>,
100 |     hasher_task: CpuTask,
101 |     simd_ext: SimdExtension,
102 | ) -> impl FnOnce() {
103 |     move || {
104 |         unsafe {
105 |             match simd_ext {
106 |                 SimdExtension::AVX512f => noncegen_avx512(
107 |                     hasher_task.cache.ptr as *mut c_void,
108 |                     hasher_task.cache_size,
109 |                     hasher_task.chunk_offset,
110 |                     hasher_task.numeric_id,
111 |                     hasher_task.local_startnonce,
112 |                     hasher_task.local_nonces,
113 |                 ),
114 |                 SimdExtension::AVX2 => noncegen_avx2(
115 |                     hasher_task.cache.ptr as *mut c_void,
116 |                     hasher_task.cache_size,
117 |                     hasher_task.chunk_offset,
118 |                     hasher_task.numeric_id,
119 |                     hasher_task.local_startnonce,
120 |                     hasher_task.local_nonces,
121 |                 ),
122 |                 SimdExtension::AVX => noncegen_avx(
123 |                     hasher_task.cache.ptr as *mut c_void,
124 |                     hasher_task.cache_size,
125 |                     hasher_task.chunk_offset,
126 |                     hasher_task.numeric_id,
127 |                     hasher_task.local_startnonce,
128 |                     hasher_task.local_nonces,
129 |                 ),
130 |                 SimdExtension::SSE2 => noncegen_sse2(
131 |                     hasher_task.cache.ptr as *mut c_void,
132 |                     hasher_task.cache_size,
133 |                     hasher_task.chunk_offset,
134 |                     hasher_task.numeric_id,
135 |                     hasher_task.local_startnonce,
136 |                     hasher_task.local_nonces,
137 |                 ),
138 |                 _ => {
139 |                     let data = from_raw_parts_mut(
140 |                         hasher_task.cache.ptr,
141 |                         hasher_task.cache_size * NONCE_SIZE,
142 |                     );
143 |                     noncegen_rust(
144 |                         data,
145 |                         hasher_task.chunk_offset,
146 |                         hasher_task.numeric_id,
147 |                         hasher_task.local_startnonce,
148 |                         hasher_task.local_nonces,
149 |                     )
150 |                 }
151 |             }
152 |         }
153 |         // report hashing done
154 |         tx.send((0u8, 1u8, 0))
155 |             .expect("CPU task can't communicate with scheduler thread.");
156 |         // report data in hostmem
157 |         tx.send((0u8, 0u8, hasher_task.local_nonces))
158 |             .expect("CPU task can't communicate with scheduler thread.");
159 |     }
160 | }
161 | 
162 | #[cfg(test)]
163 | mod test {
164 |     extern crate crypto;
165 |     use self::crypto::digest::Digest;
166 |     use self::crypto::sha2::Sha256;
167 |     use super::*;
168 |     use crate::plotter;
169 | 
170 |     #[test]
171 |     fn test_noncegen() {
172 |         let numeric_id = 7900104405094198526;
173 |         let start_nonce = 1337;
174 |         let exp_result_hash = "eebdf7dce694cbea9539f71efc362d4b72f8792def335d7157dadb09bb6d9e5f";
175 | 
176 |         let check_result = |buf: &Vec<u8>| {
177 |             let mut hasher = Sha256::new();
178 |             hasher.input(buf);
179 |             assert_eq!(hasher.result_str(), exp_result_hash);
180 |         };
181 | 
182 |         if is_x86_feature_detected!("avx512f") {
183 |             let mut buf = vec![0; 32 * plotter::NONCE_SIZE as usize];
184 |             unsafe {
185 |                 init_shabal_avx512f();
186 |                 noncegen_avx512(
187 |                     buf.as_mut_ptr() as *mut c_void,
188 |                     32,
189 |                     0,
190 |                     numeric_id,
191 |                     start_nonce,
192 |                     32,
193 |                 );
194 |             }
195 |             check_result(&buf);
196 |         }
197 | 
198 |         if is_x86_feature_detected!("avx2") {
199 |             let mut buf = vec![0; 32 * plotter::NONCE_SIZE as usize];
200 |             unsafe {
201 |                 init_shabal_avx2();
202 |                 noncegen_avx2(
203 |                     buf.as_mut_ptr() as *mut c_void,
204 |                     32,
205 |                     0,
206 |                     numeric_id,
207 |                     start_nonce,
208 |                     32,
209 |                 );
210 |             }
211 |             check_result(&buf);
212 |         }
213 | 
214 |         if is_x86_feature_detected!("avx") {
215 |             let mut buf = vec![0; 32 * plotter::NONCE_SIZE as usize];
216 |             unsafe {
217 |                 init_shabal_avx();
218 |                 noncegen_avx(
219 |                     buf.as_mut_ptr() as *mut c_void,
220 |                     32,
221 |                     0,
222 |                     numeric_id,
223 |                     start_nonce,
224 |                     32,
225 |                 );
226 |             }
227 |             check_result(&buf);
228 |         }
229 | 
230 |         if is_x86_feature_detected!("sse2") {
231 |             let mut buf = vec![0; 32 * plotter::NONCE_SIZE as usize];
232 |             unsafe {
233 |                 init_shabal_sse2();
234 |                 noncegen_sse2(
235 |                     buf.as_mut_ptr() as *mut c_void,
236 |                     32,
237 |                     0,
238 |                     numeric_id,
239 |                     start_nonce,
240 |                     32,
241 |                 );
242 |             }
243 |             check_result(&buf);
244 |         }
245 | 
246 |         let mut buf = vec![0; 32 * plotter::NONCE_SIZE as usize];
247 |         noncegen_rust(&mut buf, 0, numeric_id, start_nonce, 32);
248 |         check_result(&buf);
249 |     }
250 | }
251 | 


--------------------------------------------------------------------------------
/src/c/mshabal_512_avx512f.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * A parallel implementation of Shabal, for platforms with AVX512F.
  3 |  *
  4 |  * This is the header file for an implementation of the Shabal family
  5 |  * of hash functions, designed for maximum parallel speed. It processes
  6 |  * up to four instances of Shabal in parallel, using the AVX512F unit.
  7 |  * Total bandwidth appear to be up to twice that of a plain 32-bit
  8 |  * Shabal implementation.
  9 |  *
 10 |  * A computation uses a mshabal_context structure. That structure is
 11 |  * supposed to be allocated and released by the caller, e.g. as a
 12 |  * local or global variable, or on the heap. The structure contents
 13 |  * are initialized with mshabal_init(). Once the structure has been
 14 |  * initialized, data is input as chunks, with the mshabal() functions.
 15 |  * Chunks for the four parallel instances are provided simultaneously
 16 |  * and must have the same length. It is allowed not to use some of the
 17 |  * instances; the corresponding parameters in mshabal() are then NULL.
 18 |  * However, using NULL as a chunk for one of the instances effectively
 19 |  * deactivates that instance; this cannot be used to "skip" a chunk
 20 |  * for one instance.
 21 |  *
 22 |  * The computation is finalized with mshabal_close(). Some extra message
 23 |  * bits (0 to 7) can be input. The outputs of the four parallel instances
 24 |  * are written in the provided buffers. There again, NULL can be
 25 |  * provided as parameter is the output of one of the instances is not
 26 |  * needed.
 27 |  *
 28 |  * A mshabal_context instance is self-contained and holds no pointer.
 29 |  * Thus, it can be cloned (e.g. with memcpy()) or moved (as long as
 30 |  * proper alignment is maintained). This implementation uses no state
 31 |  * variable beyond the context instance; this, it is thread-safe and
 32 |  * reentrant.
 33 |  *
 34 |  * The Shabal specification defines Shabal with output sizes of 192,
 35 |  * 224, 256, 384 and 512 bits. This code accepts all those sizes, as
 36 |  * well as any output size which is multiple of 32, between 32 and
 37 |  * 512 (inclusive).
 38 |  *
 39 |  * Parameters are not validated. Thus, undefined behaviour occurs if
 40 |  * any of the "shall" or "must" clauses in this documentation is
 41 |  * violated.
 42 |  *
 43 |  *
 44 |  * (c) 2010 SAPHIR project. This software is provided 'as-is', without
 45 |  * any epxress or implied warranty. In no event will the authors be held
 46 |  * liable for any damages arising from the use of this software.
 47 |  *
 48 |  * Permission is granted to anyone to use this software for any purpose,
 49 |  * including commercial applications, and to alter it and redistribute it
 50 |  * freely, subject to no restriction.
 51 |  *
 52 |  * Technical remarks and questions can be addressed to:
 53 |  * <thomas.pornin@cryptolog.com>
 54 |  */
 55 | 
 56 | #ifndef MSHABAL_H__
 57 | #define MSHABAL_H__
 58 | 
 59 | #include <limits.h>
 60 | 
 61 | #ifdef __cplusplus
 62 | extern "C" {
 63 | #endif
 64 | 
 65 | /*
 66 |  * We need an integer type with width 32-bit or more (preferably, with
 67 |  * a width of exactly 32 bits).
 68 |  */
 69 | #if defined __STDC__ && __STDC_VERSION__ >= 199901L
 70 | #include <stdint.h>
 71 | #ifdef UINT32_MAX
 72 | typedef uint32_t mshabal_u32;
 73 | #else
 74 | typedef uint_fast32_t mshabal_u32;
 75 | #endif
 76 | #else
 77 | #if ((UINT_MAX >> 11) >> 11) >= 0x3FF
 78 | typedef unsigned int mshabal_u32;
 79 | #else
 80 | typedef unsigned long mshabal_u32;
 81 | #endif
 82 | #endif
 83 | 
 84 | #define MSHABAL512_VECTOR_SIZE 16
 85 | 
 86 | /*
 87 |  * The context structure for a Shabal computation. Contents are
 88 |  * private. Such a structure should be allocated and released by
 89 |  * the caller, in any memory area.
 90 |  */
 91 | typedef struct {
 92 |     unsigned char buf0[64];
 93 |     unsigned char buf1[64];
 94 |     unsigned char buf2[64];
 95 |     unsigned char buf3[64];
 96 |     unsigned char buf4[64];
 97 |     unsigned char buf5[64];
 98 |     unsigned char buf6[64];
 99 |     unsigned char buf7[64];
100 |     unsigned char buf8[64];
101 |     unsigned char buf9[64];
102 |     unsigned char buf10[64];
103 |     unsigned char buf11[64];
104 |     unsigned char buf12[64];
105 |     unsigned char buf13[64];
106 |     unsigned char buf14[64];
107 |     unsigned char buf15[64];
108 |     size_t ptr;
109 |     mshabal_u32 state[(12 + 16 + 16) * MSHABAL512_VECTOR_SIZE];
110 |     mshabal_u32 Whigh, Wlow;
111 |     unsigned out_size;
112 | } mshabal512_context;
113 | 
114 | #pragma pack(1)
115 | typedef struct {
116 |     mshabal_u32 state[(12 + 16 + 16) * MSHABAL512_VECTOR_SIZE];
117 |     mshabal_u32 Whigh, Wlow;
118 |     unsigned out_size;
119 | } mshabal512_context_fast;
120 | #pragma pack()
121 | 
122 | /*
123 |  * Initialize a context structure. The output size must be a multiple
124 |  * of 32, between 32 and 512 (inclusive). The output size is expressed
125 |  * in bits.
126 |  */
127 | void mshabal_init_avx512f(mshabal512_context *sc, unsigned out_size);
128 | 
129 | /*
130 |  * Process some more data bytes; four chunks of data, pointed to by
131 |  * data0, data1, data2 and data3, are processed. The four chunks have
132 |  * the same length of "len" bytes. For efficiency, it is best if data is
133 |  * processed by medium-sized chunks, e.g. a few kilobytes at a time.
134 |  *
135 |  * The "len" data bytes shall all be accessible. If "len" is zero, this
136 |  * this function does nothing and ignores the data* arguments.
137 |  * Otherwise, if one of the data* argument is NULL, then the
138 |  * corresponding instance is deactivated (the final value obtained from
139 |  * that instance is undefined).
140 |  */
141 | void mshabal_avx512f(mshabal512_context *sc, const void *data0, const void *data1, const void *data2, const void *data3,
142 |                 const void *data4, const void *data5, const void *data6, const void *data7, const void *data8, const void *data9,
143 |                 const void *data10, const void *data11, const void *data12, const void *data13, const void *data14,
144 |                 const void *data15, size_t len);
145 | 
146 | /*
147 |  * Terminate the Shabal computation incarnated by the provided context
148 |  * structure. "n" shall be a value between 0 and 7 (inclusive): this is
149 |  * the number of extra bits to extract from ub0, ub1, ub2 and ub3, and
150 |  * append at the end of the input message for each of the four parallel
151 |  * instances. Bits in "ub*" are taken in big-endian format: first bit is
152 |  * the one of numerical value 128, second bit has numerical value 64,
153 |  * and so on. Other bits in "ub*" are ignored. For most applications,
154 |  * input messages will consist in sequence of bytes, and the "ub*" and
155 |  * "n" parameters will be zero.
156 |  *
157 |  * The Shabal output for each of the parallel instances is written out
158 |  * in the areas pointed to by, respectively, dst0, dst1, dst2 and dst3.
159 |  * These areas shall be wide enough to accomodate the result (result
160 |  * size was specified as parameter to mshabal_init()). It is acceptable
161 |  * to use NULL for any of those pointers, if the result from the
162 |  * corresponding instance is not needed.
163 |  *
164 |  * After this call, the context structure is invalid. The caller shall
165 |  * release it, or reinitialize it with mshabal_init(). The mshabal_close()
166 |  * function does NOT imply a hidden call to mshabal_init().
167 |  */
168 | void mshabal_close_avx512f(mshabal512_context *sc, unsigned ub0, unsigned ub1, unsigned ub2,
169 |                       unsigned ub3, unsigned ub4, unsigned ub5, unsigned ub6, unsigned ub7, 
170 |                       unsigned ub8, unsigned ub9, unsigned ub10, unsigned ub11, unsigned ub12,
171 |                       unsigned ub13, unsigned ub14, unsigned ub15, unsigned n, void *dst0,
172 |                       void *dst1, void *dst2, void *dst3, void *dst4, void *dst5, void *dst6,
173 |                       void *dst7, void *dst8, void *dst9, void *dst10, void *dst11,
174 |                       void *dst12, void *dst13, void *dst14, void *dst15);
175 | 
176 | /*
177 |  * optimised Shabal routine for PoC plotting and hashing
178 |  */
179 | void mshabal_hash_fast_avx512f(mshabal512_context_fast *sc, void *message, void *termination,
180 |                                void *dst, unsigned len);
181 | 
182 | /*
183 |  * optimised Shabal routine for PoC mining
184 |  */
185 | void mshabal_deadline_fast_avx512f(mshabal512_context_fast *sc, void *message, void *termination, void *dst0,
186 |                                     void *dst1, void *dst2, void *dst3, void *dst4, void *dst5,
187 |                                     void *dst6, void *dst7, void *dst8, void *dst9, void *dst10,
188 |                                     void *dst11, void *dst12, void *dst13, void *dst14,
189 |                                     void *dst15);
190 |                                     
191 | #ifdef __cplusplus
192 | }
193 | #endif
194 | 
195 | #endif
196 | 


--------------------------------------------------------------------------------
/src/main.rs:
--------------------------------------------------------------------------------
  1 | #[macro_use]
  2 | extern crate clap;
  3 | #[macro_use]
  4 | extern crate cfg_if;
  5 | 
  6 | mod cpu_hasher;
  7 | #[cfg(feature = "opencl")]
  8 | mod gpu_hasher;
  9 | #[cfg(feature = "opencl")]
 10 | mod ocl;
 11 | mod plotter;
 12 | mod poc_hashing;
 13 | mod scheduler;
 14 | mod shabal256;
 15 | mod utils;
 16 | mod writer;
 17 | mod buffer;
 18 | 
 19 | use crate::plotter::{Plotter, PlotterTask};
 20 | use crate::utils::set_low_prio;
 21 | use clap::AppSettings::{ArgRequiredElseHelp, DeriveDisplayOrder, VersionlessSubcommands};
 22 | #[cfg(feature = "opencl")]
 23 | use clap::ArgGroup;
 24 | use clap::{App, Arg};
 25 | use std::cmp::min;
 26 | 
 27 | fn main() {
 28 |     let arg = App::new("Engraver")
 29 |         .version(crate_version!())
 30 |         .author(crate_authors!())
 31 |         .about(crate_description!())
 32 |         /*
 33 |         .setting(SubcommandRequiredElseHelp)
 34 |         */
 35 |         .setting(ArgRequiredElseHelp)
 36 |         .setting(DeriveDisplayOrder)
 37 |         .setting(VersionlessSubcommands)
 38 |         .arg(
 39 |             Arg::with_name("disable direct i/o")
 40 |                 .short("d")
 41 |                 .long("ddio")
 42 |                 .help("Disables direct i/o")
 43 |                 .global(true),
 44 |         ).arg(
 45 |             Arg::with_name("disable async i/o")
 46 |                 .short("a")
 47 |                 .long("daio")
 48 |                 .help("Disables async writing (single RAM buffer mode)")
 49 |                 .global(true),
 50 |         ).arg(
 51 |             Arg::with_name("low priority")
 52 |                 .short("l")
 53 |                 .long("prio")
 54 |                 .help("Runs engraver with low priority")
 55 |                 .global(true),
 56 |         ).arg(
 57 |             Arg::with_name("non-verbosity")
 58 |                 .short("q")
 59 |                 .long("quiet")
 60 |                 .help("Runs engraver in non-verbose mode")
 61 |                 .global(true),
 62 |         ).arg(
 63 |             Arg::with_name("benchmark")
 64 |                 .short("b")
 65 |                 .long("bench")
 66 |                 .help("Runs engraver in xPU benchmark mode")
 67 |                 .global(true),
 68 |         )
 69 |         /*
 70 |         .subcommand(
 71 |             SubCommand::with_name("plot")
 72 |                 .about("Plots a PoC2 file for your account ID")
 73 |                 .setting(ArgRequiredElseHelp)
 74 |                 .setting(DeriveDisplayOrder)
 75 |                 */.arg(
 76 |                     Arg::with_name("numeric id")
 77 |                         .short("i")
 78 |                         .long("id")
 79 |                         .value_name("numeric_ID")
 80 |                         .help("your numeric Account ID")
 81 |                         .takes_value(true)
 82 |                         .required_unless("ocl-devices"),
 83 |                 ).arg(
 84 |                     Arg::with_name("start nonce")
 85 |                         .short("s")
 86 |                         .long("sn")
 87 |                         .value_name("start_nonce")
 88 |                         .help("where you want to start plotting")
 89 |                         .takes_value(true)
 90 |                         .required_unless("ocl-devices"),
 91 |                 ).arg(
 92 |                     Arg::with_name("nonces")
 93 |                         .short("n")
 94 |                         .long("n")
 95 |                         .value_name("nonces")
 96 |                         .help("how many nonces you want to plot")
 97 |                         .takes_value(true)
 98 |                         .required_unless("ocl-devices"),
 99 |                 ).arg(
100 |                     Arg::with_name("path")
101 |                         .short("p")
102 |                         .long("path")
103 |                         .value_name("path")
104 |                         .help("target path for plotfile (optional)")
105 |                         .takes_value(true)
106 |                         .required(false),
107 |                 ).arg(
108 |                     Arg::with_name("memory")
109 |                         .short("m")
110 |                         .long("mem")
111 |                         .value_name("memory")
112 |                         .help("maximum memory usage (optional)")
113 |                         .takes_value(true)
114 |                         .required(false),
115 |                 ).args(&[
116 |                     Arg::with_name("cpu")
117 |                         .short("c")
118 |                         .long("cpu")
119 |                         .value_name("threads")
120 |                         .help("maximum cpu threads you want to use (optional)")
121 |                         .required(false)
122 |                         .takes_value(true),
123 |                     #[cfg(feature = "opencl")]
124 |                     Arg::with_name("gpu")
125 |                         .short("g")
126 |                         .long("gpu")
127 |                         .value_name("platform_id:device_id:cores")
128 |                         .help("GPU(s) you want to use for plotting (optional)")
129 |                         .multiple(true)
130 |                         .takes_value(true),
131 |                 ]).groups(&[#[cfg(feature = "opencl")]
132 |                 ArgGroup::with_name("processing")
133 |                     .args(&["cpu", "gpu"])
134 |                     .multiple(true)])
135 |                     /*
136 |                     .arg(
137 |                     Arg::with_name("ssd buffer")
138 |                         .short("b")
139 |                         .long("ssd_cache")
140 |                         .value_name("ssd_cache")
141 |                         .help("*path to ssd cache for staging (optional)")
142 |                         .takes_value(true)
143 |                         .required(false),
144 |                         
145 |                 ),
146 |                 
147 |         ).subcommand(
148 |             SubCommand::with_name("encode")
149 |                 .about("*Individualizes a PoC3 reference file for your account ID")
150 |                 .display_order(2)
151 |                 .arg(
152 |                     Arg::with_name("numeric id")
153 |                         .short("i")
154 |                         .long("numeric_ID")
155 |                         .value_name("numeric ID")
156 |                         .help("numeric Account ID")
157 |                         .takes_value(true),
158 |                 ),
159 |         ).subcommand(
160 |             SubCommand::with_name("decode")
161 |                 .about("*Restores a PoC3 reference file from an individualized file")
162 |                 .display_order(3)
163 |                 .arg(
164 |                     Arg::with_name("numeric id")
165 |                         .short("i")
166 |                         .long("numeric_ID")
167 |                         .value_name("numeric ID")
168 |                         .help("numeric Account ID")
169 |                         .takes_value(true)
170 |                         .required(true),
171 |                 ),
172 |                 
173 |         )*/;
174 | 
175 |     #[cfg(feature = "opencl")]
176 |     let arg = arg
177 |         .arg(
178 |             Arg::with_name("ocl-devices")
179 |                 .short("o")
180 |                 .long("opencl")
181 |                 .help("Display OpenCL platforms and devices")
182 |                 .global(true),
183 |         )
184 |         .arg(
185 |             Arg::with_name("zero-copy")
186 |                 .short("z")
187 |                 .long("zcb")
188 |                 .help("Enables zero copy buffers for shared mem (integrated) gpus")
189 |                 .global(true),
190 |         );
191 |     let matches = &arg.get_matches();
192 | 
193 |     if matches.is_present("low priority") {
194 |         set_low_prio();
195 |     }
196 | 
197 |     if matches.is_present("ocl-devices") {
198 |         #[cfg(feature = "opencl")]
199 |         ocl::platform_info();
200 |         return;
201 |     }
202 | 
203 |     // plotting
204 |     /* subcommand
205 |     if let Some(matches) = matches.subcommand_matches("plot") {
206 |     */
207 |     let numeric_id = value_t!(matches, "numeric id", u64).unwrap_or_else(|e| e.exit());
208 |     let start_nonce = value_t!(matches, "start nonce", u64).unwrap_or_else(|e| e.exit());
209 |     let nonces = value_t!(matches, "nonces", u64).unwrap_or_else(|e| e.exit());
210 |     let output_path = value_t!(matches, "path", String).unwrap_or_else(|_| {
211 |         std::env::current_dir()
212 |             .unwrap()
213 |             .into_os_string()
214 |             .into_string()
215 |             .unwrap()
216 |     });
217 |     let mem = value_t!(matches, "memory", String).unwrap_or_else(|_| "0B".to_owned());
218 |     let cpu_threads = value_t!(matches, "cpu", u8).unwrap_or(0u8);
219 | 
220 |     let gpus = if matches.occurrences_of("gpu") > 0 {
221 |         let gpu = values_t!(matches, "gpu", String);
222 |         Some(gpu.unwrap())
223 |     } else {
224 |         None
225 |     };
226 | 
227 |     // work out number of cpu threads to use
228 |     let cores = sys_info::cpu_num().unwrap() as u8;
229 |     let cpu_threads = if cpu_threads == 0 {
230 |         cores
231 |     } else {
232 |         min(2 * cores, cpu_threads)
233 |     };
234 | 
235 |     // special case: dont use cpu if only a gpu is defined
236 |     #[cfg(feature = "opencl")]
237 |     let cpu_threads = if matches.occurrences_of("gpu") > 0 && matches.occurrences_of("cpu") == 0 {
238 |         0u8
239 |     } else {
240 |         cpu_threads
241 |     };
242 | 
243 |     let p = Plotter::new();
244 |     p.run(PlotterTask {
245 |         numeric_id,
246 |         start_nonce,
247 |         nonces,
248 |         output_path,
249 |         mem,
250 |         cpu_threads,
251 |         gpus,
252 |         direct_io: !matches.is_present("disable direct i/o"),
253 |         async_io: !matches.is_present("disable async i/o"),
254 |         quiet: matches.is_present("non-verbosity"),
255 |         benchmark: matches.is_present("benchmark"),
256 |         zcb: matches.is_present("zero-copy"),
257 |     });
258 | }
259 | 


--------------------------------------------------------------------------------
/src/c/noncegen_128_avx.c:
--------------------------------------------------------------------------------
  1 | #include "noncegen_128_avx.h"
  2 | #include <immintrin.h>
  3 | #include <string.h>
  4 | #include "common.h"
  5 | #include "mshabal_128_avx.h"
  6 | #include "sph_shabal.h"
  7 | 
  8 | sph_shabal_context global_32;
  9 | mshabal128_context global_128;
 10 | mshabal128_context_fast global_128_fast;
 11 | 
 12 | void init_shabal_avx() {
 13 |     sph_shabal256_init(&global_32);
 14 |     mshabal_init_avx(&global_128, 256);
 15 |     global_128_fast.out_size = global_128.out_size;
 16 |     for (int i = 0; i < 176; i++) global_128_fast.state[i] = global_128.state[i];
 17 |     global_128_fast.Whigh = global_128.Whigh;
 18 |     global_128_fast.Wlow = global_128.Wlow;
 19 | }
 20 | 
 21 | // cache:			cache to save to
 22 | // local_num:		thread number
 23 | // numeric_id:		numeric account id
 24 | // loc_startnonce	nonce to start generation at
 25 | // local_nonces: 	number of nonces to generate
 26 | void noncegen_avx(char *cache, const size_t cache_size, const size_t chunk_offset,
 27 |                    const uint64_t numeric_id, const uint64_t local_startnonce,
 28 |                    const uint64_t local_nonces) {
 29 |     sph_shabal_context local_32;
 30 |     uint64_t nonce;
 31 |     size_t len;
 32 | 
 33 |     mshabal128_context_fast local_128_fast;
 34 |     uint64_t nonce1, nonce2, nonce3, nonce4;
 35 | 
 36 |     char seed[32];  // 64bit numeric account ID, 64bit nonce (blank), 1bit termination, 127 bits zero
 37 |     char term[32];  // 1bit 1, 255bit of zeros
 38 |     char zero[32];  // 256bit of zeros
 39 | 
 40 |     write_seed(seed, numeric_id);
 41 |     write_term(term);
 42 |     memset(&zero[0], 0, 32);
 43 | 
 44 |     //vars shared
 45 |     uint8_t* buffer = (uint8_t*)malloc(sizeof(uint8_t) * MSHABAL128_VECTOR_SIZE * NONCE_SIZE);
 46 |     uint8_t* final = (uint8_t*)malloc(sizeof(uint8_t) * MSHABAL128_VECTOR_SIZE * HASH_SIZE);
 47 | 
 48 |     // prepare smart SIMD aligned termination strings
 49 |     // creation could further be optimized, but not much in it as it only runs once per work package
 50 |     // creation could also be moved to plotter start
 51 |     union {
 52 |         mshabal_u32 words[16 * MSHABAL128_VECTOR_SIZE];
 53 |         __m128i data[16];
 54 |     } t1, t2, t3;
 55 | 
 56 |     for (int j = 0; j < 16 * MSHABAL128_VECTOR_SIZE / 2; j += MSHABAL128_VECTOR_SIZE) {
 57 |         size_t o = j;
 58 |         // t1
 59 |         t1.words[j + 0] = *(mshabal_u32 *)(seed + o);
 60 |         t1.words[j + 1] = *(mshabal_u32 *)(seed + o);
 61 |         t1.words[j + 2] = *(mshabal_u32 *)(seed + o);
 62 |         t1.words[j + 3] = *(mshabal_u32 *)(seed + o);
 63 |         t1.words[j + 0 + 32] = *(mshabal_u32 *)(zero + o);
 64 |         t1.words[j + 1 + 32] = *(mshabal_u32 *)(zero + o);
 65 |         t1.words[j + 2 + 32] = *(mshabal_u32 *)(zero + o);
 66 |         t1.words[j + 3 + 32] = *(mshabal_u32 *)(zero + o);
 67 |         // t2
 68 |         // (first 256bit skipped, will later be filled with data)
 69 |         t2.words[j + 0 + 32] = *(mshabal_u32 *)(seed + o);
 70 |         t2.words[j + 1 + 32] = *(mshabal_u32 *)(seed + o);
 71 |         t2.words[j + 2 + 32] = *(mshabal_u32 *)(seed + o);
 72 |         t2.words[j + 3 + 32] = *(mshabal_u32 *)(seed + o);
 73 |         // t3
 74 |         t3.words[j + 0] = *(mshabal_u32 *)(term + o);
 75 |         t3.words[j + 1] = *(mshabal_u32 *)(term + o);
 76 |         t3.words[j + 2] = *(mshabal_u32 *)(term + o);
 77 |         t3.words[j + 3] = *(mshabal_u32 *)(term + o);
 78 |         t3.words[j + 0 + 32] = *(mshabal_u32 *)(zero + o);
 79 |         t3.words[j + 1 + 32] = *(mshabal_u32 *)(zero + o);
 80 |         t3.words[j + 2 + 32] = *(mshabal_u32 *)(zero + o);
 81 |         t3.words[j + 3 + 32] = *(mshabal_u32 *)(zero + o);
 82 |     }
 83 | 
 84 |        for (uint64_t n = 0; n < local_nonces;) {
 85 |         // iterate nonces (4 per cycle - avx)
 86 |         // min 4 nonces left for avx processing, otherwise SISD
 87 |         if (n + 4 <= local_nonces) {
 88 |             // generate nonce numbers & change endianness
 89 |             nonce1 = bswap_64((uint64_t)(local_startnonce + n + 0));
 90 |             nonce2 = bswap_64((uint64_t)(local_startnonce + n + 1));
 91 |             nonce3 = bswap_64((uint64_t)(local_startnonce + n + 2));
 92 |             nonce4 = bswap_64((uint64_t)(local_startnonce + n + 3));
 93 | 
 94 |             // store nonce numbers in relevant termination strings
 95 |             for (int j = 8; j < 16; j += MSHABAL128_VECTOR_SIZE) {
 96 |                 size_t o = j - 8;
 97 |                 // t1
 98 |                 t1.words[j + 0] = *(mshabal_u32 *)((char *)&nonce1 + o);
 99 |                 t1.words[j + 1] = *(mshabal_u32 *)((char *)&nonce2 + o);
100 |                 t1.words[j + 2] = *(mshabal_u32 *)((char *)&nonce3 + o);
101 |                 t1.words[j + 3] = *(mshabal_u32 *)((char *)&nonce4 + o);
102 |                 t2.words[j + 0 + 32] = *(mshabal_u32 *)((char *)&nonce1 + o);
103 |                 t2.words[j + 1 + 32] = *(mshabal_u32 *)((char *)&nonce2 + o);
104 |                 t2.words[j + 2 + 32] = *(mshabal_u32 *)((char *)&nonce3 + o);
105 |                 t2.words[j + 3 + 32] = *(mshabal_u32 *)((char *)&nonce4 + o);
106 |             }
107 | 
108 |             // start shabal rounds
109 | 
110 |             // 3 cases: first 128 rounds uses case 1 or 2, after that case 3
111 |             // case 1: first 128 rounds, hashes are even: use termination string 1
112 |             // case 2: first 128 rounds, hashes are odd: use termination string 2
113 |             // case 3: round > 128: use termination string 3
114 |             // round 1
115 |             memcpy(&local_128_fast, &global_128_fast,
116 |                    sizeof(global_128_fast));  // fast initialize shabal
117 | 
118 |             mshabal_hash_fast_avx(
119 |                 &local_128_fast, NULL, &t1,
120 |                 &buffer[MSHABAL128_VECTOR_SIZE * (NONCE_SIZE - HASH_SIZE)], 16 >> 6);
121 | 
122 |             // store first hash into smart termination string 2 (data is vectored and SIMD aligned)
123 |             memcpy(&t2, &buffer[MSHABAL128_VECTOR_SIZE * (NONCE_SIZE - HASH_SIZE)],
124 |                    MSHABAL128_VECTOR_SIZE * (HASH_SIZE));
125 | 
126 |             // round 2 - 128
127 |             for (size_t i = NONCE_SIZE - HASH_SIZE; i > (NONCE_SIZE - HASH_CAP); i -= HASH_SIZE) {
128 |                 // check if msg can be divided into 512bit packages without a
129 |                 // remainder
130 |                 if (i % 64 == 0) {
131 |                     // last msg = seed + termination
132 |                     mshabal_hash_fast_avx(&local_128_fast, &buffer[i * MSHABAL128_VECTOR_SIZE],
133 |                                               &t1,
134 |                                               &buffer[(i - HASH_SIZE) * MSHABAL128_VECTOR_SIZE],
135 |                                               (NONCE_SIZE + 16 - i) >> 6);
136 |                 } else {
137 |                     // last msg = 256 bit data + seed + termination
138 |                     mshabal_hash_fast_avx(&local_128_fast, &buffer[i * MSHABAL128_VECTOR_SIZE],
139 |                                               &t2,
140 |                                               &buffer[(i - HASH_SIZE) * MSHABAL128_VECTOR_SIZE],
141 |                                               (NONCE_SIZE + 16 - i) >> 6);
142 |                 }
143 |             }
144 | 
145 |             // round 128-8192
146 |             for (size_t i = NONCE_SIZE - HASH_CAP; i > 0; i -= HASH_SIZE) {
147 |                 mshabal_hash_fast_avx(&local_128_fast, &buffer[i * MSHABAL128_VECTOR_SIZE], &t3,
148 |                                           &buffer[(i - HASH_SIZE) * MSHABAL128_VECTOR_SIZE],
149 |                                           (HASH_CAP) >> 6);
150 |             }
151 | 
152 |             // generate final hash
153 |             mshabal_hash_fast_avx(&local_128_fast, &buffer[0], &t1, &final[0],
154 |                                       (NONCE_SIZE + 16) >> 6);
155 | 
156 |             // XOR using SIMD
157 |             // load final hash
158 |             __m128i F[8];
159 |             for (int j = 0; j < 8; j++) F[j] = _mm_loadu_si128((__m128i *)final + j);
160 |             // xor all hashes with final hash
161 |             for (int j = 0; j < 8 * 2 * HASH_CAP; j++)
162 |                 _mm_storeu_si128(
163 |                     (__m128i *)buffer + j,
164 |                     _mm_xor_si128(_mm_loadu_si128((__m128i *)buffer + j), F[j % 8]));
165 | 
166 |             // todo: fork SIMD aligned plot file here
167 |             // simd shabal words unpack + POC Shuffle + scatter nonces into optimised cache
168 | 
169 |             for (int i = 0; i < NUM_SCOOPS * 2; i++) {
170 |                 for (int j = 0; j < 32; j += 4) {
171 |                     for (int k = 0; k < MSHABAL128_VECTOR_SIZE; k += 1) {
172 |                     memcpy(&cache[((i & 1) * (4095 - (i >> 1)) + ((i + 1) & 1) * (i >> 1)) *
173 |                                       SCOOP_SIZE * cache_size +
174 |                                   (n + k + chunk_offset) * SCOOP_SIZE + (i & 1) * 32 + j],
175 |                            &buffer[(i * 32 + j) * MSHABAL128_VECTOR_SIZE + k * 4], 4);
176 |                     }
177 |                 }
178 |             }
179 | 
180 |             n += 4;
181 |         } else {
182 |             // if less than 8 nonces left, use 1d-shabal
183 |             int8_t *xv = (int8_t *)&numeric_id;
184 | 
185 |             for (size_t i = 0; i < 8; i++) buffer[NONCE_SIZE + i] = xv[7 - i];
186 | 
187 |             nonce = local_startnonce + n;
188 |             xv = (int8_t *)&nonce;
189 | 
190 |             for (size_t i = 8; i < 16; i++) buffer[NONCE_SIZE + i] = xv[15 - i];
191 | 
192 |             for (size_t i = NONCE_SIZE; i > 0; i -= HASH_SIZE) {
193 |                 memcpy(&local_32, &global_32, sizeof(global_32));
194 |                 ;
195 |                 if (i < NONCE_SIZE + 16 - HASH_CAP)
196 |                     len = HASH_CAP;
197 |                 else
198 |                     len = NONCE_SIZE + 16 - i;
199 | 
200 |                 sph_shabal256(&local_32, &buffer[i], len);
201 |                 sph_shabal256_close(&local_32, &buffer[i - HASH_SIZE]);
202 |             }
203 | 
204 |             memcpy(&local_32, &global_32, sizeof(global_32));
205 |             sph_shabal256(&local_32, buffer, 16 + NONCE_SIZE);
206 |             sph_shabal256_close(&local_32, final);
207 | 
208 |             // XOR with final
209 |             for (size_t i = 0; i < NONCE_SIZE; i++) buffer[i] ^= (final[i % HASH_SIZE]);
210 | 
211 |             // Sort them PoC2:
212 |             for (size_t i = 0; i < HASH_CAP; i++){
213 |                 memmove(&cache[i * cache_size * SCOOP_SIZE + (n + chunk_offset) * SCOOP_SIZE], &buffer[i * SCOOP_SIZE], HASH_SIZE);
214 |                 memmove(&cache[(4095-i) * cache_size * SCOOP_SIZE + (n + chunk_offset) * SCOOP_SIZE + 32], &buffer[i * SCOOP_SIZE + 32], HASH_SIZE);
215 |             }
216 |             n++;
217 |         }
218 |     }
219 |     free(buffer);
220 |     free(final);
221 | }
222 | 


--------------------------------------------------------------------------------
/src/c/noncegen_128_sse2.c:
--------------------------------------------------------------------------------
  1 | #include "noncegen_128_avx.h"
  2 | #include <immintrin.h>
  3 | #include <string.h>
  4 | #include "common.h"
  5 | #include "mshabal_128_sse2.h"
  6 | #include "sph_shabal.h"
  7 | 
  8 | sph_shabal_context global_32;
  9 | mshabal128_context global_128;
 10 | mshabal128_context_fast global_128_fast;
 11 | 
 12 | void init_shabal_sse2() {
 13 |     sph_shabal256_init(&global_32);
 14 |     mshabal_init_sse2(&global_128, 256);
 15 |     global_128_fast.out_size = global_128.out_size;
 16 |     for (int i = 0; i < 176; i++) global_128_fast.state[i] = global_128.state[i];
 17 |     global_128_fast.Whigh = global_128.Whigh;
 18 |     global_128_fast.Wlow = global_128.Wlow;
 19 | }
 20 | 
 21 | // cache:			cache to save to
 22 | // local_num:		thread number
 23 | // numeric_id:		numeric account id
 24 | // loc_startnonce	nonce to start generation at
 25 | // local_nonces: 	number of nonces to generate
 26 | void noncegen_sse2(char *cache, const size_t cache_size, const size_t chunk_offset,
 27 |                    const uint64_t numeric_id, const uint64_t local_startnonce,
 28 |                    const uint64_t local_nonces) {
 29 |     sph_shabal_context local_32;
 30 |     uint64_t nonce;
 31 |     size_t len;
 32 | 
 33 |     mshabal128_context_fast local_128_fast;
 34 |     uint64_t nonce1, nonce2, nonce3, nonce4;
 35 | 
 36 |     char seed[32];  // 64bit numeric account ID, 64bit nonce (blank), 1bit termination, 127 bits zero
 37 |     char term[32];  // 1bit 1, 255bit of zeros
 38 |     char zero[32];  // 256bit of zeros
 39 | 
 40 |     write_seed(seed, numeric_id);
 41 |     write_term(term);
 42 |     memset(&zero[0], 0, 32);
 43 | 
 44 |     //vars shared
 45 |     uint8_t* buffer = (uint8_t*)malloc(sizeof(uint8_t) * MSHABAL128_VECTOR_SIZE * NONCE_SIZE);
 46 |     uint8_t* final = (uint8_t*)malloc(sizeof(uint8_t) * MSHABAL128_VECTOR_SIZE * HASH_SIZE);
 47 | 
 48 |     // prepare smart SIMD aligned termination strings
 49 |     // creation could further be optimized, but not much in it as it only runs once per work package
 50 |     // creation could also be moved to plotter start
 51 |     union {
 52 |         mshabal_u32 words[16 * MSHABAL128_VECTOR_SIZE];
 53 |         __m128i data[16];
 54 |     } t1, t2, t3;
 55 | 
 56 |     for (int j = 0; j < 16 * MSHABAL128_VECTOR_SIZE / 2; j += MSHABAL128_VECTOR_SIZE) {
 57 |         size_t o = j;
 58 |         // t1
 59 |         t1.words[j + 0] = *(mshabal_u32 *)(seed + o);
 60 |         t1.words[j + 1] = *(mshabal_u32 *)(seed + o);
 61 |         t1.words[j + 2] = *(mshabal_u32 *)(seed + o);
 62 |         t1.words[j + 3] = *(mshabal_u32 *)(seed + o);
 63 |         t1.words[j + 0 + 32] = *(mshabal_u32 *)(zero + o);
 64 |         t1.words[j + 1 + 32] = *(mshabal_u32 *)(zero + o);
 65 |         t1.words[j + 2 + 32] = *(mshabal_u32 *)(zero + o);
 66 |         t1.words[j + 3 + 32] = *(mshabal_u32 *)(zero + o);
 67 |         // t2
 68 |         // (first 256bit skipped, will later be filled with data)
 69 |         t2.words[j + 0 + 32] = *(mshabal_u32 *)(seed + o);
 70 |         t2.words[j + 1 + 32] = *(mshabal_u32 *)(seed + o);
 71 |         t2.words[j + 2 + 32] = *(mshabal_u32 *)(seed + o);
 72 |         t2.words[j + 3 + 32] = *(mshabal_u32 *)(seed + o);
 73 |         // t3
 74 |         t3.words[j + 0] = *(mshabal_u32 *)(term + o);
 75 |         t3.words[j + 1] = *(mshabal_u32 *)(term + o);
 76 |         t3.words[j + 2] = *(mshabal_u32 *)(term + o);
 77 |         t3.words[j + 3] = *(mshabal_u32 *)(term + o);
 78 |         t3.words[j + 0 + 32] = *(mshabal_u32 *)(zero + o);
 79 |         t3.words[j + 1 + 32] = *(mshabal_u32 *)(zero + o);
 80 |         t3.words[j + 2 + 32] = *(mshabal_u32 *)(zero + o);
 81 |         t3.words[j + 3 + 32] = *(mshabal_u32 *)(zero + o);
 82 |     }
 83 | 
 84 |        for (uint64_t n = 0; n < local_nonces;) {
 85 |         // iterate nonces (4 per cycle - sse)
 86 |         // min 4 nonces left for sse processing, otherwise SISD
 87 |         if (n + 4 <= local_nonces) {
 88 |             // generate nonce numbers & change endianness
 89 |             nonce1 = bswap_64((uint64_t)(local_startnonce + n + 0));
 90 |             nonce2 = bswap_64((uint64_t)(local_startnonce + n + 1));
 91 |             nonce3 = bswap_64((uint64_t)(local_startnonce + n + 2));
 92 |             nonce4 = bswap_64((uint64_t)(local_startnonce + n + 3));
 93 | 
 94 |             // store nonce numbers in relevant termination strings
 95 |             for (int j = 8; j < 16; j += MSHABAL128_VECTOR_SIZE) {
 96 |                 size_t o = j - 8;
 97 |                 // t1
 98 |                 t1.words[j + 0] = *(mshabal_u32 *)((char *)&nonce1 + o);
 99 |                 t1.words[j + 1] = *(mshabal_u32 *)((char *)&nonce2 + o);
100 |                 t1.words[j + 2] = *(mshabal_u32 *)((char *)&nonce3 + o);
101 |                 t1.words[j + 3] = *(mshabal_u32 *)((char *)&nonce4 + o);
102 |                 t2.words[j + 0 + 32] = *(mshabal_u32 *)((char *)&nonce1 + o);
103 |                 t2.words[j + 1 + 32] = *(mshabal_u32 *)((char *)&nonce2 + o);
104 |                 t2.words[j + 2 + 32] = *(mshabal_u32 *)((char *)&nonce3 + o);
105 |                 t2.words[j + 3 + 32] = *(mshabal_u32 *)((char *)&nonce4 + o);
106 |             }
107 | 
108 |             // start shabal rounds
109 | 
110 |             // 3 cases: first 128 rounds uses case 1 or 2, after that case 3
111 |             // case 1: first 128 rounds, hashes are even: use termination string 1
112 |             // case 2: first 128 rounds, hashes are odd: use termination string 2
113 |             // case 3: round > 128: use termination string 3
114 |             // round 1
115 |             memcpy(&local_128_fast, &global_128_fast,
116 |                    sizeof(global_128_fast));  // fast initialize shabal
117 | 
118 |             mshabal_hash_fast_sse2(
119 |                 &local_128_fast, NULL, &t1,
120 |                 &buffer[MSHABAL128_VECTOR_SIZE * (NONCE_SIZE - HASH_SIZE)], 16 >> 6);
121 | 
122 |             // store first hash into smart termination string 2 (data is vectored and SIMD aligned)
123 |             memcpy(&t2, &buffer[MSHABAL128_VECTOR_SIZE * (NONCE_SIZE - HASH_SIZE)],
124 |                    MSHABAL128_VECTOR_SIZE * (HASH_SIZE));
125 | 
126 |             // round 2 - 128
127 |             for (size_t i = NONCE_SIZE - HASH_SIZE; i > (NONCE_SIZE - HASH_CAP); i -= HASH_SIZE) {
128 |                 // check if msg can be divided into 512bit packages without a
129 |                 // remainder
130 |                 if (i % 64 == 0) {
131 |                     // last msg = seed + termination
132 |                     mshabal_hash_fast_sse2(&local_128_fast, &buffer[i * MSHABAL128_VECTOR_SIZE],
133 |                                               &t1,
134 |                                               &buffer[(i - HASH_SIZE) * MSHABAL128_VECTOR_SIZE],
135 |                                               (NONCE_SIZE + 16 - i) >> 6);
136 |                 } else {
137 |                     // last msg = 256 bit data + seed + termination
138 |                     mshabal_hash_fast_sse2(&local_128_fast, &buffer[i * MSHABAL128_VECTOR_SIZE],
139 |                                               &t2,
140 |                                               &buffer[(i - HASH_SIZE) * MSHABAL128_VECTOR_SIZE],
141 |                                               (NONCE_SIZE + 16 - i) >> 6);
142 |                 }
143 |             }
144 | 
145 |             // round 128-8192
146 |             for (size_t i = NONCE_SIZE - HASH_CAP; i > 0; i -= HASH_SIZE) {
147 |                 mshabal_hash_fast_sse2(&local_128_fast, &buffer[i * MSHABAL128_VECTOR_SIZE], &t3,
148 |                                           &buffer[(i - HASH_SIZE) * MSHABAL128_VECTOR_SIZE],
149 |                                           (HASH_CAP) >> 6);
150 |             }
151 | 
152 |             // generate final hash
153 |             mshabal_hash_fast_sse2(&local_128_fast, &buffer[0], &t1, &final[0],
154 |                                       (NONCE_SIZE + 16) >> 6);
155 | 
156 |             // XOR using SIMD
157 |             // load final hash
158 |             __m128i F[8];
159 |             for (int j = 0; j < 8; j++) F[j] = _mm_loadu_si128((__m128i *)final + j);
160 |             // xor all hashes with final hash
161 |             for (int j = 0; j < 8 * 2 * HASH_CAP; j++)
162 |                 _mm_storeu_si128(
163 |                     (__m128i *)buffer + j,
164 |                     _mm_xor_si128(_mm_loadu_si128((__m128i *)buffer + j), F[j % 8]));
165 | 
166 |             // todo: fork SIMD aligned plot file here
167 |             // simd shabal words unpack + POC Shuffle + scatter nonces into optimised cache
168 | 
169 |             for (int i = 0; i < NUM_SCOOPS * 2; i++) {
170 |                 for (int j = 0; j < 32; j += 4) {
171 |                     for (int k = 0; k < MSHABAL128_VECTOR_SIZE; k += 1) {
172 |                     memcpy(&cache[((i & 1) * (4095 - (i >> 1)) + ((i + 1) & 1) * (i >> 1)) *
173 |                                       SCOOP_SIZE * cache_size +
174 |                                   (n + k + chunk_offset) * SCOOP_SIZE + (i & 1) * 32 + j],
175 |                            &buffer[(i * 32 + j) * MSHABAL128_VECTOR_SIZE + k * 4], 4);
176 |                     }
177 |                 }
178 |             }
179 | 
180 |             n += 4;
181 |         } else {
182 |             // if less than 8 nonces left, use 1d-shabal
183 |             int8_t *xv = (int8_t *)&numeric_id;
184 | 
185 |             for (size_t i = 0; i < 8; i++) buffer[NONCE_SIZE + i] = xv[7 - i];
186 | 
187 |             nonce = local_startnonce + n;
188 |             xv = (int8_t *)&nonce;
189 | 
190 |             for (size_t i = 8; i < 16; i++) buffer[NONCE_SIZE + i] = xv[15 - i];
191 | 
192 |             for (size_t i = NONCE_SIZE; i > 0; i -= HASH_SIZE) {
193 |                 memcpy(&local_32, &global_32, sizeof(global_32));
194 |                 ;
195 |                 if (i < NONCE_SIZE + 16 - HASH_CAP)
196 |                     len = HASH_CAP;
197 |                 else
198 |                     len = NONCE_SIZE + 16 - i;
199 | 
200 |                 sph_shabal256(&local_32, &buffer[i], len);
201 |                 sph_shabal256_close(&local_32, &buffer[i - HASH_SIZE]);
202 |             }
203 | 
204 |             memcpy(&local_32, &global_32, sizeof(global_32));
205 |             sph_shabal256(&local_32, buffer, 16 + NONCE_SIZE);
206 |             sph_shabal256_close(&local_32, final);
207 | 
208 |             // XOR with final
209 |             for (size_t i = 0; i < NONCE_SIZE; i++) buffer[i] ^= (final[i % HASH_SIZE]);
210 | 
211 |             // Sort them PoC2:
212 |             for (size_t i = 0; i < HASH_CAP; i++){
213 |                 memmove(&cache[i * cache_size * SCOOP_SIZE + (n + chunk_offset) * SCOOP_SIZE], &buffer[i * SCOOP_SIZE], HASH_SIZE);
214 |                 memmove(&cache[(4095-i) * cache_size * SCOOP_SIZE + (n + chunk_offset) * SCOOP_SIZE + 32], &buffer[i * SCOOP_SIZE + 32], HASH_SIZE);
215 |             }
216 |             n++;
217 |         }
218 |     }
219 |     free(buffer);
220 |     free(final);
221 | }
222 | 


--------------------------------------------------------------------------------
/src/scheduler.rs:
--------------------------------------------------------------------------------
  1 | use crate::cpu_hasher::{SimdExtension, hash_cpu, CpuTask, SafePointer};
  2 | use crate::buffer::PageAlignedByteBuffer;
  3 | #[cfg(feature = "opencl")]
  4 | use crate::gpu_hasher::{create_gpu_hasher_thread, GpuTask};
  5 | #[cfg(feature = "opencl")]
  6 | use crate::ocl::gpu_init;
  7 | use crate::plotter::{PlotterTask, NONCE_SIZE};
  8 | #[cfg(feature = "opencl")]
  9 | use crossbeam_channel::unbounded;
 10 | use crossbeam_channel::{Receiver, Sender};
 11 | use std::cmp::min;
 12 | use std::sync::mpsc::channel;
 13 | use std::sync::Arc;
 14 | #[cfg(feature = "opencl")]
 15 | use std::thread;
 16 | 
 17 | const CPU_TASK_SIZE: u64 = 64;
 18 | 
 19 | pub fn create_scheduler_thread(
 20 |     task: Arc<PlotterTask>,
 21 |     thread_pool: rayon::ThreadPool,
 22 |     mut nonces_hashed: u64,
 23 |     mut pb: Option<pbr::ProgressBar<pbr::Pipe>>,
 24 |     rx_empty_buffers: Receiver<PageAlignedByteBuffer>,
 25 |     tx_buffers_to_writer: Sender<PageAlignedByteBuffer>,
 26 |     simd_ext: SimdExtension,
 27 | ) -> impl FnOnce() {
 28 |     move || {
 29 |         // synchronisation chanel for all hashing devices (CPU+GPU)
 30 |         // message protocol:    (hash_device_id: u8, message: u8, nonces processed: u64)
 31 |         // hash_device_id:      0=CPU, 1=GPU0, 2=GPU1...
 32 |         // message:             0 = data ready to write
 33 |         //                      1 = device ready to compute next hashing batch
 34 |         // nonces_processed:    nonces hashed / nonces writen to host buffer
 35 |         let (tx, rx) = channel();
 36 | 
 37 |         // create gpu threads and channels
 38 |         #[cfg(feature = "opencl")]
 39 |         let gpu_contexts = match &task.gpus {
 40 |             Some(x) => Some(gpu_init(&x, task.zcb)),
 41 |             None => None,
 42 |         };
 43 | 
 44 |         #[cfg(feature = "opencl")]
 45 |         let gpus = match gpu_contexts {
 46 |             Some(x) => x,
 47 |             None => Vec::new(),
 48 |         };
 49 |         #[cfg(feature = "opencl")]
 50 |         let mut gpu_threads = Vec::new();
 51 |         #[cfg(feature = "opencl")]
 52 |         let mut gpu_channels = Vec::new();
 53 | 
 54 |         #[cfg(feature = "opencl")]
 55 |         for (i, gpu) in gpus.iter().enumerate() {
 56 |             gpu_channels.push(unbounded());
 57 |             gpu_threads.push(thread::spawn({
 58 |                 create_gpu_hasher_thread(
 59 |                     (i + 1) as u8,
 60 |                     gpu.clone(),
 61 |                     tx.clone(),
 62 |                     gpu_channels.last().unwrap().1.clone(),
 63 |                 )
 64 |             }));
 65 |         }
 66 | 
 67 |         for buffer in rx_empty_buffers {
 68 |             let mut_bs = &buffer.get_buffer();
 69 |             let mut bs = mut_bs.lock().unwrap();
 70 |             let buffer_size = (*bs).len() as u64;
 71 |             let nonces_to_hash = min(buffer_size / NONCE_SIZE, task.nonces - nonces_hashed);
 72 | 
 73 |             let mut requested = 0u64;
 74 |             let mut processed = 0u64;
 75 | 
 76 |             // kickoff first gpu and cpu runs
 77 |             #[cfg(feature = "opencl")]
 78 |             for (i, gpu) in gpus.iter().enumerate() {
 79 |                 // schedule next gpu task
 80 |                 let gpu = gpu.lock().unwrap();
 81 |                 let task_size = min(gpu.worksize as u64, nonces_to_hash - requested);
 82 |                 if task_size > 0 {
 83 |                     gpu_channels[i]
 84 |                         .0
 85 |                         .send(Some(GpuTask {
 86 |                             cache: SafePointer {
 87 |                                 ptr: bs.as_mut_ptr(),
 88 |                             },
 89 |                             cache_size: buffer_size / NONCE_SIZE,
 90 |                             chunk_offset: requested,
 91 |                             numeric_id: task.numeric_id,
 92 |                             local_startnonce: task.start_nonce + nonces_hashed + requested,
 93 |                             local_nonces: task_size,
 94 |                         }))
 95 |                         .unwrap();
 96 |                 }
 97 |                 requested += task_size;
 98 |                 //println!("Debug: Device: {} started. {} nonces assigned. Total requested: {}\n\n\n",i+1,task_size,requested);
 99 |             }
100 | 
101 |             for _ in 0..task.cpu_threads {
102 |                 let task_size = min(CPU_TASK_SIZE, nonces_to_hash - requested);
103 |                 if task_size > 0 {
104 |                     let task = hash_cpu(
105 |                         tx.clone(),
106 |                         CpuTask {
107 |                             cache: SafePointer {
108 |                                 ptr: bs.as_mut_ptr(),
109 |                             },
110 |                             cache_size: (buffer_size / NONCE_SIZE) as usize,
111 |                             chunk_offset: requested as usize,
112 |                             numeric_id: task.numeric_id,
113 |                             local_startnonce: task.start_nonce + nonces_hashed + requested,
114 |                             local_nonces: task_size,
115 |                         },
116 |                         simd_ext.clone(),
117 |                     );
118 |                     thread_pool.spawn(task);
119 |                 }
120 |                 requested += task_size;
121 |             }
122 | 
123 |             // control loop
124 |             let rx = &rx;
125 |             for msg in rx {
126 |                 match msg.1 {
127 |                     // process a request for work: provide a task or signal completion
128 |                     1 => {
129 |                         let task_size = match msg.0 {
130 |                             0 => {
131 |                                 // schedule next cpu task
132 |                                 let task_size = min(CPU_TASK_SIZE, nonces_to_hash - requested);
133 |                                 if task_size > 0 {
134 |                                     let task = hash_cpu(
135 |                                         tx.clone(),
136 |                                         CpuTask {
137 |                                             cache: SafePointer {
138 |                                                 ptr: bs.as_mut_ptr(),
139 |                                             },
140 |                                             cache_size: (buffer_size / NONCE_SIZE) as usize,
141 |                                             chunk_offset: requested as usize,
142 |                                             numeric_id: task.numeric_id,
143 |                                             local_startnonce: task.start_nonce
144 |                                                 + nonces_hashed
145 |                                                 + requested,
146 |                                             local_nonces: task_size,
147 |                                         },
148 |                                         simd_ext.clone(),
149 |                                     );
150 |                                     thread_pool.spawn(task);
151 |                                 }
152 |                                 task_size
153 |                             }
154 |                             _ => {
155 |                                 // schedule next gpu task
156 |                                 #[cfg(feature = "opencl")]
157 |                                 let gpu = gpus[(msg.0 - 1) as usize].lock().unwrap();
158 |                                 #[cfg(feature = "opencl")]
159 |                                 let task_size =
160 |                                     min(gpu.worksize as u64, nonces_to_hash - requested);
161 | 
162 |                                 // optimisation: leave some work for cpu in dual mode
163 |                                 #[cfg(feature = "opencl")]
164 |                                 let task_size = if task_size < gpu.worksize as u64
165 |                                     && task.cpu_threads > 0
166 |                                     && task_size > CPU_TASK_SIZE
167 |                                 {
168 |                                     task_size / 2
169 |                                 } else {
170 |                                     task_size
171 |                                 };
172 | 
173 |                                 #[cfg(not(feature = "opencl"))]
174 |                                 let task_size = 0;
175 | 
176 |                                 #[cfg(feature = "opencl")]
177 |                                 gpu_channels[(msg.0 - 1) as usize]
178 |                                     .0
179 |                                     .send(Some(GpuTask {
180 |                                         cache: SafePointer {
181 |                                             ptr: bs.as_mut_ptr(),
182 |                                         },
183 |                                         cache_size: buffer_size / NONCE_SIZE,
184 |                                         chunk_offset: requested,
185 |                                         numeric_id: task.numeric_id,
186 |                                         local_startnonce: task.start_nonce
187 |                                             + nonces_hashed
188 |                                             + requested,
189 |                                         local_nonces: task_size,
190 |                                     }))
191 |                                     .unwrap();
192 |                                 task_size
193 |                             }
194 |                         };
195 | 
196 |                         requested += task_size;
197 |                         //println!("Debug: Device: {} asked for work. {} nonces assigned. Total requested: {}\n\n\n",msg.0,task_size,requested);
198 |                     }
199 |                     // process work completed message
200 |                     0 => {
201 |                         processed += msg.2;
202 |                         match &mut pb {
203 |                             Some(pb) => {
204 |                                 pb.add(msg.2 * NONCE_SIZE);
205 |                             }
206 |                             None => (),
207 |                         }
208 |                     }
209 |                     _ => {}
210 |                 }
211 |                 if processed == nonces_to_hash {
212 |                     break;
213 |                 }
214 |             }
215 | 
216 |             nonces_hashed += nonces_to_hash;
217 | 
218 |             // queue buffer for writing
219 |             tx_buffers_to_writer.send(buffer).unwrap();
220 | 
221 |             // thread end
222 |             if task.nonces == nonces_hashed {
223 |                 match &mut pb {
224 |                     Some(pb) => {
225 |                         pb.finish_print("Hasher done.");
226 |                     }
227 |                     None => (),
228 |                 }
229 |                 // shutdown gpu threads
230 |                 #[cfg(feature = "opencl")]
231 |                 for gpu in &gpu_channels {
232 |                     gpu.0.send(None).unwrap();
233 |                 }
234 |                 break;
235 |             };
236 |         }
237 |     }
238 | }
239 | 


--------------------------------------------------------------------------------
/src/shabal256.rs:
--------------------------------------------------------------------------------
  1 | use std::slice::from_raw_parts;
  2 | 
  3 | const A_INIT: [u32; 12] = [
  4 |     0x52F84552, 0xE54B7999, 0x2D8EE3EC, 0xB9645191, 0xE0078B86, 0xBB7C44C9, 0xD2B5C1CA, 0xB0D2EB8C,
  5 |     0x14CE5A45, 0x22AF50DC, 0xEFFDBC6B, 0xEB21B74A,
  6 | ];
  7 | 
  8 | const B_INIT: [u32; 16] = [
  9 |     0xB555C6EE, 0x3E710596, 0xA72A652F, 0x9301515F, 0xDA28C1FA, 0x696FD868, 0x9CB6BF72, 0x0AFE4002,
 10 |     0xA6E03615, 0x5138C1D4, 0xBE216306, 0xB38B8890, 0x3EA8B96B, 0x3299ACE4, 0x30924DD4, 0x55CB34A5,
 11 | ];
 12 | 
 13 | const C_INIT: [u32; 16] = [
 14 |     0xB405F031, 0xC4233EBA, 0xB3733979, 0xC0DD9D55, 0xC51C28AE, 0xA327B8E1, 0x56C56167, 0xED614433,
 15 |     0x88B59D60, 0x60E2CEBA, 0x758B4B8B, 0x83E82A7F, 0xBC968828, 0xE6E00BF7, 0xBA839E55, 0x9B491C60,
 16 | ];
 17 | 
 18 | pub fn shabal256_fast(data: &[u8], term: &[u32; 16]) -> [u8; 32] {
 19 |     let mut a = A_INIT;
 20 |     let mut b = B_INIT;
 21 |     let mut c = C_INIT;
 22 |     let mut w_high = 0u32;
 23 |     let mut w_low = 1u32;
 24 |     let mut num = data.len() >> 6;
 25 |     let mut ptr = 0;
 26 |     let data_ptr = data.as_ptr() as *const u32;
 27 |     let data = unsafe { from_raw_parts(data_ptr, data.len() / 4) };
 28 | 
 29 |     while num > 0 {
 30 |         input_block_add(&mut b, &data[ptr..]);
 31 |         xor_w(&mut a, w_low, w_high);
 32 |         apply_p(&mut a, &mut b, &c, &data[ptr..]);
 33 |         input_block_sub(&mut c, &data[ptr..]);
 34 |         swap_bc(&mut b, &mut c);
 35 |         incr_w(&mut w_low, &mut w_high);
 36 |         ptr = ptr.wrapping_add(16);
 37 |         num = num.wrapping_sub(1);
 38 |     }
 39 |     input_block_add(&mut b, term);
 40 |     xor_w(&mut a, w_low, w_high);
 41 |     apply_p(&mut a, &mut b, &c, term);
 42 |     for _ in 0..3 {
 43 |         swap_bc(&mut b, &mut c);
 44 |         xor_w(&mut a, w_low, w_high);
 45 |         apply_p(&mut a, &mut b, &c, term);
 46 |     }
 47 |     unsafe { *(b[8..16].as_ptr() as *const [u8; 32]) }
 48 | }
 49 | 
 50 | #[inline(always)]
 51 | fn input_block_add(b: &mut [u32; 16], data: &[u32]) {
 52 |     for (element, data) in b.iter_mut().zip(data.iter()) {
 53 |         *element = element.wrapping_add(*data);
 54 |     }
 55 | }
 56 | 
 57 | #[inline(always)]
 58 | fn input_block_sub(c: &mut [u32; 16], data: &[u32]) {
 59 |     for (element, data) in c.iter_mut().zip(data.iter()) {
 60 |         *element = element.wrapping_sub(*data);
 61 |     }
 62 | }
 63 | 
 64 | #[inline(always)]
 65 | fn xor_w(a: &mut [u32; 12], w_low: u32, w_high: u32) {
 66 |     a[0] ^= w_low;
 67 |     a[1] ^= w_high;
 68 | }
 69 | 
 70 | #[inline(always)]
 71 | fn apply_p(a: &mut [u32; 12], b: &mut [u32; 16], c: &[u32; 16], data: &[u32]) {
 72 |     for element in b.iter_mut() {
 73 |         *element = element.wrapping_shl(17) | element.wrapping_shr(15);
 74 |     }
 75 |     perm(a, b, c, data);
 76 |     a[0] = a[0]
 77 |         .wrapping_add(c[11])
 78 |         .wrapping_add(c[15])
 79 |         .wrapping_add(c[3]);
 80 |     a[1] = a[1]
 81 |         .wrapping_add(c[12])
 82 |         .wrapping_add(c[0])
 83 |         .wrapping_add(c[4]);
 84 |     a[2] = a[2]
 85 |         .wrapping_add(c[13])
 86 |         .wrapping_add(c[1])
 87 |         .wrapping_add(c[5]);
 88 |     a[3] = a[3]
 89 |         .wrapping_add(c[14])
 90 |         .wrapping_add(c[2])
 91 |         .wrapping_add(c[6]);
 92 |     a[4] = a[4]
 93 |         .wrapping_add(c[15])
 94 |         .wrapping_add(c[3])
 95 |         .wrapping_add(c[7]);
 96 |     a[5] = a[5]
 97 |         .wrapping_add(c[0])
 98 |         .wrapping_add(c[4])
 99 |         .wrapping_add(c[8]);
100 |     a[6] = a[6]
101 |         .wrapping_add(c[1])
102 |         .wrapping_add(c[5])
103 |         .wrapping_add(c[9]);
104 |     a[7] = a[7]
105 |         .wrapping_add(c[2])
106 |         .wrapping_add(c[6])
107 |         .wrapping_add(c[10]);
108 |     a[8] = a[8]
109 |         .wrapping_add(c[3])
110 |         .wrapping_add(c[7])
111 |         .wrapping_add(c[11]);
112 |     a[9] = a[9]
113 |         .wrapping_add(c[4])
114 |         .wrapping_add(c[8])
115 |         .wrapping_add(c[12]);
116 |     a[10] = a[10]
117 |         .wrapping_add(c[5])
118 |         .wrapping_add(c[9])
119 |         .wrapping_add(c[13]);
120 |     a[11] = a[11]
121 |         .wrapping_add(c[6])
122 |         .wrapping_add(c[10])
123 |         .wrapping_add(c[14]);
124 | }
125 | 
126 | #[inline(always)]
127 | fn perm_elt(
128 |     a: &mut [u32; 12],
129 |     b: &mut [u32; 16],
130 |     xa0: usize,
131 |     xa1: usize,
132 |     xb0: usize,
133 |     xb1: usize,
134 |     xb2: usize,
135 |     xb3: usize,
136 |     xc: u32,
137 |     xm: u32,
138 | ) {
139 |     unsafe {
140 |         *a.get_unchecked_mut(xa0) = (a.get_unchecked(xa0)
141 |             ^ ((a.get_unchecked(xa1).wrapping_shl(15u32)
142 |                 | a.get_unchecked(xa1).wrapping_shr(17u32))
143 |             .wrapping_mul(5u32))
144 |             ^ xc)
145 |             .wrapping_mul(3u32)
146 |             ^ b.get_unchecked(xb1)
147 |             ^ (b.get_unchecked(xb2) & !b.get_unchecked(xb3))
148 |             ^ xm;
149 |         *b.get_unchecked_mut(xb0) = !((b.get_unchecked(xb0).wrapping_shl(1)
150 |             | b.get_unchecked(xb0).wrapping_shr(31))
151 |             ^ a.get_unchecked(xa0));
152 |     }
153 | }
154 | 
155 | #[inline(always)]
156 | fn perm(a: &mut [u32; 12], b: &mut [u32; 16], c: &[u32; 16], data: &[u32]) {
157 |     unsafe {
158 |         perm_elt(a, b, 0, 11, 0, 13, 9, 6, c[8], *data.get_unchecked(0));
159 |         perm_elt(a, b, 1, 0, 1, 14, 10, 7, c[7], *data.get_unchecked(1));
160 |         perm_elt(a, b, 2, 1, 2, 15, 11, 8, c[6], *data.get_unchecked(2));
161 |         perm_elt(a, b, 3, 2, 3, 0, 12, 9, c[5], *data.get_unchecked(3));
162 |         perm_elt(a, b, 4, 3, 4, 1, 13, 10, c[4], *data.get_unchecked(4));
163 |         perm_elt(a, b, 5, 4, 5, 2, 14, 11, c[3], *data.get_unchecked(5));
164 |         perm_elt(a, b, 6, 5, 6, 3, 15, 12, c[2], *data.get_unchecked(6));
165 |         perm_elt(a, b, 7, 6, 7, 4, 0, 13, c[1], *data.get_unchecked(7));
166 |         perm_elt(a, b, 8, 7, 8, 5, 1, 14, c[0], *data.get_unchecked(8));
167 |         perm_elt(a, b, 9, 8, 9, 6, 2, 15, c[15], *data.get_unchecked(9));
168 |         perm_elt(a, b, 10, 9, 10, 7, 3, 0, c[14], *data.get_unchecked(10));
169 |         perm_elt(a, b, 11, 10, 11, 8, 4, 1, c[13], *data.get_unchecked(11));
170 |         perm_elt(a, b, 0, 11, 12, 9, 5, 2, c[12], *data.get_unchecked(12));
171 |         perm_elt(a, b, 1, 0, 13, 10, 6, 3, c[11], *data.get_unchecked(13));
172 |         perm_elt(a, b, 2, 1, 14, 11, 7, 4, c[10], *data.get_unchecked(14));
173 |         perm_elt(a, b, 3, 2, 15, 12, 8, 5, c[9], *data.get_unchecked(15));
174 |         perm_elt(a, b, 4, 3, 0, 13, 9, 6, c[8], *data.get_unchecked(0));
175 |         perm_elt(a, b, 5, 4, 1, 14, 10, 7, c[7], *data.get_unchecked(1));
176 |         perm_elt(a, b, 6, 5, 2, 15, 11, 8, c[6], *data.get_unchecked(2));
177 |         perm_elt(a, b, 7, 6, 3, 0, 12, 9, c[5], *data.get_unchecked(3));
178 |         perm_elt(a, b, 8, 7, 4, 1, 13, 10, c[4], *data.get_unchecked(4));
179 |         perm_elt(a, b, 9, 8, 5, 2, 14, 11, c[3], *data.get_unchecked(5));
180 |         perm_elt(a, b, 10, 9, 6, 3, 15, 12, c[2], *data.get_unchecked(6));
181 |         perm_elt(a, b, 11, 10, 7, 4, 0, 13, c[1], *data.get_unchecked(7));
182 |         perm_elt(a, b, 0, 11, 8, 5, 1, 14, c[0], *data.get_unchecked(8));
183 |         perm_elt(a, b, 1, 0, 9, 6, 2, 15, c[15], *data.get_unchecked(9));
184 |         perm_elt(a, b, 2, 1, 10, 7, 3, 0, c[14], *data.get_unchecked(10));
185 |         perm_elt(a, b, 3, 2, 11, 8, 4, 1, c[13], *data.get_unchecked(11));
186 |         perm_elt(a, b, 4, 3, 12, 9, 5, 2, c[12], *data.get_unchecked(12));
187 |         perm_elt(a, b, 5, 4, 13, 10, 6, 3, c[11], *data.get_unchecked(13));
188 |         perm_elt(a, b, 6, 5, 14, 11, 7, 4, c[10], *data.get_unchecked(14));
189 |         perm_elt(a, b, 7, 6, 15, 12, 8, 5, c[9], *data.get_unchecked(15));
190 |         perm_elt(a, b, 8, 7, 0, 13, 9, 6, c[8], *data.get_unchecked(0));
191 |         perm_elt(a, b, 9, 8, 1, 14, 10, 7, c[7], *data.get_unchecked(1));
192 |         perm_elt(a, b, 10, 9, 2, 15, 11, 8, c[6], *data.get_unchecked(2));
193 |         perm_elt(a, b, 11, 10, 3, 0, 12, 9, c[5], *data.get_unchecked(3));
194 |         perm_elt(a, b, 0, 11, 4, 1, 13, 10, c[4], *data.get_unchecked(4));
195 |         perm_elt(a, b, 1, 0, 5, 2, 14, 11, c[3], *data.get_unchecked(5));
196 |         perm_elt(a, b, 2, 1, 6, 3, 15, 12, c[2], *data.get_unchecked(6));
197 |         perm_elt(a, b, 3, 2, 7, 4, 0, 13, c[1], *data.get_unchecked(7));
198 |         perm_elt(a, b, 4, 3, 8, 5, 1, 14, c[0], *data.get_unchecked(8));
199 |         perm_elt(a, b, 5, 4, 9, 6, 2, 15, c[15], *data.get_unchecked(9));
200 |         perm_elt(a, b, 6, 5, 10, 7, 3, 0, c[14], *data.get_unchecked(10));
201 |         perm_elt(a, b, 7, 6, 11, 8, 4, 1, c[13], *data.get_unchecked(11));
202 |         perm_elt(a, b, 8, 7, 12, 9, 5, 2, c[12], *data.get_unchecked(12));
203 |         perm_elt(a, b, 9, 8, 13, 10, 6, 3, c[11], *data.get_unchecked(13));
204 |         perm_elt(a, b, 10, 9, 14, 11, 7, 4, c[10], *data.get_unchecked(14));
205 |         perm_elt(a, b, 11, 10, 15, 12, 8, 5, c[9], *data.get_unchecked(15));
206 |     }
207 | }
208 | 
209 | #[inline(always)]
210 | fn swap_bc(b: &mut [u32; 16], c: &mut [u32; 16]) {
211 |     std::mem::swap(b, c);
212 | }
213 | 
214 | #[inline(always)]
215 | fn incr_w(w_low: &mut u32, w_high: &mut u32) {
216 |     *w_low = w_low.wrapping_add(1);
217 |     if *w_low == 0 {
218 |         *w_high = w_high.wrapping_add(1);
219 |     }
220 | }
221 | 
222 | #[cfg(test)]
223 | mod test {
224 |     use super::*;
225 |     const TEST_A_RESULT: [u8; 32] = [
226 |         0xDA, 0x8F, 0x08, 0xC0, 0x2A, 0x67, 0xBA, 0x9A, 0x56, 0xBD, 0xD0, 0x79, 0x8E, 0x48, 0xAE,
227 |         0x07, 0x14, 0x21, 0x5E, 0x09, 0x3B, 0x5B, 0x85, 0x06, 0x49, 0xA3, 0x77, 0x18, 0x99, 0x3F,
228 |         0x54, 0xA2,
229 |     ];
230 |     const TEST_B_RESULT: [u8; 32] = [
231 |         0xB4, 0x9F, 0x34, 0xBF, 0x51, 0x86, 0x4C, 0x30, 0x53, 0x3C, 0xC4, 0x6C, 0xC2, 0x54, 0x2B,
232 |         0xDE, 0xC2, 0xF9, 0x6F, 0xD0, 0x6F, 0x5C, 0x53, 0x9A, 0xFF, 0x6E, 0xAD, 0x58, 0x83, 0xF7,
233 |         0x32, 0x7A,
234 |     ];
235 |     const TEST_B_M1: [u32; 16] = [
236 |         0x64636261, 0x68676665, 0x6C6B6A69, 0x706F6E6D, 0x74737271, 0x78777675, 0x302D7A79,
237 |         0x34333231, 0x38373635, 0x42412D39, 0x46454443, 0x4A494847, 0x4E4D4C4B, 0x5251504F,
238 |         0x56555453, 0x5A595857,
239 |     ];
240 |     const TEST_B_M2: [u32; 16] = [
241 |         0x3231302D, 0x36353433, 0x2D393837, 0x64636261, 0x68676665, 0x6C6B6A69, 0x706F6E6D,
242 |         0x74737271, 0x78777675, 0x00807A79, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
243 |         0x00000000, 0x00000000,
244 |     ];
245 | 
246 |     #[test]
247 |     fn shabal256() {
248 |         // test message A
249 |         let test_data = [0u8; 64];
250 |         let mut test_term = [0u32; 16];
251 |         test_term[0] = 0x80;
252 |         let hash_a = shabal256_fast(&test_data, &test_term);
253 |         assert_eq!(hash_a, TEST_A_RESULT);
254 |         // test message B
255 |         let hash_b = unsafe {
256 |             shabal256_fast(
257 |                 &std::mem::transmute::<[u32; 16], [u8; 64]>(TEST_B_M1),
258 |                 &TEST_B_M2,
259 |             )
260 |         };
261 |         assert_eq!(hash_b, TEST_B_RESULT);
262 |     }
263 | }
264 | 


--------------------------------------------------------------------------------
/src/utils.rs:
--------------------------------------------------------------------------------
  1 | use std::fs::{File, OpenOptions};
  2 | use std::io;
  3 | use std::path::Path;
  4 | 
  5 | cfg_if! {
  6 |     if #[cfg(unix)] {
  7 |         #[cfg(linux)]
  8 |         extern crate thread_priority;
  9 |         use std::process::Command;
 10 |         use std::process;
 11 |         use std::os::unix::fs::OpenOptionsExt;
 12 |         use fs2::FileExt;
 13 |         #[cfg(linux)]
 14 |         use thread_priority::*;
 15 | 
 16 |         const O_DIRECT: i32 = 0o0_040_000;
 17 | 
 18 |         pub fn set_low_prio() {
 19 |             // todo: low prio for macos
 20 |             #[cfg(linux)]
 21 |             let thread_id = thread_native_id();
 22 |             #[cfg(linux)]
 23 |             set_thread_priority(
 24 |                 thread_id,
 25 |                 ThreadPriority::Min,
 26 |                 ThreadSchedulePolicy::Normal(NormalThreadSchedulePolicy::Normal)
 27 |             ).unwrap();
 28 |         }
 29 | 
 30 |         pub fn open_using_direct_io<P: AsRef<Path>>(path: P) -> io::Result<File> {
 31 |             OpenOptions::new()
 32 |                 .write(true)
 33 |                 .create(true)
 34 |                 .custom_flags(O_DIRECT)
 35 |                 .open(path)
 36 |         }
 37 | 
 38 |         pub fn open<P: AsRef<Path>>(path: P) -> io::Result<File> {
 39 |             OpenOptions::new()
 40 |                 .write(true)
 41 |                 .create(true)
 42 |                 .open(path)
 43 |         }
 44 | 
 45 |         pub fn open_r<P: AsRef<Path>>(path: P) -> io::Result<File> {
 46 |             OpenOptions::new()
 47 |                 .read(true)
 48 |                 .open(path)
 49 |         }
 50 |         // On unix, get the device id from 'df' command
 51 |         fn get_device_id_unix(path: &str) -> String {
 52 |             let output = Command::new("df")
 53 |                  .arg(path)
 54 |                  .output()
 55 |                  .expect("failed to execute 'df --output=source'");
 56 |              let source = String::from_utf8(output.stdout).expect("not utf8");
 57 |              source.split('\n').collect::<Vec<&str>>()[1].split(' ').collect::<Vec<&str>>()[0].to_string()
 58 |          }
 59 | 
 60 |         // On macos, use df and 'diskutil info <device>' to get the Device Block Size line
 61 |         // and extract the size
 62 |         fn get_sector_size_macos(path: &str) -> u64 {
 63 |             let source = get_device_id_unix(path);
 64 |             let output = Command::new("diskutil")
 65 |                 .arg("info")
 66 |                 .arg(source)
 67 |                 .output()
 68 |                 .expect("failed to execute 'diskutil info'");
 69 |             let source = String::from_utf8(output.stdout).expect("not utf8");
 70 |             let mut sector_size: u64 = 0;
 71 |             for line in source.split('\n').collect::<Vec<&str>>() {
 72 |                 if line.trim().starts_with("Device Block Size") {
 73 |                     // e.g. in reverse: "Bytes 512 Size Block Device"
 74 |                     let source = line.rsplit(' ').collect::<Vec<&str>>()[1];
 75 | 
 76 |                     sector_size = source.parse::<u64>().unwrap();
 77 |                 }
 78 |             }
 79 |             if sector_size == 0 {
 80 |                 panic!("Abort: Unable to determine disk physical sector size from diskutil info")
 81 |             }
 82 |             sector_size
 83 |         }
 84 | 
 85 |         // On unix, use df and lsblk to extract the device sector size
 86 |         fn get_sector_size_unix(path: &str) -> u64 {
 87 |             let source = get_device_id_unix(path);
 88 |             let output = Command::new("lsblk")
 89 |                 .arg(source)
 90 |                 .arg("-o")
 91 |                 .arg("PHY-SeC") // I'm strict here, LOG-SeC would do
 92 |                 .output()
 93 |                 .expect("failed to execute 'lsblk -o PHY-SeC'");
 94 | 
 95 |             let sector_size = String::from_utf8(output.stdout).expect("not utf8");
 96 |             let sector_size = sector_size.split('\n').collect::<Vec<&str>>().get(1).unwrap_or_else(|| {
 97 |                 println!("failed to determine sector size, defaulting to 4096.");
 98 |                 &"4096"
 99 |             }).trim();
100 | 
101 |             sector_size.parse::<u64>().unwrap()
102 |         }
103 | 
104 |         pub fn get_sector_size(path: &str) -> u64 {
105 |             if cfg!(target_os = "macos") {
106 |                 get_sector_size_macos(path)
107 |             } else {
108 |                 get_sector_size_unix(path)
109 |             }
110 |         }
111 | 
112 |         pub fn preallocate(file: &Path, size_in_bytes: u64, use_direct_io: bool) {
113 |             let file = if use_direct_io {
114 |                 open_using_direct_io(&file)
115 |             } else {
116 |                 open(&file)
117 |             };
118 |             let file = file.unwrap();
119 |             match file.allocate(size_in_bytes) {
120 |                 Err(errno) => {
121 |                     // Exit if preallocate fails because write_resume_info() assumes
122 |                     // that the file isn't zero sized.
123 |                     println!("\n\nError: couldn't preallocate space for file. {}\n\
124 |                               Probable causes are:\n \
125 |                               * fallocate() is only supported on ext4 filesystems.\n \
126 |                               * Insufficient space.\n", errno);
127 |                     process::exit(1);
128 |                 }
129 |                 Ok(_) => (),
130 |             }
131 |         }
132 | 
133 |         pub fn free_disk_space(path: &str) -> u64 {
134 |             // I don't like the following code, but I had to. It's difficult to estimate the space available for a new file on ext4 due to overhead.
135 |             // Therefor I enforce a 2MB cushion assuming this is sufficient.
136 |             fs2::available_space(Path::new(&path)).unwrap().saturating_sub(2097152)
137 |         }
138 | 
139 |     } else {
140 |         use std::ffi::CString;
141 |         use std::ptr::null_mut;
142 |         use std::iter::once;
143 |         use std::ffi::OsStr;
144 |         use std::os::windows::io::AsRawHandle;
145 |         use std::os::windows::ffi::OsStrExt;
146 |         use std::os::windows::fs::OpenOptionsExt;
147 |         use core::mem::size_of_val;
148 |         use winapi::um::errhandlingapi::GetLastError;
149 |         use winapi::um::fileapi::{GetDiskFreeSpaceA,SetFileValidData};
150 |         use winapi::um::handleapi::CloseHandle;
151 |         use winapi::um::processthreadsapi::{SetThreadIdealProcessor,GetCurrentThread,OpenProcessToken,GetCurrentProcess,SetPriorityClass};
152 |         use winapi::um::securitybaseapi::AdjustTokenPrivileges;
153 |         use winapi::um::winbase::LookupPrivilegeValueW;
154 |         use winapi::um::winnt::{LUID,TOKEN_ADJUST_PRIVILEGES,TOKEN_PRIVILEGES,LUID_AND_ATTRIBUTES,SE_PRIVILEGE_ENABLED,SE_MANAGE_VOLUME_NAME};
155 | 
156 |         const FILE_FLAG_NO_BUFFERING: u32 = 0x2000_0000;
157 |         const FILE_FLAG_WRITE_THROUGH: u32 = 0x8000_0000;
158 |         const BELOW_NORMAL_PRIORITY_CLASS: u32 = 0x0000_4000;
159 | 
160 |         pub fn open_using_direct_io<P: AsRef<Path>>(path: P) -> io::Result<File> {
161 |             OpenOptions::new()
162 |                 .write(true)
163 |                 .create(true)
164 |                 .custom_flags(FILE_FLAG_NO_BUFFERING)
165 |                 .open(path)
166 |         }
167 | 
168 |         pub fn open<P: AsRef<Path>>(path: P) -> io::Result<File> {
169 |             OpenOptions::new()
170 |                 .write(true)
171 |                 .create(true)
172 |                 .custom_flags(FILE_FLAG_WRITE_THROUGH)
173 |                 .open(path)
174 |         }
175 | 
176 |         pub fn open_r<P: AsRef<Path>>(path: P) -> io::Result<File> {
177 |             OpenOptions::new()
178 |                 .read(true)
179 |                 .open(path)
180 |         }
181 | 
182 |         pub fn preallocate(file: &Path, size_in_bytes: u64, use_direct_io: bool) {
183 |             let mut result = true;
184 |             result &= obtain_priviledge();
185 | 
186 |             let file = if use_direct_io {
187 |                 open_using_direct_io(&file)
188 |             } else {
189 |                 open(&file)
190 |             };
191 |             let file = file.unwrap();
192 | 
193 |             file.set_len(size_in_bytes).unwrap();
194 | 
195 |             if result {
196 |                 let handle = file.as_raw_handle();
197 |                 unsafe{
198 |                     let temp = SetFileValidData(handle, size_in_bytes as i64);
199 |                     result &= temp == 1;
200 |                 }
201 |             }
202 | 
203 |             if !result {
204 |                 println!("FAILED, administrative rights missing");
205 |                 print!("Slow file pre-allocation...");
206 |             }
207 |         }
208 | 
209 |         pub fn obtain_priviledge() -> bool {
210 |             let mut result = true;
211 | 
212 |             let privilege_encoded: Vec<u16> = OsStr::new(SE_MANAGE_VOLUME_NAME)
213 |                 .encode_wide()
214 |                 .chain(once(0))
215 |                 .collect();
216 | 
217 |             let luid = LUID{
218 |                 HighPart: 0i32,
219 |                 LowPart: 0u32
220 | 
221 |             };
222 | 
223 |             unsafe {
224 |                 let mut htoken = null_mut();
225 |                 let mut tp = TOKEN_PRIVILEGES{
226 |                     PrivilegeCount: 1,
227 |                     Privileges: [LUID_AND_ATTRIBUTES{
228 |                     Luid: luid,
229 |                     Attributes: SE_PRIVILEGE_ENABLED,
230 |                     }]
231 |                 };
232 | 
233 |                 let temp = OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, &mut htoken);
234 |                  result &= temp == 1;
235 | 
236 |                 let temp = LookupPrivilegeValueW(null_mut(), privilege_encoded.as_ptr(), &mut tp.Privileges[0].Luid);
237 |                 result &= temp == 1;
238 | 
239 |                 let temp = AdjustTokenPrivileges(htoken, 0, &mut tp, size_of_val(&tp) as u32, null_mut(), null_mut());
240 | 
241 |                 CloseHandle(htoken);
242 |                 result &= temp == 1;
243 |                 result &=
244 |                     GetLastError() == 0u32
245 |             }
246 |             result
247 |         }
248 | 
249 |         pub fn get_sector_size(path: &str) -> u64 {
250 |             let path_encoded = Path::new(path);
251 |             let parent_path_encoded = CString::new(path_encoded.to_str().unwrap()).unwrap();
252 |             let mut sectors_per_cluster  = 0u32;
253 |             let mut bytes_per_sector  = 0u32;
254 |             let mut number_of_free_cluster  = 0u32;
255 |             let mut total_number_of_cluster  = 0u32;
256 |             if unsafe {
257 |                 GetDiskFreeSpaceA(
258 |                     parent_path_encoded.as_ptr(),
259 |                     &mut sectors_per_cluster,
260 |                     &mut bytes_per_sector,
261 |                     &mut number_of_free_cluster,
262 |                     &mut total_number_of_cluster
263 |                 )
264 |             } == 0  {
265 |                 panic!("get sector size, filename={}",path);
266 |             };
267 |             u64::from(bytes_per_sector)
268 |         }
269 | 
270 |         pub fn set_thread_ideal_processor(id: usize){
271 |             // Set core affinity for current thread.
272 |         unsafe {
273 |             SetThreadIdealProcessor(
274 |                 GetCurrentThread(),
275 |                 id as u32
276 |             );
277 |             }
278 |         }
279 |         pub fn set_low_prio() {
280 |             unsafe{
281 |                 SetPriorityClass(GetCurrentProcess(),BELOW_NORMAL_PRIORITY_CLASS);
282 |             }
283 |         }
284 |         pub fn free_disk_space(path: &str) -> u64 {
285 |             fs2::available_space(Path::new(&path)).unwrap()
286 |         }
287 |     }
288 | }
289 | 
290 | 
291 | 


--------------------------------------------------------------------------------
/src/c/noncegen_256_avx2.c:
--------------------------------------------------------------------------------
  1 | #include "noncegen_256_avx2.h"
  2 | #include <immintrin.h>
  3 | #include <string.h>
  4 | #include "common.h"
  5 | #include "mshabal_256_avx2.h"
  6 | #include "sph_shabal.h"
  7 | 
  8 | sph_shabal_context global_32;
  9 | mshabal256_context global_256;
 10 | mshabal256_context_fast global_256_fast;
 11 | 
 12 | void init_shabal_avx2() {
 13 |     sph_shabal256_init(&global_32);
 14 |     mshabal_init_avx2(&global_256, 256);
 15 |     global_256_fast.out_size = global_256.out_size;
 16 |     for (int i = 0; i < 352; i++) global_256_fast.state[i] = global_256.state[i];
 17 |     global_256_fast.Whigh = global_256.Whigh;
 18 |     global_256_fast.Wlow = global_256.Wlow;
 19 | }
 20 | 
 21 | // cache:		    cache to save to
 22 | // local_num:		thread number
 23 | // numeric_id:		numeric account id
 24 | // loc_startnonce	nonce to start generation at
 25 | // local_nonces: 	number of nonces to generate
 26 | void noncegen_avx2(char *cache, const size_t cache_size, const size_t chunk_offset,
 27 |                    const uint64_t numeric_id, const uint64_t local_startnonce,
 28 |                    const uint64_t local_nonces) {
 29 |     sph_shabal_context local_32;
 30 |     uint64_t nonce;
 31 |     size_t len;
 32 | 
 33 |     mshabal256_context_fast local_256_fast;
 34 |     uint64_t nonce1, nonce2, nonce3, nonce4, nonce5, nonce6, nonce7, nonce8;
 35 | 
 36 |     char seed[32];  // 64bit numeric account ID, 64bit nonce (blank), 1bit termination, 127 bits zero
 37 |     char term[32];  // 1bit 1, 255bit of zeros
 38 |     char zero[32];  // 256bit of zeros
 39 | 
 40 |     write_seed(seed, numeric_id);
 41 |     write_term(term);
 42 |     memset(&zero[0], 0, 32);
 43 | 
 44 |     //vars shared
 45 |     uint8_t* buffer = (uint8_t*)malloc(sizeof(uint8_t) * MSHABAL256_VECTOR_SIZE * NONCE_SIZE);
 46 |     uint8_t* final = (uint8_t*)malloc(sizeof(uint8_t) * MSHABAL256_VECTOR_SIZE * HASH_SIZE);
 47 | 
 48 |     // prepare smart SIMD aligned termination strings
 49 |     // creation could further be optimized, but not much in it as it only runs once per work package
 50 |     // creation could also be moved to plotter start
 51 |     union {
 52 |         mshabal_u32 words[16 * MSHABAL256_VECTOR_SIZE];
 53 |         __m256i data[16];
 54 |     } t1, t2, t3;
 55 | 
 56 |     for (int j = 0; j < 16 * MSHABAL256_VECTOR_SIZE / 2; j += MSHABAL256_VECTOR_SIZE) {
 57 |         size_t o = j / 2;
 58 |         // t1
 59 |         t1.words[j + 0] = *(mshabal_u32 *)(seed + o);
 60 |         t1.words[j + 1] = *(mshabal_u32 *)(seed + o);
 61 |         t1.words[j + 2] = *(mshabal_u32 *)(seed + o);
 62 |         t1.words[j + 3] = *(mshabal_u32 *)(seed + o);
 63 |         t1.words[j + 4] = *(mshabal_u32 *)(seed + o);
 64 |         t1.words[j + 5] = *(mshabal_u32 *)(seed + o);
 65 |         t1.words[j + 6] = *(mshabal_u32 *)(seed + o);
 66 |         t1.words[j + 7] = *(mshabal_u32 *)(seed + o);
 67 |         t1.words[j + 0 + 64] = *(mshabal_u32 *)(zero + o);
 68 |         t1.words[j + 1 + 64] = *(mshabal_u32 *)(zero + o);
 69 |         t1.words[j + 2 + 64] = *(mshabal_u32 *)(zero + o);
 70 |         t1.words[j + 3 + 64] = *(mshabal_u32 *)(zero + o);
 71 |         t1.words[j + 4 + 64] = *(mshabal_u32 *)(zero + o);
 72 |         t1.words[j + 5 + 64] = *(mshabal_u32 *)(zero + o);
 73 |         t1.words[j + 6 + 64] = *(mshabal_u32 *)(zero + o);
 74 |         t1.words[j + 7 + 64] = *(mshabal_u32 *)(zero + o);
 75 |         // t2
 76 |         // (first 256bit skipped, will later be filled with data)
 77 |         t2.words[j + 0 + 64] = *(mshabal_u32 *)(seed + o);
 78 |         t2.words[j + 1 + 64] = *(mshabal_u32 *)(seed + o);
 79 |         t2.words[j + 2 + 64] = *(mshabal_u32 *)(seed + o);
 80 |         t2.words[j + 3 + 64] = *(mshabal_u32 *)(seed + o);
 81 |         t2.words[j + 4 + 64] = *(mshabal_u32 *)(seed + o);
 82 |         t2.words[j + 5 + 64] = *(mshabal_u32 *)(seed + o);
 83 |         t2.words[j + 6 + 64] = *(mshabal_u32 *)(seed + o);
 84 |         t2.words[j + 7 + 64] = *(mshabal_u32 *)(seed + o);
 85 |         // t3
 86 |         t3.words[j + 0] = *(mshabal_u32 *)(term + o);
 87 |         t3.words[j + 1] = *(mshabal_u32 *)(term + o);
 88 |         t3.words[j + 2] = *(mshabal_u32 *)(term + o);
 89 |         t3.words[j + 3] = *(mshabal_u32 *)(term + o);
 90 |         t3.words[j + 4] = *(mshabal_u32 *)(term + o);
 91 |         t3.words[j + 5] = *(mshabal_u32 *)(term + o);
 92 |         t3.words[j + 6] = *(mshabal_u32 *)(term + o);
 93 |         t3.words[j + 7] = *(mshabal_u32 *)(term + o);
 94 |         t3.words[j + 0 + 64] = *(mshabal_u32 *)(zero + o);
 95 |         t3.words[j + 1 + 64] = *(mshabal_u32 *)(zero + o);
 96 |         t3.words[j + 2 + 64] = *(mshabal_u32 *)(zero + o);
 97 |         t3.words[j + 3 + 64] = *(mshabal_u32 *)(zero + o);
 98 |         t3.words[j + 4 + 64] = *(mshabal_u32 *)(zero + o);
 99 |         t3.words[j + 5 + 64] = *(mshabal_u32 *)(zero + o);
100 |         t3.words[j + 6 + 64] = *(mshabal_u32 *)(zero + o);
101 |         t3.words[j + 7 + 64] = *(mshabal_u32 *)(zero + o);
102 |     }
103 | 
104 |     for (uint64_t n = 0; n < local_nonces;) {
105 |         // iterate nonces (8 per cycle - avx2)
106 |         // min 8 nonces left for avx 2 processing, otherwise SISD
107 |         if (n + 8 <= local_nonces) {
108 |             // generate nonce numbers & change endianness
109 |             nonce1 = bswap_64((uint64_t)(local_startnonce + n + 0));
110 |             nonce2 = bswap_64((uint64_t)(local_startnonce + n + 1));
111 |             nonce3 = bswap_64((uint64_t)(local_startnonce + n + 2));
112 |             nonce4 = bswap_64((uint64_t)(local_startnonce + n + 3));
113 |             nonce5 = bswap_64((uint64_t)(local_startnonce + n + 4));
114 |             nonce6 = bswap_64((uint64_t)(local_startnonce + n + 5));
115 |             nonce7 = bswap_64((uint64_t)(local_startnonce + n + 6));
116 |             nonce8 = bswap_64((uint64_t)(local_startnonce + n + 7));
117 | 
118 |             // store nonce numbers in relevant termination strings
119 |             for (int j = 16; j < 16 * MSHABAL256_VECTOR_SIZE / 4; j += MSHABAL256_VECTOR_SIZE) {
120 |                 size_t o = j / 2 - 8;
121 |                 // t1
122 |                 t1.words[j + 0] = *(mshabal_u32 *)((char *)&nonce1 + o);
123 |                 t1.words[j + 1] = *(mshabal_u32 *)((char *)&nonce2 + o);
124 |                 t1.words[j + 2] = *(mshabal_u32 *)((char *)&nonce3 + o);
125 |                 t1.words[j + 3] = *(mshabal_u32 *)((char *)&nonce4 + o);
126 |                 t1.words[j + 4] = *(mshabal_u32 *)((char *)&nonce5 + o);
127 |                 t1.words[j + 5] = *(mshabal_u32 *)((char *)&nonce6 + o);
128 |                 t1.words[j + 6] = *(mshabal_u32 *)((char *)&nonce7 + o);
129 |                 t1.words[j + 7] = *(mshabal_u32 *)((char *)&nonce8 + o);
130 |                 t2.words[j + 0 + 64] = *(mshabal_u32 *)((char *)&nonce1 + o);
131 |                 t2.words[j + 1 + 64] = *(mshabal_u32 *)((char *)&nonce2 + o);
132 |                 t2.words[j + 2 + 64] = *(mshabal_u32 *)((char *)&nonce3 + o);
133 |                 t2.words[j + 3 + 64] = *(mshabal_u32 *)((char *)&nonce4 + o);
134 |                 t2.words[j + 4 + 64] = *(mshabal_u32 *)((char *)&nonce5 + o);
135 |                 t2.words[j + 5 + 64] = *(mshabal_u32 *)((char *)&nonce6 + o);
136 |                 t2.words[j + 6 + 64] = *(mshabal_u32 *)((char *)&nonce7 + o);
137 |                 t2.words[j + 7 + 64] = *(mshabal_u32 *)((char *)&nonce8 + o);
138 |             }
139 |     
140 |             // start shabal round    
141 | 
142 |             // 3 cases: first 128 rounds uses case 1 or 2, after that case 3
143 |             // case 1: first 128 rounds, hashes are even: use termination string 1
144 |             // case 2: first 128 rounds, hashes are odd: use termination string 2
145 |             // case 3: round > 128: use termination string 3
146 |             
147 |             // round 1
148 |             memcpy(&local_256_fast, &global_256_fast,
149 |                    sizeof(global_256_fast));  // fast initialize shabal                 
150 |             
151 |             mshabal_hash_fast_avx2(
152 |                 &local_256_fast, NULL, &t1,
153 |                 &buffer[MSHABAL256_VECTOR_SIZE * (NONCE_SIZE - HASH_SIZE)], 16 >> 6);
154 | 
155 |             // store first hash into smart termination string 2 (data is vectored and SIMD aligned)
156 |             memcpy(&t2, &buffer[MSHABAL256_VECTOR_SIZE * (NONCE_SIZE - HASH_SIZE)],
157 |                    MSHABAL256_VECTOR_SIZE * (HASH_SIZE));
158 | 
159 |             // round 2 - 128
160 |             for (size_t i = NONCE_SIZE - HASH_SIZE; i > (NONCE_SIZE - HASH_CAP); i -= HASH_SIZE) {
161 |                 // check if msg can be divided into 512bit packages without a
162 |                 // remainder
163 |                 if (i % 64 == 0) {
164 |                     // last msg = seed + termination
165 |                     mshabal_hash_fast_avx2(&local_256_fast, &buffer[i * MSHABAL256_VECTOR_SIZE],
166 |                                               &t1,
167 |                                               &buffer[(i - HASH_SIZE) * MSHABAL256_VECTOR_SIZE],
168 |                                               (NONCE_SIZE + 16 - i) >> 6);
169 |                 } else {
170 |                     // last msg = 256 bit data + seed + termination
171 |                     mshabal_hash_fast_avx2(&local_256_fast, &buffer[i * MSHABAL256_VECTOR_SIZE],
172 |                                               &t2,
173 |                                               &buffer[(i - HASH_SIZE) * MSHABAL256_VECTOR_SIZE],
174 |                                               (NONCE_SIZE + 16 - i) >> 6);
175 |                 }
176 |             }
177 | 
178 |             // round 128-8192
179 |             for (size_t i = NONCE_SIZE - HASH_CAP; i > 0; i -= HASH_SIZE) {
180 |                 mshabal_hash_fast_avx2(&local_256_fast, &buffer[i * MSHABAL256_VECTOR_SIZE], &t3,
181 |                                           &buffer[(i - HASH_SIZE) * MSHABAL256_VECTOR_SIZE],
182 |                                           (HASH_CAP) >> 6);
183 |             }
184 |            
185 |             // generate final hash
186 |             mshabal_hash_fast_avx2(&local_256_fast, &buffer[0], &t1, &final[0],
187 |                                       (NONCE_SIZE + 16) >> 6);
188 |             
189 |             // XOR using SIMD
190 |             // load final hash
191 |             __m256i F[8];
192 |             for (int j = 0; j < 8; j++) F[j] = _mm256_loadu_si256((__m256i *)final + j);
193 |             // xor all hashes with final hash
194 |             for (int j = 0; j < 8 * 2 * HASH_CAP; j++)
195 |                 _mm256_storeu_si256(
196 |                     (__m256i *)buffer + j,
197 |                     _mm256_xor_si256(_mm256_loadu_si256((__m256i *)buffer + j), F[j % 8]));
198 |              
199 |             // todo: fork SIMD aligned plot file here
200 |             
201 |             // simd shabal words unpack + POC Shuffle + scatter nonces into optimised cache
202 |             for (int i = 0; i < NUM_SCOOPS * 2; i++) {
203 |                 for (int j = 0; j < 32; j += 4) {
204 |                     for (int k = 0; k < MSHABAL256_VECTOR_SIZE; k += 1) {
205 |                     memcpy(&cache[((i & 1) * (4095 - (i >> 1)) + ((i + 1) & 1) * (i >> 1)) *
206 |                                       SCOOP_SIZE * cache_size +
207 |                                   (n + k + chunk_offset) * SCOOP_SIZE + (i & 1) * 32 + j],
208 |                            &buffer[(i * 32 + j) * MSHABAL256_VECTOR_SIZE + k * 4], 4);
209 |                     }
210 |                 }
211 |             }
212 | 
213 |             n += 8;
214 |         } else {
215 |             // if less than 8 nonces left, use 1d-shabal
216 |             int8_t *xv = (int8_t *)&numeric_id;
217 |  
218 |             for (size_t i = 0; i < 8; i++) buffer[NONCE_SIZE + i] = xv[7 - i];
219 | 
220 |             nonce = local_startnonce + n;
221 |             xv = (int8_t *)&nonce;
222 | 
223 |             for (size_t i = 8; i < 16; i++) buffer[NONCE_SIZE + i] = xv[15 - i];
224 | 
225 |             for (size_t i = NONCE_SIZE; i > 0; i -= HASH_SIZE) {
226 |                 memcpy(&local_32, &global_32, sizeof(global_32));
227 |                 ;
228 |                 if (i < NONCE_SIZE + 16 - HASH_CAP)
229 |                     len = HASH_CAP;
230 |                 else
231 |                     len = NONCE_SIZE + 16 - i;
232 | 
233 |                 sph_shabal256(&local_32, &buffer[i], len);
234 |                 sph_shabal256_close(&local_32, &buffer[i - HASH_SIZE]);
235 |             }
236 | 
237 |             memcpy(&local_32, &global_32, sizeof(global_32));
238 |             sph_shabal256(&local_32, buffer, 16 + NONCE_SIZE);
239 |             sph_shabal256_close(&local_32, final);
240 | 
241 |             // XOR with final
242 |             for (size_t i = 0; i < NONCE_SIZE; i++) buffer[i] ^= (final[i % HASH_SIZE]);
243 | 
244 |             // Sort them PoC2:
245 |             for (size_t i = 0; i < HASH_CAP; i++){
246 |                 memmove(&cache[i * cache_size * SCOOP_SIZE + (n + chunk_offset) * SCOOP_SIZE], &buffer[i * SCOOP_SIZE], HASH_SIZE);
247 |                 memmove(&cache[(4095-i) * cache_size * SCOOP_SIZE + (n + chunk_offset) * SCOOP_SIZE + 32], &buffer[i * SCOOP_SIZE + 32], HASH_SIZE);
248 |             }
249 |             n++;
250 |         }
251 |     }
252 |     free(buffer);
253 |     free(final);
254 | }
255 | 


--------------------------------------------------------------------------------
/src/plotter.rs:
--------------------------------------------------------------------------------
  1 | use humanize_rs::bytes::Bytes;
  2 | use pbr::{MultiBar, Units};
  3 | use raw_cpuid::CpuId;
  4 | 
  5 | use crate::cpu_hasher::{SimdExtension,init_simd};
  6 | use crate::buffer::PageAlignedByteBuffer;
  7 | #[cfg(feature = "opencl")]
  8 | use crate::ocl::gpu_get_info;
  9 | use crate::scheduler::create_scheduler_thread;
 10 | #[cfg(windows)]
 11 | use crate::utils::set_thread_ideal_processor;
 12 | use crate::utils::{free_disk_space, get_sector_size, preallocate};
 13 | use crate::writer::{create_writer_thread, read_resume_info, write_resume_info};
 14 | use core_affinity;
 15 | use crossbeam_channel::bounded;
 16 | use std::cmp::{max, min};
 17 | use std::path::Path;
 18 | use std::process;
 19 | use std::sync::Arc;
 20 | use std::thread;
 21 | use stopwatch::Stopwatch;
 22 | 
 23 | pub const SCOOP_SIZE: u64 = 64;
 24 | pub const NUM_SCOOPS: u64 = 4096;
 25 | pub const NONCE_SIZE: u64 = SCOOP_SIZE * NUM_SCOOPS;
 26 | 
 27 | pub struct Plotter {}
 28 | 
 29 | pub struct PlotterTask {
 30 |     pub numeric_id: u64,
 31 |     pub start_nonce: u64,
 32 |     pub nonces: u64,
 33 |     pub output_path: String,
 34 |     pub mem: String,
 35 |     pub cpu_threads: u8,
 36 |     pub gpus: Option<Vec<String>>,
 37 |     pub direct_io: bool,
 38 |     pub async_io: bool,
 39 |     pub quiet: bool,
 40 |     pub benchmark: bool,
 41 |     pub zcb: bool,
 42 | }
 43 | 
 44 | impl Plotter {
 45 |     pub fn new() -> Plotter {
 46 |         Plotter {}
 47 |     }
 48 | 
 49 |     pub fn run(self, mut task: PlotterTask) {
 50 |         let cpuid = CpuId::new();
 51 |         let cpu_name = cpuid.get_extended_function_info().unwrap();
 52 |         let cpu_name = cpu_name.processor_brand_string().unwrap().trim();
 53 |         let cores = sys_info::cpu_num().unwrap();
 54 |         let memory = sys_info::mem_info().unwrap();
 55 | 
 56 |         let simd_ext = init_simd();
 57 | 
 58 |         if !task.quiet {
 59 |             println!("Engraver {} - PoC2 Plotter\n", crate_version!());
 60 |         }
 61 | 
 62 |         if !task.quiet && task.benchmark {
 63 |             println!("*BENCHMARK MODE*\n");
 64 |         }
 65 | 
 66 |         if !task.quiet {
 67 |             println!(
 68 |                 "CPU: {} [using {} of {} cores{}{:?}]",
 69 |                 cpu_name,
 70 |                 task.cpu_threads,
 71 |                 cores,
 72 |                 if let SimdExtension::None = &simd_ext { "" } else { " + " },
 73 |                 &simd_ext
 74 |             );
 75 |         }
 76 | 
 77 |         #[cfg(not(feature = "opencl"))]
 78 |         let gpu_mem_needed = 0u64;
 79 |         #[cfg(feature = "opencl")]
 80 |         let gpu_mem_needed = match &task.gpus {
 81 |             Some(x) => gpu_get_info(&x, task.quiet),
 82 |             None => 0,
 83 |         };
 84 | 
 85 |         #[cfg(feature = "opencl")]
 86 |         let gpu_mem_needed = if task.zcb {
 87 |             gpu_mem_needed
 88 |         } else {
 89 |             gpu_mem_needed / 2
 90 |         };
 91 | 
 92 |         // use all avaiblable disk space if nonce parameter has been omitted
 93 |         let free_disk_space = free_disk_space(&task.output_path);
 94 |         if task.nonces == 0 {
 95 |             task.nonces = free_disk_space / NONCE_SIZE;
 96 |         }
 97 | 
 98 |         let gpu = task.gpus.is_some();
 99 | 
100 |         // align number of nonces with sector size if direct i/o
101 |         let mut rounded_nonces_to_sector_size = false;
102 |         let mut nonces_per_sector = 1;
103 |         if task.direct_io {
104 |             let sector_size = get_sector_size(&task.output_path);
105 |             nonces_per_sector = sector_size / SCOOP_SIZE;
106 |             if task.nonces % nonces_per_sector > 0 {
107 |                 rounded_nonces_to_sector_size = true;
108 |                 task.nonces /= nonces_per_sector;
109 |                 task.nonces *= nonces_per_sector;
110 |             }
111 |         }
112 | 
113 |         let plotsize = task.nonces * NONCE_SIZE;
114 | 
115 |         let file = Path::new(&task.output_path).join(format!(
116 |             "{}_{}_{}",
117 |             task.numeric_id, task.start_nonce, task.nonces
118 |         ));
119 | 
120 |         if !file.parent().unwrap().exists() {
121 |             println!(
122 |                 "Error: specified target path does not exist, path={}",
123 |                 &task.output_path
124 |             );
125 |             println!("Shutting down...");
126 |             return;
127 |         }
128 | 
129 |         // check available disk space
130 |         if free_disk_space < plotsize && !file.exists() && !task.benchmark {
131 |             println!(
132 |                 "Error: insufficient disk space, MiB_required={:.2}, MiB_available={:.2}",
133 |                 plotsize as f64 / 1024.0 / 1024.0,
134 |                 free_disk_space as f64 / 1024.0 / 1024.0
135 |             );
136 |             println!("Shutting down...");
137 |             return;
138 |         }
139 | 
140 |         // calculate memory usage
141 |         let mem = match calculate_mem_to_use(&task, &memory, nonces_per_sector, gpu, gpu_mem_needed)
142 |         {
143 |             Ok(x) => x,
144 |             Err(_) => return,
145 |         };
146 | 
147 |         if !task.quiet {
148 |             println!(
149 |                 "RAM: Total={:.2} GiB, Free={:.2} GiB, Usage={:.2} GiB",
150 |                 memory.total as f64 / 1024.0 / 1024.0,
151 |                 get_avail_mem(&memory) as f64 / 1024.0 / 1024.0,
152 |                 (mem + gpu_mem_needed) as f64 / 1024.0 / 1024.0 / 1024.0
153 |             );
154 | 
155 |             #[cfg(feature = "opencl")]
156 |             println!(
157 |                 "     HDDcache={:.2} GiB, GPUcache={:.2} GiB,\n",
158 |                 mem as f64 / 1024.0 / 1024.0 / 1024.0,
159 |                 gpu_mem_needed as f64 / 1024.0 / 1024.0 / 1024.0
160 |             );
161 | 
162 |             println!("Numeric ID:  {}", task.numeric_id);
163 |             println!("Start Nonce: {}", task.start_nonce);
164 |             println!(
165 |                 "Nonces:      {}{}",
166 |                 task.nonces,
167 |                 if rounded_nonces_to_sector_size {
168 |                     &" (rounded to sector size for fast direct i/o)"
169 |                 } else {
170 |                     &""
171 |                 }
172 |             );
173 |         }
174 | 
175 |         if !task.quiet {
176 |             println!("Output File: {}\n", file.display());
177 |         }
178 |         let mut progress = 0;
179 |         if file.exists() {
180 |             if !task.quiet {
181 |                 println!("File already exists, reading resume info...");
182 |             }
183 |             let resume_info = read_resume_info(&file);
184 |             match resume_info {
185 |                 Ok(x) => progress = x,
186 |                 Err(_) => {
187 |                     println!("Error: couldn't read resume info from file '{}'", file.display());
188 |                     println!("If you are sure that this file is incomplete \
189 |                               or corrupted, then delete it before continuing.");
190 |                     println!("Shutting Down...");
191 |                     return;
192 |                 }
193 |             }
194 |             if !task.quiet {
195 |                 println!("OK");
196 |             }
197 |         } else {
198 |             if !task.quiet {
199 |                 print!("Fast file pre-allocation...");
200 |             }
201 |             if !task.benchmark {
202 |                 preallocate(&file, plotsize, task.direct_io);
203 |                 if write_resume_info(&file, 0u64).is_err() {
204 |                     println!("Error: couldn't write resume info");
205 |                 }
206 |             }
207 |             if !task.quiet {
208 |                 println!("OK");
209 |             }
210 |         }
211 | 
212 |         if !task.quiet {
213 |             if progress == 0 {
214 |                 println!("Starting plotting...\n");
215 |             } else {
216 |                 println!("Resuming plotting from nonce offset {}...\n", progress);
217 |             }
218 |         }
219 | 
220 |         // determine buffer size
221 |         let num_buffer = if task.async_io { 2 } else { 1 };
222 |         let buffer_size = mem / num_buffer;
223 |         let (tx_empty_buffers, rx_empty_buffers) = bounded(num_buffer as usize);
224 |         let (tx_full_buffers, rx_full_buffers) = bounded(num_buffer as usize);
225 | 
226 |         for _ in 0..num_buffer {
227 |             let buffer = PageAlignedByteBuffer::new(buffer_size as usize);
228 |             tx_empty_buffers.send(buffer).unwrap();
229 |         }
230 | 
231 |         let mb = MultiBar::new();
232 | 
233 |         let p1x = if !task.quiet {
234 |             let mut p1 = mb.create_bar(plotsize - progress * NONCE_SIZE);
235 |             p1.format("│██░│");
236 |             p1.set_units(Units::Bytes);
237 |             p1.message("Hashing: ");
238 |             p1.show_counter = false;
239 |             p1.set(0);
240 |             Some(p1)
241 |         } else {
242 |             None
243 |         };
244 | 
245 |         let p2x = if !task.quiet {
246 |             let mut p2 = mb.create_bar(plotsize - progress * NONCE_SIZE);
247 |             p2.format("│██░│");
248 |             p2.set_units(Units::Bytes);
249 |             p2.message("Writing: ");
250 |             p2.show_counter = false;
251 |             p2.set(0);
252 |             Some(p2)
253 |         } else {
254 |             None
255 |         };
256 | 
257 |         let sw = Stopwatch::start_new();
258 |         let task = Arc::new(task);
259 | 
260 |         // hi bold! might make this optional in future releases.
261 |         let thread_pinning = true;
262 |         let core_ids = if thread_pinning {
263 |             core_affinity::get_core_ids().unwrap()
264 |         } else {
265 |             Vec::new()
266 |         };
267 | 
268 |         let hasher = thread::spawn({
269 |             create_scheduler_thread(
270 |                 task.clone(),
271 |                 rayon::ThreadPoolBuilder::new()
272 |                     .num_threads(task.cpu_threads as usize)
273 |                     .start_handler(move |id| {
274 |                         if thread_pinning {
275 |                             #[cfg(not(windows))]
276 |                             let core_id = core_ids[id % core_ids.len()];
277 |                             #[cfg(not(windows))]
278 |                             core_affinity::set_for_current(core_id);
279 |                             #[cfg(windows)]
280 |                             set_thread_ideal_processor(id % core_ids.len());
281 |                         }
282 |                     })
283 |                     .build()
284 |                     .unwrap(),
285 |                 progress,
286 |                 p1x,
287 |                 rx_empty_buffers.clone(),
288 |                 tx_full_buffers.clone(),
289 |                 simd_ext,
290 |             )
291 |         });
292 | 
293 |         let writer = thread::spawn({
294 |             create_writer_thread(
295 |                 task.clone(),
296 |                 progress,
297 |                 p2x,
298 |                 rx_full_buffers.clone(),
299 |                 tx_empty_buffers.clone(),
300 |             )
301 |         });
302 | 
303 |         if !task.quiet {
304 |             mb.listen();
305 |         }
306 |         writer.join().unwrap();
307 |         hasher.join().unwrap();
308 | 
309 |         let elapsed = sw.elapsed_ms() as u64;
310 |         let hours = elapsed / 1000 / 60 / 60;
311 |         let minutes = elapsed / 1000 / 60 - hours * 60;
312 |         let seconds = elapsed / 1000 - hours * 60 * 60 - minutes * 60;
313 | 
314 |         if !task.quiet {
315 |             println!(
316 |                 "\nGenerated {} nonces in {}h{:02}m{:02}s, {:.2} MiB/s, {:.0} nonces/m.",
317 |                 task.nonces - progress,
318 |                 hours,
319 |                 minutes,
320 |                 seconds,
321 |                 (task.nonces - progress) as f64 * 1000.0 / (elapsed as f64 + 1.0) / 4.0,
322 |                 (task.nonces - progress) as f64 * 1000.0 / (elapsed as f64 + 1.0) * 60.0
323 |             );
324 |         }
325 |     }
326 | }
327 | 
328 | fn calculate_mem_to_use(
329 |     task: &PlotterTask,
330 |     memory: &sys_info::MemInfo,
331 |     nonces_per_sector: u64,
332 |     gpu: bool,
333 |     gpu_mem_needed: u64,
334 | ) -> Result<u64, &'static str> {
335 |     let plotsize = task.nonces * NONCE_SIZE;
336 | 
337 |     let mut mem = match task.mem.parse::<Bytes>() {
338 |         Ok(x) => x.size() as u64,
339 |         Err(_) => {
340 |             println!(
341 |                 "Error: Can't parse memory limit parameter, input={}",
342 |                 task.mem,
343 |             );
344 |             println!("\nPlease specify a number followed by a unit. If no unit is provided, bytes will be assumed.");
345 |             println!("Supported units: B, KiB, MiB, GiB, TiB, PiB, EiB, KB, MB, GB, TB, PB, EB");
346 |             println!("Example: --mem 10GiB\n");
347 |             println!("Shutting down...");
348 |             return Err("invalid unit");
349 |         }
350 |     };
351 |     if gpu && mem > 0 && mem < gpu_mem_needed + nonces_per_sector * NONCE_SIZE {
352 |         println!("Error: Insufficient host memory for GPU plotting!");
353 |         println!("Shutting down...");
354 |         process::exit(0);
355 |     }
356 | 
357 |     if gpu && mem > 0 {
358 |         mem -= gpu_mem_needed;
359 |     }
360 | 
361 |     if mem == 0 {
362 |         mem = plotsize;
363 |     }
364 |     mem = min(mem, plotsize + gpu_mem_needed);
365 | 
366 |     // opencl requires buffer to be a multiple of 16 (data coalescence magic)
367 |     let nonces_per_sector = if gpu {
368 |         max(16, nonces_per_sector)
369 |     } else {
370 |         nonces_per_sector
371 |     };
372 | 
373 |     // don't exceed free memory and leave some elbow room 1-1000/1024
374 |     mem = min(mem, get_avail_mem(&memory) * 1000 - gpu_mem_needed);
375 | 
376 |     // rounding single/double buffer
377 |     let num_buffer = if task.async_io { 2 } else { 1 };
378 |     mem /= num_buffer * NONCE_SIZE * nonces_per_sector;
379 |     mem *= num_buffer * NONCE_SIZE * nonces_per_sector;
380 | 
381 |     // ensure a minimum buffer
382 |     mem = max(mem, num_buffer * NONCE_SIZE * nonces_per_sector);
383 |     Ok(mem)
384 | }
385 | 
386 | // sys_info ex, displays 0 avail on win
387 | #[cfg(not(windows))]
388 | fn get_avail_mem(memory: &sys_info::MemInfo) -> u64 {
389 |     memory.avail
390 | }
391 | 
392 | #[cfg(windows)]
393 | fn get_avail_mem(memory: &sys_info::MemInfo) -> u64 {
394 |     memory.free
395 | }
396 | 


--------------------------------------------------------------------------------
/src/c/noncegen_512_avx512f.c:
--------------------------------------------------------------------------------
  1 | #include "noncegen_512_avx512f.h"
  2 | #include <immintrin.h>
  3 | #include <string.h>
  4 | #include "common.h"
  5 | #include "mshabal_512_avx512f.h"
  6 | #include "sph_shabal.h"
  7 | 
  8 | sph_shabal_context global_32;
  9 | mshabal512_context global_512;
 10 | mshabal512_context_fast global_512_fast;
 11 | 
 12 | void init_shabal_avx512f() {
 13 |     sph_shabal256_init(&global_32);
 14 |     mshabal_init_avx512f(&global_512, 256);
 15 |     global_512_fast.out_size = global_512.out_size;
 16 |     for (int i = 0; i < 704; i++) global_512_fast.state[i] = global_512.state[i];
 17 |     global_512_fast.Whigh = global_512.Whigh;
 18 |     global_512_fast.Wlow = global_512.Wlow;
 19 | }
 20 | 
 21 | // cache:			cache to save to
 22 | // local_num:		thread number
 23 | // numeric_id:		numeric account id
 24 | // loc_startnonce	nonce to start generation at
 25 | // local_nonces: 	number of nonces to generate
 26 | void noncegen_avx512(char *cache, const size_t cache_size, const size_t chunk_offset,
 27 |                    const uint64_t numeric_id, const uint64_t local_startnonce,
 28 |                    const uint64_t local_nonces) {
 29 |     sph_shabal_context local_32;
 30 |     uint64_t nonce;
 31 |     size_t len;
 32 | 
 33 |     mshabal512_context_fast local_512_fast;
 34 |     uint64_t nonce1, nonce2, nonce3, nonce4, nonce5, nonce6, nonce7, nonce8, nonce9, nonce10, nonce11, nonce12, nonce13, nonce14, nonce15, nonce16;
 35 | 
 36 |     char seed[32];  // 64bit numeric account ID, 64bit nonce (blank), 1bit termination, 127 bits zero
 37 |     char term[32];  // 1bit 1, 255bit of zeros
 38 |     char zero[32];  // 256bit of zeros
 39 | 
 40 |     //vars shared
 41 |     uint8_t* buffer = (uint8_t*)malloc(sizeof(uint8_t) * MSHABAL512_VECTOR_SIZE * NONCE_SIZE);
 42 |     uint8_t* final = (uint8_t*)malloc(sizeof(uint8_t) * MSHABAL512_VECTOR_SIZE * HASH_SIZE);
 43 |     
 44 |     write_seed(seed, numeric_id);
 45 |     write_term(term);
 46 |     memset(&zero[0], 0, 32);
 47 | 
 48 |     // prepare smart SIMD aligned termination strings
 49 |     // creation could further be optimized, but not much in it as it only runs once per work package
 50 |     // creation could also be moved to plotter start
 51 |     union {
 52 |         mshabal_u32 words[16 * MSHABAL512_VECTOR_SIZE];
 53 |         __m512i data[16];
 54 |     } t1, t2, t3;
 55 | 
 56 |     for (int j = 0; j < 16 * MSHABAL512_VECTOR_SIZE / 2; j += MSHABAL512_VECTOR_SIZE) {
 57 |         size_t o = j / 4;
 58 |         // t1
 59 |         t1.words[j + 0] = *(mshabal_u32 *)(seed + o);
 60 |         t1.words[j + 1] = *(mshabal_u32 *)(seed + o);
 61 |         t1.words[j + 2] = *(mshabal_u32 *)(seed + o);
 62 |         t1.words[j + 3] = *(mshabal_u32 *)(seed + o);
 63 |         t1.words[j + 4] = *(mshabal_u32 *)(seed + o);
 64 |         t1.words[j + 5] = *(mshabal_u32 *)(seed + o);
 65 |         t1.words[j + 6] = *(mshabal_u32 *)(seed + o);
 66 |         t1.words[j + 7] = *(mshabal_u32 *)(seed + o);
 67 |         t1.words[j + 8] = *(mshabal_u32 *)(seed + o);
 68 |         t1.words[j + 9] = *(mshabal_u32 *)(seed + o);
 69 |         t1.words[j + 10] = *(mshabal_u32 *)(seed + o);
 70 |         t1.words[j + 11] = *(mshabal_u32 *)(seed + o);
 71 |         t1.words[j + 12] = *(mshabal_u32 *)(seed + o);
 72 |         t1.words[j + 13] = *(mshabal_u32 *)(seed + o);
 73 |         t1.words[j + 14] = *(mshabal_u32 *)(seed + o);
 74 |         t1.words[j + 15] = *(mshabal_u32 *)(seed + o);        
 75 |         t1.words[j + 0 + 128] = *(mshabal_u32 *)(zero + o);
 76 |         t1.words[j + 1 + 128] = *(mshabal_u32 *)(zero + o);
 77 |         t1.words[j + 2 + 128] = *(mshabal_u32 *)(zero + o);
 78 |         t1.words[j + 3 + 128] = *(mshabal_u32 *)(zero + o);
 79 |         t1.words[j + 4 + 128] = *(mshabal_u32 *)(zero + o);
 80 |         t1.words[j + 5 + 128] = *(mshabal_u32 *)(zero + o);
 81 |         t1.words[j + 6 + 128] = *(mshabal_u32 *)(zero + o);
 82 |         t1.words[j + 7 + 128] = *(mshabal_u32 *)(zero + o);
 83 |         t1.words[j + 8 + 128] = *(mshabal_u32 *)(zero + o);
 84 |         t1.words[j + 9 + 128] = *(mshabal_u32 *)(zero + o);
 85 |         t1.words[j + 10 + 128] = *(mshabal_u32 *)(zero + o);
 86 |         t1.words[j + 11 + 128] = *(mshabal_u32 *)(zero + o);
 87 |         t1.words[j + 12 + 128] = *(mshabal_u32 *)(zero + o);
 88 |         t1.words[j + 13 + 128] = *(mshabal_u32 *)(zero + o);
 89 |         t1.words[j + 14 + 128] = *(mshabal_u32 *)(zero + o);
 90 |         t1.words[j + 15 + 128] = *(mshabal_u32 *)(zero + o);
 91 |         // t2
 92 |         // (first 256bit skipped, will later be filled with data)
 93 |         t2.words[j + 0 + 128] = *(mshabal_u32 *)(seed + o);
 94 |         t2.words[j + 1 + 128] = *(mshabal_u32 *)(seed + o);
 95 |         t2.words[j + 2 + 128] = *(mshabal_u32 *)(seed + o);
 96 |         t2.words[j + 3 + 128] = *(mshabal_u32 *)(seed + o);
 97 |         t2.words[j + 4 + 128] = *(mshabal_u32 *)(seed + o);
 98 |         t2.words[j + 5 + 128] = *(mshabal_u32 *)(seed + o);
 99 |         t2.words[j + 6 + 128] = *(mshabal_u32 *)(seed + o);
100 |         t2.words[j + 7 + 128] = *(mshabal_u32 *)(seed + o);
101 |         t2.words[j + 8 + 128] = *(mshabal_u32 *)(seed + o);
102 |         t2.words[j + 9 + 128] = *(mshabal_u32 *)(seed + o);
103 |         t2.words[j + 10 + 128] = *(mshabal_u32 *)(seed + o);
104 |         t2.words[j + 11 + 128] = *(mshabal_u32 *)(seed + o);
105 |         t2.words[j + 12 + 128] = *(mshabal_u32 *)(seed + o);
106 |         t2.words[j + 13 + 128] = *(mshabal_u32 *)(seed + o);
107 |         t2.words[j + 14 + 128] = *(mshabal_u32 *)(seed + o);
108 |         t2.words[j + 15 + 128] = *(mshabal_u32 *)(seed + o);
109 |         // t3
110 |         t3.words[j + 0] = *(mshabal_u32 *)(term + o);
111 |         t3.words[j + 1] = *(mshabal_u32 *)(term + o);
112 |         t3.words[j + 2] = *(mshabal_u32 *)(term + o);
113 |         t3.words[j + 3] = *(mshabal_u32 *)(term + o);
114 |         t3.words[j + 4] = *(mshabal_u32 *)(term + o);
115 |         t3.words[j + 5] = *(mshabal_u32 *)(term + o);
116 |         t3.words[j + 6] = *(mshabal_u32 *)(term + o);
117 |         t3.words[j + 7] = *(mshabal_u32 *)(term + o);        
118 |         t3.words[j + 8] = *(mshabal_u32 *)(term + o);
119 |         t3.words[j + 9] = *(mshabal_u32 *)(term + o);
120 |         t3.words[j + 10] = *(mshabal_u32 *)(term + o);
121 |         t3.words[j + 11] = *(mshabal_u32 *)(term + o);
122 |         t3.words[j + 12] = *(mshabal_u32 *)(term + o);
123 |         t3.words[j + 13] = *(mshabal_u32 *)(term + o);
124 |         t3.words[j + 14] = *(mshabal_u32 *)(term + o);
125 |         t3.words[j + 15] = *(mshabal_u32 *)(term + o);
126 |         
127 |         t3.words[j + 0 + 128] = *(mshabal_u32 *)(zero + o);
128 |         t3.words[j + 1 + 128] = *(mshabal_u32 *)(zero + o);
129 |         t3.words[j + 2 + 128] = *(mshabal_u32 *)(zero + o);
130 |         t3.words[j + 3 + 128] = *(mshabal_u32 *)(zero + o);
131 |         t3.words[j + 4 + 128] = *(mshabal_u32 *)(zero + o);
132 |         t3.words[j + 5 + 128] = *(mshabal_u32 *)(zero + o);
133 |         t3.words[j + 6 + 128] = *(mshabal_u32 *)(zero + o);
134 |         t3.words[j + 7 + 128] = *(mshabal_u32 *)(zero + o);
135 |         t3.words[j + 8 + 128] = *(mshabal_u32 *)(zero + o);
136 |         t3.words[j + 9 + 128] = *(mshabal_u32 *)(zero + o);
137 |         t3.words[j + 10 + 128] = *(mshabal_u32 *)(zero + o);
138 |         t3.words[j + 11 + 128] = *(mshabal_u32 *)(zero + o);
139 |         t3.words[j + 12 + 128] = *(mshabal_u32 *)(zero + o);
140 |         t3.words[j + 13 + 128] = *(mshabal_u32 *)(zero + o);
141 |         t3.words[j + 14 + 128] = *(mshabal_u32 *)(zero + o);
142 |         t3.words[j + 15 + 128] = *(mshabal_u32 *)(zero + o);
143 |     }
144 | 
145 |     for (uint64_t n = 0; n < local_nonces;) {
146 |         // iterate nonces (16 per cycle - avx512)
147 |         // min 16 nonces left for avx512 processing, otherwise SISD
148 |         if (n + 16 <= local_nonces) {
149 |             // generate nonce numbers & change endianness
150 |             nonce1 = bswap_64((uint64_t)(local_startnonce + n + 0));
151 |             nonce2 = bswap_64((uint64_t)(local_startnonce + n + 1));
152 |             nonce3 = bswap_64((uint64_t)(local_startnonce + n + 2));
153 |             nonce4 = bswap_64((uint64_t)(local_startnonce + n + 3));
154 |             nonce5 = bswap_64((uint64_t)(local_startnonce + n + 4));
155 |             nonce6 = bswap_64((uint64_t)(local_startnonce + n + 5));
156 |             nonce7 = bswap_64((uint64_t)(local_startnonce + n + 6));
157 |             nonce8 = bswap_64((uint64_t)(local_startnonce + n + 7));
158 |             nonce9 = bswap_64((uint64_t)(local_startnonce + n + 8));
159 |             nonce10 = bswap_64((uint64_t)(local_startnonce + n + 9));
160 |             nonce11 = bswap_64((uint64_t)(local_startnonce + n + 10));
161 |             nonce12 = bswap_64((uint64_t)(local_startnonce + n + 11));
162 |             nonce13 = bswap_64((uint64_t)(local_startnonce + n + 12));
163 |             nonce14 = bswap_64((uint64_t)(local_startnonce + n + 13));
164 |             nonce15 = bswap_64((uint64_t)(local_startnonce + n + 14));
165 |             nonce16 = bswap_64((uint64_t)(local_startnonce + n + 15));
166 | 
167 |             // store nonce numbers in relevant termination strings
168 |             for (int j = 32; j < 16 * MSHABAL512_VECTOR_SIZE / 4; j += MSHABAL512_VECTOR_SIZE) {
169 |                 size_t o = j / 4 - 8;
170 |                 // t1
171 |                 t1.words[j + 0] = *(mshabal_u32 *)((char *)&nonce1 + o);
172 |                 t1.words[j + 1] = *(mshabal_u32 *)((char *)&nonce2 + o);
173 |                 t1.words[j + 2] = *(mshabal_u32 *)((char *)&nonce3 + o);
174 |                 t1.words[j + 3] = *(mshabal_u32 *)((char *)&nonce4 + o);
175 |                 t1.words[j + 4] = *(mshabal_u32 *)((char *)&nonce5 + o);
176 |                 t1.words[j + 5] = *(mshabal_u32 *)((char *)&nonce6 + o);
177 |                 t1.words[j + 6] = *(mshabal_u32 *)((char *)&nonce7 + o);
178 |                 t1.words[j + 7] = *(mshabal_u32 *)((char *)&nonce8 + o);
179 |                 t1.words[j + 8] = *(mshabal_u32 *)((char *)&nonce9 + o);
180 |                 t1.words[j + 9] = *(mshabal_u32 *)((char *)&nonce10 + o);
181 |                 t1.words[j + 10] = *(mshabal_u32 *)((char *)&nonce11 + o);
182 |                 t1.words[j + 11] = *(mshabal_u32 *)((char *)&nonce12 + o);
183 |                 t1.words[j + 12] = *(mshabal_u32 *)((char *)&nonce13 + o);
184 |                 t1.words[j + 13] = *(mshabal_u32 *)((char *)&nonce14 + o);
185 |                 t1.words[j + 14] = *(mshabal_u32 *)((char *)&nonce15 + o);
186 |                 t1.words[j + 15] = *(mshabal_u32 *)((char *)&nonce16 + o);
187 | 
188 |                 t2.words[j + 0 + 128] = *(mshabal_u32 *)((char *)&nonce1 + o);
189 |                 t2.words[j + 1 + 128] = *(mshabal_u32 *)((char *)&nonce2 + o);
190 |                 t2.words[j + 2 + 128] = *(mshabal_u32 *)((char *)&nonce3 + o);
191 |                 t2.words[j + 3 + 128] = *(mshabal_u32 *)((char *)&nonce4 + o);
192 |                 t2.words[j + 4 + 128] = *(mshabal_u32 *)((char *)&nonce5 + o);
193 |                 t2.words[j + 5 + 128] = *(mshabal_u32 *)((char *)&nonce6 + o);
194 |                 t2.words[j + 6 + 128] = *(mshabal_u32 *)((char *)&nonce7 + o);
195 |                 t2.words[j + 7 + 128] = *(mshabal_u32 *)((char *)&nonce8 + o); 
196 |                 t2.words[j + 8 + 128] = *(mshabal_u32 *)((char *)&nonce9 + o);
197 |                 t2.words[j + 9 + 128] = *(mshabal_u32 *)((char *)&nonce10 + o);
198 |                 t2.words[j + 10 + 128] = *(mshabal_u32 *)((char *)&nonce11 + o);
199 |                 t2.words[j + 11 + 128] = *(mshabal_u32 *)((char *)&nonce12 + o);
200 |                 t2.words[j + 12 + 128] = *(mshabal_u32 *)((char *)&nonce13 + o);
201 |                 t2.words[j + 13 + 128] = *(mshabal_u32 *)((char *)&nonce14 + o);
202 |                 t2.words[j + 14 + 128] = *(mshabal_u32 *)((char *)&nonce15 + o);
203 |                 t2.words[j + 15 + 128] = *(mshabal_u32 *)((char *)&nonce16 + o);
204 |             }
205 |     
206 |             // start shabal round    
207 | 
208 |             // 3 cases: first 128 rounds uses case 1 or 2, after that case 3
209 |             // case 1: first 128 rounds, hashes are even: use termination string 1
210 |             // case 2: first 128 rounds, hashes are odd: use termination string 2
211 |             // case 3: round > 128: use termination string 3
212 |             
213 |             // round 1
214 |             memcpy(&local_512_fast, &global_512_fast,
215 |                    sizeof(global_512_fast));  // fast initialize shabal                 
216 |             
217 |              mshabal_hash_fast_avx512f(
218 |                 &local_512_fast, NULL, &t1,
219 |                 &buffer[MSHABAL512_VECTOR_SIZE * (NONCE_SIZE - HASH_SIZE)], 16 >> 6);
220 | 
221 |             // store first hash into smart termination string 2 (data is vectored and SIMD aligned)
222 |             memcpy(&t2, &buffer[MSHABAL512_VECTOR_SIZE * (NONCE_SIZE - HASH_SIZE)],
223 |                    MSHABAL512_VECTOR_SIZE * (HASH_SIZE));
224 | 
225 |             // round 2 - 128
226 |             for (size_t i = NONCE_SIZE - HASH_SIZE; i > (NONCE_SIZE - HASH_CAP); i -= HASH_SIZE) {
227 |                 // check if msg can be divided into 512bit packages without a
228 |                 // remainder
229 |                 if (i % 64 == 0) {
230 |                     // last msg = seed + termination
231 |                      mshabal_hash_fast_avx512f(&local_512_fast, &buffer[i * MSHABAL512_VECTOR_SIZE],
232 |                                               &t1,
233 |                                               &buffer[(i - HASH_SIZE) * MSHABAL512_VECTOR_SIZE],
234 |                                               (NONCE_SIZE + 16 - i) >> 6);
235 |                 } else {
236 |                     // last msg = 256 bit data + seed + termination
237 |                      mshabal_hash_fast_avx512f(&local_512_fast, &buffer[i * MSHABAL512_VECTOR_SIZE],
238 |                                               &t2,
239 |                                               &buffer[(i - HASH_SIZE) * MSHABAL512_VECTOR_SIZE],
240 |                                               (NONCE_SIZE + 16 - i) >> 6);
241 |                 }
242 |             }
243 | 
244 |             // round 128-8192
245 |             for (size_t i = NONCE_SIZE - HASH_CAP; i > 0; i -= HASH_SIZE) {
246 |                  mshabal_hash_fast_avx512f(&local_512_fast, &buffer[i * MSHABAL512_VECTOR_SIZE], &t3,
247 |                                           &buffer[(i - HASH_SIZE) * MSHABAL512_VECTOR_SIZE],
248 |                                           (HASH_CAP) >> 6);
249 |             }
250 |            
251 |             // generate final hash
252 |              mshabal_hash_fast_avx512f(&local_512_fast, &buffer[0], &t1, &final[0],
253 |                                       (NONCE_SIZE + 16) >> 6);
254 |             
255 |             // XOR using SIMD
256 |             // load final hash
257 |             __m512i F[8];
258 |             for (int j = 0; j < 8; j++) F[j] = _mm512_loadu_si512((__m512i *)final + j);
259 |             // xor all hashes with final hash
260 |             for (int j = 0; j < 8 * 2 * HASH_CAP; j++)
261 |                 _mm512_storeu_si512(
262 |                     (__m512i *)buffer + j,
263 |                     _mm512_xor_si512(_mm512_loadu_si512((__m512i *)buffer + j), F[j % 8]));
264 |              
265 |             // todo: fork SIMD aligned plot file here
266 |             // simd shabal words unpack + POC Shuffle + scatter nonces into optimised cache
267 |             for (int i = 0; i < NUM_SCOOPS * 2; i++) {
268 |                 for (int j = 0; j < 32; j += 4) {
269 |                     for (int k = 0; k < MSHABAL512_VECTOR_SIZE; k += 1) {
270 |                     memcpy(&cache[((i & 1) * (4095 - (i >> 1)) + ((i + 1) & 1) * (i >> 1)) *
271 |                                       SCOOP_SIZE * cache_size +
272 |                                   (n + k + chunk_offset) * SCOOP_SIZE + (i & 1) * 32 + j],
273 |                            &buffer[(i * 32 + j) * MSHABAL512_VECTOR_SIZE + k * 4], 4);
274 |                     }
275 |                 }
276 |             }
277 | 
278 |             n += 16;
279 |         } else {
280 |             // if less than 16 nonces left, use 1d-shabal
281 |             int8_t *xv = (int8_t *)&numeric_id;
282 |             
283 |             for (size_t i = 0; i < 8; i++) buffer[NONCE_SIZE + i] = xv[7 - i];
284 | 
285 |             nonce = local_startnonce + n;
286 |             xv = (int8_t *)&nonce;
287 | 
288 |             for (size_t i = 8; i < 16; i++) buffer[NONCE_SIZE + i] = xv[15 - i];
289 | 
290 |             for (size_t i = NONCE_SIZE; i > 0; i -= HASH_SIZE) {
291 |                 memcpy(&local_32, &global_32, sizeof(global_32));
292 |                 ;
293 |                 if (i < NONCE_SIZE + 16 - HASH_CAP)
294 |                     len = HASH_CAP;
295 |                 else
296 |                     len = NONCE_SIZE + 16 - i;
297 | 
298 |                 sph_shabal256(&local_32, &buffer[i], len);
299 |                 sph_shabal256_close(&local_32, &buffer[i - HASH_SIZE]);
300 |             }
301 | 
302 |             memcpy(&local_32, &global_32, sizeof(global_32));
303 |             sph_shabal256(&local_32, buffer, 16 + NONCE_SIZE);
304 |             sph_shabal256_close(&local_32, final);
305 | 
306 |             // XOR with final
307 |             for (size_t i = 0; i < NONCE_SIZE; i++) buffer[i] ^= (final[i % HASH_SIZE]);
308 | 
309 |             // Sort them PoC2:
310 |             for (size_t i = 0; i < HASH_CAP; i++){
311 |                 memmove(&cache[i * cache_size * SCOOP_SIZE + (n + chunk_offset) * SCOOP_SIZE], &buffer[i * SCOOP_SIZE], HASH_SIZE);
312 |                 memmove(&cache[(4095-i) * cache_size * SCOOP_SIZE + (n + chunk_offset) * SCOOP_SIZE + 32], &buffer[i * SCOOP_SIZE + 32], HASH_SIZE);
313 |             }
314 |             n++;
315 |         }
316 |     }
317 |     free(buffer);
318 |     free(final);
319 | }
320 | 


--------------------------------------------------------------------------------
/src/ocl/kernel.cl:
--------------------------------------------------------------------------------
  1 | #ifdef cl_clang_storage_class_specifiers
  2 | #pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable
  3 | #endif
  4 | typedef unsigned int sph_u32;
  5 | 
  6 | #define SPH_C32(x)    ((sph_u32)(x ## U))
  7 | #define SPH_T32(x) (as_uint(x))
  8 | #define SPH_ROTL32(x, n) rotate(as_uint(x), as_uint(n))
  9 | #define SPH_ROTR32(x, n)   SPH_ROTL32(x, (32 - (n)))
 10 | 
 11 | #define SPH_C64(x)    ((sph_u64)(x ## UL))
 12 | #define SPH_T64(x) (as_ulong(x))
 13 | #define SPH_ROTL64(x, n) rotate(as_ulong(x), (n) & 0xFFFFFFFFFFFFFFFFUL)
 14 | #define SPH_ROTR64(x, n)   SPH_ROTL64(x, (64 - (n)))
 15 | 
 16 | /* $Id: shabal.c 175 2010-05-07 16:03:20Z tp $ */
 17 | /*
 18 |  * Shabal implementation.
 19 |  *
 20 |  * ==========================(LICENSE BEGIN)============================
 21 |  *
 22 |  * Copyright (c) 2007-2010  Projet RNRT SAPHIR
 23 |  *
 24 |  * Permission is hereby granted, free of charge, to any person obtaining
 25 |  * a copy of this software and associated documentation files (the
 26 |  * "Software"), to deal in the Software without restriction, including
 27 |  * without limitation the rights to use, copy, modify, merge, publish,
 28 |  * distribute, sublicense, and/or sell copies of the Software, and to
 29 |  * permit persons to whom the Software is furnished to do so, subject to
 30 |  * the following conditions:
 31 |  *
 32 |  * The above copyright notice and this permission notice shall be
 33 |  * included in all copies or substantial portions of the Software.
 34 |  *
 35 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 36 |  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 37 |  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 38 |  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 39 |  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 40 |  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 41 |  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 42 |  *
 43 |  * ===========================(LICENSE END)=============================
 44 |  *
 45 |  * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
 46 |  */
 47 | 
 48 | /*
 49 |  * Part of this code was automatically generated (the part between
 50 |  * the "BEGIN" and "END" markers).
 51 |  */
 52 | 
 53 | #define sM    16
 54 | 
 55 | #define C32   SPH_C32
 56 | #define T32   SPH_T32
 57 | 
 58 | #define O1   13
 59 | #define O2    9
 60 | #define O3    6
 61 | 
 62 | /*
 63 |  * We copy the state into local variables, so that the compiler knows
 64 |  * that it can optimize them at will.
 65 |  */
 66 | 
 67 | /* BEGIN -- automatically generated code. */
 68 | 
 69 | #define INPUT_BLOCK_ADD   do { \
 70 | 		B0 = T32(B0 + M0); \
 71 | 		B1 = T32(B1 + M1); \
 72 | 		B2 = T32(B2 + M2); \
 73 | 		B3 = T32(B3 + M3); \
 74 | 		B4 = T32(B4 + M4); \
 75 | 		B5 = T32(B5 + M5); \
 76 | 		B6 = T32(B6 + M6); \
 77 | 		B7 = T32(B7 + M7); \
 78 | 		B8 = T32(B8 + M8); \
 79 | 		B9 = T32(B9 + M9); \
 80 | 		BA = T32(BA + MA); \
 81 | 		BB = T32(BB + MB); \
 82 | 		BC = T32(BC + MC); \
 83 | 		BD = T32(BD + MD); \
 84 | 		BE = T32(BE + ME); \
 85 | 		BF = T32(BF + MF); \
 86 | 	} while (0)
 87 | 
 88 | #define INPUT_BLOCK_SUB   do { \
 89 | 		C0 = T32(C0 - M0); \
 90 | 		C1 = T32(C1 - M1); \
 91 | 		C2 = T32(C2 - M2); \
 92 | 		C3 = T32(C3 - M3); \
 93 | 		C4 = T32(C4 - M4); \
 94 | 		C5 = T32(C5 - M5); \
 95 | 		C6 = T32(C6 - M6); \
 96 | 		C7 = T32(C7 - M7); \
 97 | 		C8 = T32(C8 - M8); \
 98 | 		C9 = T32(C9 - M9); \
 99 | 		CA = T32(CA - MA); \
100 | 		CB = T32(CB - MB); \
101 | 		CC = T32(CC - MC); \
102 | 		CD = T32(CD - MD); \
103 | 		CE = T32(CE - ME); \
104 | 		CF = T32(CF - MF); \
105 | 	} while (0)
106 | 
107 | #define XOR_W   do { \
108 | 		A00 ^= Wlow; \
109 | 		A01 ^= Whigh; \
110 | 	} while (0)
111 | 
112 | #define SWAP(v1, v2)   do { \
113 | 		sph_u32 tmp = (v1); \
114 | 		(v1) = (v2); \
115 | 		(v2) = tmp; \
116 | 	} while (0)
117 | 
118 | #define SWAP_BC   do { \
119 | 		SWAP(B0, C0); \
120 | 		SWAP(B1, C1); \
121 | 		SWAP(B2, C2); \
122 | 		SWAP(B3, C3); \
123 | 		SWAP(B4, C4); \
124 | 		SWAP(B5, C5); \
125 | 		SWAP(B6, C6); \
126 | 		SWAP(B7, C7); \
127 | 		SWAP(B8, C8); \
128 | 		SWAP(B9, C9); \
129 | 		SWAP(BA, CA); \
130 | 		SWAP(BB, CB); \
131 | 		SWAP(BC, CC); \
132 | 		SWAP(BD, CD); \
133 | 		SWAP(BE, CE); \
134 | 		SWAP(BF, CF); \
135 | 	} while (0)
136 | 
137 | #define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm)   do { \
138 | 		xa0 = T32((xa0 \
139 | 			^ (((xa1 << 15) | (xa1 >> 17)) * 5U) \
140 | 			^ xc) * 3U) \
141 | 			^ xb1 ^ (xb2 & ~xb3) ^ xm; \
142 | 		xb0 = T32(~(((xb0 << 1) | (xb0 >> 31)) ^ xa0)); \
143 | 	} while (0)
144 | 
145 | #define PERM_STEP_0   do { \
146 | 		PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \
147 | 		PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \
148 | 		PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \
149 | 		PERM_ELT(A03, A02, B3, B0, BC, B9, C5, M3); \
150 | 		PERM_ELT(A04, A03, B4, B1, BD, BA, C4, M4); \
151 | 		PERM_ELT(A05, A04, B5, B2, BE, BB, C3, M5); \
152 | 		PERM_ELT(A06, A05, B6, B3, BF, BC, C2, M6); \
153 | 		PERM_ELT(A07, A06, B7, B4, B0, BD, C1, M7); \
154 | 		PERM_ELT(A08, A07, B8, B5, B1, BE, C0, M8); \
155 | 		PERM_ELT(A09, A08, B9, B6, B2, BF, CF, M9); \
156 | 		PERM_ELT(A0A, A09, BA, B7, B3, B0, CE, MA); \
157 | 		PERM_ELT(A0B, A0A, BB, B8, B4, B1, CD, MB); \
158 | 		PERM_ELT(A00, A0B, BC, B9, B5, B2, CC, MC); \
159 | 		PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \
160 | 		PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \
161 | 		PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \
162 | 	} while (0)
163 | 
164 | #define PERM_STEP_1   do { \
165 | 		PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \
166 | 		PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \
167 | 		PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \
168 | 		PERM_ELT(A07, A06, B3, B0, BC, B9, C5, M3); \
169 | 		PERM_ELT(A08, A07, B4, B1, BD, BA, C4, M4); \
170 | 		PERM_ELT(A09, A08, B5, B2, BE, BB, C3, M5); \
171 | 		PERM_ELT(A0A, A09, B6, B3, BF, BC, C2, M6); \
172 | 		PERM_ELT(A0B, A0A, B7, B4, B0, BD, C1, M7); \
173 | 		PERM_ELT(A00, A0B, B8, B5, B1, BE, C0, M8); \
174 | 		PERM_ELT(A01, A00, B9, B6, B2, BF, CF, M9); \
175 | 		PERM_ELT(A02, A01, BA, B7, B3, B0, CE, MA); \
176 | 		PERM_ELT(A03, A02, BB, B8, B4, B1, CD, MB); \
177 | 		PERM_ELT(A04, A03, BC, B9, B5, B2, CC, MC); \
178 | 		PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \
179 | 		PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \
180 | 		PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \
181 | 	} while (0)
182 | 
183 | #define PERM_STEP_2   do { \
184 | 		PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \
185 | 		PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \
186 | 		PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \
187 | 		PERM_ELT(A0B, A0A, B3, B0, BC, B9, C5, M3); \
188 | 		PERM_ELT(A00, A0B, B4, B1, BD, BA, C4, M4); \
189 | 		PERM_ELT(A01, A00, B5, B2, BE, BB, C3, M5); \
190 | 		PERM_ELT(A02, A01, B6, B3, BF, BC, C2, M6); \
191 | 		PERM_ELT(A03, A02, B7, B4, B0, BD, C1, M7); \
192 | 		PERM_ELT(A04, A03, B8, B5, B1, BE, C0, M8); \
193 | 		PERM_ELT(A05, A04, B9, B6, B2, BF, CF, M9); \
194 | 		PERM_ELT(A06, A05, BA, B7, B3, B0, CE, MA); \
195 | 		PERM_ELT(A07, A06, BB, B8, B4, B1, CD, MB); \
196 | 		PERM_ELT(A08, A07, BC, B9, B5, B2, CC, MC); \
197 | 		PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \
198 | 		PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \
199 | 		PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \
200 | 	} while (0)
201 | 
202 | #define APPLY_P   do { \
203 | 		B0 = T32(B0 << 17) | (B0 >> 15); \
204 | 		B1 = T32(B1 << 17) | (B1 >> 15); \
205 | 		B2 = T32(B2 << 17) | (B2 >> 15); \
206 | 		B3 = T32(B3 << 17) | (B3 >> 15); \
207 | 		B4 = T32(B4 << 17) | (B4 >> 15); \
208 | 		B5 = T32(B5 << 17) | (B5 >> 15); \
209 | 		B6 = T32(B6 << 17) | (B6 >> 15); \
210 | 		B7 = T32(B7 << 17) | (B7 >> 15); \
211 | 		B8 = T32(B8 << 17) | (B8 >> 15); \
212 | 		B9 = T32(B9 << 17) | (B9 >> 15); \
213 | 		BA = T32(BA << 17) | (BA >> 15); \
214 | 		BB = T32(BB << 17) | (BB >> 15); \
215 | 		BC = T32(BC << 17) | (BC >> 15); \
216 | 		BD = T32(BD << 17) | (BD >> 15); \
217 | 		BE = T32(BE << 17) | (BE >> 15); \
218 | 		BF = T32(BF << 17) | (BF >> 15); \
219 | 		PERM_STEP_0; \
220 | 		PERM_STEP_1; \
221 | 		PERM_STEP_2; \
222 | 		A0B = T32(A0B + C6); \
223 | 		A0A = T32(A0A + C5); \
224 | 		A09 = T32(A09 + C4); \
225 | 		A08 = T32(A08 + C3); \
226 | 		A07 = T32(A07 + C2); \
227 | 		A06 = T32(A06 + C1); \
228 | 		A05 = T32(A05 + C0); \
229 | 		A04 = T32(A04 + CF); \
230 | 		A03 = T32(A03 + CE); \
231 | 		A02 = T32(A02 + CD); \
232 | 		A01 = T32(A01 + CC); \
233 | 		A00 = T32(A00 + CB); \
234 | 		A0B = T32(A0B + CA); \
235 | 		A0A = T32(A0A + C9); \
236 | 		A09 = T32(A09 + C8); \
237 | 		A08 = T32(A08 + C7); \
238 | 		A07 = T32(A07 + C6); \
239 | 		A06 = T32(A06 + C5); \
240 | 		A05 = T32(A05 + C4); \
241 | 		A04 = T32(A04 + C3); \
242 | 		A03 = T32(A03 + C2); \
243 | 		A02 = T32(A02 + C1); \
244 | 		A01 = T32(A01 + C0); \
245 | 		A00 = T32(A00 + CF); \
246 | 		A0B = T32(A0B + CE); \
247 | 		A0A = T32(A0A + CD); \
248 | 		A09 = T32(A09 + CC); \
249 | 		A08 = T32(A08 + CB); \
250 | 		A07 = T32(A07 + CA); \
251 | 		A06 = T32(A06 + C9); \
252 | 		A05 = T32(A05 + C8); \
253 | 		A04 = T32(A04 + C7); \
254 | 		A03 = T32(A03 + C6); \
255 | 		A02 = T32(A02 + C5); \
256 | 		A01 = T32(A01 + C4); \
257 | 		A00 = T32(A00 + C3); \
258 | 	} while (0)
259 | 
260 | #define INCR_W   do { \
261 | 		if ((Wlow = T32(Wlow + 1)) == 0) \
262 | 			Whigh = T32(Whigh + 1); \
263 | 	} while (0)
264 | 
265 | __constant static const sph_u32 A_init_192[] = {
266 |     C32(0xFD749ED4), C32(0xB798E530), C32(0x33904B6F), C32(0x46BDA85E),
267 |     C32(0x076934B4), C32(0x454B4058), C32(0x77F74527), C32(0xFB4CF465),
268 |     C32(0x62931DA9), C32(0xE778C8DB), C32(0x22B3998E), C32(0xAC15CFB9)
269 | };
270 | 
271 | __constant static const sph_u32 B_init_192[] = {
272 |     C32(0x58BCBAC4), C32(0xEC47A08E), C32(0xAEE933B2), C32(0xDFCBC824),
273 |     C32(0xA7944804), C32(0xBF65BDB0), C32(0x5A9D4502), C32(0x59979AF7),
274 |     C32(0xC5CEA54E), C32(0x4B6B8150), C32(0x16E71909), C32(0x7D632319),
275 |     C32(0x930573A0), C32(0xF34C63D1), C32(0xCAF914B4), C32(0xFDD6612C)
276 | };
277 | 
278 | __constant static const sph_u32 C_init_192[] = {
279 |     C32(0x61550878), C32(0x89EF2B75), C32(0xA1660C46), C32(0x7EF3855B),
280 |     C32(0x7297B58C), C32(0x1BC67793), C32(0x7FB1C723), C32(0xB66FC640),
281 |     C32(0x1A48B71C), C32(0xF0976D17), C32(0x088CE80A), C32(0xA454EDF3),
282 |     C32(0x1C096BF4), C32(0xAC76224B), C32(0x5215781C), C32(0xCD5D2669)
283 | };
284 | 
285 | __constant static const sph_u32 A_init_224[] = {
286 |     C32(0xA5201467), C32(0xA9B8D94A), C32(0xD4CED997), C32(0x68379D7B),
287 |     C32(0xA7FC73BA), C32(0xF1A2546B), C32(0x606782BF), C32(0xE0BCFD0F),
288 |     C32(0x2F25374E), C32(0x069A149F), C32(0x5E2DFF25), C32(0xFAECF061)
289 | };
290 | 
291 | __constant static const sph_u32 B_init_224[] = {
292 |     C32(0xEC9905D8), C32(0xF21850CF), C32(0xC0A746C8), C32(0x21DAD498),
293 |     C32(0x35156EEB), C32(0x088C97F2), C32(0x26303E40), C32(0x8A2D4FB5),
294 |     C32(0xFEEE44B6), C32(0x8A1E9573), C32(0x7B81111A), C32(0xCBC139F0),
295 |     C32(0xA3513861), C32(0x1D2C362E), C32(0x918C580E), C32(0xB58E1B9C)
296 | };
297 | 
298 | __constant static const sph_u32 C_init_224[] = {
299 |     C32(0xE4B573A1), C32(0x4C1A0880), C32(0x1E907C51), C32(0x04807EFD),
300 |     C32(0x3AD8CDE5), C32(0x16B21302), C32(0x02512C53), C32(0x2204CB18),
301 |     C32(0x99405F2D), C32(0xE5B648A1), C32(0x70AB1D43), C32(0xA10C25C2),
302 |     C32(0x16F1AC05), C32(0x38BBEB56), C32(0x9B01DC60), C32(0xB1096D83)
303 | };
304 | 
305 | __constant static const sph_u32 A_init_256[] = {
306 |     C32(0x52F84552), C32(0xE54B7999), C32(0x2D8EE3EC), C32(0xB9645191),
307 |     C32(0xE0078B86), C32(0xBB7C44C9), C32(0xD2B5C1CA), C32(0xB0D2EB8C),
308 |     C32(0x14CE5A45), C32(0x22AF50DC), C32(0xEFFDBC6B), C32(0xEB21B74A)
309 | };
310 | 
311 | __constant static const sph_u32 B_init_256[] = {
312 |     C32(0xB555C6EE), C32(0x3E710596), C32(0xA72A652F), C32(0x9301515F),
313 |     C32(0xDA28C1FA), C32(0x696FD868), C32(0x9CB6BF72), C32(0x0AFE4002),
314 |     C32(0xA6E03615), C32(0x5138C1D4), C32(0xBE216306), C32(0xB38B8890),
315 |     C32(0x3EA8B96B), C32(0x3299ACE4), C32(0x30924DD4), C32(0x55CB34A5)
316 | };
317 | 
318 | __constant static const sph_u32 C_init_256[] = {
319 |     C32(0xB405F031), C32(0xC4233EBA), C32(0xB3733979), C32(0xC0DD9D55),
320 |     C32(0xC51C28AE), C32(0xA327B8E1), C32(0x56C56167), C32(0xED614433),
321 |     C32(0x88B59D60), C32(0x60E2CEBA), C32(0x758B4B8B), C32(0x83E82A7F),
322 |     C32(0xBC968828), C32(0xE6E00BF7), C32(0xBA839E55), C32(0x9B491C60)
323 | };
324 | 
325 | __constant static const sph_u32 A_init_384[] = {
326 |     C32(0xC8FCA331), C32(0xE55C504E), C32(0x003EBF26), C32(0xBB6B8D83),
327 |     C32(0x7B0448C1), C32(0x41B82789), C32(0x0A7C9601), C32(0x8D659CFF),
328 |     C32(0xB6E2673E), C32(0xCA54C77B), C32(0x1460FD7E), C32(0x3FCB8F2D)
329 | };
330 | 
331 | __constant static const sph_u32 B_init_384[] = {
332 |     C32(0x527291FC), C32(0x2A16455F), C32(0x78E627E5), C32(0x944F169F),
333 |     C32(0x1CA6F016), C32(0xA854EA25), C32(0x8DB98ABE), C32(0xF2C62641),
334 |     C32(0x30117DCB), C32(0xCF5C4309), C32(0x93711A25), C32(0xF9F671B8),
335 |     C32(0xB01D2116), C32(0x333F4B89), C32(0xB285D165), C32(0x86829B36)
336 | };
337 | 
338 | __constant static const sph_u32 C_init_384[] = {
339 |     C32(0xF764B11A), C32(0x76172146), C32(0xCEF6934D), C32(0xC6D28399),
340 |     C32(0xFE095F61), C32(0x5E6018B4), C32(0x5048ECF5), C32(0x51353261),
341 |     C32(0x6E6E36DC), C32(0x63130DAD), C32(0xA9C69BD6), C32(0x1E90EA0C),
342 |     C32(0x7C35073B), C32(0x28D95E6D), C32(0xAA340E0D), C32(0xCB3DEE70)
343 | };
344 | 
345 | __constant static const sph_u32 A_init_512[] = {
346 |     C32(0x20728DFD), C32(0x46C0BD53), C32(0xE782B699), C32(0x55304632),
347 |     C32(0x71B4EF90), C32(0x0EA9E82C), C32(0xDBB930F1), C32(0xFAD06B8B),
348 |     C32(0xBE0CAE40), C32(0x8BD14410), C32(0x76D2ADAC), C32(0x28ACAB7F)
349 | };
350 | 
351 | __constant static const sph_u32 B_init_512[] = {
352 |     C32(0xC1099CB7), C32(0x07B385F3), C32(0xE7442C26), C32(0xCC8AD640),
353 |     C32(0xEB6F56C7), C32(0x1EA81AA9), C32(0x73B9D314), C32(0x1DE85D08),
354 |     C32(0x48910A5A), C32(0x893B22DB), C32(0xC5A0DF44), C32(0xBBC4324E),
355 |     C32(0x72D2F240), C32(0x75941D99), C32(0x6D8BDE82), C32(0xA1A7502B)
356 | };
357 | 
358 | __constant static const sph_u32 C_init_512[] = {
359 |     C32(0xD9BF68D1), C32(0x58BAD750), C32(0x56028CB2), C32(0x8134F359),
360 |     C32(0xB5D469D8), C32(0x941A8CC2), C32(0x418B2A6E), C32(0x04052780),
361 |     C32(0x7F07D787), C32(0x5194358F), C32(0x3C60D665), C32(0xBE97D79A),
362 |     C32(0x950C3434), C32(0xAED9A06D), C32(0x2537DC8D), C32(0x7CDB5969)
363 | };
364 | 
365 | /* END -- automatically generated code. */
366 | 
367 | #define NONCES_VECTOR           16
368 | #define NONCES_VECTOR_LOG2      4
369 | #define MESSAGE_CAP             64
370 | #define NUM_HASHES   			8192
371 | #define HASH_SIZE_WORDS         8
372 | #define NONCE_SIZE_WORDS        HASH_SIZE_WORDS * NUM_HASHES
373 | 
374 | #define EndianSwap(n) (rotate(n & 0x00FF00FF, 24UL)|(rotate(n, 8UL) & 0x00FF00FF))
375 | 
376 | #define EndianSwap64(n)	bitselect( \
377 | 		bitselect(rotate(n, 24UL), \
378 | 		          rotate(n, 8UL), 0x000000FF000000FFUL), \
379 | 		bitselect(rotate(n, 56UL), \
380 | 		          rotate(n, 40UL), 0x00FF000000FF0000UL), \
381 | 		0xFFFF0000FFFF0000UL)
382 | 
383 | #define Address(nonce,hash,word) ((nonce >> NONCES_VECTOR_LOG2) * NONCES_VECTOR * NONCE_SIZE_WORDS + (hash) * NONCES_VECTOR * HASH_SIZE_WORDS + word * NONCES_VECTOR + (nonce & (NONCES_VECTOR-1)))
384 | //#define Address(nonce,hash,word) (nonce * NONCE_SIZE_WORDS + (hash) * HASH_SIZE_WORDS + word)
385 | 
386 | /* Johnny's optimised nonce calculation kernel 
387 |  * based on the implementation found in BRS
388 |  */
389 | __kernel void calculate_nonces(__global unsigned char* buffer, unsigned long startnonce, unsigned long numeric_id_be, int start, int end, unsigned long nonces) {
390 | 	//if (gid==0) {printf("\n\nOCL 2 %lu\n\n",startnonce);} DEBUG
391 | 	int gid = get_global_id(0);
392 | 
393 | 	if (gid >= nonces)
394 | 		return;
395 | 	// number of shabal message round
396 | 	int num; 
397 | 	// buffer for final hash
398 | 	sph_u32 B8,B9,BA,BB,BC,BD,BE,BF;
399 | 	// init
400 | 	unsigned long nonce_be = EndianSwap64(startnonce + gid);
401 | 	// run 8192 rounds + final round 
402 | 	for (int hash = NUM_HASHES - start; hash > -1 + NUM_HASHES - end; hash -= 1) {
403 | 		// calculate number of shabal messages excl. final message
404 | 		num = (NUM_HASHES - hash) >> 1; 
405 | 		if (hash != 0) { 
406 | 			num = (num > MESSAGE_CAP) ? MESSAGE_CAP : num;
407 | 		} 
408 | 
409 | 		// init shabal
410 |         sph_u32
411 |             A00 = A_init_256[0], A01 = A_init_256[1], A02 = A_init_256[2], A03 = A_init_256[3],
412 |             A04 = A_init_256[4], A05 = A_init_256[5], A06 = A_init_256[6], A07 = A_init_256[7],
413 |             A08 = A_init_256[8], A09 = A_init_256[9], A0A = A_init_256[10], A0B = A_init_256[11];
414 |         sph_u32
415 |             B0 = B_init_256[0], B1 = B_init_256[1], B2 = B_init_256[2], B3 = B_init_256[3],
416 |             B4 = B_init_256[4], B5 = B_init_256[5], B6 = B_init_256[6], B7 = B_init_256[7];
417 |             B8 = B_init_256[8]; B9 = B_init_256[9]; BA = B_init_256[10]; BB = B_init_256[11];
418 |             BC = B_init_256[12]; BD = B_init_256[13]; BE = B_init_256[14]; BF = B_init_256[15];
419 |         sph_u32
420 |             C0 = C_init_256[0], C1 = C_init_256[1], C2 = C_init_256[2], C3 = C_init_256[3],
421 |             C4 = C_init_256[4], C5 = C_init_256[5], C6 = C_init_256[6], C7 = C_init_256[7],
422 |             C8 = C_init_256[8], C9 = C_init_256[9], CA = C_init_256[10], CB = C_init_256[11],
423 |             CC = C_init_256[12], CD = C_init_256[13], CE = C_init_256[14], CF = C_init_256[15];
424 |         sph_u32 M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
425 |         sph_u32 Wlow = 1, Whigh = 0;
426 | 	
427 | 		for (int i = 0; i < 2 * num; i+=2){
428 | 			M0 = ((__global unsigned int*)buffer)[Address(gid, hash + i, 0)];
429 | 			M1 = ((__global unsigned int*)buffer)[Address(gid, hash + i, 1)];
430 | 			M2 = ((__global unsigned int*)buffer)[Address(gid, hash + i, 2)];
431 | 			M3 = ((__global unsigned int*)buffer)[Address(gid, hash + i, 3)];
432 | 			M4 = ((__global unsigned int*)buffer)[Address(gid, hash + i, 4)];
433 | 			M5 = ((__global unsigned int*)buffer)[Address(gid, hash + i, 5)];
434 | 			M6 = ((__global unsigned int*)buffer)[Address(gid, hash + i, 6)];
435 | 			M7 = ((__global unsigned int*)buffer)[Address(gid, hash + i, 7)];
436 | 			M8 = ((__global unsigned int*)buffer)[Address(gid, hash + i + 1, 0)];
437 | 			M9 = ((__global unsigned int*)buffer)[Address(gid, hash + i + 1, 1)];
438 | 			MA = ((__global unsigned int*)buffer)[Address(gid, hash + i + 1, 2)];
439 | 			MB = ((__global unsigned int*)buffer)[Address(gid, hash + i + 1, 3)];
440 | 			MC = ((__global unsigned int*)buffer)[Address(gid, hash + i + 1, 4)];
441 | 			MD = ((__global unsigned int*)buffer)[Address(gid, hash + i + 1, 5)];
442 | 			ME = ((__global unsigned int*)buffer)[Address(gid, hash + i + 1, 6)];
443 | 			MF = ((__global unsigned int*)buffer)[Address(gid, hash + i + 1, 7)];
444 | 
445 |     		INPUT_BLOCK_ADD;
446 |     		XOR_W;
447 |     		APPLY_P;
448 |     		INPUT_BLOCK_SUB;
449 |     		SWAP_BC;
450 |     		INCR_W;
451 |     	}
452 | 
453 | 		// final message determination
454 | 		if (num == MESSAGE_CAP) {
455 |             M0 = 0x80;
456 |             M1 = M2 = M3 = M4 = M5 = M6 = M7 = M8 = M9 = MA = MB = MC = MD = ME = MF = 0;
457 |         }
458 |         else if((hash & 1) == 0) {
459 |             M0 = ((unsigned int*)&numeric_id_be)[0];
460 |             M1 = ((unsigned int*)&numeric_id_be)[1];
461 |             M2 = ((unsigned int*)&nonce_be)[0];
462 |             M3 = ((unsigned int*)&nonce_be)[1];
463 |             M4 = 0x80;
464 |             M5 = M6 = M7 = M8 = M9 = MA = MB = MC = MD = ME = MF = 0;
465 |         }
466 |         else if((hash & 1) == 1) {
467 |             M0 = ((__global unsigned int*)buffer)[Address(gid, NUM_HASHES-1, 0)];
468 |             M1 = ((__global unsigned int*)buffer)[Address(gid, NUM_HASHES-1, 1)];
469 |             M2 = ((__global unsigned int*)buffer)[Address(gid, NUM_HASHES-1, 2)];
470 |             M3 = ((__global unsigned int*)buffer)[Address(gid, NUM_HASHES-1, 3)];
471 |             M4 = ((__global unsigned int*)buffer)[Address(gid, NUM_HASHES-1, 4)];
472 |             M5 = ((__global unsigned int*)buffer)[Address(gid, NUM_HASHES-1, 5)];
473 |             M6 = ((__global unsigned int*)buffer)[Address(gid, NUM_HASHES-1, 6)];
474 |             M7 = ((__global unsigned int*)buffer)[Address(gid, NUM_HASHES-1, 7)];
475 |             M8 = ((unsigned int*)&numeric_id_be)[0];
476 |             M9 = ((unsigned int*)&numeric_id_be)[1];
477 |             MA = ((unsigned int*)&nonce_be)[0];
478 |             MB = ((unsigned int*)&nonce_be)[1];
479 |             MC = 0x80;
480 |             MD = ME = MF = 0;
481 | 		}
482 | 
483 |     	INPUT_BLOCK_ADD;
484 |     	XOR_W;
485 |     	APPLY_P;
486 |     	for (int i = 0; i < 3; i ++) {
487 | 	        SWAP_BC;
488 |         	XOR_W;
489 |         	APPLY_P;
490 |     	}
491 | 
492 | 		if (hash > 0){
493 | 			((__global unsigned int*)buffer)[Address(gid, hash-1, 0)] = B8;		
494 | 			((__global unsigned int*)buffer)[Address(gid, hash-1, 1)] = B9;
495 | 			((__global unsigned int*)buffer)[Address(gid, hash-1, 2)] = BA;
496 | 			((__global unsigned int*)buffer)[Address(gid, hash-1, 3)] = BB;
497 | 			((__global unsigned int*)buffer)[Address(gid, hash-1, 4)] = BC;
498 | 			((__global unsigned int*)buffer)[Address(gid, hash-1, 5)] = BD;
499 | 			((__global unsigned int*)buffer)[Address(gid, hash-1, 6)] = BE;
500 | 			((__global unsigned int*)buffer)[Address(gid, hash-1, 7)] = BF;	
501 | 		}
502 | 	}
503 | 
504 | 	// final xor 
505 | 	if(end==8192){
506 | 		for (size_t i = 0; i < NUM_HASHES; i++){ 
507 | 			((__global unsigned int*)buffer)[Address(gid, i, 0)] ^= B8;
508 | 			((__global unsigned int*)buffer)[Address(gid, i, 1)] ^= B9;
509 | 			((__global unsigned int*)buffer)[Address(gid, i, 2)] ^= BA;
510 | 			((__global unsigned int*)buffer)[Address(gid, i, 3)] ^= BB;
511 | 			((__global unsigned int*)buffer)[Address(gid, i, 4)] ^= BC;
512 | 			((__global unsigned int*)buffer)[Address(gid, i, 5)] ^= BD;
513 | 			((__global unsigned int*)buffer)[Address(gid, i, 6)] ^= BE;
514 | 			((__global unsigned int*)buffer)[Address(gid, i, 7)] ^= BF;
515 | 		}
516 | 	}
517 | }


--------------------------------------------------------------------------------
/Cargo.lock:
--------------------------------------------------------------------------------
  1 | # This file is automatically @generated by Cargo.
  2 | # It is not intended for manual editing.
  3 | [[package]]
  4 | name = "addr2line"
  5 | version = "0.15.2"
  6 | source = "registry+https://github.com/rust-lang/crates.io-index"
  7 | checksum = "e7a2e47a1fbe209ee101dd6d61285226744c6c8d3c21c8dc878ba6cb9f467f3a"
  8 | dependencies = [
  9 |  "gimli",
 10 | ]
 11 | 
 12 | [[package]]
 13 | name = "adler"
 14 | version = "1.0.2"
 15 | source = "registry+https://github.com/rust-lang/crates.io-index"
 16 | checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
 17 | 
 18 | [[package]]
 19 | name = "aligned_alloc"
 20 | version = "0.1.3"
 21 | source = "registry+https://github.com/rust-lang/crates.io-index"
 22 | checksum = "9dcebfb002ccde769c15bc841d0d5548a90e80fcd2ffed5131339e8074746f0a"
 23 | dependencies = [
 24 |  "kernel32-sys",
 25 |  "libc",
 26 |  "winapi 0.2.8",
 27 | ]
 28 | 
 29 | [[package]]
 30 | name = "ansi_term"
 31 | version = "0.11.0"
 32 | source = "registry+https://github.com/rust-lang/crates.io-index"
 33 | checksum = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b"
 34 | dependencies = [
 35 |  "winapi 0.3.9",
 36 | ]
 37 | 
 38 | [[package]]
 39 | name = "atty"
 40 | version = "0.2.14"
 41 | source = "registry+https://github.com/rust-lang/crates.io-index"
 42 | checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
 43 | dependencies = [
 44 |  "hermit-abi",
 45 |  "libc",
 46 |  "winapi 0.3.9",
 47 | ]
 48 | 
 49 | [[package]]
 50 | name = "autocfg"
 51 | version = "1.0.1"
 52 | source = "registry+https://github.com/rust-lang/crates.io-index"
 53 | checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a"
 54 | 
 55 | [[package]]
 56 | name = "backtrace"
 57 | version = "0.3.60"
 58 | source = "registry+https://github.com/rust-lang/crates.io-index"
 59 | checksum = "b7815ea54e4d821e791162e078acbebfd6d8c8939cd559c9335dceb1c8ca7282"
 60 | dependencies = [
 61 |  "addr2line",
 62 |  "cc",
 63 |  "cfg-if 1.0.0",
 64 |  "libc",
 65 |  "miniz_oxide",
 66 |  "object",
 67 |  "rustc-demangle",
 68 | ]
 69 | 
 70 | [[package]]
 71 | name = "bitflags"
 72 | version = "1.2.1"
 73 | source = "registry+https://github.com/rust-lang/crates.io-index"
 74 | checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693"
 75 | 
 76 | [[package]]
 77 | name = "cc"
 78 | version = "1.0.68"
 79 | source = "registry+https://github.com/rust-lang/crates.io-index"
 80 | checksum = "4a72c244c1ff497a746a7e1fb3d14bd08420ecda70c8f25c7112f2781652d787"
 81 | 
 82 | [[package]]
 83 | name = "cfg-if"
 84 | version = "0.1.10"
 85 | source = "registry+https://github.com/rust-lang/crates.io-index"
 86 | checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822"
 87 | 
 88 | [[package]]
 89 | name = "cfg-if"
 90 | version = "1.0.0"
 91 | source = "registry+https://github.com/rust-lang/crates.io-index"
 92 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 93 | 
 94 | [[package]]
 95 | name = "cl-sys"
 96 | version = "0.4.2"
 97 | source = "registry+https://github.com/rust-lang/crates.io-index"
 98 | checksum = "e8573fa3ff8acd6c49e8e113296c54277e82376b96c6ca6307848632cce38e44"
 99 | dependencies = [
100 |  "libc",
101 | ]
102 | 
103 | [[package]]
104 | name = "clap"
105 | version = "2.33.3"
106 | source = "registry+https://github.com/rust-lang/crates.io-index"
107 | checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002"
108 | dependencies = [
109 |  "ansi_term",
110 |  "atty",
111 |  "bitflags",
112 |  "strsim",
113 |  "textwrap",
114 |  "unicode-width",
115 |  "vec_map",
116 | ]
117 | 
118 | [[package]]
119 | name = "core_affinity"
120 | version = "0.5.10"
121 | source = "registry+https://github.com/rust-lang/crates.io-index"
122 | checksum = "7f8a03115cc34fb0d7c321dd154a3914b3ca082ccc5c11d91bf7117dbbe7171f"
123 | dependencies = [
124 |  "kernel32-sys",
125 |  "libc",
126 |  "num_cpus",
127 |  "winapi 0.2.8",
128 | ]
129 | 
130 | [[package]]
131 | name = "crossbeam-channel"
132 | version = "0.3.9"
133 | source = "registry+https://github.com/rust-lang/crates.io-index"
134 | checksum = "c8ec7fcd21571dc78f96cc96243cab8d8f035247c3efd16c687be154c3fa9efa"
135 | dependencies = [
136 |  "crossbeam-utils 0.6.6",
137 | ]
138 | 
139 | [[package]]
140 | name = "crossbeam-channel"
141 | version = "0.5.1"
142 | source = "registry+https://github.com/rust-lang/crates.io-index"
143 | checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4"
144 | dependencies = [
145 |  "cfg-if 1.0.0",
146 |  "crossbeam-utils 0.8.5",
147 | ]
148 | 
149 | [[package]]
150 | name = "crossbeam-deque"
151 | version = "0.8.0"
152 | source = "registry+https://github.com/rust-lang/crates.io-index"
153 | checksum = "94af6efb46fef72616855b036a624cf27ba656ffc9be1b9a3c931cfc7749a9a9"
154 | dependencies = [
155 |  "cfg-if 1.0.0",
156 |  "crossbeam-epoch",
157 |  "crossbeam-utils 0.8.5",
158 | ]
159 | 
160 | [[package]]
161 | name = "crossbeam-epoch"
162 | version = "0.9.5"
163 | source = "registry+https://github.com/rust-lang/crates.io-index"
164 | checksum = "4ec02e091aa634e2c3ada4a392989e7c3116673ef0ac5b72232439094d73b7fd"
165 | dependencies = [
166 |  "cfg-if 1.0.0",
167 |  "crossbeam-utils 0.8.5",
168 |  "lazy_static",
169 |  "memoffset",
170 |  "scopeguard",
171 | ]
172 | 
173 | [[package]]
174 | name = "crossbeam-utils"
175 | version = "0.6.6"
176 | source = "registry+https://github.com/rust-lang/crates.io-index"
177 | checksum = "04973fa96e96579258a5091af6003abde64af786b860f18622b82e026cca60e6"
178 | dependencies = [
179 |  "cfg-if 0.1.10",
180 |  "lazy_static",
181 | ]
182 | 
183 | [[package]]
184 | name = "crossbeam-utils"
185 | version = "0.8.5"
186 | source = "registry+https://github.com/rust-lang/crates.io-index"
187 | checksum = "d82cfc11ce7f2c3faef78d8a684447b40d503d9681acebed6cb728d45940c4db"
188 | dependencies = [
189 |  "cfg-if 1.0.0",
190 |  "lazy_static",
191 | ]
192 | 
193 | [[package]]
194 | name = "either"
195 | version = "1.6.1"
196 | source = "registry+https://github.com/rust-lang/crates.io-index"
197 | checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457"
198 | 
199 | [[package]]
200 | name = "engraver"
201 | version = "2.5.0"
202 | dependencies = [
203 |  "aligned_alloc",
204 |  "cc",
205 |  "cfg-if 0.1.10",
206 |  "clap",
207 |  "core_affinity",
208 |  "crossbeam-channel 0.3.9",
209 |  "fs2",
210 |  "humanize-rs",
211 |  "libc",
212 |  "ocl-core",
213 |  "page_size",
214 |  "pbr",
215 |  "raw-cpuid",
216 |  "rayon",
217 |  "rust-crypto",
218 |  "stopwatch",
219 |  "sys-info",
220 |  "thread-priority",
221 |  "winapi 0.3.9",
222 | ]
223 | 
224 | [[package]]
225 | name = "enum_primitive"
226 | version = "0.1.1"
227 | source = "registry+https://github.com/rust-lang/crates.io-index"
228 | checksum = "be4551092f4d519593039259a9ed8daedf0da12e5109c5280338073eaeb81180"
229 | dependencies = [
230 |  "num-traits 0.1.43",
231 | ]
232 | 
233 | [[package]]
234 | name = "failure"
235 | version = "0.1.8"
236 | source = "registry+https://github.com/rust-lang/crates.io-index"
237 | checksum = "d32e9bd16cc02eae7db7ef620b392808b89f6a5e16bb3497d159c6b92a0f4f86"
238 | dependencies = [
239 |  "backtrace",
240 |  "failure_derive",
241 | ]
242 | 
243 | [[package]]
244 | name = "failure_derive"
245 | version = "0.1.8"
246 | source = "registry+https://github.com/rust-lang/crates.io-index"
247 | checksum = "aa4da3c766cd7a0db8242e326e9e4e081edd567072893ed320008189715366a4"
248 | dependencies = [
249 |  "proc-macro2",
250 |  "quote",
251 |  "syn",
252 |  "synstructure",
253 | ]
254 | 
255 | [[package]]
256 | name = "fs2"
257 | version = "0.4.3"
258 | source = "registry+https://github.com/rust-lang/crates.io-index"
259 | checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213"
260 | dependencies = [
261 |  "libc",
262 |  "winapi 0.3.9",
263 | ]
264 | 
265 | [[package]]
266 | name = "fuchsia-cprng"
267 | version = "0.1.1"
268 | source = "registry+https://github.com/rust-lang/crates.io-index"
269 | checksum = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba"
270 | 
271 | [[package]]
272 | name = "gcc"
273 | version = "0.3.55"
274 | source = "registry+https://github.com/rust-lang/crates.io-index"
275 | checksum = "8f5f3913fa0bfe7ee1fd8248b6b9f42a5af4b9d65ec2dd2c3c26132b950ecfc2"
276 | 
277 | [[package]]
278 | name = "gimli"
279 | version = "0.24.0"
280 | source = "registry+https://github.com/rust-lang/crates.io-index"
281 | checksum = "0e4075386626662786ddb0ec9081e7c7eeb1ba31951f447ca780ef9f5d568189"
282 | 
283 | [[package]]
284 | name = "hermit-abi"
285 | version = "0.1.18"
286 | source = "registry+https://github.com/rust-lang/crates.io-index"
287 | checksum = "322f4de77956e22ed0e5032c359a0f1273f1f7f0d79bfa3b8ffbc730d7fbcc5c"
288 | dependencies = [
289 |  "libc",
290 | ]
291 | 
292 | [[package]]
293 | name = "humanize-rs"
294 | version = "0.1.5"
295 | source = "registry+https://github.com/rust-lang/crates.io-index"
296 | checksum = "016b02deb8b0c415d8d56a6f0ab265e50c22df61194e37f9be75ed3a722de8a6"
297 | 
298 | [[package]]
299 | name = "kernel32-sys"
300 | version = "0.2.2"
301 | source = "registry+https://github.com/rust-lang/crates.io-index"
302 | checksum = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d"
303 | dependencies = [
304 |  "winapi 0.2.8",
305 |  "winapi-build",
306 | ]
307 | 
308 | [[package]]
309 | name = "lazy_static"
310 | version = "1.4.0"
311 | source = "registry+https://github.com/rust-lang/crates.io-index"
312 | checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
313 | 
314 | [[package]]
315 | name = "libc"
316 | version = "0.2.96"
317 | source = "registry+https://github.com/rust-lang/crates.io-index"
318 | checksum = "5600b4e6efc5421841a2138a6b082e07fe12f9aaa12783d50e5d13325b26b4fc"
319 | 
320 | [[package]]
321 | name = "memchr"
322 | version = "2.4.0"
323 | source = "registry+https://github.com/rust-lang/crates.io-index"
324 | checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc"
325 | 
326 | [[package]]
327 | name = "memoffset"
328 | version = "0.6.4"
329 | source = "registry+https://github.com/rust-lang/crates.io-index"
330 | checksum = "59accc507f1338036a0477ef61afdae33cde60840f4dfe481319ce3ad116ddf9"
331 | dependencies = [
332 |  "autocfg",
333 | ]
334 | 
335 | [[package]]
336 | name = "miniz_oxide"
337 | version = "0.4.4"
338 | source = "registry+https://github.com/rust-lang/crates.io-index"
339 | checksum = "a92518e98c078586bc6c934028adcca4c92a53d6a958196de835170a01d84e4b"
340 | dependencies = [
341 |  "adler",
342 |  "autocfg",
343 | ]
344 | 
345 | [[package]]
346 | name = "num"
347 | version = "0.1.42"
348 | source = "registry+https://github.com/rust-lang/crates.io-index"
349 | checksum = "4703ad64153382334aa8db57c637364c322d3372e097840c72000dabdcf6156e"
350 | dependencies = [
351 |  "num-bigint",
352 |  "num-complex",
353 |  "num-integer",
354 |  "num-iter",
355 |  "num-rational",
356 |  "num-traits 0.2.14",
357 | ]
358 | 
359 | [[package]]
360 | name = "num-bigint"
361 | version = "0.1.44"
362 | source = "registry+https://github.com/rust-lang/crates.io-index"
363 | checksum = "e63899ad0da84ce718c14936262a41cee2c79c981fc0a0e7c7beb47d5a07e8c1"
364 | dependencies = [
365 |  "num-integer",
366 |  "num-traits 0.2.14",
367 |  "rand 0.4.6",
368 |  "rustc-serialize",
369 | ]
370 | 
371 | [[package]]
372 | name = "num-complex"
373 | version = "0.1.43"
374 | source = "registry+https://github.com/rust-lang/crates.io-index"
375 | checksum = "b288631d7878aaf59442cffd36910ea604ecd7745c36054328595114001c9656"
376 | dependencies = [
377 |  "num-traits 0.2.14",
378 |  "rustc-serialize",
379 | ]
380 | 
381 | [[package]]
382 | name = "num-integer"
383 | version = "0.1.44"
384 | source = "registry+https://github.com/rust-lang/crates.io-index"
385 | checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db"
386 | dependencies = [
387 |  "autocfg",
388 |  "num-traits 0.2.14",
389 | ]
390 | 
391 | [[package]]
392 | name = "num-iter"
393 | version = "0.1.42"
394 | source = "registry+https://github.com/rust-lang/crates.io-index"
395 | checksum = "b2021c8337a54d21aca0d59a92577a029af9431cb59b909b03252b9c164fad59"
396 | dependencies = [
397 |  "autocfg",
398 |  "num-integer",
399 |  "num-traits 0.2.14",
400 | ]
401 | 
402 | [[package]]
403 | name = "num-rational"
404 | version = "0.1.42"
405 | source = "registry+https://github.com/rust-lang/crates.io-index"
406 | checksum = "ee314c74bd753fc86b4780aa9475da469155f3848473a261d2d18e35245a784e"
407 | dependencies = [
408 |  "num-bigint",
409 |  "num-integer",
410 |  "num-traits 0.2.14",
411 |  "rustc-serialize",
412 | ]
413 | 
414 | [[package]]
415 | name = "num-traits"
416 | version = "0.1.43"
417 | source = "registry+https://github.com/rust-lang/crates.io-index"
418 | checksum = "92e5113e9fd4cc14ded8e499429f396a20f98c772a47cc8622a736e1ec843c31"
419 | dependencies = [
420 |  "num-traits 0.2.14",
421 | ]
422 | 
423 | [[package]]
424 | name = "num-traits"
425 | version = "0.2.14"
426 | source = "registry+https://github.com/rust-lang/crates.io-index"
427 | checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290"
428 | dependencies = [
429 |  "autocfg",
430 | ]
431 | 
432 | [[package]]
433 | name = "num_cpus"
434 | version = "1.13.0"
435 | source = "registry+https://github.com/rust-lang/crates.io-index"
436 | checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3"
437 | dependencies = [
438 |  "hermit-abi",
439 |  "libc",
440 | ]
441 | 
442 | [[package]]
443 | name = "object"
444 | version = "0.25.2"
445 | source = "registry+https://github.com/rust-lang/crates.io-index"
446 | checksum = "f8bc1d42047cf336f0f939c99e97183cf31551bf0f2865a2ec9c8d91fd4ffb5e"
447 | dependencies = [
448 |  "memchr",
449 | ]
450 | 
451 | [[package]]
452 | name = "ocl-core"
453 | version = "0.11.2"
454 | source = "registry+https://github.com/rust-lang/crates.io-index"
455 | checksum = "81bc628faf959b5e07b1251252926dfe0dd1b3f2709cef8998c97936ddbdaa74"
456 | dependencies = [
457 |  "bitflags",
458 |  "cl-sys",
459 |  "enum_primitive",
460 |  "failure",
461 |  "num-complex",
462 |  "num-traits 0.2.14",
463 |  "ocl-core-vector",
464 |  "rustc_version 0.1.7",
465 | ]
466 | 
467 | [[package]]
468 | name = "ocl-core-vector"
469 | version = "0.1.0"
470 | source = "registry+https://github.com/rust-lang/crates.io-index"
471 | checksum = "b4072920739958adeec5abedec51af70febc58f7fff0601aaa0827c1f3c8fefd"
472 | dependencies = [
473 |  "num",
474 | ]
475 | 
476 | [[package]]
477 | name = "page_size"
478 | version = "0.4.2"
479 | source = "registry+https://github.com/rust-lang/crates.io-index"
480 | checksum = "eebde548fbbf1ea81a99b128872779c437752fb99f217c45245e1a61dcd9edcd"
481 | dependencies = [
482 |  "libc",
483 |  "winapi 0.3.9",
484 | ]
485 | 
486 | [[package]]
487 | name = "pbr"
488 | version = "1.0.4"
489 | source = "registry+https://github.com/rust-lang/crates.io-index"
490 | checksum = "ff5751d87f7c00ae6403eb1fcbba229b9c76c9a30de8c1cf87182177b168cea2"
491 | dependencies = [
492 |  "crossbeam-channel 0.5.1",
493 |  "libc",
494 |  "time",
495 |  "winapi 0.3.9",
496 | ]
497 | 
498 | [[package]]
499 | name = "proc-macro2"
500 | version = "1.0.27"
501 | source = "registry+https://github.com/rust-lang/crates.io-index"
502 | checksum = "f0d8caf72986c1a598726adc988bb5984792ef84f5ee5aa50209145ee8077038"
503 | dependencies = [
504 |  "unicode-xid",
505 | ]
506 | 
507 | [[package]]
508 | name = "quote"
509 | version = "1.0.9"
510 | source = "registry+https://github.com/rust-lang/crates.io-index"
511 | checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7"
512 | dependencies = [
513 |  "proc-macro2",
514 | ]
515 | 
516 | [[package]]
517 | name = "rand"
518 | version = "0.3.23"
519 | source = "registry+https://github.com/rust-lang/crates.io-index"
520 | checksum = "64ac302d8f83c0c1974bf758f6b041c6c8ada916fbb44a609158ca8b064cc76c"
521 | dependencies = [
522 |  "libc",
523 |  "rand 0.4.6",
524 | ]
525 | 
526 | [[package]]
527 | name = "rand"
528 | version = "0.4.6"
529 | source = "registry+https://github.com/rust-lang/crates.io-index"
530 | checksum = "552840b97013b1a26992c11eac34bdd778e464601a4c2054b5f0bff7c6761293"
531 | dependencies = [
532 |  "fuchsia-cprng",
533 |  "libc",
534 |  "rand_core 0.3.1",
535 |  "rdrand",
536 |  "winapi 0.3.9",
537 | ]
538 | 
539 | [[package]]
540 | name = "rand_core"
541 | version = "0.3.1"
542 | source = "registry+https://github.com/rust-lang/crates.io-index"
543 | checksum = "7a6fdeb83b075e8266dcc8762c22776f6877a63111121f5f8c7411e5be7eed4b"
544 | dependencies = [
545 |  "rand_core 0.4.2",
546 | ]
547 | 
548 | [[package]]
549 | name = "rand_core"
550 | version = "0.4.2"
551 | source = "registry+https://github.com/rust-lang/crates.io-index"
552 | checksum = "9c33a3c44ca05fa6f1807d8e6743f3824e8509beca625669633be0acbdf509dc"
553 | 
554 | [[package]]
555 | name = "raw-cpuid"
556 | version = "6.1.0"
557 | source = "registry+https://github.com/rust-lang/crates.io-index"
558 | checksum = "30a9d219c32c9132f7be513c18be77c9881c7107d2ab5569d205a6a0f0e6dc7d"
559 | dependencies = [
560 |  "bitflags",
561 |  "cc",
562 |  "rustc_version 0.2.3",
563 | ]
564 | 
565 | [[package]]
566 | name = "rayon"
567 | version = "1.5.1"
568 | source = "registry+https://github.com/rust-lang/crates.io-index"
569 | checksum = "c06aca804d41dbc8ba42dfd964f0d01334eceb64314b9ecf7c5fad5188a06d90"
570 | dependencies = [
571 |  "autocfg",
572 |  "crossbeam-deque",
573 |  "either",
574 |  "rayon-core",
575 | ]
576 | 
577 | [[package]]
578 | name = "rayon-core"
579 | version = "1.9.1"
580 | source = "registry+https://github.com/rust-lang/crates.io-index"
581 | checksum = "d78120e2c850279833f1dd3582f730c4ab53ed95aeaaaa862a2a5c71b1656d8e"
582 | dependencies = [
583 |  "crossbeam-channel 0.5.1",
584 |  "crossbeam-deque",
585 |  "crossbeam-utils 0.8.5",
586 |  "lazy_static",
587 |  "num_cpus",
588 | ]
589 | 
590 | [[package]]
591 | name = "rdrand"
592 | version = "0.4.0"
593 | source = "registry+https://github.com/rust-lang/crates.io-index"
594 | checksum = "678054eb77286b51581ba43620cc911abf02758c91f93f479767aed0f90458b2"
595 | dependencies = [
596 |  "rand_core 0.3.1",
597 | ]
598 | 
599 | [[package]]
600 | name = "rust-crypto"
601 | version = "0.2.36"
602 | source = "registry+https://github.com/rust-lang/crates.io-index"
603 | checksum = "f76d05d3993fd5f4af9434e8e436db163a12a9d40e1a58a726f27a01dfd12a2a"
604 | dependencies = [
605 |  "gcc",
606 |  "libc",
607 |  "rand 0.3.23",
608 |  "rustc-serialize",
609 |  "time",
610 | ]
611 | 
612 | [[package]]
613 | name = "rustc-demangle"
614 | version = "0.1.19"
615 | source = "registry+https://github.com/rust-lang/crates.io-index"
616 | checksum = "410f7acf3cb3a44527c5d9546bad4bf4e6c460915d5f9f2fc524498bfe8f70ce"
617 | 
618 | [[package]]
619 | name = "rustc-serialize"
620 | version = "0.3.24"
621 | source = "registry+https://github.com/rust-lang/crates.io-index"
622 | checksum = "dcf128d1287d2ea9d80910b5f1120d0b8eede3fbf1abe91c40d39ea7d51e6fda"
623 | 
624 | [[package]]
625 | name = "rustc_version"
626 | version = "0.1.7"
627 | source = "registry+https://github.com/rust-lang/crates.io-index"
628 | checksum = "c5f5376ea5e30ce23c03eb77cbe4962b988deead10910c372b226388b594c084"
629 | dependencies = [
630 |  "semver 0.1.20",
631 | ]
632 | 
633 | [[package]]
634 | name = "rustc_version"
635 | version = "0.2.3"
636 | source = "registry+https://github.com/rust-lang/crates.io-index"
637 | checksum = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a"
638 | dependencies = [
639 |  "semver 0.9.0",
640 | ]
641 | 
642 | [[package]]
643 | name = "scopeguard"
644 | version = "1.1.0"
645 | source = "registry+https://github.com/rust-lang/crates.io-index"
646 | checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
647 | 
648 | [[package]]
649 | name = "semver"
650 | version = "0.1.20"
651 | source = "registry+https://github.com/rust-lang/crates.io-index"
652 | checksum = "d4f410fedcf71af0345d7607d246e7ad15faaadd49d240ee3b24e5dc21a820ac"
653 | 
654 | [[package]]
655 | name = "semver"
656 | version = "0.9.0"
657 | source = "registry+https://github.com/rust-lang/crates.io-index"
658 | checksum = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403"
659 | dependencies = [
660 |  "semver-parser",
661 | ]
662 | 
663 | [[package]]
664 | name = "semver-parser"
665 | version = "0.7.0"
666 | source = "registry+https://github.com/rust-lang/crates.io-index"
667 | checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
668 | 
669 | [[package]]
670 | name = "stopwatch"
671 | version = "0.0.7"
672 | source = "registry+https://github.com/rust-lang/crates.io-index"
673 | checksum = "3d04b5ebc78da44d3a456319d8bc2783e7d8cc7ccbb5cb4dc3f54afbd93bf728"
674 | dependencies = [
675 |  "num",
676 | ]
677 | 
678 | [[package]]
679 | name = "strsim"
680 | version = "0.8.0"
681 | source = "registry+https://github.com/rust-lang/crates.io-index"
682 | checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"
683 | 
684 | [[package]]
685 | name = "syn"
686 | version = "1.0.73"
687 | source = "registry+https://github.com/rust-lang/crates.io-index"
688 | checksum = "f71489ff30030d2ae598524f61326b902466f72a0fb1a8564c001cc63425bcc7"
689 | dependencies = [
690 |  "proc-macro2",
691 |  "quote",
692 |  "unicode-xid",
693 | ]
694 | 
695 | [[package]]
696 | name = "synstructure"
697 | version = "0.12.4"
698 | source = "registry+https://github.com/rust-lang/crates.io-index"
699 | checksum = "b834f2d66f734cb897113e34aaff2f1ab4719ca946f9a7358dba8f8064148701"
700 | dependencies = [
701 |  "proc-macro2",
702 |  "quote",
703 |  "syn",
704 |  "unicode-xid",
705 | ]
706 | 
707 | [[package]]
708 | name = "sys-info"
709 | version = "0.5.6"
710 | source = "registry+https://github.com/rust-lang/crates.io-index"
711 | checksum = "617f594d3869801871433390254b4a79f2a18176d7f4ad5784fa990bc8c12986"
712 | dependencies = [
713 |  "cc",
714 |  "libc",
715 | ]
716 | 
717 | [[package]]
718 | name = "textwrap"
719 | version = "0.11.0"
720 | source = "registry+https://github.com/rust-lang/crates.io-index"
721 | checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060"
722 | dependencies = [
723 |  "unicode-width",
724 | ]
725 | 
726 | [[package]]
727 | name = "thread-priority"
728 | version = "0.1.1"
729 | source = "registry+https://github.com/rust-lang/crates.io-index"
730 | checksum = "52c084e908948709a7f7f6d44b5368e0134aa322e0e569431a92c989bf855188"
731 | dependencies = [
732 |  "libc",
733 | ]
734 | 
735 | [[package]]
736 | name = "time"
737 | version = "0.1.44"
738 | source = "registry+https://github.com/rust-lang/crates.io-index"
739 | checksum = "6db9e6914ab8b1ae1c260a4ae7a49b6c5611b40328a735b21862567685e73255"
740 | dependencies = [
741 |  "libc",
742 |  "wasi",
743 |  "winapi 0.3.9",
744 | ]
745 | 
746 | [[package]]
747 | name = "unicode-width"
748 | version = "0.1.8"
749 | source = "registry+https://github.com/rust-lang/crates.io-index"
750 | checksum = "9337591893a19b88d8d87f2cec1e73fad5cdfd10e5a6f349f498ad6ea2ffb1e3"
751 | 
752 | [[package]]
753 | name = "unicode-xid"
754 | version = "0.2.2"
755 | source = "registry+https://github.com/rust-lang/crates.io-index"
756 | checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3"
757 | 
758 | [[package]]
759 | name = "vec_map"
760 | version = "0.8.2"
761 | source = "registry+https://github.com/rust-lang/crates.io-index"
762 | checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191"
763 | 
764 | [[package]]
765 | name = "wasi"
766 | version = "0.10.0+wasi-snapshot-preview1"
767 | source = "registry+https://github.com/rust-lang/crates.io-index"
768 | checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f"
769 | 
770 | [[package]]
771 | name = "winapi"
772 | version = "0.2.8"
773 | source = "registry+https://github.com/rust-lang/crates.io-index"
774 | checksum = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a"
775 | 
776 | [[package]]
777 | name = "winapi"
778 | version = "0.3.9"
779 | source = "registry+https://github.com/rust-lang/crates.io-index"
780 | checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
781 | dependencies = [
782 |  "winapi-i686-pc-windows-gnu",
783 |  "winapi-x86_64-pc-windows-gnu",
784 | ]
785 | 
786 | [[package]]
787 | name = "winapi-build"
788 | version = "0.1.1"
789 | source = "registry+https://github.com/rust-lang/crates.io-index"
790 | checksum = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc"
791 | 
792 | [[package]]
793 | name = "winapi-i686-pc-windows-gnu"
794 | version = "0.4.0"
795 | source = "registry+https://github.com/rust-lang/crates.io-index"
796 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
797 | 
798 | [[package]]
799 | name = "winapi-x86_64-pc-windows-gnu"
800 | version = "0.4.0"
801 | source = "registry+https://github.com/rust-lang/crates.io-index"
802 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
803 | 


--------------------------------------------------------------------------------