├── .gitattributes ├── Makefile ├── .gitignore ├── src ├── c │ ├── noncegen_128_avx.h │ ├── noncegen_128_sse2.h │ ├── noncegen_256_avx2.h │ ├── noncegen_512_avx512f.h │ ├── common.c │ ├── common.h │ ├── .clang-format │ ├── sph_shabal.h │ ├── mshabal_128_avx.h │ ├── mshabal_128_sse2.h │ ├── mshabal_256_avx2.h │ ├── mshabal_512_avx512f.h │ ├── noncegen_128_avx.c │ ├── noncegen_128_sse2.c │ ├── noncegen_256_avx2.c │ └── noncegen_512_avx512f.c ├── buffer.rs ├── gpu_hasher.rs ├── poc_hashing.rs ├── writer.rs ├── cpu_hasher.rs ├── main.rs ├── scheduler.rs ├── shabal256.rs ├── utils.rs ├── plotter.rs └── ocl │ └── kernel.cl ├── .travis.yml ├── LICENSE ├── Cargo.toml ├── README.md └── Cargo.lock /.gitattributes: -------------------------------------------------------------------------------- 1 | src/c/* linguist-vendored 2 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | format-c: 2 | cd src/c && clang-format -i * 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Generated by Cargo 2 | # will have compiled files and executables 3 | /target/ 4 | /.vs/ 5 | /bin/ 6 | /obj/ 7 | /packages/ 8 | 9 | # These are backup files generated by rustfmt 10 | **/*.rs.bk 11 | 12 | .cquery_cached_index 13 | *.bat 14 | *.exe -------------------------------------------------------------------------------- /src/c/noncegen_128_avx.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | void init_shabal_avx(); 7 | void noncegen_avx(char *cache, const size_t cache_size, const size_t chunk_offset, 8 | const uint64_t numeric_id, const uint64_t local_startnonce, 9 | const uint64_t local_nonces); 10 | -------------------------------------------------------------------------------- /src/c/noncegen_128_sse2.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | void init_shabal_sse2(); 7 | void noncegen_sse2(char *cache, const size_t cache_size, const size_t chunk_offset, 8 | const uint64_t numeric_id, const uint64_t local_startnonce, 9 | const uint64_t local_nonces); 10 | -------------------------------------------------------------------------------- /src/c/noncegen_256_avx2.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | void init_shabal_avx2(); 7 | void noncegen_avx2(char *cache, const size_t cache_size, const size_t chunk_offset, 8 | const uint64_t numeric_id, const uint64_t local_startnonce, 9 | const uint64_t local_nonces); 10 | -------------------------------------------------------------------------------- /src/c/noncegen_512_avx512f.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | void init_shabal_avx512f(); 7 | void noncegen_avx512(char *cache, const size_t cache_size, const size_t chunk_offset, 8 | const uint64_t numeric_id, const uint64_t local_startnonce, 9 | const uint64_t local_nonces); 10 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: rust 2 | rust: 3 | - stable 4 | 5 | matrix: 6 | include: 7 | - os: linux 8 | dist: trusty 9 | sudo: required 10 | addons: 11 | apt: 12 | sources: 13 | - ubuntu-toolchain-r-test 14 | packages: 15 | - g++-4.9 16 | env: 17 | - CC=gcc-4.9 18 | - os: osx 19 | 20 | fast_finish: true 21 | 22 | -------------------------------------------------------------------------------- /src/c/common.c: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include 3 | 4 | void write_seed(char seed[32], uint64_t numeric_id) { 5 | numeric_id = bswap_64(numeric_id); 6 | memmove(&seed[0], &numeric_id, 8); 7 | memset(&seed[8], 0, 8); 8 | seed[16] = -128; // shabal message termination bit 9 | memset(&seed[17], 0, 15); 10 | } 11 | 12 | void write_term(char term[32]) { 13 | term[0] = -128; // shabal message termination bit 14 | memset(&term[1], 0, 31); 15 | } 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 PoC Consortium 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "engraver" 3 | version = "2.5.0" 4 | license = "GPL-3.0" 5 | authors = ["PoC Consortium "] 6 | description = """ 7 | Engraver - a PoC2 plotter written in Rust 8 | """ 9 | repository = "https://github.com/PoC-Consortium/engraver" 10 | documentation = "https://github.com/PoC-Consortium/engraver" 11 | keywords = ["poc2", "plotter", "rust","cryptocurrency"] 12 | readme = "README.md" 13 | edition = "2018" 14 | 15 | [features] 16 | opencl = ["ocl-core"] 17 | simd=[] 18 | 19 | [dependencies] 20 | crossbeam-channel = "0.3.6" 21 | ocl-core = { version = "0.11.1", optional = true } 22 | clap = "2.32.0" 23 | raw-cpuid = "6.1.0" 24 | sys-info = "0.5.6" 25 | cfg-if = "0.1.6" 26 | pbr = "1.0.1" 27 | humanize-rs = "0.1.5" 28 | libc = "0.2.46" 29 | rayon = "1.0.3" 30 | core_affinity = "0.5.9" 31 | stopwatch = "0.0.7" 32 | fs2 = "0.4.3" 33 | page_size = "0.4.1" 34 | aligned_alloc = "0.1.3" 35 | 36 | [target.'cfg(linux)'.dependencies] 37 | thread-priority = "0.1.0" 38 | 39 | [target.'cfg(windows)'.dependencies] 40 | winapi = { version = "0.3", features = ["std","fileapi","securitybaseapi"] } 41 | 42 | [build-dependencies] 43 | cc = "1.0" 44 | 45 | [dev-dependencies] 46 | rust-crypto = "0.2.36" 47 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | [![Build Status](https://travis-ci.org/PoC-Consortium/engraver.svg?branch=master)](https://travis-ci.org/PoC-Consortium/engraver) [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT) 4 | 5 | # Engraver - PoC2 plotter in Rust 6 | 7 | ### Features 8 | - windows, linux, unix & macOS 9 | - x86 32&64bit 10 | - direct and async i/o 11 | - SIMD support: sse2, avx, avx2, avx512f 12 | - gpu support 13 | - fastest plotter there is 14 | 15 | ### Requirements 16 | - new version of rust [stable toolchain] 17 | 18 | ### Compile, test, ... 19 | 20 | Binaries are in **target/debug** or **target/release** depending on optimization. 21 | 22 | ``` shell 23 | # build debug und run directly 24 | cargo run [--features=opencl] 25 | 26 | # build debug (unoptimized) 27 | cargo build [--features=opencl] 28 | 29 | # build release (optimized) 30 | cargo build --release [--features=opencl] 31 | ``` 32 | 33 | ### Run 34 | 35 | ```shell 36 | engraver --help 37 | ``` 38 | 39 | ### Donate 40 | * JohnnyDeluxe: BURST-S338-R6VC-LTFA-2GC6G 41 | - shabal optimizations 42 | - windows support 43 | * bold: BURST-8V9Y-58B4-RVWP-8HQAV 44 | - architecture 45 | - linux support 46 | 47 | -------------------------------------------------------------------------------- /src/buffer.rs: -------------------------------------------------------------------------------- 1 | use aligned_alloc::{aligned_alloc, aligned_free}; 2 | use std::sync::{Arc, Mutex}; 3 | 4 | pub struct PageAlignedByteBuffer { 5 | data: Option>>>, 6 | pointer: *mut (), 7 | } 8 | 9 | impl PageAlignedByteBuffer { 10 | pub fn new(buffer_size: usize) -> Self { 11 | let pointer = aligned_alloc(buffer_size, page_size::get()); 12 | let data: Vec; 13 | unsafe { 14 | data = Vec::from_raw_parts(pointer as *mut u8, buffer_size, buffer_size); 15 | } 16 | PageAlignedByteBuffer { 17 | data: Some(Arc::new(Mutex::new(data))), 18 | pointer, 19 | } 20 | } 21 | 22 | pub fn get_buffer(&self) -> Arc>> { 23 | self.data.as_ref().unwrap().clone() 24 | } 25 | } 26 | 27 | impl Drop for PageAlignedByteBuffer { 28 | fn drop(&mut self) { 29 | std::mem::forget(self.data.take().unwrap()); 30 | unsafe { 31 | aligned_free(self.pointer); 32 | } 33 | } 34 | } 35 | 36 | unsafe impl Send for PageAlignedByteBuffer {} 37 | 38 | #[cfg(test)] 39 | mod buffer_tests { 40 | use super::PageAlignedByteBuffer; 41 | 42 | #[test] 43 | fn buffer_creation_destruction_test() { 44 | { 45 | let _test = PageAlignedByteBuffer::new(1024 * 1024); 46 | } 47 | assert!(true); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/c/common.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #pragma once 4 | 5 | #ifdef _MSC_VER 6 | 7 | #include 8 | #define bswap_32(x) _byteswap_ulong(x) 9 | #define bswap_64(x) _byteswap_uint64(x) 10 | 11 | #elif defined(__APPLE__) 12 | 13 | // Mac OS X / Darwin features 14 | #include 15 | #define bswap_32(x) OSSwapInt32(x) 16 | #define bswap_64(x) OSSwapInt64(x) 17 | 18 | #elif defined(__sun) || defined(sun) 19 | 20 | #include 21 | #define bswap_32(x) BSWAP_32(x) 22 | #define bswap_64(x) BSWAP_64(x) 23 | 24 | #elif defined(__FreeBSD__) 25 | 26 | #include 27 | #define bswap_32(x) bswap32(x) 28 | #define bswap_64(x) bswap64(x) 29 | 30 | #elif defined(__OpenBSD__) 31 | 32 | #include 33 | #define bswap_32(x) swap32(x) 34 | #define bswap_64(x) swap64(x) 35 | 36 | #elif defined(__NetBSD__) 37 | 38 | #include 39 | #include 40 | #if defined(__BSWAP_RENAME) && !defined(__bswap_32) 41 | #define bswap_32(x) bswap32(x) 42 | #define bswap_64(x) bswap64(x) 43 | #endif 44 | 45 | #else 46 | 47 | #include 48 | 49 | #endif 50 | 51 | #define HASH_SIZE 32 52 | #define HASH_CAP 4096 53 | #define NUM_SCOOPS 4096 54 | #define SCOOP_SIZE 64 55 | #define NONCE_SIZE (HASH_CAP * SCOOP_SIZE) // 4096*64 56 | 57 | void write_seed(char seed[32], uint64_t numeric_id); 58 | 59 | void write_term(char term[32]); 60 | 61 | #define SET_BEST_DEADLINE(d, o) \ 62 | if ((d) < *best_deadline) { \ 63 | *best_deadline = (d); \ 64 | *best_offset = (o); \ 65 | } 66 | -------------------------------------------------------------------------------- /src/gpu_hasher.rs: -------------------------------------------------------------------------------- 1 | use crate::cpu_hasher::SafePointer; 2 | use crate::ocl::{gpu_hash, gpu_hash_and_transfer_to_host, gpu_transfer_to_host, GpuContext}; 3 | use crossbeam_channel::Receiver; 4 | use std::sync::mpsc::Sender; 5 | use std::sync::{Arc, Mutex}; 6 | 7 | pub struct GpuTask { 8 | pub cache: SafePointer, 9 | pub cache_size: u64, 10 | pub chunk_offset: u64, 11 | pub numeric_id: u64, 12 | pub local_startnonce: u64, 13 | pub local_nonces: u64, 14 | } 15 | 16 | pub fn create_gpu_hasher_thread( 17 | gpu_id: u8, 18 | gpu_context: Arc>, 19 | tx: Sender<(u8, u8, u64)>, 20 | rx_hasher_task: Receiver>, 21 | ) -> impl FnOnce() { 22 | move || { 23 | let mut first_run = true; 24 | let mut buffer_id = 0u8; 25 | let mut last_task = GpuTask { 26 | cache: SafePointer { ptr: &mut 0u8 }, 27 | cache_size: 0, 28 | chunk_offset: 0, 29 | numeric_id: 0, 30 | local_startnonce: 0, 31 | local_nonces: 0, 32 | }; 33 | for task in rx_hasher_task { 34 | // check if new task or termination 35 | match task { 36 | // new task 37 | Some(task) => { 38 | // first run - just hash 39 | if first_run { 40 | if task.local_nonces != 0 { 41 | first_run = false; 42 | gpu_hash(&gpu_context, &task); 43 | buffer_id = 1 - buffer_id; 44 | last_task = task; 45 | tx.send((gpu_id, 1u8, 0)) 46 | .expect("GPU task can't communicate with scheduler thread."); 47 | } 48 | // last run - just transfer 49 | } else if task.local_nonces == 0 { 50 | gpu_transfer_to_host(&gpu_context, buffer_id, &last_task); 51 | first_run = true; 52 | buffer_id = 0; 53 | tx.send((gpu_id, 0u8, last_task.local_nonces)) 54 | .expect("GPU task can't communicate with scheduler thread."); 55 | // normal run - hash and transfer async 56 | } else { 57 | gpu_hash_and_transfer_to_host(&gpu_context, buffer_id, &task, &last_task); 58 | buffer_id = 1 - buffer_id; 59 | tx.send((gpu_id, 0u8, last_task.local_nonces)) 60 | .expect("GPU task can't communicate with scheduler thread."); 61 | last_task = task; 62 | tx.send((gpu_id, 1u8, 0)) 63 | .expect("GPU task can't communicate with scheduler thread."); 64 | } 65 | } 66 | // termination 67 | None => { 68 | break; 69 | } 70 | } 71 | } 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /src/c/.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | Language: Cpp 3 | # BasedOnStyle: Google 4 | AccessModifierOffset: -1 5 | AlignAfterOpenBracket: Align 6 | AlignConsecutiveAssignments: false 7 | AlignConsecutiveDeclarations: false 8 | AlignEscapedNewlines: Left 9 | AlignOperands: true 10 | AlignTrailingComments: true 11 | AllowAllParametersOfDeclarationOnNextLine: true 12 | AllowShortBlocksOnASingleLine: false 13 | AllowShortCaseLabelsOnASingleLine: false 14 | AllowShortFunctionsOnASingleLine: All 15 | AllowShortIfStatementsOnASingleLine: true 16 | AllowShortLoopsOnASingleLine: true 17 | AlwaysBreakAfterDefinitionReturnType: None 18 | AlwaysBreakAfterReturnType: None 19 | AlwaysBreakBeforeMultilineStrings: true 20 | AlwaysBreakTemplateDeclarations: true 21 | BinPackArguments: true 22 | BinPackParameters: true 23 | BraceWrapping: 24 | AfterClass: false 25 | AfterControlStatement: false 26 | AfterEnum: false 27 | AfterFunction: false 28 | AfterNamespace: false 29 | AfterObjCDeclaration: false 30 | AfterStruct: false 31 | AfterUnion: false 32 | AfterExternBlock: false 33 | BeforeCatch: false 34 | BeforeElse: false 35 | IndentBraces: false 36 | SplitEmptyFunction: true 37 | SplitEmptyRecord: true 38 | SplitEmptyNamespace: true 39 | BreakBeforeBinaryOperators: None 40 | BreakBeforeBraces: Attach 41 | BreakBeforeInheritanceComma: false 42 | BreakBeforeTernaryOperators: true 43 | BreakConstructorInitializersBeforeComma: false 44 | BreakConstructorInitializers: BeforeColon 45 | BreakAfterJavaFieldAnnotations: false 46 | BreakStringLiterals: true 47 | ColumnLimit: 100 48 | CommentPragmas: '^ IWYU pragma:' 49 | CompactNamespaces: false 50 | ConstructorInitializerAllOnOneLineOrOnePerLine: true 51 | ConstructorInitializerIndentWidth: 4 52 | ContinuationIndentWidth: 4 53 | Cpp11BracedListStyle: true 54 | DerivePointerAlignment: true 55 | DisableFormat: false 56 | ExperimentalAutoDetectBinPacking: false 57 | FixNamespaceComments: true 58 | ForEachMacros: 59 | - foreach 60 | - Q_FOREACH 61 | - BOOST_FOREACH 62 | IncludeBlocks: Preserve 63 | IncludeCategories: 64 | - Regex: '^' 65 | Priority: 2 66 | - Regex: '^<.*\.h>' 67 | Priority: 1 68 | - Regex: '^<.*' 69 | Priority: 2 70 | - Regex: '.*' 71 | Priority: 3 72 | IncludeIsMainRegex: '([-_](test|unittest))?$' 73 | IndentCaseLabels: true 74 | IndentPPDirectives: None 75 | IndentWidth: 4 76 | IndentWrappedFunctionNames: false 77 | JavaScriptQuotes: Leave 78 | JavaScriptWrapImports: true 79 | KeepEmptyLinesAtTheStartOfBlocks: false 80 | MacroBlockBegin: '' 81 | MacroBlockEnd: '' 82 | MaxEmptyLinesToKeep: 1 83 | NamespaceIndentation: None 84 | ObjCBlockIndentWidth: 2 85 | ObjCSpaceAfterProperty: false 86 | ObjCSpaceBeforeProtocolList: false 87 | PenaltyBreakAssignment: 2 88 | PenaltyBreakBeforeFirstCallParameter: 1 89 | PenaltyBreakComment: 300 90 | PenaltyBreakFirstLessLess: 120 91 | PenaltyBreakString: 1000 92 | PenaltyExcessCharacter: 1000000 93 | PenaltyReturnTypeOnItsOwnLine: 200 94 | PointerAlignment: Left 95 | RawStringFormats: 96 | - Delimiter: pb 97 | Language: TextProto 98 | BasedOnStyle: google 99 | ReflowComments: true 100 | SortIncludes: true 101 | SortUsingDeclarations: true 102 | SpaceAfterCStyleCast: false 103 | SpaceAfterTemplateKeyword: true 104 | SpaceBeforeAssignmentOperators: true 105 | SpaceBeforeParens: ControlStatements 106 | SpaceInEmptyParentheses: false 107 | SpacesBeforeTrailingComments: 2 108 | SpacesInAngles: false 109 | SpacesInContainerLiterals: true 110 | SpacesInCStyleCastParentheses: false 111 | SpacesInParentheses: false 112 | SpacesInSquareBrackets: false 113 | Standard: Auto 114 | TabWidth: 8 115 | UseTab: Never 116 | ... 117 | -------------------------------------------------------------------------------- /src/poc_hashing.rs: -------------------------------------------------------------------------------- 1 | use crate::shabal256::shabal256_fast; 2 | 3 | const HASH_SIZE: usize = 32; 4 | const HASH_CAP: usize = 4096; 5 | const NUM_SCOOPS: usize = 4096; 6 | const SCOOP_SIZE: usize = 64; 7 | const NONCE_SIZE: usize = NUM_SCOOPS * SCOOP_SIZE; 8 | const MESSAGE_SIZE: usize = 16; 9 | 10 | // cache: cache to save to 11 | // local_num: thread number 12 | // numeric_id: numeric account id 13 | // loc_startnonce nonce to start generation at 14 | // local_nonces: number of nonces to generate 15 | pub fn noncegen_rust( 16 | cache: &mut [u8], 17 | cache_offset: usize, 18 | numeric_id: u64, 19 | local_startnonce: u64, 20 | local_nonces: u64, 21 | ) { 22 | let numeric_id: [u32; 2] = unsafe { std::mem::transmute(numeric_id.to_be()) }; 23 | 24 | let mut buffer = [0u8; NONCE_SIZE]; 25 | let mut final_buffer = [0u8; HASH_SIZE]; 26 | 27 | // prepare termination strings 28 | let mut t1 = [0u32; MESSAGE_SIZE]; 29 | t1[0..2].clone_from_slice(&numeric_id); 30 | t1[4] = 0x80; 31 | 32 | let mut t2 = [0u32; MESSAGE_SIZE]; 33 | t2[8..10].clone_from_slice(&numeric_id); 34 | t2[12] = 0x80; 35 | 36 | let mut t3 = [0u32; MESSAGE_SIZE]; 37 | t3[0] = 0x80; 38 | 39 | for n in 0..local_nonces { 40 | // generate nonce numbers & change endianness 41 | let nonce: [u32; 2] = unsafe { std::mem::transmute((local_startnonce + n).to_be()) }; 42 | 43 | // store nonce numbers in relevant termination strings 44 | t1[2..4].clone_from_slice(&nonce); 45 | t2[10..12].clone_from_slice(&nonce); 46 | 47 | // start shabal rounds 48 | 49 | // 3 cases: first 128 rounds uses case 1 or 2, after that case 3 50 | // case 1: first 128 rounds, hashes are even: use termination string 1 51 | // case 2: first 128 rounds, hashes are odd: use termination string 2 52 | // case 3: round > 128: use termination string 3 53 | // round 1 54 | let hash = shabal256_fast(&[], &t1); 55 | 56 | buffer[NONCE_SIZE - HASH_SIZE..NONCE_SIZE].clone_from_slice(&hash); 57 | let hash = unsafe { std::mem::transmute::<[u8; 32], [u32; 8]>(hash) }; 58 | 59 | // store first hash into smart termination string 2 60 | t2[0..8].clone_from_slice(&hash); 61 | // round 2 - 128 62 | for i in (NONCE_SIZE - HASH_CAP + HASH_SIZE..=NONCE_SIZE - HASH_SIZE) 63 | .rev() 64 | .step_by(HASH_SIZE) 65 | { 66 | // check if msg can be divided into 512bit packages without a 67 | // remainder 68 | if i % 64 == 0 { 69 | // last msg = seed + termination 70 | let hash = &shabal256_fast(&buffer[i..NONCE_SIZE], &t1); 71 | buffer[i - HASH_SIZE..i].clone_from_slice(hash); 72 | } else { 73 | // last msg = 256 bit data + seed + termination 74 | let hash = &shabal256_fast(&buffer[i..NONCE_SIZE], &t2); 75 | buffer[i - HASH_SIZE..i].clone_from_slice(hash); 76 | } 77 | } 78 | 79 | // round 128-8192 80 | for i in (HASH_SIZE..=NONCE_SIZE - HASH_CAP).rev().step_by(HASH_SIZE) { 81 | let hash = &shabal256_fast(&buffer[i..i + HASH_CAP], &t3); 82 | buffer[i - HASH_SIZE..i].clone_from_slice(hash); 83 | } 84 | 85 | // generate final hash 86 | final_buffer.clone_from_slice(&shabal256_fast(&buffer[0..NONCE_SIZE], &t1)); 87 | 88 | // XOR with final 89 | for i in 0..NONCE_SIZE { 90 | buffer[i] ^= final_buffer[i % HASH_SIZE]; 91 | } 92 | 93 | // PoC2 shuffle 94 | let cache_size = cache.len() / NONCE_SIZE; 95 | for i in 0..NUM_SCOOPS { 96 | let offset = i * cache_size * SCOOP_SIZE + (n as usize + cache_offset) * SCOOP_SIZE; 97 | cache[offset..offset + HASH_SIZE] 98 | .clone_from_slice(&buffer[i * SCOOP_SIZE..i * SCOOP_SIZE + HASH_SIZE]); 99 | let mirror_offset = (4095 - i) * cache_size * SCOOP_SIZE 100 | + (n as usize + cache_offset) * SCOOP_SIZE 101 | + HASH_SIZE; 102 | cache[mirror_offset..mirror_offset + HASH_SIZE].clone_from_slice( 103 | &buffer[i * SCOOP_SIZE + HASH_SIZE..i * SCOOP_SIZE + 2 * HASH_SIZE], 104 | ); 105 | } 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /src/c/sph_shabal.h: -------------------------------------------------------------------------------- 1 | /* $Id: sph_shabal.h 175 2010-05-07 16:03:20Z tp $ */ 2 | /** 3 | * Shabal interface. Shabal is a family of functions which differ by 4 | * their output size; this implementation defines Shabal for output 5 | * sizes 192, 224, 256, 384 and 512 bits. 6 | * 7 | * ==========================(LICENSE BEGIN)============================ 8 | * 9 | * Copyright (c) 2007-2010 Projet RNRT SAPHIR 10 | * 11 | * Permission is hereby granted, free of charge, to any person obtaining 12 | * a copy of this software and associated documentation files (the 13 | * "Software"), to deal in the Software without restriction, including 14 | * without limitation the rights to use, copy, modify, merge, publish, 15 | * distribute, sublicense, and/or sell copies of the Software, and to 16 | * permit persons to whom the Software is furnished to do so, subject to 17 | * the following conditions: 18 | * 19 | * The above copyright notice and this permission notice shall be 20 | * included in all copies or substantial portions of the Software. 21 | * 22 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 23 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 24 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 25 | * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 26 | * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 27 | * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 28 | * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 29 | * 30 | * ===========================(LICENSE END)============================= 31 | * 32 | * @file sph_shabal.h 33 | * @author Thomas Pornin 34 | */ 35 | 36 | #ifndef SPH_SHABAL_H__ 37 | #define SPH_SHABAL_H__ 38 | 39 | #include 40 | #include "sph_types.h" 41 | 42 | #ifdef __cplusplus 43 | extern "C" { 44 | #endif 45 | 46 | /** 47 | * Output size (in bits) for Shabal-256. 48 | */ 49 | #define SPH_SIZE_shabal256 256 50 | 51 | /** 52 | * This structure is a context for Shabal computations: it contains the 53 | * intermediate values and some data from the last entered block. Once 54 | * a Shabal computation has been performed, the context can be reused for 55 | * another computation. 56 | * 57 | * The contents of this structure are private. A running Shabal computation 58 | * can be cloned by copying the context (e.g. with a simple 59 | * memcpy()). 60 | */ 61 | typedef struct { 62 | #ifndef DOXYGEN_IGNORE 63 | unsigned char buf[64]; /* first field, for alignment */ 64 | size_t ptr; 65 | sph_u32 A[12], B[16], C[16]; 66 | sph_u32 Whigh, Wlow; 67 | #endif 68 | } sph_shabal_context; 69 | 70 | /** 71 | * Type for a Shabal-256 context (identical to the common context). 72 | */ 73 | typedef sph_shabal_context sph_shabal256_context; 74 | 75 | /** 76 | * Initialize a Shabal-256 context. This process performs no memory 77 | * allocation. 78 | * 79 | * @param cc the Shabal-256 context (pointer to a 80 | * sph_shabal256_context) 81 | */ 82 | void sph_shabal256_init(sph_shabal_context* cc); 83 | 84 | /** 85 | * Process some data bytes. It is acceptable that len is zero 86 | * (in which case this function does nothing). 87 | * 88 | * @param cc the Shabal-256 context 89 | * @param data the input data 90 | * @param len the input data length (in bytes) 91 | */ 92 | void sph_shabal256(void* cc, const unsigned char* data, size_t len); 93 | 94 | /** 95 | * Terminate the current Shabal-256 computation and output the result into 96 | * the provided buffer. The destination buffer must be wide enough to 97 | * accomodate the result (32 bytes). The context is automatically 98 | * reinitialized. 99 | * 100 | * @param cc the Shabal-256 context 101 | * @param dst the destination buffer 102 | */ 103 | void sph_shabal256_close(void* cc, void* dst); 104 | 105 | /** 106 | * Add a few additional bits (0 to 7) to the current computation, then 107 | * terminate it and output the result in the provided buffer, which must 108 | * be wide enough to accomodate the result (32 bytes). If bit number i 109 | * in ub has value 2^i, then the extra bits are those 110 | * numbered 7 downto 8-n (this is the big-endian convention at the byte 111 | * level). The context is automatically reinitialized. 112 | * 113 | * @param cc the Shabal-256 context 114 | * @param ub the extra bits 115 | * @param n the number of extra bits (0 to 7) 116 | * @param dst the destination buffer 117 | */ 118 | void sph_shabal256_addbits_and_close(void* cc, unsigned ub, unsigned n, void* dst); 119 | 120 | /* 121 | * optimised Shabal routine for PoC plotting and hashing 122 | */ 123 | void sph_shabal_hash_fast(void *message, void *termination, void* dst, unsigned num); 124 | 125 | /* 126 | * optimised Shabal routine for PoC mining 127 | */ 128 | void sph_shabal_deadline_fast(void *scoop_data, void *gen_sig, void *dst); 129 | 130 | #ifdef __cplusplus 131 | } 132 | #endif 133 | #endif 134 | -------------------------------------------------------------------------------- /src/writer.rs: -------------------------------------------------------------------------------- 1 | use crate::plotter::{PlotterTask, NONCE_SIZE, SCOOP_SIZE}; 2 | use crate::buffer::PageAlignedByteBuffer; 3 | use crate::utils::{open, open_r, open_using_direct_io}; 4 | use crossbeam_channel::{Receiver, Sender}; 5 | use std::cmp::min; 6 | use std::io::{Read, Seek, SeekFrom, Write, Error, ErrorKind}; 7 | use std::path::Path; 8 | use std::sync::Arc; 9 | 10 | const TASK_SIZE: u64 = 16384; 11 | 12 | pub fn create_writer_thread( 13 | task: Arc, 14 | mut nonces_written: u64, 15 | mut pb: Option>, 16 | rx_buffers_to_writer: Receiver, 17 | tx_empty_buffers: Sender, 18 | ) -> impl FnOnce() { 19 | move || { 20 | for buffer in rx_buffers_to_writer { 21 | let mut_bs = &buffer.get_buffer(); 22 | let bs = mut_bs.lock().unwrap(); 23 | let buffer_size = (*bs).len() as u64; 24 | let nonces_to_write = min(buffer_size / NONCE_SIZE, task.nonces - nonces_written); 25 | 26 | let filename = Path::new(&task.output_path).join(format!( 27 | "{}_{}_{}", 28 | task.numeric_id, task.start_nonce, task.nonces 29 | )); 30 | if !task.benchmark { 31 | let file = if task.direct_io { 32 | open_using_direct_io(&filename) 33 | } else { 34 | open(&filename) 35 | }; 36 | 37 | let mut file = file.unwrap(); 38 | 39 | for scoop in 0..4096 { 40 | let mut seek_addr = scoop * task.nonces as u64 * SCOOP_SIZE; 41 | seek_addr += nonces_written as u64 * SCOOP_SIZE; 42 | 43 | file.seek(SeekFrom::Start(seek_addr)).unwrap(); 44 | 45 | let mut local_addr = scoop * buffer_size / NONCE_SIZE * SCOOP_SIZE; 46 | for _ in 0..nonces_to_write / TASK_SIZE { 47 | file.write_all( 48 | &bs[local_addr as usize 49 | ..(local_addr + TASK_SIZE * SCOOP_SIZE) as usize], 50 | ) 51 | .unwrap(); 52 | 53 | local_addr += TASK_SIZE * SCOOP_SIZE; 54 | } 55 | 56 | // write remainder 57 | if nonces_to_write % TASK_SIZE > 0 { 58 | file.write_all( 59 | &bs[local_addr as usize 60 | ..(local_addr + (nonces_to_write % TASK_SIZE) * SCOOP_SIZE) 61 | as usize], 62 | ) 63 | .unwrap(); 64 | } 65 | 66 | if (scoop + 1) % 128 == 0 { 67 | match &mut pb { 68 | Some(pb) => { 69 | pb.add(nonces_to_write * SCOOP_SIZE * 128); 70 | } 71 | None => (), 72 | } 73 | } 74 | } 75 | } 76 | nonces_written += nonces_to_write; 77 | 78 | // thread end 79 | if task.nonces == nonces_written { 80 | match &mut pb { 81 | Some(pb) => { 82 | pb.finish_print("Writer done."); 83 | } 84 | None => (), 85 | } 86 | tx_empty_buffers.send(buffer).unwrap(); 87 | break; 88 | } 89 | 90 | if !task.benchmark { 91 | if write_resume_info(&filename, nonces_written).is_err() { 92 | println!("Error: couldn't write resume info"); 93 | } 94 | } 95 | tx_empty_buffers.send(buffer).unwrap(); 96 | } 97 | } 98 | } 99 | 100 | pub fn read_resume_info(file: &Path) -> Result { 101 | let mut file = open_r(&file)?; 102 | file.seek(SeekFrom::End(-8))?; 103 | 104 | 105 | let mut progress = [0u8; 4]; 106 | let mut double_monkey = [0u8; 4]; 107 | 108 | file.read_exact(&mut progress[0..4])?; 109 | file.read_exact(&mut double_monkey[0..4])?; 110 | 111 | if double_monkey == [0xAF, 0xFE, 0xAF, 0xFE] { 112 | Ok(u64::from(as_u32_le(progress))) 113 | } else { 114 | Err(Error::new(ErrorKind::Other, "End marker not found")) 115 | } 116 | } 117 | 118 | pub fn write_resume_info(file: &Path, nonces_written: u64) -> Result<(), Error> { 119 | let mut file = open(&file)?; 120 | file.seek(SeekFrom::End(-8))?; 121 | 122 | let progress = as_u8_le(nonces_written as u32); 123 | let double_monkey = [0xAF, 0xFE, 0xAF, 0xFE]; 124 | 125 | file.write_all(&progress[0..4])?; 126 | file.write_all(&double_monkey[0..4])?; 127 | Ok(()) 128 | } 129 | 130 | fn as_u32_le(array: [u8; 4]) -> u32 { 131 | u32::from(array[0]) 132 | + (u32::from(array[1]) << 8) 133 | + (u32::from(array[2]) << 16) 134 | + (u32::from(array[3]) << 24) 135 | } 136 | 137 | fn as_u8_le(x: u32) -> [u8; 4] { 138 | let b1: u8 = (x & 0xff) as u8; 139 | let b2: u8 = ((x >> 8) & 0xff) as u8; 140 | let b3: u8 = ((x >> 16) & 0xff) as u8; 141 | let b4: u8 = ((x >> 24) & 0xff) as u8; 142 | [b1, b2, b3, b4] 143 | } 144 | -------------------------------------------------------------------------------- /src/c/mshabal_128_avx.h: -------------------------------------------------------------------------------- 1 | /* 2 | * A parallel implementation of Shabal, for platforms with AVX. 3 | * 4 | * This is the header file for an implementation of the Shabal family 5 | * of hash functions, designed for maximum parallel speed. It processes 6 | * up to four instances of Shabal in parallel, using the AVX unit. 7 | * Total bandwidth appear to be up to twice that of a plain 32-bit 8 | * Shabal implementation. 9 | * 10 | * A computation uses a mshabal_context structure. That structure is 11 | * supposed to be allocated and released by the caller, e.g. as a 12 | * local or global variable, or on the heap. The structure contents 13 | * are initialized with mshabal_init(). Once the structure has been 14 | * initialized, data is input as chunks, with the mshabal() functions. 15 | * Chunks for the four parallel instances are provided simultaneously 16 | * and must have the same length. It is allowed not to use some of the 17 | * instances; the corresponding parameters in mshabal() are then NULL. 18 | * However, using NULL as a chunk for one of the instances effectively 19 | * deactivates that instance; this cannot be used to "skip" a chunk 20 | * for one instance. 21 | * 22 | * The computation is finalized with mshabal_close(). Some extra message 23 | * bits (0 to 7) can be input. The outputs of the four parallel instances 24 | * are written in the provided buffers. There again, NULL can be 25 | * provided as parameter is the output of one of the instances is not 26 | * needed. 27 | * 28 | * A mshabal_context instance is self-contained and holds no pointer. 29 | * Thus, it can be cloned (e.g. with memcpy()) or moved (as long as 30 | * proper alignment is maintained). This implementation uses no state 31 | * variable beyond the context instance; this, it is thread-safe and 32 | * reentrant. 33 | * 34 | * The Shabal specification defines Shabal with output sizes of 192, 35 | * 224, 256, 384 and 512 bits. This code accepts all those sizes, as 36 | * well as any output size which is multiple of 32, between 32 and 37 | * 512 (inclusive). 38 | * 39 | * Parameters are not validated. Thus, undefined behaviour occurs if 40 | * any of the "shall" or "must" clauses in this documentation is 41 | * violated. 42 | * 43 | * 44 | * (c) 2010 SAPHIR project. This software is provided 'as-is', without 45 | * any epxress or implied warranty. In no event will the authors be held 46 | * liable for any damages arising from the use of this software. 47 | * 48 | * Permission is granted to anyone to use this software for any purpose, 49 | * including commercial applications, and to alter it and redistribute it 50 | * freely, subject to no restriction. 51 | * 52 | * Technical remarks and questions can be addressed to: 53 | * 54 | */ 55 | 56 | #ifndef MSHABAL_H__ 57 | #define MSHABAL_H__ 58 | 59 | #include 60 | 61 | #ifdef __cplusplus 62 | extern "C" { 63 | #endif 64 | 65 | /* 66 | * We need an integer type with width 32-bit or more (preferably, with 67 | * a width of exactly 32 bits). 68 | */ 69 | #if defined __STDC__ && __STDC_VERSION__ >= 199901L 70 | #include 71 | #ifdef UINT32_MAX 72 | typedef uint32_t mshabal_u32; 73 | #else 74 | typedef uint_fast32_t mshabal_u32; 75 | #endif 76 | #else 77 | #if ((UINT_MAX >> 11) >> 11) >= 0x3FF 78 | typedef unsigned int mshabal_u32; 79 | #else 80 | typedef unsigned long mshabal_u32; 81 | #endif 82 | #endif 83 | 84 | #define MSHABAL128_VECTOR_SIZE 4 85 | 86 | /* 87 | * The context structure for a Shabal computation. Contents are 88 | * private. Such a structure should be allocated and released by 89 | * the caller, in any memory area. 90 | */ 91 | typedef struct { 92 | unsigned char buf0[64]; 93 | unsigned char buf1[64]; 94 | unsigned char buf2[64]; 95 | unsigned char buf3[64]; 96 | size_t ptr; 97 | mshabal_u32 state[(12 + 16 + 16) * MSHABAL128_VECTOR_SIZE]; 98 | mshabal_u32 Whigh, Wlow; 99 | unsigned out_size; 100 | } mshabal128_context; 101 | 102 | #pragma pack(1) 103 | typedef struct { 104 | mshabal_u32 state[(12 + 16 + 16) * MSHABAL128_VECTOR_SIZE]; 105 | mshabal_u32 Whigh, Wlow; 106 | unsigned out_size; 107 | } mshabal128_context_fast; 108 | #pragma pack() 109 | 110 | /* 111 | * Initialize a context structure. The output size must be a multiple 112 | * of 32, between 32 and 512 (inclusive). The output size is expressed 113 | * in bits. 114 | */ 115 | void mshabal_init_avx(mshabal128_context *sc, unsigned out_size); 116 | 117 | /* 118 | * Process some more data bytes; four chunks of data, pointed to by 119 | * data0, data1, data2 and data3, are processed. The four chunks have 120 | * the same length of "len" bytes. For efficiency, it is best if data is 121 | * processed by medium-sized chunks, e.g. a few kilobytes at a time. 122 | * 123 | * The "len" data bytes shall all be accessible. If "len" is zero, this 124 | * this function does nothing and ignores the data* arguments. 125 | * Otherwise, if one of the data* argument is NULL, then the 126 | * corresponding instance is deactivated (the final value obtained from 127 | * that instance is undefined). 128 | */ 129 | void mshabal_avx(mshabal128_context *sc, const void *data0, const void *data1, const void *data2, 130 | const void *data3, size_t len); 131 | 132 | /* 133 | * Terminate the Shabal computation incarnated by the provided context 134 | * structure. "n" shall be a value between 0 and 7 (inclusive): this is 135 | * the number of extra bits to extract from ub0, ub1, ub2 and ub3, and 136 | * append at the end of the input message for each of the four parallel 137 | * instances. Bits in "ub*" are taken in big-endian format: first bit is 138 | * the one of numerical value 128, second bit has numerical value 64, 139 | * and so on. Other bits in "ub*" are ignored. For most applications, 140 | * input messages will consist in sequence of bytes, and the "ub*" and 141 | * "n" parameters will be zero. 142 | * 143 | * The Shabal output for each of the parallel instances is written out 144 | * in the areas pointed to by, respectively, dst0, dst1, dst2 and dst3. 145 | * These areas shall be wide enough to accomodate the result (result 146 | * size was specified as parameter to mshabal_init()). It is acceptable 147 | * to use NULL for any of those pointers, if the result from the 148 | * corresponding instance is not needed. 149 | * 150 | * After this call, the context structure is invalid. The caller shall 151 | * release it, or reinitialize it with mshabal_init(). The mshabal_close() 152 | * function does NOT imply a hidden call to mshabal_init(). 153 | */ 154 | void mshabal_close_avx(mshabal128_context *sc, unsigned ub0, unsigned ub1, unsigned ub2, 155 | unsigned ub3, unsigned n, void *dst0, void *dst1, void *dst2, 156 | void *dst3); 157 | 158 | /* 159 | * optimised Shabal routine for PoC mining 160 | */ 161 | void mshabal_deadline_fast_avx(mshabal128_context_fast *sc, void *message, void *termination, void *dst0, 162 | void *dst1, void *dst2, void *dst3); 163 | 164 | /* 165 | * optimised Shabal routine for PoC plotting and hashing 166 | */ 167 | void mshabal_hash_fast_avx(mshabal128_context_fast *sc, void *message, void *termination, 168 | void *dst, unsigned num); 169 | 170 | #ifdef __cplusplus 171 | } 172 | #endif 173 | 174 | #endif 175 | -------------------------------------------------------------------------------- /src/c/mshabal_128_sse2.h: -------------------------------------------------------------------------------- 1 | /* 2 | * A parallel implementation of Shabal, for platforms with SSE2. 3 | * 4 | * This is the header file for an implementation of the Shabal family 5 | * of hash functions, designed for maximum parallel speed. It processes 6 | * up to four instances of Shabal in parallel, using the SSE2 unit. 7 | * Total bandwidth appear to be up to twice that of a plain 32-bit 8 | * Shabal implementation. 9 | * 10 | * A computation uses a mshabal_context structure. That structure is 11 | * supposed to be allocated and released by the caller, e.g. as a 12 | * local or global variable, or on the heap. The structure contents 13 | * are initialized with mshabal_init(). Once the structure has been 14 | * initialized, data is input as chunks, with the mshabal() functions. 15 | * Chunks for the four parallel instances are provided simultaneously 16 | * and must have the same length. It is allowed not to use some of the 17 | * instances; the corresponding parameters in mshabal() are then NULL. 18 | * However, using NULL as a chunk for one of the instances effectively 19 | * deactivates that instance; this cannot be used to "skip" a chunk 20 | * for one instance. 21 | * 22 | * The computation is finalized with mshabal_close(). Some extra message 23 | * bits (0 to 7) can be input. The outputs of the four parallel instances 24 | * are written in the provided buffers. There again, NULL can be 25 | * provided as parameter is the output of one of the instances is not 26 | * needed. 27 | * 28 | * A mshabal_context instance is self-contained and holds no pointer. 29 | * Thus, it can be cloned (e.g. with memcpy()) or moved (as long as 30 | * proper alignment is maintained). This implementation uses no state 31 | * variable beyond the context instance; this, it is thread-safe and 32 | * reentrant. 33 | * 34 | * The Shabal specification defines Shabal with output sizes of 192, 35 | * 224, 256, 384 and 512 bits. This code accepts all those sizes, as 36 | * well as any output size which is multiple of 32, between 32 and 37 | * 512 (inclusive). 38 | * 39 | * Parameters are not validated. Thus, undefined behaviour occurs if 40 | * any of the "shall" or "must" clauses in this documentation is 41 | * violated. 42 | * 43 | * 44 | * (c) 2010 SAPHIR project. This software is provided 'as-is', without 45 | * any epxress or implied warranty. In no event will the authors be held 46 | * liable for any damages arising from the use of this software. 47 | * 48 | * Permission is granted to anyone to use this software for any purpose, 49 | * including commercial applications, and to alter it and redistribute it 50 | * freely, subject to no restriction. 51 | * 52 | * Technical remarks and questions can be addressed to: 53 | * 54 | */ 55 | 56 | #ifndef MSHABAL_H__ 57 | #define MSHABAL_H__ 58 | 59 | #include 60 | 61 | #ifdef __cplusplus 62 | extern "C" { 63 | #endif 64 | 65 | /* 66 | * We need an integer type with width 32-bit or more (preferably, with 67 | * a width of exactly 32 bits). 68 | */ 69 | #if defined __STDC__ && __STDC_VERSION__ >= 199901L 70 | #include 71 | #ifdef UINT32_MAX 72 | typedef uint32_t mshabal_u32; 73 | #else 74 | typedef uint_fast32_t mshabal_u32; 75 | #endif 76 | #else 77 | #if ((UINT_MAX >> 11) >> 11) >= 0x3FF 78 | typedef unsigned int mshabal_u32; 79 | #else 80 | typedef unsigned long mshabal_u32; 81 | #endif 82 | #endif 83 | 84 | #define MSHABAL128_VECTOR_SIZE 4 85 | 86 | /* 87 | * The context structure for a Shabal computation. Contents are 88 | * private. Such a structure should be allocated and released by 89 | * the caller, in any memory area. 90 | */ 91 | typedef struct { 92 | unsigned char buf0[64]; 93 | unsigned char buf1[64]; 94 | unsigned char buf2[64]; 95 | unsigned char buf3[64]; 96 | size_t ptr; 97 | mshabal_u32 state[(12 + 16 + 16) * MSHABAL128_VECTOR_SIZE]; 98 | mshabal_u32 Whigh, Wlow; 99 | unsigned out_size; 100 | } mshabal128_context; 101 | 102 | #pragma pack(1) 103 | typedef struct { 104 | mshabal_u32 state[(12 + 16 + 16) * MSHABAL128_VECTOR_SIZE]; 105 | mshabal_u32 Whigh, Wlow; 106 | unsigned out_size; 107 | } mshabal128_context_fast; 108 | #pragma pack() 109 | 110 | /* 111 | * Initialize a context structure. The output size must be a multiple 112 | * of 32, between 32 and 512 (inclusive). The output size is expressed 113 | * in bits. 114 | */ 115 | void mshabal_init_sse2(mshabal128_context *sc, unsigned out_size); 116 | 117 | /* 118 | * Process some more data bytes; four chunks of data, pointed to by 119 | * data0, data1, data2 and data3, are processed. The four chunks have 120 | * the same length of "len" bytes. For efficiency, it is best if data is 121 | * processed by medium-sized chunks, e.g. a few kilobytes at a time. 122 | * 123 | * The "len" data bytes shall all be accessible. If "len" is zero, this 124 | * this function does nothing and ignores the data* arguments. 125 | * Otherwise, if one of the data* argument is NULL, then the 126 | * corresponding instance is deactivated (the final value obtained from 127 | * that instance is undefined). 128 | */ 129 | void mshabal_sse2(mshabal128_context *sc, const void *data0, const void *data1, const void *data2, 130 | const void *data3, size_t len); 131 | 132 | /* 133 | * Terminate the Shabal computation incarnated by the provided context 134 | * structure. "n" shall be a value between 0 and 7 (inclusive): this is 135 | * the number of extra bits to extract from ub0, ub1, ub2 and ub3, and 136 | * append at the end of the input message for each of the four parallel 137 | * instances. Bits in "ub*" are taken in big-endian format: first bit is 138 | * the one of numerical value 128, second bit has numerical value 64, 139 | * and so on. Other bits in "ub*" are ignored. For most applications, 140 | * input messages will consist in sequence of bytes, and the "ub*" and 141 | * "n" parameters will be zero. 142 | * 143 | * The Shabal output for each of the parallel instances is written out 144 | * in the areas pointed to by, respectively, dst0, dst1, dst2 and dst3. 145 | * These areas shall be wide enough to accomodate the result (result 146 | * size was specified as parameter to mshabal_init()). It is acceptable 147 | * to use NULL for any of those pointers, if the result from the 148 | * corresponding instance is not needed. 149 | * 150 | * After this call, the context structure is invalid. The caller shall 151 | * release it, or reinitialize it with mshabal_init(). The mshabal_close() 152 | * function does NOT imply a hidden call to mshabal_init(). 153 | */ 154 | void mshabal_close_sse2(mshabal128_context *sc, unsigned ub0, unsigned ub1, unsigned ub2, 155 | unsigned ub3, unsigned n, void *dst0, void *dst1, void *dst2, 156 | void *dst3); 157 | 158 | /* 159 | * optimised Shabal routine for PoC plotting and hashing 160 | */ 161 | void mshabal_hash_fast_sse2(mshabal128_context_fast *sc, void *message, void *termination, 162 | void *dst, unsigned num); 163 | 164 | /* 165 | * optimised Shabal routine for PoC mining 166 | */ 167 | void mshabal_deadline_fast_sse2(mshabal128_context_fast *sc, void *message, void *termination, void *dst0, 168 | void *dst1, void *dst2, void *dst3); 169 | 170 | #ifdef __cplusplus 171 | } 172 | #endif 173 | 174 | #endif 175 | -------------------------------------------------------------------------------- /src/c/mshabal_256_avx2.h: -------------------------------------------------------------------------------- 1 | /* 2 | * A parallel implementation of Shabal, for platforms with AVX2. 3 | * 4 | * This is the header file for an implementation of the Shabal family 5 | * of hash functions, designed for maximum parallel speed. It processes 6 | * up to four instances of Shabal in parallel, using the AVX2 unit. 7 | * Total bandwidth appear to be up to twice that of a plain 32-bit 8 | * Shabal implementation. 9 | * 10 | * A computation uses a mshabal_context structure. That structure is 11 | * supposed to be allocated and released by the caller, e.g. as a 12 | * local or global variable, or on the heap. The structure contents 13 | * are initialized with mshabal_init(). Once the structure has been 14 | * initialized, data is input as chunks, with the mshabal() functions. 15 | * Chunks for the four parallel instances are provided simultaneously 16 | * and must have the same length. It is allowed not to use some of the 17 | * instances; the corresponding parameters in mshabal() are then NULL. 18 | * However, using NULL as a chunk for one of the instances effectively 19 | * deactivates that instance; this cannot be used to "skip" a chunk 20 | * for one instance. 21 | * 22 | * The computation is finalized with mshabal_close(). Some extra message 23 | * bits (0 to 7) can be input. The outputs of the four parallel instances 24 | * are written in the provided buffers. There again, NULL can be 25 | * provided as parameter is the output of one of the instances is not 26 | * needed. 27 | * 28 | * A mshabal_context instance is self-contained and holds no pointer. 29 | * Thus, it can be cloned (e.g. with memcpy()) or moved (as long as 30 | * proper alignment is maintained). This implementation uses no state 31 | * variable beyond the context instance; this, it is thread-safe and 32 | * reentrant. 33 | * 34 | * The Shabal specification defines Shabal with output sizes of 192, 35 | * 224, 256, 384 and 512 bits. This code accepts all those sizes, as 36 | * well as any output size which is multiple of 32, between 32 and 37 | * 512 (inclusive). 38 | * 39 | * Parameters are not validated. Thus, undefined behaviour occurs if 40 | * any of the "shall" or "must" clauses in this documentation is 41 | * violated. 42 | * 43 | * 44 | * (c) 2010 SAPHIR project. This software is provided 'as-is', without 45 | * any epxress or implied warranty. In no event will the authors be held 46 | * liable for any damages arising from the use of this software. 47 | * 48 | * Permission is granted to anyone to use this software for any purpose, 49 | * including commercial applications, and to alter it and redistribute it 50 | * freely, subject to no restriction. 51 | * 52 | * Technical remarks and questions can be addressed to: 53 | * 54 | */ 55 | 56 | #ifndef MSHABAL_H__ 57 | #define MSHABAL_H__ 58 | 59 | #include 60 | 61 | #ifdef __cplusplus 62 | extern "C" { 63 | #endif 64 | 65 | /* 66 | * We need an integer type with width 32-bit or more (preferably, with 67 | * a width of exactly 32 bits). 68 | */ 69 | #if defined __STDC__ && __STDC_VERSION__ >= 199901L 70 | #include 71 | #ifdef UINT32_MAX 72 | typedef uint32_t mshabal_u32; 73 | #else 74 | typedef uint_fast32_t mshabal_u32; 75 | #endif 76 | #else 77 | #if ((UINT_MAX >> 11) >> 11) >= 0x3FF 78 | typedef unsigned int mshabal_u32; 79 | #else 80 | typedef unsigned long mshabal_u32; 81 | #endif 82 | #endif 83 | 84 | #define MSHABAL256_VECTOR_SIZE 8 85 | 86 | /* 87 | * The context structure for a Shabal computation. Contents are 88 | * private. Such a structure should be allocated and released by 89 | * the caller, in any memory area. 90 | */ 91 | typedef struct { 92 | unsigned char buf0[64]; 93 | unsigned char buf1[64]; 94 | unsigned char buf2[64]; 95 | unsigned char buf3[64]; 96 | unsigned char buf4[64]; 97 | unsigned char buf5[64]; 98 | unsigned char buf6[64]; 99 | unsigned char buf7[64]; 100 | size_t ptr; 101 | mshabal_u32 state[(12 + 16 + 16) * MSHABAL256_VECTOR_SIZE]; 102 | mshabal_u32 Whigh, Wlow; 103 | unsigned out_size; 104 | } mshabal256_context; 105 | 106 | #pragma pack(1) 107 | typedef struct { 108 | mshabal_u32 state[(12 + 16 + 16) * MSHABAL256_VECTOR_SIZE]; 109 | mshabal_u32 Whigh, Wlow; 110 | unsigned out_size; 111 | } mshabal256_context_fast; 112 | #pragma pack() 113 | 114 | /* 115 | * Initialize a context structure. The output size must be a multiple 116 | * of 32, between 32 and 512 (inclusive). The output size is expressed 117 | * in bits. 118 | */ 119 | void mshabal_init_avx2(mshabal256_context *sc, unsigned out_size); 120 | 121 | /* 122 | * Process some more data bytes; four chunks of data, pointed to by 123 | * data0, data1, data2 and data3, are processed. The four chunks have 124 | * the same length of "len" bytes. For efficiency, it is best if data is 125 | * processed by medium-sized chunks, e.g. a few kilobytes at a time. 126 | * 127 | * The "len" data bytes shall all be accessible. If "len" is zero, this 128 | * this function does nothing and ignores the data* arguments. 129 | * Otherwise, if one of the data* argument is NULL, then the 130 | * corresponding instance is deactivated (the final value obtained from 131 | * that instance is undefined). 132 | */ 133 | void mshabal_avx2(mshabal256_context *sc, const void *data0, const void *data1, const void *data2, const void *data3, 134 | const void *data4, const void *data5, const void *data6, const void *data7, size_t len); 135 | 136 | /* 137 | * Terminate the Shabal computation incarnated by the provided context 138 | * structure. "n" shall be a value between 0 and 7 (inclusive): this is 139 | * the number of extra bits to extract from ub0, ub1, ub2 and ub3, and 140 | * append at the end of the input message for each of the four parallel 141 | * instances. Bits in "ub*" are taken in big-endian format: first bit is 142 | * the one of numerical value 128, second bit has numerical value 64, 143 | * and so on. Other bits in "ub*" are ignored. For most applications, 144 | * input messages will consist in sequence of bytes, and the "ub*" and 145 | * "n" parameters will be zero. 146 | * 147 | * The Shabal output for each of the parallel instances is written out 148 | * in the areas pointed to by, respectively, dst0, dst1, dst2 and dst3. 149 | * These areas shall be wide enough to accomodate the result (result 150 | * size was specified as parameter to mshabal_init()). It is acceptable 151 | * to use NULL for any of those pointers, if the result from the 152 | * corresponding instance is not needed. 153 | * 154 | * After this call, the context structure is invalid. The caller shall 155 | * release it, or reinitialize it with mshabal_init(). The mshabal_close() 156 | * function does NOT imply a hidden call to mshabal_init(). 157 | */ 158 | void mshabal_close_avx2(mshabal256_context *sc, unsigned ub0, unsigned ub1, unsigned ub2, 159 | unsigned ub3, unsigned ub4, unsigned ub5, unsigned ub6, unsigned ub7, 160 | unsigned n, void *dst0, void *dst1, void *dst2, void *dst3, void *dst4, 161 | void *dst5, void *dst6, void *dst7); 162 | 163 | /* 164 | * optimised Shabal routine for PoC plotting and hashing 165 | */ 166 | void mshabal_hash_fast_avx2(mshabal256_context_fast *sc, void *message, void *termination, 167 | void *dst, unsigned len); 168 | 169 | /* 170 | * optimised Shabal routine for PoC mining 171 | */ 172 | void mshabal_deadline_fast_avx2(mshabal256_context_fast *sc, void *message, void *termination, void *dst0, 173 | void *dst1, void *dst2, void *dst3, void *dst4, void *dst5, 174 | void *dst6, void *dst7); 175 | #ifdef __cplusplus 176 | } 177 | #endif 178 | 179 | #endif 180 | -------------------------------------------------------------------------------- /src/cpu_hasher.rs: -------------------------------------------------------------------------------- 1 | use crate::poc_hashing::noncegen_rust; 2 | use libc::{c_void, size_t}; 3 | use std::slice::from_raw_parts_mut; 4 | use std::sync::mpsc::Sender; 5 | 6 | const NUM_SCOOPS: usize = 4096; 7 | const SCOOP_SIZE: usize = 64; 8 | const NONCE_SIZE: usize = NUM_SCOOPS * SCOOP_SIZE; 9 | 10 | extern "C" { 11 | pub fn init_shabal_sse2() -> (); 12 | pub fn init_shabal_avx() -> (); 13 | pub fn init_shabal_avx2() -> (); 14 | pub fn init_shabal_avx512f() -> (); 15 | pub fn noncegen_sse2( 16 | cache: *mut c_void, 17 | cache_size: size_t, 18 | chunk_offset: size_t, 19 | numeric_ID: u64, 20 | local_startnonce: u64, 21 | local_nonces: u64, 22 | ); 23 | pub fn noncegen_avx( 24 | cache: *mut c_void, 25 | cache_size: size_t, 26 | chunk_offset: size_t, 27 | numeric_ID: u64, 28 | local_startnonce: u64, 29 | local_nonces: u64, 30 | ); 31 | pub fn noncegen_avx2( 32 | cache: *mut c_void, 33 | cache_size: size_t, 34 | chunk_offset: size_t, 35 | numeric_ID: u64, 36 | local_startnonce: u64, 37 | local_nonces: u64, 38 | ); 39 | pub fn noncegen_avx512( 40 | cache: *mut c_void, 41 | cache_size: size_t, 42 | chunk_offset: size_t, 43 | numeric_ID: u64, 44 | local_startnonce: u64, 45 | local_nonces: u64, 46 | ); 47 | } 48 | pub struct SafePointer { 49 | pub ptr: *mut u8, 50 | } 51 | unsafe impl Send for SafePointer {} 52 | unsafe impl Sync for SafePointer {} 53 | 54 | pub struct CpuTask { 55 | pub cache: SafePointer, 56 | pub cache_size: usize, 57 | pub chunk_offset: usize, 58 | pub numeric_id: u64, 59 | pub local_startnonce: u64, 60 | pub local_nonces: u64, 61 | } 62 | 63 | #[derive(Debug, Clone)] 64 | pub enum SimdExtension { 65 | AVX512f, 66 | AVX2, 67 | AVX, 68 | SSE2, 69 | None, 70 | } 71 | 72 | pub fn init_simd() -> SimdExtension { 73 | if is_x86_feature_detected!("avx512f") { 74 | unsafe { 75 | init_shabal_avx512f(); 76 | } 77 | SimdExtension::AVX512f 78 | } else if is_x86_feature_detected!("avx2") { 79 | unsafe { 80 | init_shabal_avx2(); 81 | } 82 | SimdExtension::AVX2 83 | } else if is_x86_feature_detected!("avx") { 84 | unsafe { 85 | init_shabal_avx(); 86 | } 87 | SimdExtension::AVX 88 | } else if is_x86_feature_detected!("sse2") { 89 | unsafe { 90 | init_shabal_sse2(); 91 | } 92 | SimdExtension::SSE2 93 | } else { 94 | SimdExtension::None 95 | } 96 | } 97 | 98 | pub fn hash_cpu( 99 | tx: Sender<(u8, u8, u64)>, 100 | hasher_task: CpuTask, 101 | simd_ext: SimdExtension, 102 | ) -> impl FnOnce() { 103 | move || { 104 | unsafe { 105 | match simd_ext { 106 | SimdExtension::AVX512f => noncegen_avx512( 107 | hasher_task.cache.ptr as *mut c_void, 108 | hasher_task.cache_size, 109 | hasher_task.chunk_offset, 110 | hasher_task.numeric_id, 111 | hasher_task.local_startnonce, 112 | hasher_task.local_nonces, 113 | ), 114 | SimdExtension::AVX2 => noncegen_avx2( 115 | hasher_task.cache.ptr as *mut c_void, 116 | hasher_task.cache_size, 117 | hasher_task.chunk_offset, 118 | hasher_task.numeric_id, 119 | hasher_task.local_startnonce, 120 | hasher_task.local_nonces, 121 | ), 122 | SimdExtension::AVX => noncegen_avx( 123 | hasher_task.cache.ptr as *mut c_void, 124 | hasher_task.cache_size, 125 | hasher_task.chunk_offset, 126 | hasher_task.numeric_id, 127 | hasher_task.local_startnonce, 128 | hasher_task.local_nonces, 129 | ), 130 | SimdExtension::SSE2 => noncegen_sse2( 131 | hasher_task.cache.ptr as *mut c_void, 132 | hasher_task.cache_size, 133 | hasher_task.chunk_offset, 134 | hasher_task.numeric_id, 135 | hasher_task.local_startnonce, 136 | hasher_task.local_nonces, 137 | ), 138 | _ => { 139 | let data = from_raw_parts_mut( 140 | hasher_task.cache.ptr, 141 | hasher_task.cache_size * NONCE_SIZE, 142 | ); 143 | noncegen_rust( 144 | data, 145 | hasher_task.chunk_offset, 146 | hasher_task.numeric_id, 147 | hasher_task.local_startnonce, 148 | hasher_task.local_nonces, 149 | ) 150 | } 151 | } 152 | } 153 | // report hashing done 154 | tx.send((0u8, 1u8, 0)) 155 | .expect("CPU task can't communicate with scheduler thread."); 156 | // report data in hostmem 157 | tx.send((0u8, 0u8, hasher_task.local_nonces)) 158 | .expect("CPU task can't communicate with scheduler thread."); 159 | } 160 | } 161 | 162 | #[cfg(test)] 163 | mod test { 164 | extern crate crypto; 165 | use self::crypto::digest::Digest; 166 | use self::crypto::sha2::Sha256; 167 | use super::*; 168 | use crate::plotter; 169 | 170 | #[test] 171 | fn test_noncegen() { 172 | let numeric_id = 7900104405094198526; 173 | let start_nonce = 1337; 174 | let exp_result_hash = "eebdf7dce694cbea9539f71efc362d4b72f8792def335d7157dadb09bb6d9e5f"; 175 | 176 | let check_result = |buf: &Vec| { 177 | let mut hasher = Sha256::new(); 178 | hasher.input(buf); 179 | assert_eq!(hasher.result_str(), exp_result_hash); 180 | }; 181 | 182 | if is_x86_feature_detected!("avx512f") { 183 | let mut buf = vec![0; 32 * plotter::NONCE_SIZE as usize]; 184 | unsafe { 185 | init_shabal_avx512f(); 186 | noncegen_avx512( 187 | buf.as_mut_ptr() as *mut c_void, 188 | 32, 189 | 0, 190 | numeric_id, 191 | start_nonce, 192 | 32, 193 | ); 194 | } 195 | check_result(&buf); 196 | } 197 | 198 | if is_x86_feature_detected!("avx2") { 199 | let mut buf = vec![0; 32 * plotter::NONCE_SIZE as usize]; 200 | unsafe { 201 | init_shabal_avx2(); 202 | noncegen_avx2( 203 | buf.as_mut_ptr() as *mut c_void, 204 | 32, 205 | 0, 206 | numeric_id, 207 | start_nonce, 208 | 32, 209 | ); 210 | } 211 | check_result(&buf); 212 | } 213 | 214 | if is_x86_feature_detected!("avx") { 215 | let mut buf = vec![0; 32 * plotter::NONCE_SIZE as usize]; 216 | unsafe { 217 | init_shabal_avx(); 218 | noncegen_avx( 219 | buf.as_mut_ptr() as *mut c_void, 220 | 32, 221 | 0, 222 | numeric_id, 223 | start_nonce, 224 | 32, 225 | ); 226 | } 227 | check_result(&buf); 228 | } 229 | 230 | if is_x86_feature_detected!("sse2") { 231 | let mut buf = vec![0; 32 * plotter::NONCE_SIZE as usize]; 232 | unsafe { 233 | init_shabal_sse2(); 234 | noncegen_sse2( 235 | buf.as_mut_ptr() as *mut c_void, 236 | 32, 237 | 0, 238 | numeric_id, 239 | start_nonce, 240 | 32, 241 | ); 242 | } 243 | check_result(&buf); 244 | } 245 | 246 | let mut buf = vec![0; 32 * plotter::NONCE_SIZE as usize]; 247 | noncegen_rust(&mut buf, 0, numeric_id, start_nonce, 32); 248 | check_result(&buf); 249 | } 250 | } 251 | -------------------------------------------------------------------------------- /src/c/mshabal_512_avx512f.h: -------------------------------------------------------------------------------- 1 | /* 2 | * A parallel implementation of Shabal, for platforms with AVX512F. 3 | * 4 | * This is the header file for an implementation of the Shabal family 5 | * of hash functions, designed for maximum parallel speed. It processes 6 | * up to four instances of Shabal in parallel, using the AVX512F unit. 7 | * Total bandwidth appear to be up to twice that of a plain 32-bit 8 | * Shabal implementation. 9 | * 10 | * A computation uses a mshabal_context structure. That structure is 11 | * supposed to be allocated and released by the caller, e.g. as a 12 | * local or global variable, or on the heap. The structure contents 13 | * are initialized with mshabal_init(). Once the structure has been 14 | * initialized, data is input as chunks, with the mshabal() functions. 15 | * Chunks for the four parallel instances are provided simultaneously 16 | * and must have the same length. It is allowed not to use some of the 17 | * instances; the corresponding parameters in mshabal() are then NULL. 18 | * However, using NULL as a chunk for one of the instances effectively 19 | * deactivates that instance; this cannot be used to "skip" a chunk 20 | * for one instance. 21 | * 22 | * The computation is finalized with mshabal_close(). Some extra message 23 | * bits (0 to 7) can be input. The outputs of the four parallel instances 24 | * are written in the provided buffers. There again, NULL can be 25 | * provided as parameter is the output of one of the instances is not 26 | * needed. 27 | * 28 | * A mshabal_context instance is self-contained and holds no pointer. 29 | * Thus, it can be cloned (e.g. with memcpy()) or moved (as long as 30 | * proper alignment is maintained). This implementation uses no state 31 | * variable beyond the context instance; this, it is thread-safe and 32 | * reentrant. 33 | * 34 | * The Shabal specification defines Shabal with output sizes of 192, 35 | * 224, 256, 384 and 512 bits. This code accepts all those sizes, as 36 | * well as any output size which is multiple of 32, between 32 and 37 | * 512 (inclusive). 38 | * 39 | * Parameters are not validated. Thus, undefined behaviour occurs if 40 | * any of the "shall" or "must" clauses in this documentation is 41 | * violated. 42 | * 43 | * 44 | * (c) 2010 SAPHIR project. This software is provided 'as-is', without 45 | * any epxress or implied warranty. In no event will the authors be held 46 | * liable for any damages arising from the use of this software. 47 | * 48 | * Permission is granted to anyone to use this software for any purpose, 49 | * including commercial applications, and to alter it and redistribute it 50 | * freely, subject to no restriction. 51 | * 52 | * Technical remarks and questions can be addressed to: 53 | * 54 | */ 55 | 56 | #ifndef MSHABAL_H__ 57 | #define MSHABAL_H__ 58 | 59 | #include 60 | 61 | #ifdef __cplusplus 62 | extern "C" { 63 | #endif 64 | 65 | /* 66 | * We need an integer type with width 32-bit or more (preferably, with 67 | * a width of exactly 32 bits). 68 | */ 69 | #if defined __STDC__ && __STDC_VERSION__ >= 199901L 70 | #include 71 | #ifdef UINT32_MAX 72 | typedef uint32_t mshabal_u32; 73 | #else 74 | typedef uint_fast32_t mshabal_u32; 75 | #endif 76 | #else 77 | #if ((UINT_MAX >> 11) >> 11) >= 0x3FF 78 | typedef unsigned int mshabal_u32; 79 | #else 80 | typedef unsigned long mshabal_u32; 81 | #endif 82 | #endif 83 | 84 | #define MSHABAL512_VECTOR_SIZE 16 85 | 86 | /* 87 | * The context structure for a Shabal computation. Contents are 88 | * private. Such a structure should be allocated and released by 89 | * the caller, in any memory area. 90 | */ 91 | typedef struct { 92 | unsigned char buf0[64]; 93 | unsigned char buf1[64]; 94 | unsigned char buf2[64]; 95 | unsigned char buf3[64]; 96 | unsigned char buf4[64]; 97 | unsigned char buf5[64]; 98 | unsigned char buf6[64]; 99 | unsigned char buf7[64]; 100 | unsigned char buf8[64]; 101 | unsigned char buf9[64]; 102 | unsigned char buf10[64]; 103 | unsigned char buf11[64]; 104 | unsigned char buf12[64]; 105 | unsigned char buf13[64]; 106 | unsigned char buf14[64]; 107 | unsigned char buf15[64]; 108 | size_t ptr; 109 | mshabal_u32 state[(12 + 16 + 16) * MSHABAL512_VECTOR_SIZE]; 110 | mshabal_u32 Whigh, Wlow; 111 | unsigned out_size; 112 | } mshabal512_context; 113 | 114 | #pragma pack(1) 115 | typedef struct { 116 | mshabal_u32 state[(12 + 16 + 16) * MSHABAL512_VECTOR_SIZE]; 117 | mshabal_u32 Whigh, Wlow; 118 | unsigned out_size; 119 | } mshabal512_context_fast; 120 | #pragma pack() 121 | 122 | /* 123 | * Initialize a context structure. The output size must be a multiple 124 | * of 32, between 32 and 512 (inclusive). The output size is expressed 125 | * in bits. 126 | */ 127 | void mshabal_init_avx512f(mshabal512_context *sc, unsigned out_size); 128 | 129 | /* 130 | * Process some more data bytes; four chunks of data, pointed to by 131 | * data0, data1, data2 and data3, are processed. The four chunks have 132 | * the same length of "len" bytes. For efficiency, it is best if data is 133 | * processed by medium-sized chunks, e.g. a few kilobytes at a time. 134 | * 135 | * The "len" data bytes shall all be accessible. If "len" is zero, this 136 | * this function does nothing and ignores the data* arguments. 137 | * Otherwise, if one of the data* argument is NULL, then the 138 | * corresponding instance is deactivated (the final value obtained from 139 | * that instance is undefined). 140 | */ 141 | void mshabal_avx512f(mshabal512_context *sc, const void *data0, const void *data1, const void *data2, const void *data3, 142 | const void *data4, const void *data5, const void *data6, const void *data7, const void *data8, const void *data9, 143 | const void *data10, const void *data11, const void *data12, const void *data13, const void *data14, 144 | const void *data15, size_t len); 145 | 146 | /* 147 | * Terminate the Shabal computation incarnated by the provided context 148 | * structure. "n" shall be a value between 0 and 7 (inclusive): this is 149 | * the number of extra bits to extract from ub0, ub1, ub2 and ub3, and 150 | * append at the end of the input message for each of the four parallel 151 | * instances. Bits in "ub*" are taken in big-endian format: first bit is 152 | * the one of numerical value 128, second bit has numerical value 64, 153 | * and so on. Other bits in "ub*" are ignored. For most applications, 154 | * input messages will consist in sequence of bytes, and the "ub*" and 155 | * "n" parameters will be zero. 156 | * 157 | * The Shabal output for each of the parallel instances is written out 158 | * in the areas pointed to by, respectively, dst0, dst1, dst2 and dst3. 159 | * These areas shall be wide enough to accomodate the result (result 160 | * size was specified as parameter to mshabal_init()). It is acceptable 161 | * to use NULL for any of those pointers, if the result from the 162 | * corresponding instance is not needed. 163 | * 164 | * After this call, the context structure is invalid. The caller shall 165 | * release it, or reinitialize it with mshabal_init(). The mshabal_close() 166 | * function does NOT imply a hidden call to mshabal_init(). 167 | */ 168 | void mshabal_close_avx512f(mshabal512_context *sc, unsigned ub0, unsigned ub1, unsigned ub2, 169 | unsigned ub3, unsigned ub4, unsigned ub5, unsigned ub6, unsigned ub7, 170 | unsigned ub8, unsigned ub9, unsigned ub10, unsigned ub11, unsigned ub12, 171 | unsigned ub13, unsigned ub14, unsigned ub15, unsigned n, void *dst0, 172 | void *dst1, void *dst2, void *dst3, void *dst4, void *dst5, void *dst6, 173 | void *dst7, void *dst8, void *dst9, void *dst10, void *dst11, 174 | void *dst12, void *dst13, void *dst14, void *dst15); 175 | 176 | /* 177 | * optimised Shabal routine for PoC plotting and hashing 178 | */ 179 | void mshabal_hash_fast_avx512f(mshabal512_context_fast *sc, void *message, void *termination, 180 | void *dst, unsigned len); 181 | 182 | /* 183 | * optimised Shabal routine for PoC mining 184 | */ 185 | void mshabal_deadline_fast_avx512f(mshabal512_context_fast *sc, void *message, void *termination, void *dst0, 186 | void *dst1, void *dst2, void *dst3, void *dst4, void *dst5, 187 | void *dst6, void *dst7, void *dst8, void *dst9, void *dst10, 188 | void *dst11, void *dst12, void *dst13, void *dst14, 189 | void *dst15); 190 | 191 | #ifdef __cplusplus 192 | } 193 | #endif 194 | 195 | #endif 196 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | #[macro_use] 2 | extern crate clap; 3 | #[macro_use] 4 | extern crate cfg_if; 5 | 6 | mod cpu_hasher; 7 | #[cfg(feature = "opencl")] 8 | mod gpu_hasher; 9 | #[cfg(feature = "opencl")] 10 | mod ocl; 11 | mod plotter; 12 | mod poc_hashing; 13 | mod scheduler; 14 | mod shabal256; 15 | mod utils; 16 | mod writer; 17 | mod buffer; 18 | 19 | use crate::plotter::{Plotter, PlotterTask}; 20 | use crate::utils::set_low_prio; 21 | use clap::AppSettings::{ArgRequiredElseHelp, DeriveDisplayOrder, VersionlessSubcommands}; 22 | #[cfg(feature = "opencl")] 23 | use clap::ArgGroup; 24 | use clap::{App, Arg}; 25 | use std::cmp::min; 26 | 27 | fn main() { 28 | let arg = App::new("Engraver") 29 | .version(crate_version!()) 30 | .author(crate_authors!()) 31 | .about(crate_description!()) 32 | /* 33 | .setting(SubcommandRequiredElseHelp) 34 | */ 35 | .setting(ArgRequiredElseHelp) 36 | .setting(DeriveDisplayOrder) 37 | .setting(VersionlessSubcommands) 38 | .arg( 39 | Arg::with_name("disable direct i/o") 40 | .short("d") 41 | .long("ddio") 42 | .help("Disables direct i/o") 43 | .global(true), 44 | ).arg( 45 | Arg::with_name("disable async i/o") 46 | .short("a") 47 | .long("daio") 48 | .help("Disables async writing (single RAM buffer mode)") 49 | .global(true), 50 | ).arg( 51 | Arg::with_name("low priority") 52 | .short("l") 53 | .long("prio") 54 | .help("Runs engraver with low priority") 55 | .global(true), 56 | ).arg( 57 | Arg::with_name("non-verbosity") 58 | .short("q") 59 | .long("quiet") 60 | .help("Runs engraver in non-verbose mode") 61 | .global(true), 62 | ).arg( 63 | Arg::with_name("benchmark") 64 | .short("b") 65 | .long("bench") 66 | .help("Runs engraver in xPU benchmark mode") 67 | .global(true), 68 | ) 69 | /* 70 | .subcommand( 71 | SubCommand::with_name("plot") 72 | .about("Plots a PoC2 file for your account ID") 73 | .setting(ArgRequiredElseHelp) 74 | .setting(DeriveDisplayOrder) 75 | */.arg( 76 | Arg::with_name("numeric id") 77 | .short("i") 78 | .long("id") 79 | .value_name("numeric_ID") 80 | .help("your numeric Account ID") 81 | .takes_value(true) 82 | .required_unless("ocl-devices"), 83 | ).arg( 84 | Arg::with_name("start nonce") 85 | .short("s") 86 | .long("sn") 87 | .value_name("start_nonce") 88 | .help("where you want to start plotting") 89 | .takes_value(true) 90 | .required_unless("ocl-devices"), 91 | ).arg( 92 | Arg::with_name("nonces") 93 | .short("n") 94 | .long("n") 95 | .value_name("nonces") 96 | .help("how many nonces you want to plot") 97 | .takes_value(true) 98 | .required_unless("ocl-devices"), 99 | ).arg( 100 | Arg::with_name("path") 101 | .short("p") 102 | .long("path") 103 | .value_name("path") 104 | .help("target path for plotfile (optional)") 105 | .takes_value(true) 106 | .required(false), 107 | ).arg( 108 | Arg::with_name("memory") 109 | .short("m") 110 | .long("mem") 111 | .value_name("memory") 112 | .help("maximum memory usage (optional)") 113 | .takes_value(true) 114 | .required(false), 115 | ).args(&[ 116 | Arg::with_name("cpu") 117 | .short("c") 118 | .long("cpu") 119 | .value_name("threads") 120 | .help("maximum cpu threads you want to use (optional)") 121 | .required(false) 122 | .takes_value(true), 123 | #[cfg(feature = "opencl")] 124 | Arg::with_name("gpu") 125 | .short("g") 126 | .long("gpu") 127 | .value_name("platform_id:device_id:cores") 128 | .help("GPU(s) you want to use for plotting (optional)") 129 | .multiple(true) 130 | .takes_value(true), 131 | ]).groups(&[#[cfg(feature = "opencl")] 132 | ArgGroup::with_name("processing") 133 | .args(&["cpu", "gpu"]) 134 | .multiple(true)]) 135 | /* 136 | .arg( 137 | Arg::with_name("ssd buffer") 138 | .short("b") 139 | .long("ssd_cache") 140 | .value_name("ssd_cache") 141 | .help("*path to ssd cache for staging (optional)") 142 | .takes_value(true) 143 | .required(false), 144 | 145 | ), 146 | 147 | ).subcommand( 148 | SubCommand::with_name("encode") 149 | .about("*Individualizes a PoC3 reference file for your account ID") 150 | .display_order(2) 151 | .arg( 152 | Arg::with_name("numeric id") 153 | .short("i") 154 | .long("numeric_ID") 155 | .value_name("numeric ID") 156 | .help("numeric Account ID") 157 | .takes_value(true), 158 | ), 159 | ).subcommand( 160 | SubCommand::with_name("decode") 161 | .about("*Restores a PoC3 reference file from an individualized file") 162 | .display_order(3) 163 | .arg( 164 | Arg::with_name("numeric id") 165 | .short("i") 166 | .long("numeric_ID") 167 | .value_name("numeric ID") 168 | .help("numeric Account ID") 169 | .takes_value(true) 170 | .required(true), 171 | ), 172 | 173 | )*/; 174 | 175 | #[cfg(feature = "opencl")] 176 | let arg = arg 177 | .arg( 178 | Arg::with_name("ocl-devices") 179 | .short("o") 180 | .long("opencl") 181 | .help("Display OpenCL platforms and devices") 182 | .global(true), 183 | ) 184 | .arg( 185 | Arg::with_name("zero-copy") 186 | .short("z") 187 | .long("zcb") 188 | .help("Enables zero copy buffers for shared mem (integrated) gpus") 189 | .global(true), 190 | ); 191 | let matches = &arg.get_matches(); 192 | 193 | if matches.is_present("low priority") { 194 | set_low_prio(); 195 | } 196 | 197 | if matches.is_present("ocl-devices") { 198 | #[cfg(feature = "opencl")] 199 | ocl::platform_info(); 200 | return; 201 | } 202 | 203 | // plotting 204 | /* subcommand 205 | if let Some(matches) = matches.subcommand_matches("plot") { 206 | */ 207 | let numeric_id = value_t!(matches, "numeric id", u64).unwrap_or_else(|e| e.exit()); 208 | let start_nonce = value_t!(matches, "start nonce", u64).unwrap_or_else(|e| e.exit()); 209 | let nonces = value_t!(matches, "nonces", u64).unwrap_or_else(|e| e.exit()); 210 | let output_path = value_t!(matches, "path", String).unwrap_or_else(|_| { 211 | std::env::current_dir() 212 | .unwrap() 213 | .into_os_string() 214 | .into_string() 215 | .unwrap() 216 | }); 217 | let mem = value_t!(matches, "memory", String).unwrap_or_else(|_| "0B".to_owned()); 218 | let cpu_threads = value_t!(matches, "cpu", u8).unwrap_or(0u8); 219 | 220 | let gpus = if matches.occurrences_of("gpu") > 0 { 221 | let gpu = values_t!(matches, "gpu", String); 222 | Some(gpu.unwrap()) 223 | } else { 224 | None 225 | }; 226 | 227 | // work out number of cpu threads to use 228 | let cores = sys_info::cpu_num().unwrap() as u8; 229 | let cpu_threads = if cpu_threads == 0 { 230 | cores 231 | } else { 232 | min(2 * cores, cpu_threads) 233 | }; 234 | 235 | // special case: dont use cpu if only a gpu is defined 236 | #[cfg(feature = "opencl")] 237 | let cpu_threads = if matches.occurrences_of("gpu") > 0 && matches.occurrences_of("cpu") == 0 { 238 | 0u8 239 | } else { 240 | cpu_threads 241 | }; 242 | 243 | let p = Plotter::new(); 244 | p.run(PlotterTask { 245 | numeric_id, 246 | start_nonce, 247 | nonces, 248 | output_path, 249 | mem, 250 | cpu_threads, 251 | gpus, 252 | direct_io: !matches.is_present("disable direct i/o"), 253 | async_io: !matches.is_present("disable async i/o"), 254 | quiet: matches.is_present("non-verbosity"), 255 | benchmark: matches.is_present("benchmark"), 256 | zcb: matches.is_present("zero-copy"), 257 | }); 258 | } 259 | -------------------------------------------------------------------------------- /src/c/noncegen_128_avx.c: -------------------------------------------------------------------------------- 1 | #include "noncegen_128_avx.h" 2 | #include 3 | #include 4 | #include "common.h" 5 | #include "mshabal_128_avx.h" 6 | #include "sph_shabal.h" 7 | 8 | sph_shabal_context global_32; 9 | mshabal128_context global_128; 10 | mshabal128_context_fast global_128_fast; 11 | 12 | void init_shabal_avx() { 13 | sph_shabal256_init(&global_32); 14 | mshabal_init_avx(&global_128, 256); 15 | global_128_fast.out_size = global_128.out_size; 16 | for (int i = 0; i < 176; i++) global_128_fast.state[i] = global_128.state[i]; 17 | global_128_fast.Whigh = global_128.Whigh; 18 | global_128_fast.Wlow = global_128.Wlow; 19 | } 20 | 21 | // cache: cache to save to 22 | // local_num: thread number 23 | // numeric_id: numeric account id 24 | // loc_startnonce nonce to start generation at 25 | // local_nonces: number of nonces to generate 26 | void noncegen_avx(char *cache, const size_t cache_size, const size_t chunk_offset, 27 | const uint64_t numeric_id, const uint64_t local_startnonce, 28 | const uint64_t local_nonces) { 29 | sph_shabal_context local_32; 30 | uint64_t nonce; 31 | size_t len; 32 | 33 | mshabal128_context_fast local_128_fast; 34 | uint64_t nonce1, nonce2, nonce3, nonce4; 35 | 36 | char seed[32]; // 64bit numeric account ID, 64bit nonce (blank), 1bit termination, 127 bits zero 37 | char term[32]; // 1bit 1, 255bit of zeros 38 | char zero[32]; // 256bit of zeros 39 | 40 | write_seed(seed, numeric_id); 41 | write_term(term); 42 | memset(&zero[0], 0, 32); 43 | 44 | //vars shared 45 | uint8_t* buffer = (uint8_t*)malloc(sizeof(uint8_t) * MSHABAL128_VECTOR_SIZE * NONCE_SIZE); 46 | uint8_t* final = (uint8_t*)malloc(sizeof(uint8_t) * MSHABAL128_VECTOR_SIZE * HASH_SIZE); 47 | 48 | // prepare smart SIMD aligned termination strings 49 | // creation could further be optimized, but not much in it as it only runs once per work package 50 | // creation could also be moved to plotter start 51 | union { 52 | mshabal_u32 words[16 * MSHABAL128_VECTOR_SIZE]; 53 | __m128i data[16]; 54 | } t1, t2, t3; 55 | 56 | for (int j = 0; j < 16 * MSHABAL128_VECTOR_SIZE / 2; j += MSHABAL128_VECTOR_SIZE) { 57 | size_t o = j; 58 | // t1 59 | t1.words[j + 0] = *(mshabal_u32 *)(seed + o); 60 | t1.words[j + 1] = *(mshabal_u32 *)(seed + o); 61 | t1.words[j + 2] = *(mshabal_u32 *)(seed + o); 62 | t1.words[j + 3] = *(mshabal_u32 *)(seed + o); 63 | t1.words[j + 0 + 32] = *(mshabal_u32 *)(zero + o); 64 | t1.words[j + 1 + 32] = *(mshabal_u32 *)(zero + o); 65 | t1.words[j + 2 + 32] = *(mshabal_u32 *)(zero + o); 66 | t1.words[j + 3 + 32] = *(mshabal_u32 *)(zero + o); 67 | // t2 68 | // (first 256bit skipped, will later be filled with data) 69 | t2.words[j + 0 + 32] = *(mshabal_u32 *)(seed + o); 70 | t2.words[j + 1 + 32] = *(mshabal_u32 *)(seed + o); 71 | t2.words[j + 2 + 32] = *(mshabal_u32 *)(seed + o); 72 | t2.words[j + 3 + 32] = *(mshabal_u32 *)(seed + o); 73 | // t3 74 | t3.words[j + 0] = *(mshabal_u32 *)(term + o); 75 | t3.words[j + 1] = *(mshabal_u32 *)(term + o); 76 | t3.words[j + 2] = *(mshabal_u32 *)(term + o); 77 | t3.words[j + 3] = *(mshabal_u32 *)(term + o); 78 | t3.words[j + 0 + 32] = *(mshabal_u32 *)(zero + o); 79 | t3.words[j + 1 + 32] = *(mshabal_u32 *)(zero + o); 80 | t3.words[j + 2 + 32] = *(mshabal_u32 *)(zero + o); 81 | t3.words[j + 3 + 32] = *(mshabal_u32 *)(zero + o); 82 | } 83 | 84 | for (uint64_t n = 0; n < local_nonces;) { 85 | // iterate nonces (4 per cycle - avx) 86 | // min 4 nonces left for avx processing, otherwise SISD 87 | if (n + 4 <= local_nonces) { 88 | // generate nonce numbers & change endianness 89 | nonce1 = bswap_64((uint64_t)(local_startnonce + n + 0)); 90 | nonce2 = bswap_64((uint64_t)(local_startnonce + n + 1)); 91 | nonce3 = bswap_64((uint64_t)(local_startnonce + n + 2)); 92 | nonce4 = bswap_64((uint64_t)(local_startnonce + n + 3)); 93 | 94 | // store nonce numbers in relevant termination strings 95 | for (int j = 8; j < 16; j += MSHABAL128_VECTOR_SIZE) { 96 | size_t o = j - 8; 97 | // t1 98 | t1.words[j + 0] = *(mshabal_u32 *)((char *)&nonce1 + o); 99 | t1.words[j + 1] = *(mshabal_u32 *)((char *)&nonce2 + o); 100 | t1.words[j + 2] = *(mshabal_u32 *)((char *)&nonce3 + o); 101 | t1.words[j + 3] = *(mshabal_u32 *)((char *)&nonce4 + o); 102 | t2.words[j + 0 + 32] = *(mshabal_u32 *)((char *)&nonce1 + o); 103 | t2.words[j + 1 + 32] = *(mshabal_u32 *)((char *)&nonce2 + o); 104 | t2.words[j + 2 + 32] = *(mshabal_u32 *)((char *)&nonce3 + o); 105 | t2.words[j + 3 + 32] = *(mshabal_u32 *)((char *)&nonce4 + o); 106 | } 107 | 108 | // start shabal rounds 109 | 110 | // 3 cases: first 128 rounds uses case 1 or 2, after that case 3 111 | // case 1: first 128 rounds, hashes are even: use termination string 1 112 | // case 2: first 128 rounds, hashes are odd: use termination string 2 113 | // case 3: round > 128: use termination string 3 114 | // round 1 115 | memcpy(&local_128_fast, &global_128_fast, 116 | sizeof(global_128_fast)); // fast initialize shabal 117 | 118 | mshabal_hash_fast_avx( 119 | &local_128_fast, NULL, &t1, 120 | &buffer[MSHABAL128_VECTOR_SIZE * (NONCE_SIZE - HASH_SIZE)], 16 >> 6); 121 | 122 | // store first hash into smart termination string 2 (data is vectored and SIMD aligned) 123 | memcpy(&t2, &buffer[MSHABAL128_VECTOR_SIZE * (NONCE_SIZE - HASH_SIZE)], 124 | MSHABAL128_VECTOR_SIZE * (HASH_SIZE)); 125 | 126 | // round 2 - 128 127 | for (size_t i = NONCE_SIZE - HASH_SIZE; i > (NONCE_SIZE - HASH_CAP); i -= HASH_SIZE) { 128 | // check if msg can be divided into 512bit packages without a 129 | // remainder 130 | if (i % 64 == 0) { 131 | // last msg = seed + termination 132 | mshabal_hash_fast_avx(&local_128_fast, &buffer[i * MSHABAL128_VECTOR_SIZE], 133 | &t1, 134 | &buffer[(i - HASH_SIZE) * MSHABAL128_VECTOR_SIZE], 135 | (NONCE_SIZE + 16 - i) >> 6); 136 | } else { 137 | // last msg = 256 bit data + seed + termination 138 | mshabal_hash_fast_avx(&local_128_fast, &buffer[i * MSHABAL128_VECTOR_SIZE], 139 | &t2, 140 | &buffer[(i - HASH_SIZE) * MSHABAL128_VECTOR_SIZE], 141 | (NONCE_SIZE + 16 - i) >> 6); 142 | } 143 | } 144 | 145 | // round 128-8192 146 | for (size_t i = NONCE_SIZE - HASH_CAP; i > 0; i -= HASH_SIZE) { 147 | mshabal_hash_fast_avx(&local_128_fast, &buffer[i * MSHABAL128_VECTOR_SIZE], &t3, 148 | &buffer[(i - HASH_SIZE) * MSHABAL128_VECTOR_SIZE], 149 | (HASH_CAP) >> 6); 150 | } 151 | 152 | // generate final hash 153 | mshabal_hash_fast_avx(&local_128_fast, &buffer[0], &t1, &final[0], 154 | (NONCE_SIZE + 16) >> 6); 155 | 156 | // XOR using SIMD 157 | // load final hash 158 | __m128i F[8]; 159 | for (int j = 0; j < 8; j++) F[j] = _mm_loadu_si128((__m128i *)final + j); 160 | // xor all hashes with final hash 161 | for (int j = 0; j < 8 * 2 * HASH_CAP; j++) 162 | _mm_storeu_si128( 163 | (__m128i *)buffer + j, 164 | _mm_xor_si128(_mm_loadu_si128((__m128i *)buffer + j), F[j % 8])); 165 | 166 | // todo: fork SIMD aligned plot file here 167 | // simd shabal words unpack + POC Shuffle + scatter nonces into optimised cache 168 | 169 | for (int i = 0; i < NUM_SCOOPS * 2; i++) { 170 | for (int j = 0; j < 32; j += 4) { 171 | for (int k = 0; k < MSHABAL128_VECTOR_SIZE; k += 1) { 172 | memcpy(&cache[((i & 1) * (4095 - (i >> 1)) + ((i + 1) & 1) * (i >> 1)) * 173 | SCOOP_SIZE * cache_size + 174 | (n + k + chunk_offset) * SCOOP_SIZE + (i & 1) * 32 + j], 175 | &buffer[(i * 32 + j) * MSHABAL128_VECTOR_SIZE + k * 4], 4); 176 | } 177 | } 178 | } 179 | 180 | n += 4; 181 | } else { 182 | // if less than 8 nonces left, use 1d-shabal 183 | int8_t *xv = (int8_t *)&numeric_id; 184 | 185 | for (size_t i = 0; i < 8; i++) buffer[NONCE_SIZE + i] = xv[7 - i]; 186 | 187 | nonce = local_startnonce + n; 188 | xv = (int8_t *)&nonce; 189 | 190 | for (size_t i = 8; i < 16; i++) buffer[NONCE_SIZE + i] = xv[15 - i]; 191 | 192 | for (size_t i = NONCE_SIZE; i > 0; i -= HASH_SIZE) { 193 | memcpy(&local_32, &global_32, sizeof(global_32)); 194 | ; 195 | if (i < NONCE_SIZE + 16 - HASH_CAP) 196 | len = HASH_CAP; 197 | else 198 | len = NONCE_SIZE + 16 - i; 199 | 200 | sph_shabal256(&local_32, &buffer[i], len); 201 | sph_shabal256_close(&local_32, &buffer[i - HASH_SIZE]); 202 | } 203 | 204 | memcpy(&local_32, &global_32, sizeof(global_32)); 205 | sph_shabal256(&local_32, buffer, 16 + NONCE_SIZE); 206 | sph_shabal256_close(&local_32, final); 207 | 208 | // XOR with final 209 | for (size_t i = 0; i < NONCE_SIZE; i++) buffer[i] ^= (final[i % HASH_SIZE]); 210 | 211 | // Sort them PoC2: 212 | for (size_t i = 0; i < HASH_CAP; i++){ 213 | memmove(&cache[i * cache_size * SCOOP_SIZE + (n + chunk_offset) * SCOOP_SIZE], &buffer[i * SCOOP_SIZE], HASH_SIZE); 214 | memmove(&cache[(4095-i) * cache_size * SCOOP_SIZE + (n + chunk_offset) * SCOOP_SIZE + 32], &buffer[i * SCOOP_SIZE + 32], HASH_SIZE); 215 | } 216 | n++; 217 | } 218 | } 219 | free(buffer); 220 | free(final); 221 | } 222 | -------------------------------------------------------------------------------- /src/c/noncegen_128_sse2.c: -------------------------------------------------------------------------------- 1 | #include "noncegen_128_avx.h" 2 | #include 3 | #include 4 | #include "common.h" 5 | #include "mshabal_128_sse2.h" 6 | #include "sph_shabal.h" 7 | 8 | sph_shabal_context global_32; 9 | mshabal128_context global_128; 10 | mshabal128_context_fast global_128_fast; 11 | 12 | void init_shabal_sse2() { 13 | sph_shabal256_init(&global_32); 14 | mshabal_init_sse2(&global_128, 256); 15 | global_128_fast.out_size = global_128.out_size; 16 | for (int i = 0; i < 176; i++) global_128_fast.state[i] = global_128.state[i]; 17 | global_128_fast.Whigh = global_128.Whigh; 18 | global_128_fast.Wlow = global_128.Wlow; 19 | } 20 | 21 | // cache: cache to save to 22 | // local_num: thread number 23 | // numeric_id: numeric account id 24 | // loc_startnonce nonce to start generation at 25 | // local_nonces: number of nonces to generate 26 | void noncegen_sse2(char *cache, const size_t cache_size, const size_t chunk_offset, 27 | const uint64_t numeric_id, const uint64_t local_startnonce, 28 | const uint64_t local_nonces) { 29 | sph_shabal_context local_32; 30 | uint64_t nonce; 31 | size_t len; 32 | 33 | mshabal128_context_fast local_128_fast; 34 | uint64_t nonce1, nonce2, nonce3, nonce4; 35 | 36 | char seed[32]; // 64bit numeric account ID, 64bit nonce (blank), 1bit termination, 127 bits zero 37 | char term[32]; // 1bit 1, 255bit of zeros 38 | char zero[32]; // 256bit of zeros 39 | 40 | write_seed(seed, numeric_id); 41 | write_term(term); 42 | memset(&zero[0], 0, 32); 43 | 44 | //vars shared 45 | uint8_t* buffer = (uint8_t*)malloc(sizeof(uint8_t) * MSHABAL128_VECTOR_SIZE * NONCE_SIZE); 46 | uint8_t* final = (uint8_t*)malloc(sizeof(uint8_t) * MSHABAL128_VECTOR_SIZE * HASH_SIZE); 47 | 48 | // prepare smart SIMD aligned termination strings 49 | // creation could further be optimized, but not much in it as it only runs once per work package 50 | // creation could also be moved to plotter start 51 | union { 52 | mshabal_u32 words[16 * MSHABAL128_VECTOR_SIZE]; 53 | __m128i data[16]; 54 | } t1, t2, t3; 55 | 56 | for (int j = 0; j < 16 * MSHABAL128_VECTOR_SIZE / 2; j += MSHABAL128_VECTOR_SIZE) { 57 | size_t o = j; 58 | // t1 59 | t1.words[j + 0] = *(mshabal_u32 *)(seed + o); 60 | t1.words[j + 1] = *(mshabal_u32 *)(seed + o); 61 | t1.words[j + 2] = *(mshabal_u32 *)(seed + o); 62 | t1.words[j + 3] = *(mshabal_u32 *)(seed + o); 63 | t1.words[j + 0 + 32] = *(mshabal_u32 *)(zero + o); 64 | t1.words[j + 1 + 32] = *(mshabal_u32 *)(zero + o); 65 | t1.words[j + 2 + 32] = *(mshabal_u32 *)(zero + o); 66 | t1.words[j + 3 + 32] = *(mshabal_u32 *)(zero + o); 67 | // t2 68 | // (first 256bit skipped, will later be filled with data) 69 | t2.words[j + 0 + 32] = *(mshabal_u32 *)(seed + o); 70 | t2.words[j + 1 + 32] = *(mshabal_u32 *)(seed + o); 71 | t2.words[j + 2 + 32] = *(mshabal_u32 *)(seed + o); 72 | t2.words[j + 3 + 32] = *(mshabal_u32 *)(seed + o); 73 | // t3 74 | t3.words[j + 0] = *(mshabal_u32 *)(term + o); 75 | t3.words[j + 1] = *(mshabal_u32 *)(term + o); 76 | t3.words[j + 2] = *(mshabal_u32 *)(term + o); 77 | t3.words[j + 3] = *(mshabal_u32 *)(term + o); 78 | t3.words[j + 0 + 32] = *(mshabal_u32 *)(zero + o); 79 | t3.words[j + 1 + 32] = *(mshabal_u32 *)(zero + o); 80 | t3.words[j + 2 + 32] = *(mshabal_u32 *)(zero + o); 81 | t3.words[j + 3 + 32] = *(mshabal_u32 *)(zero + o); 82 | } 83 | 84 | for (uint64_t n = 0; n < local_nonces;) { 85 | // iterate nonces (4 per cycle - sse) 86 | // min 4 nonces left for sse processing, otherwise SISD 87 | if (n + 4 <= local_nonces) { 88 | // generate nonce numbers & change endianness 89 | nonce1 = bswap_64((uint64_t)(local_startnonce + n + 0)); 90 | nonce2 = bswap_64((uint64_t)(local_startnonce + n + 1)); 91 | nonce3 = bswap_64((uint64_t)(local_startnonce + n + 2)); 92 | nonce4 = bswap_64((uint64_t)(local_startnonce + n + 3)); 93 | 94 | // store nonce numbers in relevant termination strings 95 | for (int j = 8; j < 16; j += MSHABAL128_VECTOR_SIZE) { 96 | size_t o = j - 8; 97 | // t1 98 | t1.words[j + 0] = *(mshabal_u32 *)((char *)&nonce1 + o); 99 | t1.words[j + 1] = *(mshabal_u32 *)((char *)&nonce2 + o); 100 | t1.words[j + 2] = *(mshabal_u32 *)((char *)&nonce3 + o); 101 | t1.words[j + 3] = *(mshabal_u32 *)((char *)&nonce4 + o); 102 | t2.words[j + 0 + 32] = *(mshabal_u32 *)((char *)&nonce1 + o); 103 | t2.words[j + 1 + 32] = *(mshabal_u32 *)((char *)&nonce2 + o); 104 | t2.words[j + 2 + 32] = *(mshabal_u32 *)((char *)&nonce3 + o); 105 | t2.words[j + 3 + 32] = *(mshabal_u32 *)((char *)&nonce4 + o); 106 | } 107 | 108 | // start shabal rounds 109 | 110 | // 3 cases: first 128 rounds uses case 1 or 2, after that case 3 111 | // case 1: first 128 rounds, hashes are even: use termination string 1 112 | // case 2: first 128 rounds, hashes are odd: use termination string 2 113 | // case 3: round > 128: use termination string 3 114 | // round 1 115 | memcpy(&local_128_fast, &global_128_fast, 116 | sizeof(global_128_fast)); // fast initialize shabal 117 | 118 | mshabal_hash_fast_sse2( 119 | &local_128_fast, NULL, &t1, 120 | &buffer[MSHABAL128_VECTOR_SIZE * (NONCE_SIZE - HASH_SIZE)], 16 >> 6); 121 | 122 | // store first hash into smart termination string 2 (data is vectored and SIMD aligned) 123 | memcpy(&t2, &buffer[MSHABAL128_VECTOR_SIZE * (NONCE_SIZE - HASH_SIZE)], 124 | MSHABAL128_VECTOR_SIZE * (HASH_SIZE)); 125 | 126 | // round 2 - 128 127 | for (size_t i = NONCE_SIZE - HASH_SIZE; i > (NONCE_SIZE - HASH_CAP); i -= HASH_SIZE) { 128 | // check if msg can be divided into 512bit packages without a 129 | // remainder 130 | if (i % 64 == 0) { 131 | // last msg = seed + termination 132 | mshabal_hash_fast_sse2(&local_128_fast, &buffer[i * MSHABAL128_VECTOR_SIZE], 133 | &t1, 134 | &buffer[(i - HASH_SIZE) * MSHABAL128_VECTOR_SIZE], 135 | (NONCE_SIZE + 16 - i) >> 6); 136 | } else { 137 | // last msg = 256 bit data + seed + termination 138 | mshabal_hash_fast_sse2(&local_128_fast, &buffer[i * MSHABAL128_VECTOR_SIZE], 139 | &t2, 140 | &buffer[(i - HASH_SIZE) * MSHABAL128_VECTOR_SIZE], 141 | (NONCE_SIZE + 16 - i) >> 6); 142 | } 143 | } 144 | 145 | // round 128-8192 146 | for (size_t i = NONCE_SIZE - HASH_CAP; i > 0; i -= HASH_SIZE) { 147 | mshabal_hash_fast_sse2(&local_128_fast, &buffer[i * MSHABAL128_VECTOR_SIZE], &t3, 148 | &buffer[(i - HASH_SIZE) * MSHABAL128_VECTOR_SIZE], 149 | (HASH_CAP) >> 6); 150 | } 151 | 152 | // generate final hash 153 | mshabal_hash_fast_sse2(&local_128_fast, &buffer[0], &t1, &final[0], 154 | (NONCE_SIZE + 16) >> 6); 155 | 156 | // XOR using SIMD 157 | // load final hash 158 | __m128i F[8]; 159 | for (int j = 0; j < 8; j++) F[j] = _mm_loadu_si128((__m128i *)final + j); 160 | // xor all hashes with final hash 161 | for (int j = 0; j < 8 * 2 * HASH_CAP; j++) 162 | _mm_storeu_si128( 163 | (__m128i *)buffer + j, 164 | _mm_xor_si128(_mm_loadu_si128((__m128i *)buffer + j), F[j % 8])); 165 | 166 | // todo: fork SIMD aligned plot file here 167 | // simd shabal words unpack + POC Shuffle + scatter nonces into optimised cache 168 | 169 | for (int i = 0; i < NUM_SCOOPS * 2; i++) { 170 | for (int j = 0; j < 32; j += 4) { 171 | for (int k = 0; k < MSHABAL128_VECTOR_SIZE; k += 1) { 172 | memcpy(&cache[((i & 1) * (4095 - (i >> 1)) + ((i + 1) & 1) * (i >> 1)) * 173 | SCOOP_SIZE * cache_size + 174 | (n + k + chunk_offset) * SCOOP_SIZE + (i & 1) * 32 + j], 175 | &buffer[(i * 32 + j) * MSHABAL128_VECTOR_SIZE + k * 4], 4); 176 | } 177 | } 178 | } 179 | 180 | n += 4; 181 | } else { 182 | // if less than 8 nonces left, use 1d-shabal 183 | int8_t *xv = (int8_t *)&numeric_id; 184 | 185 | for (size_t i = 0; i < 8; i++) buffer[NONCE_SIZE + i] = xv[7 - i]; 186 | 187 | nonce = local_startnonce + n; 188 | xv = (int8_t *)&nonce; 189 | 190 | for (size_t i = 8; i < 16; i++) buffer[NONCE_SIZE + i] = xv[15 - i]; 191 | 192 | for (size_t i = NONCE_SIZE; i > 0; i -= HASH_SIZE) { 193 | memcpy(&local_32, &global_32, sizeof(global_32)); 194 | ; 195 | if (i < NONCE_SIZE + 16 - HASH_CAP) 196 | len = HASH_CAP; 197 | else 198 | len = NONCE_SIZE + 16 - i; 199 | 200 | sph_shabal256(&local_32, &buffer[i], len); 201 | sph_shabal256_close(&local_32, &buffer[i - HASH_SIZE]); 202 | } 203 | 204 | memcpy(&local_32, &global_32, sizeof(global_32)); 205 | sph_shabal256(&local_32, buffer, 16 + NONCE_SIZE); 206 | sph_shabal256_close(&local_32, final); 207 | 208 | // XOR with final 209 | for (size_t i = 0; i < NONCE_SIZE; i++) buffer[i] ^= (final[i % HASH_SIZE]); 210 | 211 | // Sort them PoC2: 212 | for (size_t i = 0; i < HASH_CAP; i++){ 213 | memmove(&cache[i * cache_size * SCOOP_SIZE + (n + chunk_offset) * SCOOP_SIZE], &buffer[i * SCOOP_SIZE], HASH_SIZE); 214 | memmove(&cache[(4095-i) * cache_size * SCOOP_SIZE + (n + chunk_offset) * SCOOP_SIZE + 32], &buffer[i * SCOOP_SIZE + 32], HASH_SIZE); 215 | } 216 | n++; 217 | } 218 | } 219 | free(buffer); 220 | free(final); 221 | } 222 | -------------------------------------------------------------------------------- /src/scheduler.rs: -------------------------------------------------------------------------------- 1 | use crate::cpu_hasher::{SimdExtension, hash_cpu, CpuTask, SafePointer}; 2 | use crate::buffer::PageAlignedByteBuffer; 3 | #[cfg(feature = "opencl")] 4 | use crate::gpu_hasher::{create_gpu_hasher_thread, GpuTask}; 5 | #[cfg(feature = "opencl")] 6 | use crate::ocl::gpu_init; 7 | use crate::plotter::{PlotterTask, NONCE_SIZE}; 8 | #[cfg(feature = "opencl")] 9 | use crossbeam_channel::unbounded; 10 | use crossbeam_channel::{Receiver, Sender}; 11 | use std::cmp::min; 12 | use std::sync::mpsc::channel; 13 | use std::sync::Arc; 14 | #[cfg(feature = "opencl")] 15 | use std::thread; 16 | 17 | const CPU_TASK_SIZE: u64 = 64; 18 | 19 | pub fn create_scheduler_thread( 20 | task: Arc, 21 | thread_pool: rayon::ThreadPool, 22 | mut nonces_hashed: u64, 23 | mut pb: Option>, 24 | rx_empty_buffers: Receiver, 25 | tx_buffers_to_writer: Sender, 26 | simd_ext: SimdExtension, 27 | ) -> impl FnOnce() { 28 | move || { 29 | // synchronisation chanel for all hashing devices (CPU+GPU) 30 | // message protocol: (hash_device_id: u8, message: u8, nonces processed: u64) 31 | // hash_device_id: 0=CPU, 1=GPU0, 2=GPU1... 32 | // message: 0 = data ready to write 33 | // 1 = device ready to compute next hashing batch 34 | // nonces_processed: nonces hashed / nonces writen to host buffer 35 | let (tx, rx) = channel(); 36 | 37 | // create gpu threads and channels 38 | #[cfg(feature = "opencl")] 39 | let gpu_contexts = match &task.gpus { 40 | Some(x) => Some(gpu_init(&x, task.zcb)), 41 | None => None, 42 | }; 43 | 44 | #[cfg(feature = "opencl")] 45 | let gpus = match gpu_contexts { 46 | Some(x) => x, 47 | None => Vec::new(), 48 | }; 49 | #[cfg(feature = "opencl")] 50 | let mut gpu_threads = Vec::new(); 51 | #[cfg(feature = "opencl")] 52 | let mut gpu_channels = Vec::new(); 53 | 54 | #[cfg(feature = "opencl")] 55 | for (i, gpu) in gpus.iter().enumerate() { 56 | gpu_channels.push(unbounded()); 57 | gpu_threads.push(thread::spawn({ 58 | create_gpu_hasher_thread( 59 | (i + 1) as u8, 60 | gpu.clone(), 61 | tx.clone(), 62 | gpu_channels.last().unwrap().1.clone(), 63 | ) 64 | })); 65 | } 66 | 67 | for buffer in rx_empty_buffers { 68 | let mut_bs = &buffer.get_buffer(); 69 | let mut bs = mut_bs.lock().unwrap(); 70 | let buffer_size = (*bs).len() as u64; 71 | let nonces_to_hash = min(buffer_size / NONCE_SIZE, task.nonces - nonces_hashed); 72 | 73 | let mut requested = 0u64; 74 | let mut processed = 0u64; 75 | 76 | // kickoff first gpu and cpu runs 77 | #[cfg(feature = "opencl")] 78 | for (i, gpu) in gpus.iter().enumerate() { 79 | // schedule next gpu task 80 | let gpu = gpu.lock().unwrap(); 81 | let task_size = min(gpu.worksize as u64, nonces_to_hash - requested); 82 | if task_size > 0 { 83 | gpu_channels[i] 84 | .0 85 | .send(Some(GpuTask { 86 | cache: SafePointer { 87 | ptr: bs.as_mut_ptr(), 88 | }, 89 | cache_size: buffer_size / NONCE_SIZE, 90 | chunk_offset: requested, 91 | numeric_id: task.numeric_id, 92 | local_startnonce: task.start_nonce + nonces_hashed + requested, 93 | local_nonces: task_size, 94 | })) 95 | .unwrap(); 96 | } 97 | requested += task_size; 98 | //println!("Debug: Device: {} started. {} nonces assigned. Total requested: {}\n\n\n",i+1,task_size,requested); 99 | } 100 | 101 | for _ in 0..task.cpu_threads { 102 | let task_size = min(CPU_TASK_SIZE, nonces_to_hash - requested); 103 | if task_size > 0 { 104 | let task = hash_cpu( 105 | tx.clone(), 106 | CpuTask { 107 | cache: SafePointer { 108 | ptr: bs.as_mut_ptr(), 109 | }, 110 | cache_size: (buffer_size / NONCE_SIZE) as usize, 111 | chunk_offset: requested as usize, 112 | numeric_id: task.numeric_id, 113 | local_startnonce: task.start_nonce + nonces_hashed + requested, 114 | local_nonces: task_size, 115 | }, 116 | simd_ext.clone(), 117 | ); 118 | thread_pool.spawn(task); 119 | } 120 | requested += task_size; 121 | } 122 | 123 | // control loop 124 | let rx = ℞ 125 | for msg in rx { 126 | match msg.1 { 127 | // process a request for work: provide a task or signal completion 128 | 1 => { 129 | let task_size = match msg.0 { 130 | 0 => { 131 | // schedule next cpu task 132 | let task_size = min(CPU_TASK_SIZE, nonces_to_hash - requested); 133 | if task_size > 0 { 134 | let task = hash_cpu( 135 | tx.clone(), 136 | CpuTask { 137 | cache: SafePointer { 138 | ptr: bs.as_mut_ptr(), 139 | }, 140 | cache_size: (buffer_size / NONCE_SIZE) as usize, 141 | chunk_offset: requested as usize, 142 | numeric_id: task.numeric_id, 143 | local_startnonce: task.start_nonce 144 | + nonces_hashed 145 | + requested, 146 | local_nonces: task_size, 147 | }, 148 | simd_ext.clone(), 149 | ); 150 | thread_pool.spawn(task); 151 | } 152 | task_size 153 | } 154 | _ => { 155 | // schedule next gpu task 156 | #[cfg(feature = "opencl")] 157 | let gpu = gpus[(msg.0 - 1) as usize].lock().unwrap(); 158 | #[cfg(feature = "opencl")] 159 | let task_size = 160 | min(gpu.worksize as u64, nonces_to_hash - requested); 161 | 162 | // optimisation: leave some work for cpu in dual mode 163 | #[cfg(feature = "opencl")] 164 | let task_size = if task_size < gpu.worksize as u64 165 | && task.cpu_threads > 0 166 | && task_size > CPU_TASK_SIZE 167 | { 168 | task_size / 2 169 | } else { 170 | task_size 171 | }; 172 | 173 | #[cfg(not(feature = "opencl"))] 174 | let task_size = 0; 175 | 176 | #[cfg(feature = "opencl")] 177 | gpu_channels[(msg.0 - 1) as usize] 178 | .0 179 | .send(Some(GpuTask { 180 | cache: SafePointer { 181 | ptr: bs.as_mut_ptr(), 182 | }, 183 | cache_size: buffer_size / NONCE_SIZE, 184 | chunk_offset: requested, 185 | numeric_id: task.numeric_id, 186 | local_startnonce: task.start_nonce 187 | + nonces_hashed 188 | + requested, 189 | local_nonces: task_size, 190 | })) 191 | .unwrap(); 192 | task_size 193 | } 194 | }; 195 | 196 | requested += task_size; 197 | //println!("Debug: Device: {} asked for work. {} nonces assigned. Total requested: {}\n\n\n",msg.0,task_size,requested); 198 | } 199 | // process work completed message 200 | 0 => { 201 | processed += msg.2; 202 | match &mut pb { 203 | Some(pb) => { 204 | pb.add(msg.2 * NONCE_SIZE); 205 | } 206 | None => (), 207 | } 208 | } 209 | _ => {} 210 | } 211 | if processed == nonces_to_hash { 212 | break; 213 | } 214 | } 215 | 216 | nonces_hashed += nonces_to_hash; 217 | 218 | // queue buffer for writing 219 | tx_buffers_to_writer.send(buffer).unwrap(); 220 | 221 | // thread end 222 | if task.nonces == nonces_hashed { 223 | match &mut pb { 224 | Some(pb) => { 225 | pb.finish_print("Hasher done."); 226 | } 227 | None => (), 228 | } 229 | // shutdown gpu threads 230 | #[cfg(feature = "opencl")] 231 | for gpu in &gpu_channels { 232 | gpu.0.send(None).unwrap(); 233 | } 234 | break; 235 | }; 236 | } 237 | } 238 | } 239 | -------------------------------------------------------------------------------- /src/shabal256.rs: -------------------------------------------------------------------------------- 1 | use std::slice::from_raw_parts; 2 | 3 | const A_INIT: [u32; 12] = [ 4 | 0x52F84552, 0xE54B7999, 0x2D8EE3EC, 0xB9645191, 0xE0078B86, 0xBB7C44C9, 0xD2B5C1CA, 0xB0D2EB8C, 5 | 0x14CE5A45, 0x22AF50DC, 0xEFFDBC6B, 0xEB21B74A, 6 | ]; 7 | 8 | const B_INIT: [u32; 16] = [ 9 | 0xB555C6EE, 0x3E710596, 0xA72A652F, 0x9301515F, 0xDA28C1FA, 0x696FD868, 0x9CB6BF72, 0x0AFE4002, 10 | 0xA6E03615, 0x5138C1D4, 0xBE216306, 0xB38B8890, 0x3EA8B96B, 0x3299ACE4, 0x30924DD4, 0x55CB34A5, 11 | ]; 12 | 13 | const C_INIT: [u32; 16] = [ 14 | 0xB405F031, 0xC4233EBA, 0xB3733979, 0xC0DD9D55, 0xC51C28AE, 0xA327B8E1, 0x56C56167, 0xED614433, 15 | 0x88B59D60, 0x60E2CEBA, 0x758B4B8B, 0x83E82A7F, 0xBC968828, 0xE6E00BF7, 0xBA839E55, 0x9B491C60, 16 | ]; 17 | 18 | pub fn shabal256_fast(data: &[u8], term: &[u32; 16]) -> [u8; 32] { 19 | let mut a = A_INIT; 20 | let mut b = B_INIT; 21 | let mut c = C_INIT; 22 | let mut w_high = 0u32; 23 | let mut w_low = 1u32; 24 | let mut num = data.len() >> 6; 25 | let mut ptr = 0; 26 | let data_ptr = data.as_ptr() as *const u32; 27 | let data = unsafe { from_raw_parts(data_ptr, data.len() / 4) }; 28 | 29 | while num > 0 { 30 | input_block_add(&mut b, &data[ptr..]); 31 | xor_w(&mut a, w_low, w_high); 32 | apply_p(&mut a, &mut b, &c, &data[ptr..]); 33 | input_block_sub(&mut c, &data[ptr..]); 34 | swap_bc(&mut b, &mut c); 35 | incr_w(&mut w_low, &mut w_high); 36 | ptr = ptr.wrapping_add(16); 37 | num = num.wrapping_sub(1); 38 | } 39 | input_block_add(&mut b, term); 40 | xor_w(&mut a, w_low, w_high); 41 | apply_p(&mut a, &mut b, &c, term); 42 | for _ in 0..3 { 43 | swap_bc(&mut b, &mut c); 44 | xor_w(&mut a, w_low, w_high); 45 | apply_p(&mut a, &mut b, &c, term); 46 | } 47 | unsafe { *(b[8..16].as_ptr() as *const [u8; 32]) } 48 | } 49 | 50 | #[inline(always)] 51 | fn input_block_add(b: &mut [u32; 16], data: &[u32]) { 52 | for (element, data) in b.iter_mut().zip(data.iter()) { 53 | *element = element.wrapping_add(*data); 54 | } 55 | } 56 | 57 | #[inline(always)] 58 | fn input_block_sub(c: &mut [u32; 16], data: &[u32]) { 59 | for (element, data) in c.iter_mut().zip(data.iter()) { 60 | *element = element.wrapping_sub(*data); 61 | } 62 | } 63 | 64 | #[inline(always)] 65 | fn xor_w(a: &mut [u32; 12], w_low: u32, w_high: u32) { 66 | a[0] ^= w_low; 67 | a[1] ^= w_high; 68 | } 69 | 70 | #[inline(always)] 71 | fn apply_p(a: &mut [u32; 12], b: &mut [u32; 16], c: &[u32; 16], data: &[u32]) { 72 | for element in b.iter_mut() { 73 | *element = element.wrapping_shl(17) | element.wrapping_shr(15); 74 | } 75 | perm(a, b, c, data); 76 | a[0] = a[0] 77 | .wrapping_add(c[11]) 78 | .wrapping_add(c[15]) 79 | .wrapping_add(c[3]); 80 | a[1] = a[1] 81 | .wrapping_add(c[12]) 82 | .wrapping_add(c[0]) 83 | .wrapping_add(c[4]); 84 | a[2] = a[2] 85 | .wrapping_add(c[13]) 86 | .wrapping_add(c[1]) 87 | .wrapping_add(c[5]); 88 | a[3] = a[3] 89 | .wrapping_add(c[14]) 90 | .wrapping_add(c[2]) 91 | .wrapping_add(c[6]); 92 | a[4] = a[4] 93 | .wrapping_add(c[15]) 94 | .wrapping_add(c[3]) 95 | .wrapping_add(c[7]); 96 | a[5] = a[5] 97 | .wrapping_add(c[0]) 98 | .wrapping_add(c[4]) 99 | .wrapping_add(c[8]); 100 | a[6] = a[6] 101 | .wrapping_add(c[1]) 102 | .wrapping_add(c[5]) 103 | .wrapping_add(c[9]); 104 | a[7] = a[7] 105 | .wrapping_add(c[2]) 106 | .wrapping_add(c[6]) 107 | .wrapping_add(c[10]); 108 | a[8] = a[8] 109 | .wrapping_add(c[3]) 110 | .wrapping_add(c[7]) 111 | .wrapping_add(c[11]); 112 | a[9] = a[9] 113 | .wrapping_add(c[4]) 114 | .wrapping_add(c[8]) 115 | .wrapping_add(c[12]); 116 | a[10] = a[10] 117 | .wrapping_add(c[5]) 118 | .wrapping_add(c[9]) 119 | .wrapping_add(c[13]); 120 | a[11] = a[11] 121 | .wrapping_add(c[6]) 122 | .wrapping_add(c[10]) 123 | .wrapping_add(c[14]); 124 | } 125 | 126 | #[inline(always)] 127 | fn perm_elt( 128 | a: &mut [u32; 12], 129 | b: &mut [u32; 16], 130 | xa0: usize, 131 | xa1: usize, 132 | xb0: usize, 133 | xb1: usize, 134 | xb2: usize, 135 | xb3: usize, 136 | xc: u32, 137 | xm: u32, 138 | ) { 139 | unsafe { 140 | *a.get_unchecked_mut(xa0) = (a.get_unchecked(xa0) 141 | ^ ((a.get_unchecked(xa1).wrapping_shl(15u32) 142 | | a.get_unchecked(xa1).wrapping_shr(17u32)) 143 | .wrapping_mul(5u32)) 144 | ^ xc) 145 | .wrapping_mul(3u32) 146 | ^ b.get_unchecked(xb1) 147 | ^ (b.get_unchecked(xb2) & !b.get_unchecked(xb3)) 148 | ^ xm; 149 | *b.get_unchecked_mut(xb0) = !((b.get_unchecked(xb0).wrapping_shl(1) 150 | | b.get_unchecked(xb0).wrapping_shr(31)) 151 | ^ a.get_unchecked(xa0)); 152 | } 153 | } 154 | 155 | #[inline(always)] 156 | fn perm(a: &mut [u32; 12], b: &mut [u32; 16], c: &[u32; 16], data: &[u32]) { 157 | unsafe { 158 | perm_elt(a, b, 0, 11, 0, 13, 9, 6, c[8], *data.get_unchecked(0)); 159 | perm_elt(a, b, 1, 0, 1, 14, 10, 7, c[7], *data.get_unchecked(1)); 160 | perm_elt(a, b, 2, 1, 2, 15, 11, 8, c[6], *data.get_unchecked(2)); 161 | perm_elt(a, b, 3, 2, 3, 0, 12, 9, c[5], *data.get_unchecked(3)); 162 | perm_elt(a, b, 4, 3, 4, 1, 13, 10, c[4], *data.get_unchecked(4)); 163 | perm_elt(a, b, 5, 4, 5, 2, 14, 11, c[3], *data.get_unchecked(5)); 164 | perm_elt(a, b, 6, 5, 6, 3, 15, 12, c[2], *data.get_unchecked(6)); 165 | perm_elt(a, b, 7, 6, 7, 4, 0, 13, c[1], *data.get_unchecked(7)); 166 | perm_elt(a, b, 8, 7, 8, 5, 1, 14, c[0], *data.get_unchecked(8)); 167 | perm_elt(a, b, 9, 8, 9, 6, 2, 15, c[15], *data.get_unchecked(9)); 168 | perm_elt(a, b, 10, 9, 10, 7, 3, 0, c[14], *data.get_unchecked(10)); 169 | perm_elt(a, b, 11, 10, 11, 8, 4, 1, c[13], *data.get_unchecked(11)); 170 | perm_elt(a, b, 0, 11, 12, 9, 5, 2, c[12], *data.get_unchecked(12)); 171 | perm_elt(a, b, 1, 0, 13, 10, 6, 3, c[11], *data.get_unchecked(13)); 172 | perm_elt(a, b, 2, 1, 14, 11, 7, 4, c[10], *data.get_unchecked(14)); 173 | perm_elt(a, b, 3, 2, 15, 12, 8, 5, c[9], *data.get_unchecked(15)); 174 | perm_elt(a, b, 4, 3, 0, 13, 9, 6, c[8], *data.get_unchecked(0)); 175 | perm_elt(a, b, 5, 4, 1, 14, 10, 7, c[7], *data.get_unchecked(1)); 176 | perm_elt(a, b, 6, 5, 2, 15, 11, 8, c[6], *data.get_unchecked(2)); 177 | perm_elt(a, b, 7, 6, 3, 0, 12, 9, c[5], *data.get_unchecked(3)); 178 | perm_elt(a, b, 8, 7, 4, 1, 13, 10, c[4], *data.get_unchecked(4)); 179 | perm_elt(a, b, 9, 8, 5, 2, 14, 11, c[3], *data.get_unchecked(5)); 180 | perm_elt(a, b, 10, 9, 6, 3, 15, 12, c[2], *data.get_unchecked(6)); 181 | perm_elt(a, b, 11, 10, 7, 4, 0, 13, c[1], *data.get_unchecked(7)); 182 | perm_elt(a, b, 0, 11, 8, 5, 1, 14, c[0], *data.get_unchecked(8)); 183 | perm_elt(a, b, 1, 0, 9, 6, 2, 15, c[15], *data.get_unchecked(9)); 184 | perm_elt(a, b, 2, 1, 10, 7, 3, 0, c[14], *data.get_unchecked(10)); 185 | perm_elt(a, b, 3, 2, 11, 8, 4, 1, c[13], *data.get_unchecked(11)); 186 | perm_elt(a, b, 4, 3, 12, 9, 5, 2, c[12], *data.get_unchecked(12)); 187 | perm_elt(a, b, 5, 4, 13, 10, 6, 3, c[11], *data.get_unchecked(13)); 188 | perm_elt(a, b, 6, 5, 14, 11, 7, 4, c[10], *data.get_unchecked(14)); 189 | perm_elt(a, b, 7, 6, 15, 12, 8, 5, c[9], *data.get_unchecked(15)); 190 | perm_elt(a, b, 8, 7, 0, 13, 9, 6, c[8], *data.get_unchecked(0)); 191 | perm_elt(a, b, 9, 8, 1, 14, 10, 7, c[7], *data.get_unchecked(1)); 192 | perm_elt(a, b, 10, 9, 2, 15, 11, 8, c[6], *data.get_unchecked(2)); 193 | perm_elt(a, b, 11, 10, 3, 0, 12, 9, c[5], *data.get_unchecked(3)); 194 | perm_elt(a, b, 0, 11, 4, 1, 13, 10, c[4], *data.get_unchecked(4)); 195 | perm_elt(a, b, 1, 0, 5, 2, 14, 11, c[3], *data.get_unchecked(5)); 196 | perm_elt(a, b, 2, 1, 6, 3, 15, 12, c[2], *data.get_unchecked(6)); 197 | perm_elt(a, b, 3, 2, 7, 4, 0, 13, c[1], *data.get_unchecked(7)); 198 | perm_elt(a, b, 4, 3, 8, 5, 1, 14, c[0], *data.get_unchecked(8)); 199 | perm_elt(a, b, 5, 4, 9, 6, 2, 15, c[15], *data.get_unchecked(9)); 200 | perm_elt(a, b, 6, 5, 10, 7, 3, 0, c[14], *data.get_unchecked(10)); 201 | perm_elt(a, b, 7, 6, 11, 8, 4, 1, c[13], *data.get_unchecked(11)); 202 | perm_elt(a, b, 8, 7, 12, 9, 5, 2, c[12], *data.get_unchecked(12)); 203 | perm_elt(a, b, 9, 8, 13, 10, 6, 3, c[11], *data.get_unchecked(13)); 204 | perm_elt(a, b, 10, 9, 14, 11, 7, 4, c[10], *data.get_unchecked(14)); 205 | perm_elt(a, b, 11, 10, 15, 12, 8, 5, c[9], *data.get_unchecked(15)); 206 | } 207 | } 208 | 209 | #[inline(always)] 210 | fn swap_bc(b: &mut [u32; 16], c: &mut [u32; 16]) { 211 | std::mem::swap(b, c); 212 | } 213 | 214 | #[inline(always)] 215 | fn incr_w(w_low: &mut u32, w_high: &mut u32) { 216 | *w_low = w_low.wrapping_add(1); 217 | if *w_low == 0 { 218 | *w_high = w_high.wrapping_add(1); 219 | } 220 | } 221 | 222 | #[cfg(test)] 223 | mod test { 224 | use super::*; 225 | const TEST_A_RESULT: [u8; 32] = [ 226 | 0xDA, 0x8F, 0x08, 0xC0, 0x2A, 0x67, 0xBA, 0x9A, 0x56, 0xBD, 0xD0, 0x79, 0x8E, 0x48, 0xAE, 227 | 0x07, 0x14, 0x21, 0x5E, 0x09, 0x3B, 0x5B, 0x85, 0x06, 0x49, 0xA3, 0x77, 0x18, 0x99, 0x3F, 228 | 0x54, 0xA2, 229 | ]; 230 | const TEST_B_RESULT: [u8; 32] = [ 231 | 0xB4, 0x9F, 0x34, 0xBF, 0x51, 0x86, 0x4C, 0x30, 0x53, 0x3C, 0xC4, 0x6C, 0xC2, 0x54, 0x2B, 232 | 0xDE, 0xC2, 0xF9, 0x6F, 0xD0, 0x6F, 0x5C, 0x53, 0x9A, 0xFF, 0x6E, 0xAD, 0x58, 0x83, 0xF7, 233 | 0x32, 0x7A, 234 | ]; 235 | const TEST_B_M1: [u32; 16] = [ 236 | 0x64636261, 0x68676665, 0x6C6B6A69, 0x706F6E6D, 0x74737271, 0x78777675, 0x302D7A79, 237 | 0x34333231, 0x38373635, 0x42412D39, 0x46454443, 0x4A494847, 0x4E4D4C4B, 0x5251504F, 238 | 0x56555453, 0x5A595857, 239 | ]; 240 | const TEST_B_M2: [u32; 16] = [ 241 | 0x3231302D, 0x36353433, 0x2D393837, 0x64636261, 0x68676665, 0x6C6B6A69, 0x706F6E6D, 242 | 0x74737271, 0x78777675, 0x00807A79, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 243 | 0x00000000, 0x00000000, 244 | ]; 245 | 246 | #[test] 247 | fn shabal256() { 248 | // test message A 249 | let test_data = [0u8; 64]; 250 | let mut test_term = [0u32; 16]; 251 | test_term[0] = 0x80; 252 | let hash_a = shabal256_fast(&test_data, &test_term); 253 | assert_eq!(hash_a, TEST_A_RESULT); 254 | // test message B 255 | let hash_b = unsafe { 256 | shabal256_fast( 257 | &std::mem::transmute::<[u32; 16], [u8; 64]>(TEST_B_M1), 258 | &TEST_B_M2, 259 | ) 260 | }; 261 | assert_eq!(hash_b, TEST_B_RESULT); 262 | } 263 | } 264 | -------------------------------------------------------------------------------- /src/utils.rs: -------------------------------------------------------------------------------- 1 | use std::fs::{File, OpenOptions}; 2 | use std::io; 3 | use std::path::Path; 4 | 5 | cfg_if! { 6 | if #[cfg(unix)] { 7 | #[cfg(linux)] 8 | extern crate thread_priority; 9 | use std::process::Command; 10 | use std::process; 11 | use std::os::unix::fs::OpenOptionsExt; 12 | use fs2::FileExt; 13 | #[cfg(linux)] 14 | use thread_priority::*; 15 | 16 | const O_DIRECT: i32 = 0o0_040_000; 17 | 18 | pub fn set_low_prio() { 19 | // todo: low prio for macos 20 | #[cfg(linux)] 21 | let thread_id = thread_native_id(); 22 | #[cfg(linux)] 23 | set_thread_priority( 24 | thread_id, 25 | ThreadPriority::Min, 26 | ThreadSchedulePolicy::Normal(NormalThreadSchedulePolicy::Normal) 27 | ).unwrap(); 28 | } 29 | 30 | pub fn open_using_direct_io>(path: P) -> io::Result { 31 | OpenOptions::new() 32 | .write(true) 33 | .create(true) 34 | .custom_flags(O_DIRECT) 35 | .open(path) 36 | } 37 | 38 | pub fn open>(path: P) -> io::Result { 39 | OpenOptions::new() 40 | .write(true) 41 | .create(true) 42 | .open(path) 43 | } 44 | 45 | pub fn open_r>(path: P) -> io::Result { 46 | OpenOptions::new() 47 | .read(true) 48 | .open(path) 49 | } 50 | // On unix, get the device id from 'df' command 51 | fn get_device_id_unix(path: &str) -> String { 52 | let output = Command::new("df") 53 | .arg(path) 54 | .output() 55 | .expect("failed to execute 'df --output=source'"); 56 | let source = String::from_utf8(output.stdout).expect("not utf8"); 57 | source.split('\n').collect::>()[1].split(' ').collect::>()[0].to_string() 58 | } 59 | 60 | // On macos, use df and 'diskutil info ' to get the Device Block Size line 61 | // and extract the size 62 | fn get_sector_size_macos(path: &str) -> u64 { 63 | let source = get_device_id_unix(path); 64 | let output = Command::new("diskutil") 65 | .arg("info") 66 | .arg(source) 67 | .output() 68 | .expect("failed to execute 'diskutil info'"); 69 | let source = String::from_utf8(output.stdout).expect("not utf8"); 70 | let mut sector_size: u64 = 0; 71 | for line in source.split('\n').collect::>() { 72 | if line.trim().starts_with("Device Block Size") { 73 | // e.g. in reverse: "Bytes 512 Size Block Device" 74 | let source = line.rsplit(' ').collect::>()[1]; 75 | 76 | sector_size = source.parse::().unwrap(); 77 | } 78 | } 79 | if sector_size == 0 { 80 | panic!("Abort: Unable to determine disk physical sector size from diskutil info") 81 | } 82 | sector_size 83 | } 84 | 85 | // On unix, use df and lsblk to extract the device sector size 86 | fn get_sector_size_unix(path: &str) -> u64 { 87 | let source = get_device_id_unix(path); 88 | let output = Command::new("lsblk") 89 | .arg(source) 90 | .arg("-o") 91 | .arg("PHY-SeC") // I'm strict here, LOG-SeC would do 92 | .output() 93 | .expect("failed to execute 'lsblk -o PHY-SeC'"); 94 | 95 | let sector_size = String::from_utf8(output.stdout).expect("not utf8"); 96 | let sector_size = sector_size.split('\n').collect::>().get(1).unwrap_or_else(|| { 97 | println!("failed to determine sector size, defaulting to 4096."); 98 | &"4096" 99 | }).trim(); 100 | 101 | sector_size.parse::().unwrap() 102 | } 103 | 104 | pub fn get_sector_size(path: &str) -> u64 { 105 | if cfg!(target_os = "macos") { 106 | get_sector_size_macos(path) 107 | } else { 108 | get_sector_size_unix(path) 109 | } 110 | } 111 | 112 | pub fn preallocate(file: &Path, size_in_bytes: u64, use_direct_io: bool) { 113 | let file = if use_direct_io { 114 | open_using_direct_io(&file) 115 | } else { 116 | open(&file) 117 | }; 118 | let file = file.unwrap(); 119 | match file.allocate(size_in_bytes) { 120 | Err(errno) => { 121 | // Exit if preallocate fails because write_resume_info() assumes 122 | // that the file isn't zero sized. 123 | println!("\n\nError: couldn't preallocate space for file. {}\n\ 124 | Probable causes are:\n \ 125 | * fallocate() is only supported on ext4 filesystems.\n \ 126 | * Insufficient space.\n", errno); 127 | process::exit(1); 128 | } 129 | Ok(_) => (), 130 | } 131 | } 132 | 133 | pub fn free_disk_space(path: &str) -> u64 { 134 | // I don't like the following code, but I had to. It's difficult to estimate the space available for a new file on ext4 due to overhead. 135 | // Therefor I enforce a 2MB cushion assuming this is sufficient. 136 | fs2::available_space(Path::new(&path)).unwrap().saturating_sub(2097152) 137 | } 138 | 139 | } else { 140 | use std::ffi::CString; 141 | use std::ptr::null_mut; 142 | use std::iter::once; 143 | use std::ffi::OsStr; 144 | use std::os::windows::io::AsRawHandle; 145 | use std::os::windows::ffi::OsStrExt; 146 | use std::os::windows::fs::OpenOptionsExt; 147 | use core::mem::size_of_val; 148 | use winapi::um::errhandlingapi::GetLastError; 149 | use winapi::um::fileapi::{GetDiskFreeSpaceA,SetFileValidData}; 150 | use winapi::um::handleapi::CloseHandle; 151 | use winapi::um::processthreadsapi::{SetThreadIdealProcessor,GetCurrentThread,OpenProcessToken,GetCurrentProcess,SetPriorityClass}; 152 | use winapi::um::securitybaseapi::AdjustTokenPrivileges; 153 | use winapi::um::winbase::LookupPrivilegeValueW; 154 | use winapi::um::winnt::{LUID,TOKEN_ADJUST_PRIVILEGES,TOKEN_PRIVILEGES,LUID_AND_ATTRIBUTES,SE_PRIVILEGE_ENABLED,SE_MANAGE_VOLUME_NAME}; 155 | 156 | const FILE_FLAG_NO_BUFFERING: u32 = 0x2000_0000; 157 | const FILE_FLAG_WRITE_THROUGH: u32 = 0x8000_0000; 158 | const BELOW_NORMAL_PRIORITY_CLASS: u32 = 0x0000_4000; 159 | 160 | pub fn open_using_direct_io>(path: P) -> io::Result { 161 | OpenOptions::new() 162 | .write(true) 163 | .create(true) 164 | .custom_flags(FILE_FLAG_NO_BUFFERING) 165 | .open(path) 166 | } 167 | 168 | pub fn open>(path: P) -> io::Result { 169 | OpenOptions::new() 170 | .write(true) 171 | .create(true) 172 | .custom_flags(FILE_FLAG_WRITE_THROUGH) 173 | .open(path) 174 | } 175 | 176 | pub fn open_r>(path: P) -> io::Result { 177 | OpenOptions::new() 178 | .read(true) 179 | .open(path) 180 | } 181 | 182 | pub fn preallocate(file: &Path, size_in_bytes: u64, use_direct_io: bool) { 183 | let mut result = true; 184 | result &= obtain_priviledge(); 185 | 186 | let file = if use_direct_io { 187 | open_using_direct_io(&file) 188 | } else { 189 | open(&file) 190 | }; 191 | let file = file.unwrap(); 192 | 193 | file.set_len(size_in_bytes).unwrap(); 194 | 195 | if result { 196 | let handle = file.as_raw_handle(); 197 | unsafe{ 198 | let temp = SetFileValidData(handle, size_in_bytes as i64); 199 | result &= temp == 1; 200 | } 201 | } 202 | 203 | if !result { 204 | println!("FAILED, administrative rights missing"); 205 | print!("Slow file pre-allocation..."); 206 | } 207 | } 208 | 209 | pub fn obtain_priviledge() -> bool { 210 | let mut result = true; 211 | 212 | let privilege_encoded: Vec = OsStr::new(SE_MANAGE_VOLUME_NAME) 213 | .encode_wide() 214 | .chain(once(0)) 215 | .collect(); 216 | 217 | let luid = LUID{ 218 | HighPart: 0i32, 219 | LowPart: 0u32 220 | 221 | }; 222 | 223 | unsafe { 224 | let mut htoken = null_mut(); 225 | let mut tp = TOKEN_PRIVILEGES{ 226 | PrivilegeCount: 1, 227 | Privileges: [LUID_AND_ATTRIBUTES{ 228 | Luid: luid, 229 | Attributes: SE_PRIVILEGE_ENABLED, 230 | }] 231 | }; 232 | 233 | let temp = OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, &mut htoken); 234 | result &= temp == 1; 235 | 236 | let temp = LookupPrivilegeValueW(null_mut(), privilege_encoded.as_ptr(), &mut tp.Privileges[0].Luid); 237 | result &= temp == 1; 238 | 239 | let temp = AdjustTokenPrivileges(htoken, 0, &mut tp, size_of_val(&tp) as u32, null_mut(), null_mut()); 240 | 241 | CloseHandle(htoken); 242 | result &= temp == 1; 243 | result &= 244 | GetLastError() == 0u32 245 | } 246 | result 247 | } 248 | 249 | pub fn get_sector_size(path: &str) -> u64 { 250 | let path_encoded = Path::new(path); 251 | let parent_path_encoded = CString::new(path_encoded.to_str().unwrap()).unwrap(); 252 | let mut sectors_per_cluster = 0u32; 253 | let mut bytes_per_sector = 0u32; 254 | let mut number_of_free_cluster = 0u32; 255 | let mut total_number_of_cluster = 0u32; 256 | if unsafe { 257 | GetDiskFreeSpaceA( 258 | parent_path_encoded.as_ptr(), 259 | &mut sectors_per_cluster, 260 | &mut bytes_per_sector, 261 | &mut number_of_free_cluster, 262 | &mut total_number_of_cluster 263 | ) 264 | } == 0 { 265 | panic!("get sector size, filename={}",path); 266 | }; 267 | u64::from(bytes_per_sector) 268 | } 269 | 270 | pub fn set_thread_ideal_processor(id: usize){ 271 | // Set core affinity for current thread. 272 | unsafe { 273 | SetThreadIdealProcessor( 274 | GetCurrentThread(), 275 | id as u32 276 | ); 277 | } 278 | } 279 | pub fn set_low_prio() { 280 | unsafe{ 281 | SetPriorityClass(GetCurrentProcess(),BELOW_NORMAL_PRIORITY_CLASS); 282 | } 283 | } 284 | pub fn free_disk_space(path: &str) -> u64 { 285 | fs2::available_space(Path::new(&path)).unwrap() 286 | } 287 | } 288 | } 289 | 290 | 291 | -------------------------------------------------------------------------------- /src/c/noncegen_256_avx2.c: -------------------------------------------------------------------------------- 1 | #include "noncegen_256_avx2.h" 2 | #include 3 | #include 4 | #include "common.h" 5 | #include "mshabal_256_avx2.h" 6 | #include "sph_shabal.h" 7 | 8 | sph_shabal_context global_32; 9 | mshabal256_context global_256; 10 | mshabal256_context_fast global_256_fast; 11 | 12 | void init_shabal_avx2() { 13 | sph_shabal256_init(&global_32); 14 | mshabal_init_avx2(&global_256, 256); 15 | global_256_fast.out_size = global_256.out_size; 16 | for (int i = 0; i < 352; i++) global_256_fast.state[i] = global_256.state[i]; 17 | global_256_fast.Whigh = global_256.Whigh; 18 | global_256_fast.Wlow = global_256.Wlow; 19 | } 20 | 21 | // cache: cache to save to 22 | // local_num: thread number 23 | // numeric_id: numeric account id 24 | // loc_startnonce nonce to start generation at 25 | // local_nonces: number of nonces to generate 26 | void noncegen_avx2(char *cache, const size_t cache_size, const size_t chunk_offset, 27 | const uint64_t numeric_id, const uint64_t local_startnonce, 28 | const uint64_t local_nonces) { 29 | sph_shabal_context local_32; 30 | uint64_t nonce; 31 | size_t len; 32 | 33 | mshabal256_context_fast local_256_fast; 34 | uint64_t nonce1, nonce2, nonce3, nonce4, nonce5, nonce6, nonce7, nonce8; 35 | 36 | char seed[32]; // 64bit numeric account ID, 64bit nonce (blank), 1bit termination, 127 bits zero 37 | char term[32]; // 1bit 1, 255bit of zeros 38 | char zero[32]; // 256bit of zeros 39 | 40 | write_seed(seed, numeric_id); 41 | write_term(term); 42 | memset(&zero[0], 0, 32); 43 | 44 | //vars shared 45 | uint8_t* buffer = (uint8_t*)malloc(sizeof(uint8_t) * MSHABAL256_VECTOR_SIZE * NONCE_SIZE); 46 | uint8_t* final = (uint8_t*)malloc(sizeof(uint8_t) * MSHABAL256_VECTOR_SIZE * HASH_SIZE); 47 | 48 | // prepare smart SIMD aligned termination strings 49 | // creation could further be optimized, but not much in it as it only runs once per work package 50 | // creation could also be moved to plotter start 51 | union { 52 | mshabal_u32 words[16 * MSHABAL256_VECTOR_SIZE]; 53 | __m256i data[16]; 54 | } t1, t2, t3; 55 | 56 | for (int j = 0; j < 16 * MSHABAL256_VECTOR_SIZE / 2; j += MSHABAL256_VECTOR_SIZE) { 57 | size_t o = j / 2; 58 | // t1 59 | t1.words[j + 0] = *(mshabal_u32 *)(seed + o); 60 | t1.words[j + 1] = *(mshabal_u32 *)(seed + o); 61 | t1.words[j + 2] = *(mshabal_u32 *)(seed + o); 62 | t1.words[j + 3] = *(mshabal_u32 *)(seed + o); 63 | t1.words[j + 4] = *(mshabal_u32 *)(seed + o); 64 | t1.words[j + 5] = *(mshabal_u32 *)(seed + o); 65 | t1.words[j + 6] = *(mshabal_u32 *)(seed + o); 66 | t1.words[j + 7] = *(mshabal_u32 *)(seed + o); 67 | t1.words[j + 0 + 64] = *(mshabal_u32 *)(zero + o); 68 | t1.words[j + 1 + 64] = *(mshabal_u32 *)(zero + o); 69 | t1.words[j + 2 + 64] = *(mshabal_u32 *)(zero + o); 70 | t1.words[j + 3 + 64] = *(mshabal_u32 *)(zero + o); 71 | t1.words[j + 4 + 64] = *(mshabal_u32 *)(zero + o); 72 | t1.words[j + 5 + 64] = *(mshabal_u32 *)(zero + o); 73 | t1.words[j + 6 + 64] = *(mshabal_u32 *)(zero + o); 74 | t1.words[j + 7 + 64] = *(mshabal_u32 *)(zero + o); 75 | // t2 76 | // (first 256bit skipped, will later be filled with data) 77 | t2.words[j + 0 + 64] = *(mshabal_u32 *)(seed + o); 78 | t2.words[j + 1 + 64] = *(mshabal_u32 *)(seed + o); 79 | t2.words[j + 2 + 64] = *(mshabal_u32 *)(seed + o); 80 | t2.words[j + 3 + 64] = *(mshabal_u32 *)(seed + o); 81 | t2.words[j + 4 + 64] = *(mshabal_u32 *)(seed + o); 82 | t2.words[j + 5 + 64] = *(mshabal_u32 *)(seed + o); 83 | t2.words[j + 6 + 64] = *(mshabal_u32 *)(seed + o); 84 | t2.words[j + 7 + 64] = *(mshabal_u32 *)(seed + o); 85 | // t3 86 | t3.words[j + 0] = *(mshabal_u32 *)(term + o); 87 | t3.words[j + 1] = *(mshabal_u32 *)(term + o); 88 | t3.words[j + 2] = *(mshabal_u32 *)(term + o); 89 | t3.words[j + 3] = *(mshabal_u32 *)(term + o); 90 | t3.words[j + 4] = *(mshabal_u32 *)(term + o); 91 | t3.words[j + 5] = *(mshabal_u32 *)(term + o); 92 | t3.words[j + 6] = *(mshabal_u32 *)(term + o); 93 | t3.words[j + 7] = *(mshabal_u32 *)(term + o); 94 | t3.words[j + 0 + 64] = *(mshabal_u32 *)(zero + o); 95 | t3.words[j + 1 + 64] = *(mshabal_u32 *)(zero + o); 96 | t3.words[j + 2 + 64] = *(mshabal_u32 *)(zero + o); 97 | t3.words[j + 3 + 64] = *(mshabal_u32 *)(zero + o); 98 | t3.words[j + 4 + 64] = *(mshabal_u32 *)(zero + o); 99 | t3.words[j + 5 + 64] = *(mshabal_u32 *)(zero + o); 100 | t3.words[j + 6 + 64] = *(mshabal_u32 *)(zero + o); 101 | t3.words[j + 7 + 64] = *(mshabal_u32 *)(zero + o); 102 | } 103 | 104 | for (uint64_t n = 0; n < local_nonces;) { 105 | // iterate nonces (8 per cycle - avx2) 106 | // min 8 nonces left for avx 2 processing, otherwise SISD 107 | if (n + 8 <= local_nonces) { 108 | // generate nonce numbers & change endianness 109 | nonce1 = bswap_64((uint64_t)(local_startnonce + n + 0)); 110 | nonce2 = bswap_64((uint64_t)(local_startnonce + n + 1)); 111 | nonce3 = bswap_64((uint64_t)(local_startnonce + n + 2)); 112 | nonce4 = bswap_64((uint64_t)(local_startnonce + n + 3)); 113 | nonce5 = bswap_64((uint64_t)(local_startnonce + n + 4)); 114 | nonce6 = bswap_64((uint64_t)(local_startnonce + n + 5)); 115 | nonce7 = bswap_64((uint64_t)(local_startnonce + n + 6)); 116 | nonce8 = bswap_64((uint64_t)(local_startnonce + n + 7)); 117 | 118 | // store nonce numbers in relevant termination strings 119 | for (int j = 16; j < 16 * MSHABAL256_VECTOR_SIZE / 4; j += MSHABAL256_VECTOR_SIZE) { 120 | size_t o = j / 2 - 8; 121 | // t1 122 | t1.words[j + 0] = *(mshabal_u32 *)((char *)&nonce1 + o); 123 | t1.words[j + 1] = *(mshabal_u32 *)((char *)&nonce2 + o); 124 | t1.words[j + 2] = *(mshabal_u32 *)((char *)&nonce3 + o); 125 | t1.words[j + 3] = *(mshabal_u32 *)((char *)&nonce4 + o); 126 | t1.words[j + 4] = *(mshabal_u32 *)((char *)&nonce5 + o); 127 | t1.words[j + 5] = *(mshabal_u32 *)((char *)&nonce6 + o); 128 | t1.words[j + 6] = *(mshabal_u32 *)((char *)&nonce7 + o); 129 | t1.words[j + 7] = *(mshabal_u32 *)((char *)&nonce8 + o); 130 | t2.words[j + 0 + 64] = *(mshabal_u32 *)((char *)&nonce1 + o); 131 | t2.words[j + 1 + 64] = *(mshabal_u32 *)((char *)&nonce2 + o); 132 | t2.words[j + 2 + 64] = *(mshabal_u32 *)((char *)&nonce3 + o); 133 | t2.words[j + 3 + 64] = *(mshabal_u32 *)((char *)&nonce4 + o); 134 | t2.words[j + 4 + 64] = *(mshabal_u32 *)((char *)&nonce5 + o); 135 | t2.words[j + 5 + 64] = *(mshabal_u32 *)((char *)&nonce6 + o); 136 | t2.words[j + 6 + 64] = *(mshabal_u32 *)((char *)&nonce7 + o); 137 | t2.words[j + 7 + 64] = *(mshabal_u32 *)((char *)&nonce8 + o); 138 | } 139 | 140 | // start shabal round 141 | 142 | // 3 cases: first 128 rounds uses case 1 or 2, after that case 3 143 | // case 1: first 128 rounds, hashes are even: use termination string 1 144 | // case 2: first 128 rounds, hashes are odd: use termination string 2 145 | // case 3: round > 128: use termination string 3 146 | 147 | // round 1 148 | memcpy(&local_256_fast, &global_256_fast, 149 | sizeof(global_256_fast)); // fast initialize shabal 150 | 151 | mshabal_hash_fast_avx2( 152 | &local_256_fast, NULL, &t1, 153 | &buffer[MSHABAL256_VECTOR_SIZE * (NONCE_SIZE - HASH_SIZE)], 16 >> 6); 154 | 155 | // store first hash into smart termination string 2 (data is vectored and SIMD aligned) 156 | memcpy(&t2, &buffer[MSHABAL256_VECTOR_SIZE * (NONCE_SIZE - HASH_SIZE)], 157 | MSHABAL256_VECTOR_SIZE * (HASH_SIZE)); 158 | 159 | // round 2 - 128 160 | for (size_t i = NONCE_SIZE - HASH_SIZE; i > (NONCE_SIZE - HASH_CAP); i -= HASH_SIZE) { 161 | // check if msg can be divided into 512bit packages without a 162 | // remainder 163 | if (i % 64 == 0) { 164 | // last msg = seed + termination 165 | mshabal_hash_fast_avx2(&local_256_fast, &buffer[i * MSHABAL256_VECTOR_SIZE], 166 | &t1, 167 | &buffer[(i - HASH_SIZE) * MSHABAL256_VECTOR_SIZE], 168 | (NONCE_SIZE + 16 - i) >> 6); 169 | } else { 170 | // last msg = 256 bit data + seed + termination 171 | mshabal_hash_fast_avx2(&local_256_fast, &buffer[i * MSHABAL256_VECTOR_SIZE], 172 | &t2, 173 | &buffer[(i - HASH_SIZE) * MSHABAL256_VECTOR_SIZE], 174 | (NONCE_SIZE + 16 - i) >> 6); 175 | } 176 | } 177 | 178 | // round 128-8192 179 | for (size_t i = NONCE_SIZE - HASH_CAP; i > 0; i -= HASH_SIZE) { 180 | mshabal_hash_fast_avx2(&local_256_fast, &buffer[i * MSHABAL256_VECTOR_SIZE], &t3, 181 | &buffer[(i - HASH_SIZE) * MSHABAL256_VECTOR_SIZE], 182 | (HASH_CAP) >> 6); 183 | } 184 | 185 | // generate final hash 186 | mshabal_hash_fast_avx2(&local_256_fast, &buffer[0], &t1, &final[0], 187 | (NONCE_SIZE + 16) >> 6); 188 | 189 | // XOR using SIMD 190 | // load final hash 191 | __m256i F[8]; 192 | for (int j = 0; j < 8; j++) F[j] = _mm256_loadu_si256((__m256i *)final + j); 193 | // xor all hashes with final hash 194 | for (int j = 0; j < 8 * 2 * HASH_CAP; j++) 195 | _mm256_storeu_si256( 196 | (__m256i *)buffer + j, 197 | _mm256_xor_si256(_mm256_loadu_si256((__m256i *)buffer + j), F[j % 8])); 198 | 199 | // todo: fork SIMD aligned plot file here 200 | 201 | // simd shabal words unpack + POC Shuffle + scatter nonces into optimised cache 202 | for (int i = 0; i < NUM_SCOOPS * 2; i++) { 203 | for (int j = 0; j < 32; j += 4) { 204 | for (int k = 0; k < MSHABAL256_VECTOR_SIZE; k += 1) { 205 | memcpy(&cache[((i & 1) * (4095 - (i >> 1)) + ((i + 1) & 1) * (i >> 1)) * 206 | SCOOP_SIZE * cache_size + 207 | (n + k + chunk_offset) * SCOOP_SIZE + (i & 1) * 32 + j], 208 | &buffer[(i * 32 + j) * MSHABAL256_VECTOR_SIZE + k * 4], 4); 209 | } 210 | } 211 | } 212 | 213 | n += 8; 214 | } else { 215 | // if less than 8 nonces left, use 1d-shabal 216 | int8_t *xv = (int8_t *)&numeric_id; 217 | 218 | for (size_t i = 0; i < 8; i++) buffer[NONCE_SIZE + i] = xv[7 - i]; 219 | 220 | nonce = local_startnonce + n; 221 | xv = (int8_t *)&nonce; 222 | 223 | for (size_t i = 8; i < 16; i++) buffer[NONCE_SIZE + i] = xv[15 - i]; 224 | 225 | for (size_t i = NONCE_SIZE; i > 0; i -= HASH_SIZE) { 226 | memcpy(&local_32, &global_32, sizeof(global_32)); 227 | ; 228 | if (i < NONCE_SIZE + 16 - HASH_CAP) 229 | len = HASH_CAP; 230 | else 231 | len = NONCE_SIZE + 16 - i; 232 | 233 | sph_shabal256(&local_32, &buffer[i], len); 234 | sph_shabal256_close(&local_32, &buffer[i - HASH_SIZE]); 235 | } 236 | 237 | memcpy(&local_32, &global_32, sizeof(global_32)); 238 | sph_shabal256(&local_32, buffer, 16 + NONCE_SIZE); 239 | sph_shabal256_close(&local_32, final); 240 | 241 | // XOR with final 242 | for (size_t i = 0; i < NONCE_SIZE; i++) buffer[i] ^= (final[i % HASH_SIZE]); 243 | 244 | // Sort them PoC2: 245 | for (size_t i = 0; i < HASH_CAP; i++){ 246 | memmove(&cache[i * cache_size * SCOOP_SIZE + (n + chunk_offset) * SCOOP_SIZE], &buffer[i * SCOOP_SIZE], HASH_SIZE); 247 | memmove(&cache[(4095-i) * cache_size * SCOOP_SIZE + (n + chunk_offset) * SCOOP_SIZE + 32], &buffer[i * SCOOP_SIZE + 32], HASH_SIZE); 248 | } 249 | n++; 250 | } 251 | } 252 | free(buffer); 253 | free(final); 254 | } 255 | -------------------------------------------------------------------------------- /src/plotter.rs: -------------------------------------------------------------------------------- 1 | use humanize_rs::bytes::Bytes; 2 | use pbr::{MultiBar, Units}; 3 | use raw_cpuid::CpuId; 4 | 5 | use crate::cpu_hasher::{SimdExtension,init_simd}; 6 | use crate::buffer::PageAlignedByteBuffer; 7 | #[cfg(feature = "opencl")] 8 | use crate::ocl::gpu_get_info; 9 | use crate::scheduler::create_scheduler_thread; 10 | #[cfg(windows)] 11 | use crate::utils::set_thread_ideal_processor; 12 | use crate::utils::{free_disk_space, get_sector_size, preallocate}; 13 | use crate::writer::{create_writer_thread, read_resume_info, write_resume_info}; 14 | use core_affinity; 15 | use crossbeam_channel::bounded; 16 | use std::cmp::{max, min}; 17 | use std::path::Path; 18 | use std::process; 19 | use std::sync::Arc; 20 | use std::thread; 21 | use stopwatch::Stopwatch; 22 | 23 | pub const SCOOP_SIZE: u64 = 64; 24 | pub const NUM_SCOOPS: u64 = 4096; 25 | pub const NONCE_SIZE: u64 = SCOOP_SIZE * NUM_SCOOPS; 26 | 27 | pub struct Plotter {} 28 | 29 | pub struct PlotterTask { 30 | pub numeric_id: u64, 31 | pub start_nonce: u64, 32 | pub nonces: u64, 33 | pub output_path: String, 34 | pub mem: String, 35 | pub cpu_threads: u8, 36 | pub gpus: Option>, 37 | pub direct_io: bool, 38 | pub async_io: bool, 39 | pub quiet: bool, 40 | pub benchmark: bool, 41 | pub zcb: bool, 42 | } 43 | 44 | impl Plotter { 45 | pub fn new() -> Plotter { 46 | Plotter {} 47 | } 48 | 49 | pub fn run(self, mut task: PlotterTask) { 50 | let cpuid = CpuId::new(); 51 | let cpu_name = cpuid.get_extended_function_info().unwrap(); 52 | let cpu_name = cpu_name.processor_brand_string().unwrap().trim(); 53 | let cores = sys_info::cpu_num().unwrap(); 54 | let memory = sys_info::mem_info().unwrap(); 55 | 56 | let simd_ext = init_simd(); 57 | 58 | if !task.quiet { 59 | println!("Engraver {} - PoC2 Plotter\n", crate_version!()); 60 | } 61 | 62 | if !task.quiet && task.benchmark { 63 | println!("*BENCHMARK MODE*\n"); 64 | } 65 | 66 | if !task.quiet { 67 | println!( 68 | "CPU: {} [using {} of {} cores{}{:?}]", 69 | cpu_name, 70 | task.cpu_threads, 71 | cores, 72 | if let SimdExtension::None = &simd_ext { "" } else { " + " }, 73 | &simd_ext 74 | ); 75 | } 76 | 77 | #[cfg(not(feature = "opencl"))] 78 | let gpu_mem_needed = 0u64; 79 | #[cfg(feature = "opencl")] 80 | let gpu_mem_needed = match &task.gpus { 81 | Some(x) => gpu_get_info(&x, task.quiet), 82 | None => 0, 83 | }; 84 | 85 | #[cfg(feature = "opencl")] 86 | let gpu_mem_needed = if task.zcb { 87 | gpu_mem_needed 88 | } else { 89 | gpu_mem_needed / 2 90 | }; 91 | 92 | // use all avaiblable disk space if nonce parameter has been omitted 93 | let free_disk_space = free_disk_space(&task.output_path); 94 | if task.nonces == 0 { 95 | task.nonces = free_disk_space / NONCE_SIZE; 96 | } 97 | 98 | let gpu = task.gpus.is_some(); 99 | 100 | // align number of nonces with sector size if direct i/o 101 | let mut rounded_nonces_to_sector_size = false; 102 | let mut nonces_per_sector = 1; 103 | if task.direct_io { 104 | let sector_size = get_sector_size(&task.output_path); 105 | nonces_per_sector = sector_size / SCOOP_SIZE; 106 | if task.nonces % nonces_per_sector > 0 { 107 | rounded_nonces_to_sector_size = true; 108 | task.nonces /= nonces_per_sector; 109 | task.nonces *= nonces_per_sector; 110 | } 111 | } 112 | 113 | let plotsize = task.nonces * NONCE_SIZE; 114 | 115 | let file = Path::new(&task.output_path).join(format!( 116 | "{}_{}_{}", 117 | task.numeric_id, task.start_nonce, task.nonces 118 | )); 119 | 120 | if !file.parent().unwrap().exists() { 121 | println!( 122 | "Error: specified target path does not exist, path={}", 123 | &task.output_path 124 | ); 125 | println!("Shutting down..."); 126 | return; 127 | } 128 | 129 | // check available disk space 130 | if free_disk_space < plotsize && !file.exists() && !task.benchmark { 131 | println!( 132 | "Error: insufficient disk space, MiB_required={:.2}, MiB_available={:.2}", 133 | plotsize as f64 / 1024.0 / 1024.0, 134 | free_disk_space as f64 / 1024.0 / 1024.0 135 | ); 136 | println!("Shutting down..."); 137 | return; 138 | } 139 | 140 | // calculate memory usage 141 | let mem = match calculate_mem_to_use(&task, &memory, nonces_per_sector, gpu, gpu_mem_needed) 142 | { 143 | Ok(x) => x, 144 | Err(_) => return, 145 | }; 146 | 147 | if !task.quiet { 148 | println!( 149 | "RAM: Total={:.2} GiB, Free={:.2} GiB, Usage={:.2} GiB", 150 | memory.total as f64 / 1024.0 / 1024.0, 151 | get_avail_mem(&memory) as f64 / 1024.0 / 1024.0, 152 | (mem + gpu_mem_needed) as f64 / 1024.0 / 1024.0 / 1024.0 153 | ); 154 | 155 | #[cfg(feature = "opencl")] 156 | println!( 157 | " HDDcache={:.2} GiB, GPUcache={:.2} GiB,\n", 158 | mem as f64 / 1024.0 / 1024.0 / 1024.0, 159 | gpu_mem_needed as f64 / 1024.0 / 1024.0 / 1024.0 160 | ); 161 | 162 | println!("Numeric ID: {}", task.numeric_id); 163 | println!("Start Nonce: {}", task.start_nonce); 164 | println!( 165 | "Nonces: {}{}", 166 | task.nonces, 167 | if rounded_nonces_to_sector_size { 168 | &" (rounded to sector size for fast direct i/o)" 169 | } else { 170 | &"" 171 | } 172 | ); 173 | } 174 | 175 | if !task.quiet { 176 | println!("Output File: {}\n", file.display()); 177 | } 178 | let mut progress = 0; 179 | if file.exists() { 180 | if !task.quiet { 181 | println!("File already exists, reading resume info..."); 182 | } 183 | let resume_info = read_resume_info(&file); 184 | match resume_info { 185 | Ok(x) => progress = x, 186 | Err(_) => { 187 | println!("Error: couldn't read resume info from file '{}'", file.display()); 188 | println!("If you are sure that this file is incomplete \ 189 | or corrupted, then delete it before continuing."); 190 | println!("Shutting Down..."); 191 | return; 192 | } 193 | } 194 | if !task.quiet { 195 | println!("OK"); 196 | } 197 | } else { 198 | if !task.quiet { 199 | print!("Fast file pre-allocation..."); 200 | } 201 | if !task.benchmark { 202 | preallocate(&file, plotsize, task.direct_io); 203 | if write_resume_info(&file, 0u64).is_err() { 204 | println!("Error: couldn't write resume info"); 205 | } 206 | } 207 | if !task.quiet { 208 | println!("OK"); 209 | } 210 | } 211 | 212 | if !task.quiet { 213 | if progress == 0 { 214 | println!("Starting plotting...\n"); 215 | } else { 216 | println!("Resuming plotting from nonce offset {}...\n", progress); 217 | } 218 | } 219 | 220 | // determine buffer size 221 | let num_buffer = if task.async_io { 2 } else { 1 }; 222 | let buffer_size = mem / num_buffer; 223 | let (tx_empty_buffers, rx_empty_buffers) = bounded(num_buffer as usize); 224 | let (tx_full_buffers, rx_full_buffers) = bounded(num_buffer as usize); 225 | 226 | for _ in 0..num_buffer { 227 | let buffer = PageAlignedByteBuffer::new(buffer_size as usize); 228 | tx_empty_buffers.send(buffer).unwrap(); 229 | } 230 | 231 | let mb = MultiBar::new(); 232 | 233 | let p1x = if !task.quiet { 234 | let mut p1 = mb.create_bar(plotsize - progress * NONCE_SIZE); 235 | p1.format("│██░│"); 236 | p1.set_units(Units::Bytes); 237 | p1.message("Hashing: "); 238 | p1.show_counter = false; 239 | p1.set(0); 240 | Some(p1) 241 | } else { 242 | None 243 | }; 244 | 245 | let p2x = if !task.quiet { 246 | let mut p2 = mb.create_bar(plotsize - progress * NONCE_SIZE); 247 | p2.format("│██░│"); 248 | p2.set_units(Units::Bytes); 249 | p2.message("Writing: "); 250 | p2.show_counter = false; 251 | p2.set(0); 252 | Some(p2) 253 | } else { 254 | None 255 | }; 256 | 257 | let sw = Stopwatch::start_new(); 258 | let task = Arc::new(task); 259 | 260 | // hi bold! might make this optional in future releases. 261 | let thread_pinning = true; 262 | let core_ids = if thread_pinning { 263 | core_affinity::get_core_ids().unwrap() 264 | } else { 265 | Vec::new() 266 | }; 267 | 268 | let hasher = thread::spawn({ 269 | create_scheduler_thread( 270 | task.clone(), 271 | rayon::ThreadPoolBuilder::new() 272 | .num_threads(task.cpu_threads as usize) 273 | .start_handler(move |id| { 274 | if thread_pinning { 275 | #[cfg(not(windows))] 276 | let core_id = core_ids[id % core_ids.len()]; 277 | #[cfg(not(windows))] 278 | core_affinity::set_for_current(core_id); 279 | #[cfg(windows)] 280 | set_thread_ideal_processor(id % core_ids.len()); 281 | } 282 | }) 283 | .build() 284 | .unwrap(), 285 | progress, 286 | p1x, 287 | rx_empty_buffers.clone(), 288 | tx_full_buffers.clone(), 289 | simd_ext, 290 | ) 291 | }); 292 | 293 | let writer = thread::spawn({ 294 | create_writer_thread( 295 | task.clone(), 296 | progress, 297 | p2x, 298 | rx_full_buffers.clone(), 299 | tx_empty_buffers.clone(), 300 | ) 301 | }); 302 | 303 | if !task.quiet { 304 | mb.listen(); 305 | } 306 | writer.join().unwrap(); 307 | hasher.join().unwrap(); 308 | 309 | let elapsed = sw.elapsed_ms() as u64; 310 | let hours = elapsed / 1000 / 60 / 60; 311 | let minutes = elapsed / 1000 / 60 - hours * 60; 312 | let seconds = elapsed / 1000 - hours * 60 * 60 - minutes * 60; 313 | 314 | if !task.quiet { 315 | println!( 316 | "\nGenerated {} nonces in {}h{:02}m{:02}s, {:.2} MiB/s, {:.0} nonces/m.", 317 | task.nonces - progress, 318 | hours, 319 | minutes, 320 | seconds, 321 | (task.nonces - progress) as f64 * 1000.0 / (elapsed as f64 + 1.0) / 4.0, 322 | (task.nonces - progress) as f64 * 1000.0 / (elapsed as f64 + 1.0) * 60.0 323 | ); 324 | } 325 | } 326 | } 327 | 328 | fn calculate_mem_to_use( 329 | task: &PlotterTask, 330 | memory: &sys_info::MemInfo, 331 | nonces_per_sector: u64, 332 | gpu: bool, 333 | gpu_mem_needed: u64, 334 | ) -> Result { 335 | let plotsize = task.nonces * NONCE_SIZE; 336 | 337 | let mut mem = match task.mem.parse::() { 338 | Ok(x) => x.size() as u64, 339 | Err(_) => { 340 | println!( 341 | "Error: Can't parse memory limit parameter, input={}", 342 | task.mem, 343 | ); 344 | println!("\nPlease specify a number followed by a unit. If no unit is provided, bytes will be assumed."); 345 | println!("Supported units: B, KiB, MiB, GiB, TiB, PiB, EiB, KB, MB, GB, TB, PB, EB"); 346 | println!("Example: --mem 10GiB\n"); 347 | println!("Shutting down..."); 348 | return Err("invalid unit"); 349 | } 350 | }; 351 | if gpu && mem > 0 && mem < gpu_mem_needed + nonces_per_sector * NONCE_SIZE { 352 | println!("Error: Insufficient host memory for GPU plotting!"); 353 | println!("Shutting down..."); 354 | process::exit(0); 355 | } 356 | 357 | if gpu && mem > 0 { 358 | mem -= gpu_mem_needed; 359 | } 360 | 361 | if mem == 0 { 362 | mem = plotsize; 363 | } 364 | mem = min(mem, plotsize + gpu_mem_needed); 365 | 366 | // opencl requires buffer to be a multiple of 16 (data coalescence magic) 367 | let nonces_per_sector = if gpu { 368 | max(16, nonces_per_sector) 369 | } else { 370 | nonces_per_sector 371 | }; 372 | 373 | // don't exceed free memory and leave some elbow room 1-1000/1024 374 | mem = min(mem, get_avail_mem(&memory) * 1000 - gpu_mem_needed); 375 | 376 | // rounding single/double buffer 377 | let num_buffer = if task.async_io { 2 } else { 1 }; 378 | mem /= num_buffer * NONCE_SIZE * nonces_per_sector; 379 | mem *= num_buffer * NONCE_SIZE * nonces_per_sector; 380 | 381 | // ensure a minimum buffer 382 | mem = max(mem, num_buffer * NONCE_SIZE * nonces_per_sector); 383 | Ok(mem) 384 | } 385 | 386 | // sys_info ex, displays 0 avail on win 387 | #[cfg(not(windows))] 388 | fn get_avail_mem(memory: &sys_info::MemInfo) -> u64 { 389 | memory.avail 390 | } 391 | 392 | #[cfg(windows)] 393 | fn get_avail_mem(memory: &sys_info::MemInfo) -> u64 { 394 | memory.free 395 | } 396 | -------------------------------------------------------------------------------- /src/c/noncegen_512_avx512f.c: -------------------------------------------------------------------------------- 1 | #include "noncegen_512_avx512f.h" 2 | #include 3 | #include 4 | #include "common.h" 5 | #include "mshabal_512_avx512f.h" 6 | #include "sph_shabal.h" 7 | 8 | sph_shabal_context global_32; 9 | mshabal512_context global_512; 10 | mshabal512_context_fast global_512_fast; 11 | 12 | void init_shabal_avx512f() { 13 | sph_shabal256_init(&global_32); 14 | mshabal_init_avx512f(&global_512, 256); 15 | global_512_fast.out_size = global_512.out_size; 16 | for (int i = 0; i < 704; i++) global_512_fast.state[i] = global_512.state[i]; 17 | global_512_fast.Whigh = global_512.Whigh; 18 | global_512_fast.Wlow = global_512.Wlow; 19 | } 20 | 21 | // cache: cache to save to 22 | // local_num: thread number 23 | // numeric_id: numeric account id 24 | // loc_startnonce nonce to start generation at 25 | // local_nonces: number of nonces to generate 26 | void noncegen_avx512(char *cache, const size_t cache_size, const size_t chunk_offset, 27 | const uint64_t numeric_id, const uint64_t local_startnonce, 28 | const uint64_t local_nonces) { 29 | sph_shabal_context local_32; 30 | uint64_t nonce; 31 | size_t len; 32 | 33 | mshabal512_context_fast local_512_fast; 34 | uint64_t nonce1, nonce2, nonce3, nonce4, nonce5, nonce6, nonce7, nonce8, nonce9, nonce10, nonce11, nonce12, nonce13, nonce14, nonce15, nonce16; 35 | 36 | char seed[32]; // 64bit numeric account ID, 64bit nonce (blank), 1bit termination, 127 bits zero 37 | char term[32]; // 1bit 1, 255bit of zeros 38 | char zero[32]; // 256bit of zeros 39 | 40 | //vars shared 41 | uint8_t* buffer = (uint8_t*)malloc(sizeof(uint8_t) * MSHABAL512_VECTOR_SIZE * NONCE_SIZE); 42 | uint8_t* final = (uint8_t*)malloc(sizeof(uint8_t) * MSHABAL512_VECTOR_SIZE * HASH_SIZE); 43 | 44 | write_seed(seed, numeric_id); 45 | write_term(term); 46 | memset(&zero[0], 0, 32); 47 | 48 | // prepare smart SIMD aligned termination strings 49 | // creation could further be optimized, but not much in it as it only runs once per work package 50 | // creation could also be moved to plotter start 51 | union { 52 | mshabal_u32 words[16 * MSHABAL512_VECTOR_SIZE]; 53 | __m512i data[16]; 54 | } t1, t2, t3; 55 | 56 | for (int j = 0; j < 16 * MSHABAL512_VECTOR_SIZE / 2; j += MSHABAL512_VECTOR_SIZE) { 57 | size_t o = j / 4; 58 | // t1 59 | t1.words[j + 0] = *(mshabal_u32 *)(seed + o); 60 | t1.words[j + 1] = *(mshabal_u32 *)(seed + o); 61 | t1.words[j + 2] = *(mshabal_u32 *)(seed + o); 62 | t1.words[j + 3] = *(mshabal_u32 *)(seed + o); 63 | t1.words[j + 4] = *(mshabal_u32 *)(seed + o); 64 | t1.words[j + 5] = *(mshabal_u32 *)(seed + o); 65 | t1.words[j + 6] = *(mshabal_u32 *)(seed + o); 66 | t1.words[j + 7] = *(mshabal_u32 *)(seed + o); 67 | t1.words[j + 8] = *(mshabal_u32 *)(seed + o); 68 | t1.words[j + 9] = *(mshabal_u32 *)(seed + o); 69 | t1.words[j + 10] = *(mshabal_u32 *)(seed + o); 70 | t1.words[j + 11] = *(mshabal_u32 *)(seed + o); 71 | t1.words[j + 12] = *(mshabal_u32 *)(seed + o); 72 | t1.words[j + 13] = *(mshabal_u32 *)(seed + o); 73 | t1.words[j + 14] = *(mshabal_u32 *)(seed + o); 74 | t1.words[j + 15] = *(mshabal_u32 *)(seed + o); 75 | t1.words[j + 0 + 128] = *(mshabal_u32 *)(zero + o); 76 | t1.words[j + 1 + 128] = *(mshabal_u32 *)(zero + o); 77 | t1.words[j + 2 + 128] = *(mshabal_u32 *)(zero + o); 78 | t1.words[j + 3 + 128] = *(mshabal_u32 *)(zero + o); 79 | t1.words[j + 4 + 128] = *(mshabal_u32 *)(zero + o); 80 | t1.words[j + 5 + 128] = *(mshabal_u32 *)(zero + o); 81 | t1.words[j + 6 + 128] = *(mshabal_u32 *)(zero + o); 82 | t1.words[j + 7 + 128] = *(mshabal_u32 *)(zero + o); 83 | t1.words[j + 8 + 128] = *(mshabal_u32 *)(zero + o); 84 | t1.words[j + 9 + 128] = *(mshabal_u32 *)(zero + o); 85 | t1.words[j + 10 + 128] = *(mshabal_u32 *)(zero + o); 86 | t1.words[j + 11 + 128] = *(mshabal_u32 *)(zero + o); 87 | t1.words[j + 12 + 128] = *(mshabal_u32 *)(zero + o); 88 | t1.words[j + 13 + 128] = *(mshabal_u32 *)(zero + o); 89 | t1.words[j + 14 + 128] = *(mshabal_u32 *)(zero + o); 90 | t1.words[j + 15 + 128] = *(mshabal_u32 *)(zero + o); 91 | // t2 92 | // (first 256bit skipped, will later be filled with data) 93 | t2.words[j + 0 + 128] = *(mshabal_u32 *)(seed + o); 94 | t2.words[j + 1 + 128] = *(mshabal_u32 *)(seed + o); 95 | t2.words[j + 2 + 128] = *(mshabal_u32 *)(seed + o); 96 | t2.words[j + 3 + 128] = *(mshabal_u32 *)(seed + o); 97 | t2.words[j + 4 + 128] = *(mshabal_u32 *)(seed + o); 98 | t2.words[j + 5 + 128] = *(mshabal_u32 *)(seed + o); 99 | t2.words[j + 6 + 128] = *(mshabal_u32 *)(seed + o); 100 | t2.words[j + 7 + 128] = *(mshabal_u32 *)(seed + o); 101 | t2.words[j + 8 + 128] = *(mshabal_u32 *)(seed + o); 102 | t2.words[j + 9 + 128] = *(mshabal_u32 *)(seed + o); 103 | t2.words[j + 10 + 128] = *(mshabal_u32 *)(seed + o); 104 | t2.words[j + 11 + 128] = *(mshabal_u32 *)(seed + o); 105 | t2.words[j + 12 + 128] = *(mshabal_u32 *)(seed + o); 106 | t2.words[j + 13 + 128] = *(mshabal_u32 *)(seed + o); 107 | t2.words[j + 14 + 128] = *(mshabal_u32 *)(seed + o); 108 | t2.words[j + 15 + 128] = *(mshabal_u32 *)(seed + o); 109 | // t3 110 | t3.words[j + 0] = *(mshabal_u32 *)(term + o); 111 | t3.words[j + 1] = *(mshabal_u32 *)(term + o); 112 | t3.words[j + 2] = *(mshabal_u32 *)(term + o); 113 | t3.words[j + 3] = *(mshabal_u32 *)(term + o); 114 | t3.words[j + 4] = *(mshabal_u32 *)(term + o); 115 | t3.words[j + 5] = *(mshabal_u32 *)(term + o); 116 | t3.words[j + 6] = *(mshabal_u32 *)(term + o); 117 | t3.words[j + 7] = *(mshabal_u32 *)(term + o); 118 | t3.words[j + 8] = *(mshabal_u32 *)(term + o); 119 | t3.words[j + 9] = *(mshabal_u32 *)(term + o); 120 | t3.words[j + 10] = *(mshabal_u32 *)(term + o); 121 | t3.words[j + 11] = *(mshabal_u32 *)(term + o); 122 | t3.words[j + 12] = *(mshabal_u32 *)(term + o); 123 | t3.words[j + 13] = *(mshabal_u32 *)(term + o); 124 | t3.words[j + 14] = *(mshabal_u32 *)(term + o); 125 | t3.words[j + 15] = *(mshabal_u32 *)(term + o); 126 | 127 | t3.words[j + 0 + 128] = *(mshabal_u32 *)(zero + o); 128 | t3.words[j + 1 + 128] = *(mshabal_u32 *)(zero + o); 129 | t3.words[j + 2 + 128] = *(mshabal_u32 *)(zero + o); 130 | t3.words[j + 3 + 128] = *(mshabal_u32 *)(zero + o); 131 | t3.words[j + 4 + 128] = *(mshabal_u32 *)(zero + o); 132 | t3.words[j + 5 + 128] = *(mshabal_u32 *)(zero + o); 133 | t3.words[j + 6 + 128] = *(mshabal_u32 *)(zero + o); 134 | t3.words[j + 7 + 128] = *(mshabal_u32 *)(zero + o); 135 | t3.words[j + 8 + 128] = *(mshabal_u32 *)(zero + o); 136 | t3.words[j + 9 + 128] = *(mshabal_u32 *)(zero + o); 137 | t3.words[j + 10 + 128] = *(mshabal_u32 *)(zero + o); 138 | t3.words[j + 11 + 128] = *(mshabal_u32 *)(zero + o); 139 | t3.words[j + 12 + 128] = *(mshabal_u32 *)(zero + o); 140 | t3.words[j + 13 + 128] = *(mshabal_u32 *)(zero + o); 141 | t3.words[j + 14 + 128] = *(mshabal_u32 *)(zero + o); 142 | t3.words[j + 15 + 128] = *(mshabal_u32 *)(zero + o); 143 | } 144 | 145 | for (uint64_t n = 0; n < local_nonces;) { 146 | // iterate nonces (16 per cycle - avx512) 147 | // min 16 nonces left for avx512 processing, otherwise SISD 148 | if (n + 16 <= local_nonces) { 149 | // generate nonce numbers & change endianness 150 | nonce1 = bswap_64((uint64_t)(local_startnonce + n + 0)); 151 | nonce2 = bswap_64((uint64_t)(local_startnonce + n + 1)); 152 | nonce3 = bswap_64((uint64_t)(local_startnonce + n + 2)); 153 | nonce4 = bswap_64((uint64_t)(local_startnonce + n + 3)); 154 | nonce5 = bswap_64((uint64_t)(local_startnonce + n + 4)); 155 | nonce6 = bswap_64((uint64_t)(local_startnonce + n + 5)); 156 | nonce7 = bswap_64((uint64_t)(local_startnonce + n + 6)); 157 | nonce8 = bswap_64((uint64_t)(local_startnonce + n + 7)); 158 | nonce9 = bswap_64((uint64_t)(local_startnonce + n + 8)); 159 | nonce10 = bswap_64((uint64_t)(local_startnonce + n + 9)); 160 | nonce11 = bswap_64((uint64_t)(local_startnonce + n + 10)); 161 | nonce12 = bswap_64((uint64_t)(local_startnonce + n + 11)); 162 | nonce13 = bswap_64((uint64_t)(local_startnonce + n + 12)); 163 | nonce14 = bswap_64((uint64_t)(local_startnonce + n + 13)); 164 | nonce15 = bswap_64((uint64_t)(local_startnonce + n + 14)); 165 | nonce16 = bswap_64((uint64_t)(local_startnonce + n + 15)); 166 | 167 | // store nonce numbers in relevant termination strings 168 | for (int j = 32; j < 16 * MSHABAL512_VECTOR_SIZE / 4; j += MSHABAL512_VECTOR_SIZE) { 169 | size_t o = j / 4 - 8; 170 | // t1 171 | t1.words[j + 0] = *(mshabal_u32 *)((char *)&nonce1 + o); 172 | t1.words[j + 1] = *(mshabal_u32 *)((char *)&nonce2 + o); 173 | t1.words[j + 2] = *(mshabal_u32 *)((char *)&nonce3 + o); 174 | t1.words[j + 3] = *(mshabal_u32 *)((char *)&nonce4 + o); 175 | t1.words[j + 4] = *(mshabal_u32 *)((char *)&nonce5 + o); 176 | t1.words[j + 5] = *(mshabal_u32 *)((char *)&nonce6 + o); 177 | t1.words[j + 6] = *(mshabal_u32 *)((char *)&nonce7 + o); 178 | t1.words[j + 7] = *(mshabal_u32 *)((char *)&nonce8 + o); 179 | t1.words[j + 8] = *(mshabal_u32 *)((char *)&nonce9 + o); 180 | t1.words[j + 9] = *(mshabal_u32 *)((char *)&nonce10 + o); 181 | t1.words[j + 10] = *(mshabal_u32 *)((char *)&nonce11 + o); 182 | t1.words[j + 11] = *(mshabal_u32 *)((char *)&nonce12 + o); 183 | t1.words[j + 12] = *(mshabal_u32 *)((char *)&nonce13 + o); 184 | t1.words[j + 13] = *(mshabal_u32 *)((char *)&nonce14 + o); 185 | t1.words[j + 14] = *(mshabal_u32 *)((char *)&nonce15 + o); 186 | t1.words[j + 15] = *(mshabal_u32 *)((char *)&nonce16 + o); 187 | 188 | t2.words[j + 0 + 128] = *(mshabal_u32 *)((char *)&nonce1 + o); 189 | t2.words[j + 1 + 128] = *(mshabal_u32 *)((char *)&nonce2 + o); 190 | t2.words[j + 2 + 128] = *(mshabal_u32 *)((char *)&nonce3 + o); 191 | t2.words[j + 3 + 128] = *(mshabal_u32 *)((char *)&nonce4 + o); 192 | t2.words[j + 4 + 128] = *(mshabal_u32 *)((char *)&nonce5 + o); 193 | t2.words[j + 5 + 128] = *(mshabal_u32 *)((char *)&nonce6 + o); 194 | t2.words[j + 6 + 128] = *(mshabal_u32 *)((char *)&nonce7 + o); 195 | t2.words[j + 7 + 128] = *(mshabal_u32 *)((char *)&nonce8 + o); 196 | t2.words[j + 8 + 128] = *(mshabal_u32 *)((char *)&nonce9 + o); 197 | t2.words[j + 9 + 128] = *(mshabal_u32 *)((char *)&nonce10 + o); 198 | t2.words[j + 10 + 128] = *(mshabal_u32 *)((char *)&nonce11 + o); 199 | t2.words[j + 11 + 128] = *(mshabal_u32 *)((char *)&nonce12 + o); 200 | t2.words[j + 12 + 128] = *(mshabal_u32 *)((char *)&nonce13 + o); 201 | t2.words[j + 13 + 128] = *(mshabal_u32 *)((char *)&nonce14 + o); 202 | t2.words[j + 14 + 128] = *(mshabal_u32 *)((char *)&nonce15 + o); 203 | t2.words[j + 15 + 128] = *(mshabal_u32 *)((char *)&nonce16 + o); 204 | } 205 | 206 | // start shabal round 207 | 208 | // 3 cases: first 128 rounds uses case 1 or 2, after that case 3 209 | // case 1: first 128 rounds, hashes are even: use termination string 1 210 | // case 2: first 128 rounds, hashes are odd: use termination string 2 211 | // case 3: round > 128: use termination string 3 212 | 213 | // round 1 214 | memcpy(&local_512_fast, &global_512_fast, 215 | sizeof(global_512_fast)); // fast initialize shabal 216 | 217 | mshabal_hash_fast_avx512f( 218 | &local_512_fast, NULL, &t1, 219 | &buffer[MSHABAL512_VECTOR_SIZE * (NONCE_SIZE - HASH_SIZE)], 16 >> 6); 220 | 221 | // store first hash into smart termination string 2 (data is vectored and SIMD aligned) 222 | memcpy(&t2, &buffer[MSHABAL512_VECTOR_SIZE * (NONCE_SIZE - HASH_SIZE)], 223 | MSHABAL512_VECTOR_SIZE * (HASH_SIZE)); 224 | 225 | // round 2 - 128 226 | for (size_t i = NONCE_SIZE - HASH_SIZE; i > (NONCE_SIZE - HASH_CAP); i -= HASH_SIZE) { 227 | // check if msg can be divided into 512bit packages without a 228 | // remainder 229 | if (i % 64 == 0) { 230 | // last msg = seed + termination 231 | mshabal_hash_fast_avx512f(&local_512_fast, &buffer[i * MSHABAL512_VECTOR_SIZE], 232 | &t1, 233 | &buffer[(i - HASH_SIZE) * MSHABAL512_VECTOR_SIZE], 234 | (NONCE_SIZE + 16 - i) >> 6); 235 | } else { 236 | // last msg = 256 bit data + seed + termination 237 | mshabal_hash_fast_avx512f(&local_512_fast, &buffer[i * MSHABAL512_VECTOR_SIZE], 238 | &t2, 239 | &buffer[(i - HASH_SIZE) * MSHABAL512_VECTOR_SIZE], 240 | (NONCE_SIZE + 16 - i) >> 6); 241 | } 242 | } 243 | 244 | // round 128-8192 245 | for (size_t i = NONCE_SIZE - HASH_CAP; i > 0; i -= HASH_SIZE) { 246 | mshabal_hash_fast_avx512f(&local_512_fast, &buffer[i * MSHABAL512_VECTOR_SIZE], &t3, 247 | &buffer[(i - HASH_SIZE) * MSHABAL512_VECTOR_SIZE], 248 | (HASH_CAP) >> 6); 249 | } 250 | 251 | // generate final hash 252 | mshabal_hash_fast_avx512f(&local_512_fast, &buffer[0], &t1, &final[0], 253 | (NONCE_SIZE + 16) >> 6); 254 | 255 | // XOR using SIMD 256 | // load final hash 257 | __m512i F[8]; 258 | for (int j = 0; j < 8; j++) F[j] = _mm512_loadu_si512((__m512i *)final + j); 259 | // xor all hashes with final hash 260 | for (int j = 0; j < 8 * 2 * HASH_CAP; j++) 261 | _mm512_storeu_si512( 262 | (__m512i *)buffer + j, 263 | _mm512_xor_si512(_mm512_loadu_si512((__m512i *)buffer + j), F[j % 8])); 264 | 265 | // todo: fork SIMD aligned plot file here 266 | // simd shabal words unpack + POC Shuffle + scatter nonces into optimised cache 267 | for (int i = 0; i < NUM_SCOOPS * 2; i++) { 268 | for (int j = 0; j < 32; j += 4) { 269 | for (int k = 0; k < MSHABAL512_VECTOR_SIZE; k += 1) { 270 | memcpy(&cache[((i & 1) * (4095 - (i >> 1)) + ((i + 1) & 1) * (i >> 1)) * 271 | SCOOP_SIZE * cache_size + 272 | (n + k + chunk_offset) * SCOOP_SIZE + (i & 1) * 32 + j], 273 | &buffer[(i * 32 + j) * MSHABAL512_VECTOR_SIZE + k * 4], 4); 274 | } 275 | } 276 | } 277 | 278 | n += 16; 279 | } else { 280 | // if less than 16 nonces left, use 1d-shabal 281 | int8_t *xv = (int8_t *)&numeric_id; 282 | 283 | for (size_t i = 0; i < 8; i++) buffer[NONCE_SIZE + i] = xv[7 - i]; 284 | 285 | nonce = local_startnonce + n; 286 | xv = (int8_t *)&nonce; 287 | 288 | for (size_t i = 8; i < 16; i++) buffer[NONCE_SIZE + i] = xv[15 - i]; 289 | 290 | for (size_t i = NONCE_SIZE; i > 0; i -= HASH_SIZE) { 291 | memcpy(&local_32, &global_32, sizeof(global_32)); 292 | ; 293 | if (i < NONCE_SIZE + 16 - HASH_CAP) 294 | len = HASH_CAP; 295 | else 296 | len = NONCE_SIZE + 16 - i; 297 | 298 | sph_shabal256(&local_32, &buffer[i], len); 299 | sph_shabal256_close(&local_32, &buffer[i - HASH_SIZE]); 300 | } 301 | 302 | memcpy(&local_32, &global_32, sizeof(global_32)); 303 | sph_shabal256(&local_32, buffer, 16 + NONCE_SIZE); 304 | sph_shabal256_close(&local_32, final); 305 | 306 | // XOR with final 307 | for (size_t i = 0; i < NONCE_SIZE; i++) buffer[i] ^= (final[i % HASH_SIZE]); 308 | 309 | // Sort them PoC2: 310 | for (size_t i = 0; i < HASH_CAP; i++){ 311 | memmove(&cache[i * cache_size * SCOOP_SIZE + (n + chunk_offset) * SCOOP_SIZE], &buffer[i * SCOOP_SIZE], HASH_SIZE); 312 | memmove(&cache[(4095-i) * cache_size * SCOOP_SIZE + (n + chunk_offset) * SCOOP_SIZE + 32], &buffer[i * SCOOP_SIZE + 32], HASH_SIZE); 313 | } 314 | n++; 315 | } 316 | } 317 | free(buffer); 318 | free(final); 319 | } 320 | -------------------------------------------------------------------------------- /src/ocl/kernel.cl: -------------------------------------------------------------------------------- 1 | #ifdef cl_clang_storage_class_specifiers 2 | #pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable 3 | #endif 4 | typedef unsigned int sph_u32; 5 | 6 | #define SPH_C32(x) ((sph_u32)(x ## U)) 7 | #define SPH_T32(x) (as_uint(x)) 8 | #define SPH_ROTL32(x, n) rotate(as_uint(x), as_uint(n)) 9 | #define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n))) 10 | 11 | #define SPH_C64(x) ((sph_u64)(x ## UL)) 12 | #define SPH_T64(x) (as_ulong(x)) 13 | #define SPH_ROTL64(x, n) rotate(as_ulong(x), (n) & 0xFFFFFFFFFFFFFFFFUL) 14 | #define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n))) 15 | 16 | /* $Id: shabal.c 175 2010-05-07 16:03:20Z tp $ */ 17 | /* 18 | * Shabal implementation. 19 | * 20 | * ==========================(LICENSE BEGIN)============================ 21 | * 22 | * Copyright (c) 2007-2010 Projet RNRT SAPHIR 23 | * 24 | * Permission is hereby granted, free of charge, to any person obtaining 25 | * a copy of this software and associated documentation files (the 26 | * "Software"), to deal in the Software without restriction, including 27 | * without limitation the rights to use, copy, modify, merge, publish, 28 | * distribute, sublicense, and/or sell copies of the Software, and to 29 | * permit persons to whom the Software is furnished to do so, subject to 30 | * the following conditions: 31 | * 32 | * The above copyright notice and this permission notice shall be 33 | * included in all copies or substantial portions of the Software. 34 | * 35 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 36 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 37 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 38 | * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 39 | * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 40 | * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 41 | * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 42 | * 43 | * ===========================(LICENSE END)============================= 44 | * 45 | * @author Thomas Pornin 46 | */ 47 | 48 | /* 49 | * Part of this code was automatically generated (the part between 50 | * the "BEGIN" and "END" markers). 51 | */ 52 | 53 | #define sM 16 54 | 55 | #define C32 SPH_C32 56 | #define T32 SPH_T32 57 | 58 | #define O1 13 59 | #define O2 9 60 | #define O3 6 61 | 62 | /* 63 | * We copy the state into local variables, so that the compiler knows 64 | * that it can optimize them at will. 65 | */ 66 | 67 | /* BEGIN -- automatically generated code. */ 68 | 69 | #define INPUT_BLOCK_ADD do { \ 70 | B0 = T32(B0 + M0); \ 71 | B1 = T32(B1 + M1); \ 72 | B2 = T32(B2 + M2); \ 73 | B3 = T32(B3 + M3); \ 74 | B4 = T32(B4 + M4); \ 75 | B5 = T32(B5 + M5); \ 76 | B6 = T32(B6 + M6); \ 77 | B7 = T32(B7 + M7); \ 78 | B8 = T32(B8 + M8); \ 79 | B9 = T32(B9 + M9); \ 80 | BA = T32(BA + MA); \ 81 | BB = T32(BB + MB); \ 82 | BC = T32(BC + MC); \ 83 | BD = T32(BD + MD); \ 84 | BE = T32(BE + ME); \ 85 | BF = T32(BF + MF); \ 86 | } while (0) 87 | 88 | #define INPUT_BLOCK_SUB do { \ 89 | C0 = T32(C0 - M0); \ 90 | C1 = T32(C1 - M1); \ 91 | C2 = T32(C2 - M2); \ 92 | C3 = T32(C3 - M3); \ 93 | C4 = T32(C4 - M4); \ 94 | C5 = T32(C5 - M5); \ 95 | C6 = T32(C6 - M6); \ 96 | C7 = T32(C7 - M7); \ 97 | C8 = T32(C8 - M8); \ 98 | C9 = T32(C9 - M9); \ 99 | CA = T32(CA - MA); \ 100 | CB = T32(CB - MB); \ 101 | CC = T32(CC - MC); \ 102 | CD = T32(CD - MD); \ 103 | CE = T32(CE - ME); \ 104 | CF = T32(CF - MF); \ 105 | } while (0) 106 | 107 | #define XOR_W do { \ 108 | A00 ^= Wlow; \ 109 | A01 ^= Whigh; \ 110 | } while (0) 111 | 112 | #define SWAP(v1, v2) do { \ 113 | sph_u32 tmp = (v1); \ 114 | (v1) = (v2); \ 115 | (v2) = tmp; \ 116 | } while (0) 117 | 118 | #define SWAP_BC do { \ 119 | SWAP(B0, C0); \ 120 | SWAP(B1, C1); \ 121 | SWAP(B2, C2); \ 122 | SWAP(B3, C3); \ 123 | SWAP(B4, C4); \ 124 | SWAP(B5, C5); \ 125 | SWAP(B6, C6); \ 126 | SWAP(B7, C7); \ 127 | SWAP(B8, C8); \ 128 | SWAP(B9, C9); \ 129 | SWAP(BA, CA); \ 130 | SWAP(BB, CB); \ 131 | SWAP(BC, CC); \ 132 | SWAP(BD, CD); \ 133 | SWAP(BE, CE); \ 134 | SWAP(BF, CF); \ 135 | } while (0) 136 | 137 | #define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) do { \ 138 | xa0 = T32((xa0 \ 139 | ^ (((xa1 << 15) | (xa1 >> 17)) * 5U) \ 140 | ^ xc) * 3U) \ 141 | ^ xb1 ^ (xb2 & ~xb3) ^ xm; \ 142 | xb0 = T32(~(((xb0 << 1) | (xb0 >> 31)) ^ xa0)); \ 143 | } while (0) 144 | 145 | #define PERM_STEP_0 do { \ 146 | PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \ 147 | PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \ 148 | PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \ 149 | PERM_ELT(A03, A02, B3, B0, BC, B9, C5, M3); \ 150 | PERM_ELT(A04, A03, B4, B1, BD, BA, C4, M4); \ 151 | PERM_ELT(A05, A04, B5, B2, BE, BB, C3, M5); \ 152 | PERM_ELT(A06, A05, B6, B3, BF, BC, C2, M6); \ 153 | PERM_ELT(A07, A06, B7, B4, B0, BD, C1, M7); \ 154 | PERM_ELT(A08, A07, B8, B5, B1, BE, C0, M8); \ 155 | PERM_ELT(A09, A08, B9, B6, B2, BF, CF, M9); \ 156 | PERM_ELT(A0A, A09, BA, B7, B3, B0, CE, MA); \ 157 | PERM_ELT(A0B, A0A, BB, B8, B4, B1, CD, MB); \ 158 | PERM_ELT(A00, A0B, BC, B9, B5, B2, CC, MC); \ 159 | PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \ 160 | PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \ 161 | PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \ 162 | } while (0) 163 | 164 | #define PERM_STEP_1 do { \ 165 | PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \ 166 | PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \ 167 | PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \ 168 | PERM_ELT(A07, A06, B3, B0, BC, B9, C5, M3); \ 169 | PERM_ELT(A08, A07, B4, B1, BD, BA, C4, M4); \ 170 | PERM_ELT(A09, A08, B5, B2, BE, BB, C3, M5); \ 171 | PERM_ELT(A0A, A09, B6, B3, BF, BC, C2, M6); \ 172 | PERM_ELT(A0B, A0A, B7, B4, B0, BD, C1, M7); \ 173 | PERM_ELT(A00, A0B, B8, B5, B1, BE, C0, M8); \ 174 | PERM_ELT(A01, A00, B9, B6, B2, BF, CF, M9); \ 175 | PERM_ELT(A02, A01, BA, B7, B3, B0, CE, MA); \ 176 | PERM_ELT(A03, A02, BB, B8, B4, B1, CD, MB); \ 177 | PERM_ELT(A04, A03, BC, B9, B5, B2, CC, MC); \ 178 | PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \ 179 | PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \ 180 | PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \ 181 | } while (0) 182 | 183 | #define PERM_STEP_2 do { \ 184 | PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \ 185 | PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \ 186 | PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \ 187 | PERM_ELT(A0B, A0A, B3, B0, BC, B9, C5, M3); \ 188 | PERM_ELT(A00, A0B, B4, B1, BD, BA, C4, M4); \ 189 | PERM_ELT(A01, A00, B5, B2, BE, BB, C3, M5); \ 190 | PERM_ELT(A02, A01, B6, B3, BF, BC, C2, M6); \ 191 | PERM_ELT(A03, A02, B7, B4, B0, BD, C1, M7); \ 192 | PERM_ELT(A04, A03, B8, B5, B1, BE, C0, M8); \ 193 | PERM_ELT(A05, A04, B9, B6, B2, BF, CF, M9); \ 194 | PERM_ELT(A06, A05, BA, B7, B3, B0, CE, MA); \ 195 | PERM_ELT(A07, A06, BB, B8, B4, B1, CD, MB); \ 196 | PERM_ELT(A08, A07, BC, B9, B5, B2, CC, MC); \ 197 | PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \ 198 | PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \ 199 | PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \ 200 | } while (0) 201 | 202 | #define APPLY_P do { \ 203 | B0 = T32(B0 << 17) | (B0 >> 15); \ 204 | B1 = T32(B1 << 17) | (B1 >> 15); \ 205 | B2 = T32(B2 << 17) | (B2 >> 15); \ 206 | B3 = T32(B3 << 17) | (B3 >> 15); \ 207 | B4 = T32(B4 << 17) | (B4 >> 15); \ 208 | B5 = T32(B5 << 17) | (B5 >> 15); \ 209 | B6 = T32(B6 << 17) | (B6 >> 15); \ 210 | B7 = T32(B7 << 17) | (B7 >> 15); \ 211 | B8 = T32(B8 << 17) | (B8 >> 15); \ 212 | B9 = T32(B9 << 17) | (B9 >> 15); \ 213 | BA = T32(BA << 17) | (BA >> 15); \ 214 | BB = T32(BB << 17) | (BB >> 15); \ 215 | BC = T32(BC << 17) | (BC >> 15); \ 216 | BD = T32(BD << 17) | (BD >> 15); \ 217 | BE = T32(BE << 17) | (BE >> 15); \ 218 | BF = T32(BF << 17) | (BF >> 15); \ 219 | PERM_STEP_0; \ 220 | PERM_STEP_1; \ 221 | PERM_STEP_2; \ 222 | A0B = T32(A0B + C6); \ 223 | A0A = T32(A0A + C5); \ 224 | A09 = T32(A09 + C4); \ 225 | A08 = T32(A08 + C3); \ 226 | A07 = T32(A07 + C2); \ 227 | A06 = T32(A06 + C1); \ 228 | A05 = T32(A05 + C0); \ 229 | A04 = T32(A04 + CF); \ 230 | A03 = T32(A03 + CE); \ 231 | A02 = T32(A02 + CD); \ 232 | A01 = T32(A01 + CC); \ 233 | A00 = T32(A00 + CB); \ 234 | A0B = T32(A0B + CA); \ 235 | A0A = T32(A0A + C9); \ 236 | A09 = T32(A09 + C8); \ 237 | A08 = T32(A08 + C7); \ 238 | A07 = T32(A07 + C6); \ 239 | A06 = T32(A06 + C5); \ 240 | A05 = T32(A05 + C4); \ 241 | A04 = T32(A04 + C3); \ 242 | A03 = T32(A03 + C2); \ 243 | A02 = T32(A02 + C1); \ 244 | A01 = T32(A01 + C0); \ 245 | A00 = T32(A00 + CF); \ 246 | A0B = T32(A0B + CE); \ 247 | A0A = T32(A0A + CD); \ 248 | A09 = T32(A09 + CC); \ 249 | A08 = T32(A08 + CB); \ 250 | A07 = T32(A07 + CA); \ 251 | A06 = T32(A06 + C9); \ 252 | A05 = T32(A05 + C8); \ 253 | A04 = T32(A04 + C7); \ 254 | A03 = T32(A03 + C6); \ 255 | A02 = T32(A02 + C5); \ 256 | A01 = T32(A01 + C4); \ 257 | A00 = T32(A00 + C3); \ 258 | } while (0) 259 | 260 | #define INCR_W do { \ 261 | if ((Wlow = T32(Wlow + 1)) == 0) \ 262 | Whigh = T32(Whigh + 1); \ 263 | } while (0) 264 | 265 | __constant static const sph_u32 A_init_192[] = { 266 | C32(0xFD749ED4), C32(0xB798E530), C32(0x33904B6F), C32(0x46BDA85E), 267 | C32(0x076934B4), C32(0x454B4058), C32(0x77F74527), C32(0xFB4CF465), 268 | C32(0x62931DA9), C32(0xE778C8DB), C32(0x22B3998E), C32(0xAC15CFB9) 269 | }; 270 | 271 | __constant static const sph_u32 B_init_192[] = { 272 | C32(0x58BCBAC4), C32(0xEC47A08E), C32(0xAEE933B2), C32(0xDFCBC824), 273 | C32(0xA7944804), C32(0xBF65BDB0), C32(0x5A9D4502), C32(0x59979AF7), 274 | C32(0xC5CEA54E), C32(0x4B6B8150), C32(0x16E71909), C32(0x7D632319), 275 | C32(0x930573A0), C32(0xF34C63D1), C32(0xCAF914B4), C32(0xFDD6612C) 276 | }; 277 | 278 | __constant static const sph_u32 C_init_192[] = { 279 | C32(0x61550878), C32(0x89EF2B75), C32(0xA1660C46), C32(0x7EF3855B), 280 | C32(0x7297B58C), C32(0x1BC67793), C32(0x7FB1C723), C32(0xB66FC640), 281 | C32(0x1A48B71C), C32(0xF0976D17), C32(0x088CE80A), C32(0xA454EDF3), 282 | C32(0x1C096BF4), C32(0xAC76224B), C32(0x5215781C), C32(0xCD5D2669) 283 | }; 284 | 285 | __constant static const sph_u32 A_init_224[] = { 286 | C32(0xA5201467), C32(0xA9B8D94A), C32(0xD4CED997), C32(0x68379D7B), 287 | C32(0xA7FC73BA), C32(0xF1A2546B), C32(0x606782BF), C32(0xE0BCFD0F), 288 | C32(0x2F25374E), C32(0x069A149F), C32(0x5E2DFF25), C32(0xFAECF061) 289 | }; 290 | 291 | __constant static const sph_u32 B_init_224[] = { 292 | C32(0xEC9905D8), C32(0xF21850CF), C32(0xC0A746C8), C32(0x21DAD498), 293 | C32(0x35156EEB), C32(0x088C97F2), C32(0x26303E40), C32(0x8A2D4FB5), 294 | C32(0xFEEE44B6), C32(0x8A1E9573), C32(0x7B81111A), C32(0xCBC139F0), 295 | C32(0xA3513861), C32(0x1D2C362E), C32(0x918C580E), C32(0xB58E1B9C) 296 | }; 297 | 298 | __constant static const sph_u32 C_init_224[] = { 299 | C32(0xE4B573A1), C32(0x4C1A0880), C32(0x1E907C51), C32(0x04807EFD), 300 | C32(0x3AD8CDE5), C32(0x16B21302), C32(0x02512C53), C32(0x2204CB18), 301 | C32(0x99405F2D), C32(0xE5B648A1), C32(0x70AB1D43), C32(0xA10C25C2), 302 | C32(0x16F1AC05), C32(0x38BBEB56), C32(0x9B01DC60), C32(0xB1096D83) 303 | }; 304 | 305 | __constant static const sph_u32 A_init_256[] = { 306 | C32(0x52F84552), C32(0xE54B7999), C32(0x2D8EE3EC), C32(0xB9645191), 307 | C32(0xE0078B86), C32(0xBB7C44C9), C32(0xD2B5C1CA), C32(0xB0D2EB8C), 308 | C32(0x14CE5A45), C32(0x22AF50DC), C32(0xEFFDBC6B), C32(0xEB21B74A) 309 | }; 310 | 311 | __constant static const sph_u32 B_init_256[] = { 312 | C32(0xB555C6EE), C32(0x3E710596), C32(0xA72A652F), C32(0x9301515F), 313 | C32(0xDA28C1FA), C32(0x696FD868), C32(0x9CB6BF72), C32(0x0AFE4002), 314 | C32(0xA6E03615), C32(0x5138C1D4), C32(0xBE216306), C32(0xB38B8890), 315 | C32(0x3EA8B96B), C32(0x3299ACE4), C32(0x30924DD4), C32(0x55CB34A5) 316 | }; 317 | 318 | __constant static const sph_u32 C_init_256[] = { 319 | C32(0xB405F031), C32(0xC4233EBA), C32(0xB3733979), C32(0xC0DD9D55), 320 | C32(0xC51C28AE), C32(0xA327B8E1), C32(0x56C56167), C32(0xED614433), 321 | C32(0x88B59D60), C32(0x60E2CEBA), C32(0x758B4B8B), C32(0x83E82A7F), 322 | C32(0xBC968828), C32(0xE6E00BF7), C32(0xBA839E55), C32(0x9B491C60) 323 | }; 324 | 325 | __constant static const sph_u32 A_init_384[] = { 326 | C32(0xC8FCA331), C32(0xE55C504E), C32(0x003EBF26), C32(0xBB6B8D83), 327 | C32(0x7B0448C1), C32(0x41B82789), C32(0x0A7C9601), C32(0x8D659CFF), 328 | C32(0xB6E2673E), C32(0xCA54C77B), C32(0x1460FD7E), C32(0x3FCB8F2D) 329 | }; 330 | 331 | __constant static const sph_u32 B_init_384[] = { 332 | C32(0x527291FC), C32(0x2A16455F), C32(0x78E627E5), C32(0x944F169F), 333 | C32(0x1CA6F016), C32(0xA854EA25), C32(0x8DB98ABE), C32(0xF2C62641), 334 | C32(0x30117DCB), C32(0xCF5C4309), C32(0x93711A25), C32(0xF9F671B8), 335 | C32(0xB01D2116), C32(0x333F4B89), C32(0xB285D165), C32(0x86829B36) 336 | }; 337 | 338 | __constant static const sph_u32 C_init_384[] = { 339 | C32(0xF764B11A), C32(0x76172146), C32(0xCEF6934D), C32(0xC6D28399), 340 | C32(0xFE095F61), C32(0x5E6018B4), C32(0x5048ECF5), C32(0x51353261), 341 | C32(0x6E6E36DC), C32(0x63130DAD), C32(0xA9C69BD6), C32(0x1E90EA0C), 342 | C32(0x7C35073B), C32(0x28D95E6D), C32(0xAA340E0D), C32(0xCB3DEE70) 343 | }; 344 | 345 | __constant static const sph_u32 A_init_512[] = { 346 | C32(0x20728DFD), C32(0x46C0BD53), C32(0xE782B699), C32(0x55304632), 347 | C32(0x71B4EF90), C32(0x0EA9E82C), C32(0xDBB930F1), C32(0xFAD06B8B), 348 | C32(0xBE0CAE40), C32(0x8BD14410), C32(0x76D2ADAC), C32(0x28ACAB7F) 349 | }; 350 | 351 | __constant static const sph_u32 B_init_512[] = { 352 | C32(0xC1099CB7), C32(0x07B385F3), C32(0xE7442C26), C32(0xCC8AD640), 353 | C32(0xEB6F56C7), C32(0x1EA81AA9), C32(0x73B9D314), C32(0x1DE85D08), 354 | C32(0x48910A5A), C32(0x893B22DB), C32(0xC5A0DF44), C32(0xBBC4324E), 355 | C32(0x72D2F240), C32(0x75941D99), C32(0x6D8BDE82), C32(0xA1A7502B) 356 | }; 357 | 358 | __constant static const sph_u32 C_init_512[] = { 359 | C32(0xD9BF68D1), C32(0x58BAD750), C32(0x56028CB2), C32(0x8134F359), 360 | C32(0xB5D469D8), C32(0x941A8CC2), C32(0x418B2A6E), C32(0x04052780), 361 | C32(0x7F07D787), C32(0x5194358F), C32(0x3C60D665), C32(0xBE97D79A), 362 | C32(0x950C3434), C32(0xAED9A06D), C32(0x2537DC8D), C32(0x7CDB5969) 363 | }; 364 | 365 | /* END -- automatically generated code. */ 366 | 367 | #define NONCES_VECTOR 16 368 | #define NONCES_VECTOR_LOG2 4 369 | #define MESSAGE_CAP 64 370 | #define NUM_HASHES 8192 371 | #define HASH_SIZE_WORDS 8 372 | #define NONCE_SIZE_WORDS HASH_SIZE_WORDS * NUM_HASHES 373 | 374 | #define EndianSwap(n) (rotate(n & 0x00FF00FF, 24UL)|(rotate(n, 8UL) & 0x00FF00FF)) 375 | 376 | #define EndianSwap64(n) bitselect( \ 377 | bitselect(rotate(n, 24UL), \ 378 | rotate(n, 8UL), 0x000000FF000000FFUL), \ 379 | bitselect(rotate(n, 56UL), \ 380 | rotate(n, 40UL), 0x00FF000000FF0000UL), \ 381 | 0xFFFF0000FFFF0000UL) 382 | 383 | #define Address(nonce,hash,word) ((nonce >> NONCES_VECTOR_LOG2) * NONCES_VECTOR * NONCE_SIZE_WORDS + (hash) * NONCES_VECTOR * HASH_SIZE_WORDS + word * NONCES_VECTOR + (nonce & (NONCES_VECTOR-1))) 384 | //#define Address(nonce,hash,word) (nonce * NONCE_SIZE_WORDS + (hash) * HASH_SIZE_WORDS + word) 385 | 386 | /* Johnny's optimised nonce calculation kernel 387 | * based on the implementation found in BRS 388 | */ 389 | __kernel void calculate_nonces(__global unsigned char* buffer, unsigned long startnonce, unsigned long numeric_id_be, int start, int end, unsigned long nonces) { 390 | //if (gid==0) {printf("\n\nOCL 2 %lu\n\n",startnonce);} DEBUG 391 | int gid = get_global_id(0); 392 | 393 | if (gid >= nonces) 394 | return; 395 | // number of shabal message round 396 | int num; 397 | // buffer for final hash 398 | sph_u32 B8,B9,BA,BB,BC,BD,BE,BF; 399 | // init 400 | unsigned long nonce_be = EndianSwap64(startnonce + gid); 401 | // run 8192 rounds + final round 402 | for (int hash = NUM_HASHES - start; hash > -1 + NUM_HASHES - end; hash -= 1) { 403 | // calculate number of shabal messages excl. final message 404 | num = (NUM_HASHES - hash) >> 1; 405 | if (hash != 0) { 406 | num = (num > MESSAGE_CAP) ? MESSAGE_CAP : num; 407 | } 408 | 409 | // init shabal 410 | sph_u32 411 | A00 = A_init_256[0], A01 = A_init_256[1], A02 = A_init_256[2], A03 = A_init_256[3], 412 | A04 = A_init_256[4], A05 = A_init_256[5], A06 = A_init_256[6], A07 = A_init_256[7], 413 | A08 = A_init_256[8], A09 = A_init_256[9], A0A = A_init_256[10], A0B = A_init_256[11]; 414 | sph_u32 415 | B0 = B_init_256[0], B1 = B_init_256[1], B2 = B_init_256[2], B3 = B_init_256[3], 416 | B4 = B_init_256[4], B5 = B_init_256[5], B6 = B_init_256[6], B7 = B_init_256[7]; 417 | B8 = B_init_256[8]; B9 = B_init_256[9]; BA = B_init_256[10]; BB = B_init_256[11]; 418 | BC = B_init_256[12]; BD = B_init_256[13]; BE = B_init_256[14]; BF = B_init_256[15]; 419 | sph_u32 420 | C0 = C_init_256[0], C1 = C_init_256[1], C2 = C_init_256[2], C3 = C_init_256[3], 421 | C4 = C_init_256[4], C5 = C_init_256[5], C6 = C_init_256[6], C7 = C_init_256[7], 422 | C8 = C_init_256[8], C9 = C_init_256[9], CA = C_init_256[10], CB = C_init_256[11], 423 | CC = C_init_256[12], CD = C_init_256[13], CE = C_init_256[14], CF = C_init_256[15]; 424 | sph_u32 M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF; 425 | sph_u32 Wlow = 1, Whigh = 0; 426 | 427 | for (int i = 0; i < 2 * num; i+=2){ 428 | M0 = ((__global unsigned int*)buffer)[Address(gid, hash + i, 0)]; 429 | M1 = ((__global unsigned int*)buffer)[Address(gid, hash + i, 1)]; 430 | M2 = ((__global unsigned int*)buffer)[Address(gid, hash + i, 2)]; 431 | M3 = ((__global unsigned int*)buffer)[Address(gid, hash + i, 3)]; 432 | M4 = ((__global unsigned int*)buffer)[Address(gid, hash + i, 4)]; 433 | M5 = ((__global unsigned int*)buffer)[Address(gid, hash + i, 5)]; 434 | M6 = ((__global unsigned int*)buffer)[Address(gid, hash + i, 6)]; 435 | M7 = ((__global unsigned int*)buffer)[Address(gid, hash + i, 7)]; 436 | M8 = ((__global unsigned int*)buffer)[Address(gid, hash + i + 1, 0)]; 437 | M9 = ((__global unsigned int*)buffer)[Address(gid, hash + i + 1, 1)]; 438 | MA = ((__global unsigned int*)buffer)[Address(gid, hash + i + 1, 2)]; 439 | MB = ((__global unsigned int*)buffer)[Address(gid, hash + i + 1, 3)]; 440 | MC = ((__global unsigned int*)buffer)[Address(gid, hash + i + 1, 4)]; 441 | MD = ((__global unsigned int*)buffer)[Address(gid, hash + i + 1, 5)]; 442 | ME = ((__global unsigned int*)buffer)[Address(gid, hash + i + 1, 6)]; 443 | MF = ((__global unsigned int*)buffer)[Address(gid, hash + i + 1, 7)]; 444 | 445 | INPUT_BLOCK_ADD; 446 | XOR_W; 447 | APPLY_P; 448 | INPUT_BLOCK_SUB; 449 | SWAP_BC; 450 | INCR_W; 451 | } 452 | 453 | // final message determination 454 | if (num == MESSAGE_CAP) { 455 | M0 = 0x80; 456 | M1 = M2 = M3 = M4 = M5 = M6 = M7 = M8 = M9 = MA = MB = MC = MD = ME = MF = 0; 457 | } 458 | else if((hash & 1) == 0) { 459 | M0 = ((unsigned int*)&numeric_id_be)[0]; 460 | M1 = ((unsigned int*)&numeric_id_be)[1]; 461 | M2 = ((unsigned int*)&nonce_be)[0]; 462 | M3 = ((unsigned int*)&nonce_be)[1]; 463 | M4 = 0x80; 464 | M5 = M6 = M7 = M8 = M9 = MA = MB = MC = MD = ME = MF = 0; 465 | } 466 | else if((hash & 1) == 1) { 467 | M0 = ((__global unsigned int*)buffer)[Address(gid, NUM_HASHES-1, 0)]; 468 | M1 = ((__global unsigned int*)buffer)[Address(gid, NUM_HASHES-1, 1)]; 469 | M2 = ((__global unsigned int*)buffer)[Address(gid, NUM_HASHES-1, 2)]; 470 | M3 = ((__global unsigned int*)buffer)[Address(gid, NUM_HASHES-1, 3)]; 471 | M4 = ((__global unsigned int*)buffer)[Address(gid, NUM_HASHES-1, 4)]; 472 | M5 = ((__global unsigned int*)buffer)[Address(gid, NUM_HASHES-1, 5)]; 473 | M6 = ((__global unsigned int*)buffer)[Address(gid, NUM_HASHES-1, 6)]; 474 | M7 = ((__global unsigned int*)buffer)[Address(gid, NUM_HASHES-1, 7)]; 475 | M8 = ((unsigned int*)&numeric_id_be)[0]; 476 | M9 = ((unsigned int*)&numeric_id_be)[1]; 477 | MA = ((unsigned int*)&nonce_be)[0]; 478 | MB = ((unsigned int*)&nonce_be)[1]; 479 | MC = 0x80; 480 | MD = ME = MF = 0; 481 | } 482 | 483 | INPUT_BLOCK_ADD; 484 | XOR_W; 485 | APPLY_P; 486 | for (int i = 0; i < 3; i ++) { 487 | SWAP_BC; 488 | XOR_W; 489 | APPLY_P; 490 | } 491 | 492 | if (hash > 0){ 493 | ((__global unsigned int*)buffer)[Address(gid, hash-1, 0)] = B8; 494 | ((__global unsigned int*)buffer)[Address(gid, hash-1, 1)] = B9; 495 | ((__global unsigned int*)buffer)[Address(gid, hash-1, 2)] = BA; 496 | ((__global unsigned int*)buffer)[Address(gid, hash-1, 3)] = BB; 497 | ((__global unsigned int*)buffer)[Address(gid, hash-1, 4)] = BC; 498 | ((__global unsigned int*)buffer)[Address(gid, hash-1, 5)] = BD; 499 | ((__global unsigned int*)buffer)[Address(gid, hash-1, 6)] = BE; 500 | ((__global unsigned int*)buffer)[Address(gid, hash-1, 7)] = BF; 501 | } 502 | } 503 | 504 | // final xor 505 | if(end==8192){ 506 | for (size_t i = 0; i < NUM_HASHES; i++){ 507 | ((__global unsigned int*)buffer)[Address(gid, i, 0)] ^= B8; 508 | ((__global unsigned int*)buffer)[Address(gid, i, 1)] ^= B9; 509 | ((__global unsigned int*)buffer)[Address(gid, i, 2)] ^= BA; 510 | ((__global unsigned int*)buffer)[Address(gid, i, 3)] ^= BB; 511 | ((__global unsigned int*)buffer)[Address(gid, i, 4)] ^= BC; 512 | ((__global unsigned int*)buffer)[Address(gid, i, 5)] ^= BD; 513 | ((__global unsigned int*)buffer)[Address(gid, i, 6)] ^= BE; 514 | ((__global unsigned int*)buffer)[Address(gid, i, 7)] ^= BF; 515 | } 516 | } 517 | } -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | [[package]] 4 | name = "addr2line" 5 | version = "0.15.2" 6 | source = "registry+https://github.com/rust-lang/crates.io-index" 7 | checksum = "e7a2e47a1fbe209ee101dd6d61285226744c6c8d3c21c8dc878ba6cb9f467f3a" 8 | dependencies = [ 9 | "gimli", 10 | ] 11 | 12 | [[package]] 13 | name = "adler" 14 | version = "1.0.2" 15 | source = "registry+https://github.com/rust-lang/crates.io-index" 16 | checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" 17 | 18 | [[package]] 19 | name = "aligned_alloc" 20 | version = "0.1.3" 21 | source = "registry+https://github.com/rust-lang/crates.io-index" 22 | checksum = "9dcebfb002ccde769c15bc841d0d5548a90e80fcd2ffed5131339e8074746f0a" 23 | dependencies = [ 24 | "kernel32-sys", 25 | "libc", 26 | "winapi 0.2.8", 27 | ] 28 | 29 | [[package]] 30 | name = "ansi_term" 31 | version = "0.11.0" 32 | source = "registry+https://github.com/rust-lang/crates.io-index" 33 | checksum = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b" 34 | dependencies = [ 35 | "winapi 0.3.9", 36 | ] 37 | 38 | [[package]] 39 | name = "atty" 40 | version = "0.2.14" 41 | source = "registry+https://github.com/rust-lang/crates.io-index" 42 | checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" 43 | dependencies = [ 44 | "hermit-abi", 45 | "libc", 46 | "winapi 0.3.9", 47 | ] 48 | 49 | [[package]] 50 | name = "autocfg" 51 | version = "1.0.1" 52 | source = "registry+https://github.com/rust-lang/crates.io-index" 53 | checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" 54 | 55 | [[package]] 56 | name = "backtrace" 57 | version = "0.3.60" 58 | source = "registry+https://github.com/rust-lang/crates.io-index" 59 | checksum = "b7815ea54e4d821e791162e078acbebfd6d8c8939cd559c9335dceb1c8ca7282" 60 | dependencies = [ 61 | "addr2line", 62 | "cc", 63 | "cfg-if 1.0.0", 64 | "libc", 65 | "miniz_oxide", 66 | "object", 67 | "rustc-demangle", 68 | ] 69 | 70 | [[package]] 71 | name = "bitflags" 72 | version = "1.2.1" 73 | source = "registry+https://github.com/rust-lang/crates.io-index" 74 | checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" 75 | 76 | [[package]] 77 | name = "cc" 78 | version = "1.0.68" 79 | source = "registry+https://github.com/rust-lang/crates.io-index" 80 | checksum = "4a72c244c1ff497a746a7e1fb3d14bd08420ecda70c8f25c7112f2781652d787" 81 | 82 | [[package]] 83 | name = "cfg-if" 84 | version = "0.1.10" 85 | source = "registry+https://github.com/rust-lang/crates.io-index" 86 | checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" 87 | 88 | [[package]] 89 | name = "cfg-if" 90 | version = "1.0.0" 91 | source = "registry+https://github.com/rust-lang/crates.io-index" 92 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 93 | 94 | [[package]] 95 | name = "cl-sys" 96 | version = "0.4.2" 97 | source = "registry+https://github.com/rust-lang/crates.io-index" 98 | checksum = "e8573fa3ff8acd6c49e8e113296c54277e82376b96c6ca6307848632cce38e44" 99 | dependencies = [ 100 | "libc", 101 | ] 102 | 103 | [[package]] 104 | name = "clap" 105 | version = "2.33.3" 106 | source = "registry+https://github.com/rust-lang/crates.io-index" 107 | checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002" 108 | dependencies = [ 109 | "ansi_term", 110 | "atty", 111 | "bitflags", 112 | "strsim", 113 | "textwrap", 114 | "unicode-width", 115 | "vec_map", 116 | ] 117 | 118 | [[package]] 119 | name = "core_affinity" 120 | version = "0.5.10" 121 | source = "registry+https://github.com/rust-lang/crates.io-index" 122 | checksum = "7f8a03115cc34fb0d7c321dd154a3914b3ca082ccc5c11d91bf7117dbbe7171f" 123 | dependencies = [ 124 | "kernel32-sys", 125 | "libc", 126 | "num_cpus", 127 | "winapi 0.2.8", 128 | ] 129 | 130 | [[package]] 131 | name = "crossbeam-channel" 132 | version = "0.3.9" 133 | source = "registry+https://github.com/rust-lang/crates.io-index" 134 | checksum = "c8ec7fcd21571dc78f96cc96243cab8d8f035247c3efd16c687be154c3fa9efa" 135 | dependencies = [ 136 | "crossbeam-utils 0.6.6", 137 | ] 138 | 139 | [[package]] 140 | name = "crossbeam-channel" 141 | version = "0.5.1" 142 | source = "registry+https://github.com/rust-lang/crates.io-index" 143 | checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4" 144 | dependencies = [ 145 | "cfg-if 1.0.0", 146 | "crossbeam-utils 0.8.5", 147 | ] 148 | 149 | [[package]] 150 | name = "crossbeam-deque" 151 | version = "0.8.0" 152 | source = "registry+https://github.com/rust-lang/crates.io-index" 153 | checksum = "94af6efb46fef72616855b036a624cf27ba656ffc9be1b9a3c931cfc7749a9a9" 154 | dependencies = [ 155 | "cfg-if 1.0.0", 156 | "crossbeam-epoch", 157 | "crossbeam-utils 0.8.5", 158 | ] 159 | 160 | [[package]] 161 | name = "crossbeam-epoch" 162 | version = "0.9.5" 163 | source = "registry+https://github.com/rust-lang/crates.io-index" 164 | checksum = "4ec02e091aa634e2c3ada4a392989e7c3116673ef0ac5b72232439094d73b7fd" 165 | dependencies = [ 166 | "cfg-if 1.0.0", 167 | "crossbeam-utils 0.8.5", 168 | "lazy_static", 169 | "memoffset", 170 | "scopeguard", 171 | ] 172 | 173 | [[package]] 174 | name = "crossbeam-utils" 175 | version = "0.6.6" 176 | source = "registry+https://github.com/rust-lang/crates.io-index" 177 | checksum = "04973fa96e96579258a5091af6003abde64af786b860f18622b82e026cca60e6" 178 | dependencies = [ 179 | "cfg-if 0.1.10", 180 | "lazy_static", 181 | ] 182 | 183 | [[package]] 184 | name = "crossbeam-utils" 185 | version = "0.8.5" 186 | source = "registry+https://github.com/rust-lang/crates.io-index" 187 | checksum = "d82cfc11ce7f2c3faef78d8a684447b40d503d9681acebed6cb728d45940c4db" 188 | dependencies = [ 189 | "cfg-if 1.0.0", 190 | "lazy_static", 191 | ] 192 | 193 | [[package]] 194 | name = "either" 195 | version = "1.6.1" 196 | source = "registry+https://github.com/rust-lang/crates.io-index" 197 | checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" 198 | 199 | [[package]] 200 | name = "engraver" 201 | version = "2.5.0" 202 | dependencies = [ 203 | "aligned_alloc", 204 | "cc", 205 | "cfg-if 0.1.10", 206 | "clap", 207 | "core_affinity", 208 | "crossbeam-channel 0.3.9", 209 | "fs2", 210 | "humanize-rs", 211 | "libc", 212 | "ocl-core", 213 | "page_size", 214 | "pbr", 215 | "raw-cpuid", 216 | "rayon", 217 | "rust-crypto", 218 | "stopwatch", 219 | "sys-info", 220 | "thread-priority", 221 | "winapi 0.3.9", 222 | ] 223 | 224 | [[package]] 225 | name = "enum_primitive" 226 | version = "0.1.1" 227 | source = "registry+https://github.com/rust-lang/crates.io-index" 228 | checksum = "be4551092f4d519593039259a9ed8daedf0da12e5109c5280338073eaeb81180" 229 | dependencies = [ 230 | "num-traits 0.1.43", 231 | ] 232 | 233 | [[package]] 234 | name = "failure" 235 | version = "0.1.8" 236 | source = "registry+https://github.com/rust-lang/crates.io-index" 237 | checksum = "d32e9bd16cc02eae7db7ef620b392808b89f6a5e16bb3497d159c6b92a0f4f86" 238 | dependencies = [ 239 | "backtrace", 240 | "failure_derive", 241 | ] 242 | 243 | [[package]] 244 | name = "failure_derive" 245 | version = "0.1.8" 246 | source = "registry+https://github.com/rust-lang/crates.io-index" 247 | checksum = "aa4da3c766cd7a0db8242e326e9e4e081edd567072893ed320008189715366a4" 248 | dependencies = [ 249 | "proc-macro2", 250 | "quote", 251 | "syn", 252 | "synstructure", 253 | ] 254 | 255 | [[package]] 256 | name = "fs2" 257 | version = "0.4.3" 258 | source = "registry+https://github.com/rust-lang/crates.io-index" 259 | checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213" 260 | dependencies = [ 261 | "libc", 262 | "winapi 0.3.9", 263 | ] 264 | 265 | [[package]] 266 | name = "fuchsia-cprng" 267 | version = "0.1.1" 268 | source = "registry+https://github.com/rust-lang/crates.io-index" 269 | checksum = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba" 270 | 271 | [[package]] 272 | name = "gcc" 273 | version = "0.3.55" 274 | source = "registry+https://github.com/rust-lang/crates.io-index" 275 | checksum = "8f5f3913fa0bfe7ee1fd8248b6b9f42a5af4b9d65ec2dd2c3c26132b950ecfc2" 276 | 277 | [[package]] 278 | name = "gimli" 279 | version = "0.24.0" 280 | source = "registry+https://github.com/rust-lang/crates.io-index" 281 | checksum = "0e4075386626662786ddb0ec9081e7c7eeb1ba31951f447ca780ef9f5d568189" 282 | 283 | [[package]] 284 | name = "hermit-abi" 285 | version = "0.1.18" 286 | source = "registry+https://github.com/rust-lang/crates.io-index" 287 | checksum = "322f4de77956e22ed0e5032c359a0f1273f1f7f0d79bfa3b8ffbc730d7fbcc5c" 288 | dependencies = [ 289 | "libc", 290 | ] 291 | 292 | [[package]] 293 | name = "humanize-rs" 294 | version = "0.1.5" 295 | source = "registry+https://github.com/rust-lang/crates.io-index" 296 | checksum = "016b02deb8b0c415d8d56a6f0ab265e50c22df61194e37f9be75ed3a722de8a6" 297 | 298 | [[package]] 299 | name = "kernel32-sys" 300 | version = "0.2.2" 301 | source = "registry+https://github.com/rust-lang/crates.io-index" 302 | checksum = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" 303 | dependencies = [ 304 | "winapi 0.2.8", 305 | "winapi-build", 306 | ] 307 | 308 | [[package]] 309 | name = "lazy_static" 310 | version = "1.4.0" 311 | source = "registry+https://github.com/rust-lang/crates.io-index" 312 | checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" 313 | 314 | [[package]] 315 | name = "libc" 316 | version = "0.2.96" 317 | source = "registry+https://github.com/rust-lang/crates.io-index" 318 | checksum = "5600b4e6efc5421841a2138a6b082e07fe12f9aaa12783d50e5d13325b26b4fc" 319 | 320 | [[package]] 321 | name = "memchr" 322 | version = "2.4.0" 323 | source = "registry+https://github.com/rust-lang/crates.io-index" 324 | checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc" 325 | 326 | [[package]] 327 | name = "memoffset" 328 | version = "0.6.4" 329 | source = "registry+https://github.com/rust-lang/crates.io-index" 330 | checksum = "59accc507f1338036a0477ef61afdae33cde60840f4dfe481319ce3ad116ddf9" 331 | dependencies = [ 332 | "autocfg", 333 | ] 334 | 335 | [[package]] 336 | name = "miniz_oxide" 337 | version = "0.4.4" 338 | source = "registry+https://github.com/rust-lang/crates.io-index" 339 | checksum = "a92518e98c078586bc6c934028adcca4c92a53d6a958196de835170a01d84e4b" 340 | dependencies = [ 341 | "adler", 342 | "autocfg", 343 | ] 344 | 345 | [[package]] 346 | name = "num" 347 | version = "0.1.42" 348 | source = "registry+https://github.com/rust-lang/crates.io-index" 349 | checksum = "4703ad64153382334aa8db57c637364c322d3372e097840c72000dabdcf6156e" 350 | dependencies = [ 351 | "num-bigint", 352 | "num-complex", 353 | "num-integer", 354 | "num-iter", 355 | "num-rational", 356 | "num-traits 0.2.14", 357 | ] 358 | 359 | [[package]] 360 | name = "num-bigint" 361 | version = "0.1.44" 362 | source = "registry+https://github.com/rust-lang/crates.io-index" 363 | checksum = "e63899ad0da84ce718c14936262a41cee2c79c981fc0a0e7c7beb47d5a07e8c1" 364 | dependencies = [ 365 | "num-integer", 366 | "num-traits 0.2.14", 367 | "rand 0.4.6", 368 | "rustc-serialize", 369 | ] 370 | 371 | [[package]] 372 | name = "num-complex" 373 | version = "0.1.43" 374 | source = "registry+https://github.com/rust-lang/crates.io-index" 375 | checksum = "b288631d7878aaf59442cffd36910ea604ecd7745c36054328595114001c9656" 376 | dependencies = [ 377 | "num-traits 0.2.14", 378 | "rustc-serialize", 379 | ] 380 | 381 | [[package]] 382 | name = "num-integer" 383 | version = "0.1.44" 384 | source = "registry+https://github.com/rust-lang/crates.io-index" 385 | checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db" 386 | dependencies = [ 387 | "autocfg", 388 | "num-traits 0.2.14", 389 | ] 390 | 391 | [[package]] 392 | name = "num-iter" 393 | version = "0.1.42" 394 | source = "registry+https://github.com/rust-lang/crates.io-index" 395 | checksum = "b2021c8337a54d21aca0d59a92577a029af9431cb59b909b03252b9c164fad59" 396 | dependencies = [ 397 | "autocfg", 398 | "num-integer", 399 | "num-traits 0.2.14", 400 | ] 401 | 402 | [[package]] 403 | name = "num-rational" 404 | version = "0.1.42" 405 | source = "registry+https://github.com/rust-lang/crates.io-index" 406 | checksum = "ee314c74bd753fc86b4780aa9475da469155f3848473a261d2d18e35245a784e" 407 | dependencies = [ 408 | "num-bigint", 409 | "num-integer", 410 | "num-traits 0.2.14", 411 | "rustc-serialize", 412 | ] 413 | 414 | [[package]] 415 | name = "num-traits" 416 | version = "0.1.43" 417 | source = "registry+https://github.com/rust-lang/crates.io-index" 418 | checksum = "92e5113e9fd4cc14ded8e499429f396a20f98c772a47cc8622a736e1ec843c31" 419 | dependencies = [ 420 | "num-traits 0.2.14", 421 | ] 422 | 423 | [[package]] 424 | name = "num-traits" 425 | version = "0.2.14" 426 | source = "registry+https://github.com/rust-lang/crates.io-index" 427 | checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" 428 | dependencies = [ 429 | "autocfg", 430 | ] 431 | 432 | [[package]] 433 | name = "num_cpus" 434 | version = "1.13.0" 435 | source = "registry+https://github.com/rust-lang/crates.io-index" 436 | checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3" 437 | dependencies = [ 438 | "hermit-abi", 439 | "libc", 440 | ] 441 | 442 | [[package]] 443 | name = "object" 444 | version = "0.25.2" 445 | source = "registry+https://github.com/rust-lang/crates.io-index" 446 | checksum = "f8bc1d42047cf336f0f939c99e97183cf31551bf0f2865a2ec9c8d91fd4ffb5e" 447 | dependencies = [ 448 | "memchr", 449 | ] 450 | 451 | [[package]] 452 | name = "ocl-core" 453 | version = "0.11.2" 454 | source = "registry+https://github.com/rust-lang/crates.io-index" 455 | checksum = "81bc628faf959b5e07b1251252926dfe0dd1b3f2709cef8998c97936ddbdaa74" 456 | dependencies = [ 457 | "bitflags", 458 | "cl-sys", 459 | "enum_primitive", 460 | "failure", 461 | "num-complex", 462 | "num-traits 0.2.14", 463 | "ocl-core-vector", 464 | "rustc_version 0.1.7", 465 | ] 466 | 467 | [[package]] 468 | name = "ocl-core-vector" 469 | version = "0.1.0" 470 | source = "registry+https://github.com/rust-lang/crates.io-index" 471 | checksum = "b4072920739958adeec5abedec51af70febc58f7fff0601aaa0827c1f3c8fefd" 472 | dependencies = [ 473 | "num", 474 | ] 475 | 476 | [[package]] 477 | name = "page_size" 478 | version = "0.4.2" 479 | source = "registry+https://github.com/rust-lang/crates.io-index" 480 | checksum = "eebde548fbbf1ea81a99b128872779c437752fb99f217c45245e1a61dcd9edcd" 481 | dependencies = [ 482 | "libc", 483 | "winapi 0.3.9", 484 | ] 485 | 486 | [[package]] 487 | name = "pbr" 488 | version = "1.0.4" 489 | source = "registry+https://github.com/rust-lang/crates.io-index" 490 | checksum = "ff5751d87f7c00ae6403eb1fcbba229b9c76c9a30de8c1cf87182177b168cea2" 491 | dependencies = [ 492 | "crossbeam-channel 0.5.1", 493 | "libc", 494 | "time", 495 | "winapi 0.3.9", 496 | ] 497 | 498 | [[package]] 499 | name = "proc-macro2" 500 | version = "1.0.27" 501 | source = "registry+https://github.com/rust-lang/crates.io-index" 502 | checksum = "f0d8caf72986c1a598726adc988bb5984792ef84f5ee5aa50209145ee8077038" 503 | dependencies = [ 504 | "unicode-xid", 505 | ] 506 | 507 | [[package]] 508 | name = "quote" 509 | version = "1.0.9" 510 | source = "registry+https://github.com/rust-lang/crates.io-index" 511 | checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7" 512 | dependencies = [ 513 | "proc-macro2", 514 | ] 515 | 516 | [[package]] 517 | name = "rand" 518 | version = "0.3.23" 519 | source = "registry+https://github.com/rust-lang/crates.io-index" 520 | checksum = "64ac302d8f83c0c1974bf758f6b041c6c8ada916fbb44a609158ca8b064cc76c" 521 | dependencies = [ 522 | "libc", 523 | "rand 0.4.6", 524 | ] 525 | 526 | [[package]] 527 | name = "rand" 528 | version = "0.4.6" 529 | source = "registry+https://github.com/rust-lang/crates.io-index" 530 | checksum = "552840b97013b1a26992c11eac34bdd778e464601a4c2054b5f0bff7c6761293" 531 | dependencies = [ 532 | "fuchsia-cprng", 533 | "libc", 534 | "rand_core 0.3.1", 535 | "rdrand", 536 | "winapi 0.3.9", 537 | ] 538 | 539 | [[package]] 540 | name = "rand_core" 541 | version = "0.3.1" 542 | source = "registry+https://github.com/rust-lang/crates.io-index" 543 | checksum = "7a6fdeb83b075e8266dcc8762c22776f6877a63111121f5f8c7411e5be7eed4b" 544 | dependencies = [ 545 | "rand_core 0.4.2", 546 | ] 547 | 548 | [[package]] 549 | name = "rand_core" 550 | version = "0.4.2" 551 | source = "registry+https://github.com/rust-lang/crates.io-index" 552 | checksum = "9c33a3c44ca05fa6f1807d8e6743f3824e8509beca625669633be0acbdf509dc" 553 | 554 | [[package]] 555 | name = "raw-cpuid" 556 | version = "6.1.0" 557 | source = "registry+https://github.com/rust-lang/crates.io-index" 558 | checksum = "30a9d219c32c9132f7be513c18be77c9881c7107d2ab5569d205a6a0f0e6dc7d" 559 | dependencies = [ 560 | "bitflags", 561 | "cc", 562 | "rustc_version 0.2.3", 563 | ] 564 | 565 | [[package]] 566 | name = "rayon" 567 | version = "1.5.1" 568 | source = "registry+https://github.com/rust-lang/crates.io-index" 569 | checksum = "c06aca804d41dbc8ba42dfd964f0d01334eceb64314b9ecf7c5fad5188a06d90" 570 | dependencies = [ 571 | "autocfg", 572 | "crossbeam-deque", 573 | "either", 574 | "rayon-core", 575 | ] 576 | 577 | [[package]] 578 | name = "rayon-core" 579 | version = "1.9.1" 580 | source = "registry+https://github.com/rust-lang/crates.io-index" 581 | checksum = "d78120e2c850279833f1dd3582f730c4ab53ed95aeaaaa862a2a5c71b1656d8e" 582 | dependencies = [ 583 | "crossbeam-channel 0.5.1", 584 | "crossbeam-deque", 585 | "crossbeam-utils 0.8.5", 586 | "lazy_static", 587 | "num_cpus", 588 | ] 589 | 590 | [[package]] 591 | name = "rdrand" 592 | version = "0.4.0" 593 | source = "registry+https://github.com/rust-lang/crates.io-index" 594 | checksum = "678054eb77286b51581ba43620cc911abf02758c91f93f479767aed0f90458b2" 595 | dependencies = [ 596 | "rand_core 0.3.1", 597 | ] 598 | 599 | [[package]] 600 | name = "rust-crypto" 601 | version = "0.2.36" 602 | source = "registry+https://github.com/rust-lang/crates.io-index" 603 | checksum = "f76d05d3993fd5f4af9434e8e436db163a12a9d40e1a58a726f27a01dfd12a2a" 604 | dependencies = [ 605 | "gcc", 606 | "libc", 607 | "rand 0.3.23", 608 | "rustc-serialize", 609 | "time", 610 | ] 611 | 612 | [[package]] 613 | name = "rustc-demangle" 614 | version = "0.1.19" 615 | source = "registry+https://github.com/rust-lang/crates.io-index" 616 | checksum = "410f7acf3cb3a44527c5d9546bad4bf4e6c460915d5f9f2fc524498bfe8f70ce" 617 | 618 | [[package]] 619 | name = "rustc-serialize" 620 | version = "0.3.24" 621 | source = "registry+https://github.com/rust-lang/crates.io-index" 622 | checksum = "dcf128d1287d2ea9d80910b5f1120d0b8eede3fbf1abe91c40d39ea7d51e6fda" 623 | 624 | [[package]] 625 | name = "rustc_version" 626 | version = "0.1.7" 627 | source = "registry+https://github.com/rust-lang/crates.io-index" 628 | checksum = "c5f5376ea5e30ce23c03eb77cbe4962b988deead10910c372b226388b594c084" 629 | dependencies = [ 630 | "semver 0.1.20", 631 | ] 632 | 633 | [[package]] 634 | name = "rustc_version" 635 | version = "0.2.3" 636 | source = "registry+https://github.com/rust-lang/crates.io-index" 637 | checksum = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a" 638 | dependencies = [ 639 | "semver 0.9.0", 640 | ] 641 | 642 | [[package]] 643 | name = "scopeguard" 644 | version = "1.1.0" 645 | source = "registry+https://github.com/rust-lang/crates.io-index" 646 | checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" 647 | 648 | [[package]] 649 | name = "semver" 650 | version = "0.1.20" 651 | source = "registry+https://github.com/rust-lang/crates.io-index" 652 | checksum = "d4f410fedcf71af0345d7607d246e7ad15faaadd49d240ee3b24e5dc21a820ac" 653 | 654 | [[package]] 655 | name = "semver" 656 | version = "0.9.0" 657 | source = "registry+https://github.com/rust-lang/crates.io-index" 658 | checksum = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403" 659 | dependencies = [ 660 | "semver-parser", 661 | ] 662 | 663 | [[package]] 664 | name = "semver-parser" 665 | version = "0.7.0" 666 | source = "registry+https://github.com/rust-lang/crates.io-index" 667 | checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" 668 | 669 | [[package]] 670 | name = "stopwatch" 671 | version = "0.0.7" 672 | source = "registry+https://github.com/rust-lang/crates.io-index" 673 | checksum = "3d04b5ebc78da44d3a456319d8bc2783e7d8cc7ccbb5cb4dc3f54afbd93bf728" 674 | dependencies = [ 675 | "num", 676 | ] 677 | 678 | [[package]] 679 | name = "strsim" 680 | version = "0.8.0" 681 | source = "registry+https://github.com/rust-lang/crates.io-index" 682 | checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" 683 | 684 | [[package]] 685 | name = "syn" 686 | version = "1.0.73" 687 | source = "registry+https://github.com/rust-lang/crates.io-index" 688 | checksum = "f71489ff30030d2ae598524f61326b902466f72a0fb1a8564c001cc63425bcc7" 689 | dependencies = [ 690 | "proc-macro2", 691 | "quote", 692 | "unicode-xid", 693 | ] 694 | 695 | [[package]] 696 | name = "synstructure" 697 | version = "0.12.4" 698 | source = "registry+https://github.com/rust-lang/crates.io-index" 699 | checksum = "b834f2d66f734cb897113e34aaff2f1ab4719ca946f9a7358dba8f8064148701" 700 | dependencies = [ 701 | "proc-macro2", 702 | "quote", 703 | "syn", 704 | "unicode-xid", 705 | ] 706 | 707 | [[package]] 708 | name = "sys-info" 709 | version = "0.5.6" 710 | source = "registry+https://github.com/rust-lang/crates.io-index" 711 | checksum = "617f594d3869801871433390254b4a79f2a18176d7f4ad5784fa990bc8c12986" 712 | dependencies = [ 713 | "cc", 714 | "libc", 715 | ] 716 | 717 | [[package]] 718 | name = "textwrap" 719 | version = "0.11.0" 720 | source = "registry+https://github.com/rust-lang/crates.io-index" 721 | checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" 722 | dependencies = [ 723 | "unicode-width", 724 | ] 725 | 726 | [[package]] 727 | name = "thread-priority" 728 | version = "0.1.1" 729 | source = "registry+https://github.com/rust-lang/crates.io-index" 730 | checksum = "52c084e908948709a7f7f6d44b5368e0134aa322e0e569431a92c989bf855188" 731 | dependencies = [ 732 | "libc", 733 | ] 734 | 735 | [[package]] 736 | name = "time" 737 | version = "0.1.44" 738 | source = "registry+https://github.com/rust-lang/crates.io-index" 739 | checksum = "6db9e6914ab8b1ae1c260a4ae7a49b6c5611b40328a735b21862567685e73255" 740 | dependencies = [ 741 | "libc", 742 | "wasi", 743 | "winapi 0.3.9", 744 | ] 745 | 746 | [[package]] 747 | name = "unicode-width" 748 | version = "0.1.8" 749 | source = "registry+https://github.com/rust-lang/crates.io-index" 750 | checksum = "9337591893a19b88d8d87f2cec1e73fad5cdfd10e5a6f349f498ad6ea2ffb1e3" 751 | 752 | [[package]] 753 | name = "unicode-xid" 754 | version = "0.2.2" 755 | source = "registry+https://github.com/rust-lang/crates.io-index" 756 | checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" 757 | 758 | [[package]] 759 | name = "vec_map" 760 | version = "0.8.2" 761 | source = "registry+https://github.com/rust-lang/crates.io-index" 762 | checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" 763 | 764 | [[package]] 765 | name = "wasi" 766 | version = "0.10.0+wasi-snapshot-preview1" 767 | source = "registry+https://github.com/rust-lang/crates.io-index" 768 | checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" 769 | 770 | [[package]] 771 | name = "winapi" 772 | version = "0.2.8" 773 | source = "registry+https://github.com/rust-lang/crates.io-index" 774 | checksum = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a" 775 | 776 | [[package]] 777 | name = "winapi" 778 | version = "0.3.9" 779 | source = "registry+https://github.com/rust-lang/crates.io-index" 780 | checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" 781 | dependencies = [ 782 | "winapi-i686-pc-windows-gnu", 783 | "winapi-x86_64-pc-windows-gnu", 784 | ] 785 | 786 | [[package]] 787 | name = "winapi-build" 788 | version = "0.1.1" 789 | source = "registry+https://github.com/rust-lang/crates.io-index" 790 | checksum = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc" 791 | 792 | [[package]] 793 | name = "winapi-i686-pc-windows-gnu" 794 | version = "0.4.0" 795 | source = "registry+https://github.com/rust-lang/crates.io-index" 796 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" 797 | 798 | [[package]] 799 | name = "winapi-x86_64-pc-windows-gnu" 800 | version = "0.4.0" 801 | source = "registry+https://github.com/rust-lang/crates.io-index" 802 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" 803 | --------------------------------------------------------------------------------