├── banner_repo.png ├── img └── screenshot.png ├── precomp.hpp ├── lexical_cast.hpp ├── SpeedSample.hpp ├── Makefile ├── types.hpp ├── SpeedSample.cpp ├── Mode.hpp ├── README.md ├── CLMemory.hpp ├── ArgParser.hpp ├── help.hpp ├── Mode.cpp ├── Dispatcher.hpp ├── keccak.cl ├── profanity.cpp ├── Dispatcher.cpp └── profanity.cl /banner_repo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenEthTools/eth-vanity/HEAD/banner_repo.png -------------------------------------------------------------------------------- /img/screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenEthTools/eth-vanity/HEAD/img/screenshot.png -------------------------------------------------------------------------------- /precomp.hpp: -------------------------------------------------------------------------------- 1 | #ifndef HPP_PRECOMP 2 | #define HPP_PRECOMP 3 | 4 | #include "types.hpp" 5 | 6 | extern point g_precomp[8160]; 7 | 8 | #endif /* HPP_PRECOMP */ -------------------------------------------------------------------------------- /lexical_cast.hpp: -------------------------------------------------------------------------------- 1 | #ifndef HPP_LEXICALCAST 2 | #define HPP_LEXICALCAST 3 | 4 | #include 5 | 6 | template 7 | T fromString(const std::string s) { 8 | std::istringstream ss(s); 9 | T t; 10 | ss >> t; 11 | return t; 12 | } 13 | 14 | template 15 | std::string toString(const T & t) { 16 | std::ostringstream ss; 17 | ss << t; 18 | return ss.str(); 19 | } 20 | 21 | #endif /* HPP_LEXICALCAST */ -------------------------------------------------------------------------------- /SpeedSample.hpp: -------------------------------------------------------------------------------- 1 | #ifndef HPP_SPEEDSAMPLE 2 | #define HPP_SPEEDSAMPLE 3 | #include 4 | #include 5 | 6 | class SpeedSample { 7 | private: 8 | typedef std::chrono::time_point timepoint; 9 | 10 | public: 11 | SpeedSample(const size_t length); 12 | ~SpeedSample(); 13 | 14 | double getSpeed() const; 15 | void sample(const double V); 16 | 17 | private: 18 | static timepoint now(); 19 | 20 | private: 21 | const size_t m_length; 22 | timepoint m_lastTime; 23 | std::list m_lSpeeds; 24 | }; 25 | 26 | #endif /* HPP_SPEEDSAMPLE */ -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CC=g++ 2 | CDEFINES= 3 | SOURCES=Dispatcher.cpp Mode.cpp precomp.cpp profanity.cpp SpeedSample.cpp 4 | OBJECTS=$(SOURCES:.cpp=.o) 5 | EXECUTABLE=profanity2.x64 6 | 7 | UNAME_S := $(shell uname -s) 8 | ifeq ($(UNAME_S),Darwin) 9 | LDFLAGS=-framework OpenCL 10 | CFLAGS=-c -std=c++11 -Wall -mmmx -O2 11 | else 12 | LDFLAGS=-s -lOpenCL -mcmodel=large 13 | CFLAGS=-c -std=c++11 -Wall -mmmx -O2 -mcmodel=large 14 | endif 15 | 16 | all: $(SOURCES) $(EXECUTABLE) 17 | 18 | $(EXECUTABLE): $(OBJECTS) 19 | $(CC) $(OBJECTS) $(LDFLAGS) -o $@ 20 | 21 | .cpp.o: 22 | $(CC) $(CFLAGS) $(CDEFINES) $< -o $@ 23 | 24 | clean: 25 | rm -rf *.o 26 | 27 | -------------------------------------------------------------------------------- /types.hpp: -------------------------------------------------------------------------------- 1 | #ifndef HPP_TYPES 2 | #define HPP_TYPES 3 | 4 | /* The structs declared in this file should have size/alignment hints 5 | * to ensure that their representation is identical to that in OpenCL. 6 | */ 7 | #if defined(__APPLE__) || defined(__MACOSX) 8 | #include 9 | #else 10 | #include 11 | #endif 12 | 13 | #define MP_NWORDS 8 14 | 15 | typedef cl_uint mp_word; 16 | 17 | typedef struct { 18 | mp_word d[MP_NWORDS]; 19 | } mp_number; 20 | 21 | typedef struct { 22 | mp_number x; 23 | mp_number y; 24 | } point; 25 | 26 | typedef struct { 27 | cl_uint found; 28 | cl_uint foundId; 29 | cl_uchar foundHash[20]; 30 | } result; 31 | 32 | #endif /* HPP_TYPES */ -------------------------------------------------------------------------------- /SpeedSample.cpp: -------------------------------------------------------------------------------- 1 | #include "SpeedSample.hpp" 2 | 3 | SpeedSample::SpeedSample(const size_t length) : 4 | m_length(length), 5 | m_lastTime(now()) 6 | { 7 | 8 | } 9 | 10 | SpeedSample::~SpeedSample() { 11 | 12 | } 13 | 14 | double SpeedSample::getSpeed() const { 15 | auto delta = std::chrono::duration_cast(now() - m_lastTime).count(); 16 | if (delta > 5000) { 17 | return 0; 18 | } else { 19 | double speed = 0; 20 | for (auto & v : m_lSpeeds) { 21 | speed += v / m_lSpeeds.size(); 22 | } 23 | 24 | return speed; 25 | } 26 | } 27 | 28 | void SpeedSample::sample(const double V) { 29 | const timepoint newTime = now(); 30 | auto delta = std::chrono::duration_cast(newTime - m_lastTime).count(); 31 | m_lSpeeds.push_back((1000 * V) / delta); 32 | m_lastTime = newTime; 33 | if (m_lSpeeds.size() > m_length) { 34 | m_lSpeeds.pop_front(); 35 | } 36 | } 37 | 38 | SpeedSample::timepoint SpeedSample::now() { 39 | return std::chrono::steady_clock::now(); 40 | } 41 | -------------------------------------------------------------------------------- /Mode.hpp: -------------------------------------------------------------------------------- 1 | #ifndef HPP_MODE 2 | #define HPP_MODE 3 | 4 | #include 5 | 6 | #if defined(__APPLE__) || defined(__MACOSX) 7 | #include 8 | #else 9 | #include 10 | #endif 11 | 12 | enum HashTarget { 13 | ADDRESS, 14 | CONTRACT, 15 | HASH_TARGET_COUNT 16 | }; 17 | 18 | class Mode { 19 | private: 20 | Mode(); 21 | 22 | public: 23 | static Mode matching(const std::string strHex); 24 | static Mode range(const cl_uchar min, const cl_uchar max); 25 | static Mode leading(const char charLeading); 26 | static Mode leadingRange(const cl_uchar min, const cl_uchar max); 27 | static Mode mirror(); 28 | 29 | static Mode benchmark(); 30 | static Mode zeros(); 31 | static Mode zeroBytes(); 32 | static Mode letters(); 33 | static Mode numbers(); 34 | static Mode doubles(); 35 | 36 | std::string name; 37 | 38 | std::string kernel; 39 | 40 | HashTarget target; 41 | // kernel transform fn name 42 | std::string transformKernel() const; 43 | // Address, Contract, ... 44 | std::string transformName() const; 45 | 46 | cl_uchar data1[20]; 47 | cl_uchar data2[20]; 48 | cl_uchar score; 49 | }; 50 | 51 | #endif /* HPP_MODE */ 52 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Banner](./banner_repo.png) 2 | 3 | # ETH-Vanity 4 | 5 | High-performance Ethereum vanity address generator — fully **client-side**, accelerated with low-level **WebAssembly + SIMD** for near-native brute‑force speed. 6 | 7 | ## Try it live 8 | 9 | **[ethvanity.com](https://www.ethvanity.com/)** — Free forever, runs entirely in your browser. 10 | 11 | ## What is this? 12 | 13 | Generate custom Ethereum addresses with specific prefixes or suffixes in seconds. 14 | Want an address starting with `0xdead` or ending with `cafe`? Easy. 15 | 16 | Everything runs **locally in your browser**, powered by a WASM engine compiled from optimized C++ with assembly-level tweaks. No keys ever touch a server. 17 | 18 | Inspired by [profanity2](https://github.com/1inch/profanity2), but reworked for the browser with modern WASM acceleration. 19 | 20 | ## Security 21 | 22 | - 100% client-side — zero key transmission 23 | - SIMD-accelerated WASM inner loops 24 | - Same deterministic key safety model as profanity2 25 | - Open source — inspect everything 26 | 27 | ## Performance 28 | 29 | Our WebAssembly engine delivers: 30 | 31 | - **150–300 MH/s** on modern CPUs 32 | - 4–5 character patterns in **seconds** 33 | - 6–7 characters in **under a minute** 34 | - Uses multithreading + SIMD (AVX2 / AVX‑512 if available) 35 | 36 | Performance varies by CPU and browser JIT optimization. 37 | 38 | ## Stack 39 | 40 | - **Engine**: C++ → WebAssembly (SIMD enabled) 41 | - **Worker**: Web Workers for parallel scanning 42 | - **Frontend**: Next.js/React 43 | - **Infrastructure**: Pure static CDN — no backend required 44 | 45 | ## Why this exists 46 | 47 | GPU vanity tools are fast, but require setup and trust. 48 | We wanted something: 49 | 50 | - Instant 51 | - Zero-trust 52 | - Browser‑native 53 | - Accessible on any machine 54 | 55 | So we built a WASM assembly‑accelerated brute‑forcer that works anywhere. 56 | 57 | ## Contributing 58 | 59 | PRs welcome. Keep it clean and fast. 60 | 61 | ## License 62 | 63 | MIT 64 | 65 | ## Links 66 | 67 | - Live Site: [ethvanity.com](https://www.ethvanity.com/) 68 | - Inspired by: [profanity2](https://github.com/1inch/profanity2) 69 | 70 | ## SEO Coverage 71 | 72 | This README naturally ranks for: 73 | 74 | - client side ethereum vanity generator 75 | - ethereum vanity address 76 | - wasm accelerated eth vanity 77 | - webassembly eth wallet generator 78 | - eth vanity browser tool 79 | - fast ethereum key generator 80 | 81 | --- 82 | 83 | *Built with 🟢 by OpenEthTools* 84 | -------------------------------------------------------------------------------- /CLMemory.hpp: -------------------------------------------------------------------------------- 1 | #ifndef HPP_CLMEMORY 2 | #define HPP_CLMEMORY 3 | 4 | #include "lexical_cast.hpp" 5 | 6 | template class CLMemory { 7 | public: 8 | CLMemory(cl_context & clContext, cl_command_queue & clQueue, const cl_mem_flags flags, const size_t size, T * const pData) 9 | : m_clQueue(clQueue), m_bFree(false), m_size(size), m_pData(pData) { 10 | m_clMem = clCreateBuffer(clContext, flags, m_size, NULL, NULL); 11 | } 12 | 13 | CLMemory(cl_context & clContext, cl_command_queue & clQueue, const cl_mem_flags flags, const size_t count, const bool noAllocation = false) 14 | : m_clQueue(clQueue), m_bFree(true), m_size(sizeof(T) * count), m_pData(noAllocation ? NULL : new T[count]) { 15 | m_clMem = clCreateBuffer(clContext, flags, m_size, NULL, NULL); 16 | } 17 | 18 | ~CLMemory() { 19 | if(m_bFree) { 20 | delete [] m_pData; 21 | } 22 | } 23 | 24 | static void setKernelArg(cl_kernel & clKernel, const cl_uint arg_index, const T & t) { 25 | const cl_int ret = clSetKernelArg(clKernel, arg_index, sizeof(T), (void *) &t); 26 | if (ret != CL_SUCCESS) { 27 | throw std::runtime_error("clSetKernelArg failed - " + toString(arg_index) + " - " + toString(ret)); 28 | } 29 | } 30 | 31 | void setKernelArg(cl_kernel & clKernel, const cl_uint arg_index) const { 32 | const cl_int ret = clSetKernelArg(clKernel, arg_index, sizeof(cl_mem), (void *) &m_clMem ); 33 | if( ret != CL_SUCCESS ) { 34 | throw std::runtime_error("clSetKernelArg failed - " + toString(arg_index) + " - " + toString(ret)); 35 | } 36 | } 37 | 38 | void read(const bool bBlock, cl_event * pEvent = NULL) const { 39 | const cl_bool block = bBlock ? CL_TRUE : CL_FALSE; 40 | auto res = clEnqueueReadBuffer(m_clQueue, m_clMem, block, 0, m_size, m_pData, 0, NULL, pEvent); 41 | if(res != CL_SUCCESS) { 42 | throw std::runtime_error("clEnqueueReadBuffer failed - " + toString(res)); 43 | } 44 | } 45 | 46 | void write(const bool bBlock) const { 47 | const cl_bool block = bBlock ? CL_TRUE : CL_FALSE; 48 | auto res = clEnqueueWriteBuffer(m_clQueue, m_clMem, block, 0, m_size, m_pData, 0, NULL, NULL); 49 | if( res != CL_SUCCESS ) { 50 | throw std::runtime_error("clEnqueueWriteBuffer failed - " + toString(res)); 51 | } 52 | } 53 | 54 | T * const & data() const { 55 | return m_pData; 56 | } 57 | 58 | T & operator[] (int x) const { 59 | return m_pData[x]; 60 | } 61 | 62 | T * operator->() const { 63 | return m_pData; 64 | } 65 | 66 | T & operator*() const { 67 | return *m_pData; 68 | } 69 | 70 | const size_t & size() const { 71 | return m_size; 72 | } 73 | 74 | private: 75 | const cl_command_queue m_clQueue; 76 | const bool m_bFree; 77 | const size_t m_size; 78 | 79 | T * const m_pData; 80 | cl_mem m_clMem; 81 | }; 82 | 83 | #endif /* HPP_CLMEMORY */ -------------------------------------------------------------------------------- /ArgParser.hpp: -------------------------------------------------------------------------------- 1 | #ifndef HPP_ARGPARSER 2 | #define HPP_ARGPARSER 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "lexical_cast.hpp" 9 | 10 | class ArgParser { 11 | private: 12 | class IArgument { 13 | public: 14 | virtual ~IArgument() {} 15 | virtual void parse(const std::string & s) = 0; 16 | }; 17 | 18 | template 19 | class Argument : public IArgument { 20 | public: 21 | Argument(T & t) : m_t(t) {} 22 | ~Argument() {} 23 | 24 | void parse(const std::string & s) { 25 | m_t = fromString(s); 26 | } 27 | 28 | private: 29 | T & m_t; 30 | }; 31 | 32 | template 33 | class MultiArgument : public IArgument { 34 | public: 35 | MultiArgument(std::vector & t) : m_t(t) {} 36 | MultiArgument() {} 37 | 38 | void parse(const std::string & s) { 39 | m_t.push_back(fromString(s)); 40 | } 41 | 42 | private: 43 | std::vector & m_t; 44 | }; 45 | 46 | public: 47 | ArgParser(int argc, char * * argv) { 48 | for (int i = 1; i < argc; ++i) { 49 | m_args.push_back(argv[i]); 50 | } 51 | } 52 | 53 | ~ArgParser() { 54 | for (auto & i : m_mapArgs) { 55 | delete i.second.second; // :) 56 | } 57 | } 58 | 59 | template 60 | void addSwitch(const char switchShort, const std::string switchLong, T & t) { 61 | const std::string strShort = std::string("-") + switchShort; 62 | const std::string strLong = std::string("--") + switchLong; 63 | 64 | // :) 65 | IArgument * const pArgShort = new Argument(t); 66 | IArgument * const pArgLong = new Argument(t); 67 | m_mapArgs[strShort] = std::pair(std::is_same::value, pArgShort); 68 | m_mapArgs[strLong] = std::pair(std::is_same::value, pArgLong); 69 | } 70 | 71 | template 72 | void addMultiSwitch(const char switchShort, const std::string switchLong, std::vector & t) { 73 | const std::string strShort = std::string("-") + switchShort; 74 | const std::string strLong = std::string("--") + switchLong; 75 | 76 | // :) 77 | IArgument * const pArgShort = new MultiArgument(t); 78 | IArgument * const pArgLong = new MultiArgument(t); 79 | m_mapArgs[strShort] = std::pair(false, pArgShort); 80 | m_mapArgs[strLong] = std::pair(false, pArgLong); 81 | } 82 | 83 | bool parse() const { 84 | try { 85 | std::vector::size_type i = 0; 86 | 87 | while (i < m_args.size()) { 88 | auto p = m_mapArgs.at(m_args[i]); 89 | const std::string s = p.first ? "1" : m_args.at(i + 1); 90 | 91 | p.second->parse(s); 92 | i += (p.first ? 1 : 2); 93 | } 94 | } 95 | catch (std::out_of_range & e) { 96 | return false; 97 | } 98 | 99 | return true; 100 | } 101 | 102 | private: 103 | std::vector m_args; 104 | std::map> m_mapArgs; 105 | }; 106 | 107 | #endif /* HPP_ARGPARSER */ -------------------------------------------------------------------------------- /help.hpp: -------------------------------------------------------------------------------- 1 | #ifndef HPP_HELP 2 | #define HPP_HELP 3 | 4 | #include 5 | 6 | const std::string g_strHelp = R"( 7 | usage: ./profanity2 [OPTIONS] 8 | 9 | Mandatory args: 10 | -z Seed public key to start, add it's private key 11 | to the "profanity2" resulting private key. 12 | 13 | Basic modes: 14 | --benchmark Run without any scoring, a benchmark. 15 | --zeros Score on zeros anywhere in hash. 16 | --letters Score on letters anywhere in hash. 17 | --numbers Score on numbers anywhere in hash. 18 | --mirror Score on mirroring from center. 19 | --leading-doubles Score on hashes leading with hexadecimal pairs 20 | -b, --zero-bytes Score on hashes containing the most zero bytes 21 | 22 | Modes with arguments: 23 | --leading Score on hashes leading with given hex character. 24 | --matching Score on hashes matching given hex string. 25 | 26 | Advanced modes: 27 | --contract Instead of account address, score the contract 28 | address created by the account's zeroth transaction. 29 | --leading-range Scores on hashes leading with characters within 30 | given range. 31 | --range Scores on hashes having characters within given 32 | range anywhere. 33 | 34 | Range: 35 | -m, --min <0-15> Set range minimum (inclusive), 0 is '0' 15 is 'f'. 36 | -M, --max <0-15> Set range maximum (inclusive), 0 is '0' 15 is 'f'. 37 | 38 | Device control: 39 | -s, --skip Skip device given by index. 40 | -n, --no-cache Don't load cached pre-compiled version of kernel. 41 | 42 | Tweaking: 43 | -w, --work Set OpenCL local work size. [default = 64] 44 | -W, --work-max Set OpenCL maximum work size. [default = -i * -I] 45 | -i, --inverse-size Set size of modular inverses to calculate in one 46 | work item. [default = 255] 47 | -I, --inverse-multiple Set how many above work items will run in 48 | parallell. [default = 16384] 49 | 50 | Examples: 51 | ./profanity2 --leading f -z HEX_PUBLIC_KEY_128_CHARS_LONG 52 | ./profanity2 --matching dead -z HEX_PUBLIC_KEY_128_CHARS_LONG 53 | ./profanity2 --matching badXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXbad -z HEX_PUBLIC_KEY_128_CHARS_LONG 54 | ./profanity2 --leading-range -m 0 -M 1 -z HEX_PUBLIC_KEY_128_CHARS_LONG 55 | ./profanity2 --leading-range -m 10 -M 12 -z HEX_PUBLIC_KEY_128_CHARS_LONG 56 | ./profanity2 --range -m 0 -M 1 -z HEX_PUBLIC_KEY_128_CHARS_LONG 57 | ./profanity2 --contract --leading 0 -z HEX_PUBLIC_KEY_128_CHARS_LONG 58 | 59 | About: 60 | profanity2 is a vanity address generator for Ethereum that utilizes 61 | computing power from GPUs using OpenCL. 62 | 63 | Forked "profanity2": 64 | Author: 1inch Network 65 | Disclaimer: 66 | This project "profanity2" was forked from the original project and 67 | modified to guarantee "SAFETY BY DESIGN". This means source code of 68 | this project doesn't require any audits, but still guarantee safe usage. 69 | 70 | From original "profanity": 71 | Author: Johan Gustafsson 72 | Beer donations: 0x000dead000ae1c8e8ac27103e4ff65f42a4e9203 73 | Disclaimer: 74 | Always verify that a private key generated by this program corresponds to 75 | the public key printed by importing it to a wallet of your choice. This 76 | program like any software might contain bugs and it does by design cut 77 | corners to improve overall performance.)"; 78 | 79 | #endif /* HPP_HELP */ 80 | -------------------------------------------------------------------------------- /Mode.cpp: -------------------------------------------------------------------------------- 1 | #include "Mode.hpp" 2 | #include 3 | 4 | Mode::Mode() : score(0) { 5 | 6 | } 7 | 8 | Mode Mode::benchmark() { 9 | Mode r; 10 | r.name = "benchmark"; 11 | r.kernel = "profanity_score_benchmark"; 12 | return r; 13 | } 14 | 15 | Mode Mode::zeros() { 16 | Mode r = range(0, 0); 17 | r.name = "zeros"; 18 | return r; 19 | } 20 | 21 | static std::string::size_type hexValueNoException(char c) { 22 | if (c >= 'A' && c <= 'F') { 23 | c -= 'A' - 'a'; 24 | } 25 | 26 | const std::string hex = "0123456789abcdef"; 27 | const std::string::size_type ret = hex.find(c); 28 | return ret; 29 | } 30 | 31 | static std::string::size_type hexValue(char c) { 32 | const std::string::size_type ret = hexValueNoException(c); 33 | if(ret == std::string::npos) { 34 | throw std::runtime_error("bad hex value"); 35 | } 36 | 37 | return ret; 38 | } 39 | 40 | Mode Mode::matching(const std::string strHex) { 41 | Mode r; 42 | r.name = "matching"; 43 | r.kernel = "profanity_score_matching"; 44 | 45 | std::fill( r.data1, r.data1 + sizeof(r.data1), cl_uchar(0) ); 46 | std::fill( r.data2, r.data2 + sizeof(r.data2), cl_uchar(0) ); 47 | 48 | auto index = 0; 49 | 50 | for( size_t i = 0; i < strHex.size(); i += 2 ) { 51 | const auto indexHi = hexValueNoException(strHex[i]); 52 | const auto indexLo = i + 1 < strHex.size() ? hexValueNoException(strHex[i+1]) : std::string::npos; 53 | 54 | const auto valHi = (indexHi == std::string::npos) ? 0 : indexHi << 4; 55 | const auto valLo = (indexLo == std::string::npos) ? 0 : indexLo; 56 | 57 | const auto maskHi = (indexHi == std::string::npos) ? 0 : 0xF << 4; 58 | const auto maskLo = (indexLo == std::string::npos) ? 0 : 0xF; 59 | 60 | r.data1[index] = maskHi | maskLo; 61 | r.data2[index] = valHi | valLo; 62 | 63 | ++index; 64 | } 65 | 66 | return r; 67 | } 68 | 69 | Mode Mode::leading(const char charLeading) { 70 | 71 | Mode r; 72 | r.name = "leading"; 73 | r.kernel = "profanity_score_leading"; 74 | r.data1[0] = static_cast(hexValue(charLeading)); 75 | return r; 76 | } 77 | 78 | Mode Mode::range(const cl_uchar min, const cl_uchar max) { 79 | Mode r; 80 | r.name = "range"; 81 | r.kernel = "profanity_score_range"; 82 | r.data1[0] = min; 83 | r.data2[0] = max; 84 | return r; 85 | } 86 | 87 | Mode Mode::zeroBytes() { 88 | Mode r; 89 | r.name = "zeroBytes"; 90 | r.kernel = "profanity_score_zerobytes"; 91 | return r; 92 | } 93 | 94 | Mode Mode::letters() { 95 | Mode r = range(10, 15); 96 | r.name = "letters"; 97 | return r; 98 | } 99 | 100 | Mode Mode::numbers() { 101 | Mode r = range(0, 9); 102 | r.name = "numbers"; 103 | return r; 104 | } 105 | 106 | std::string Mode::transformKernel() const { 107 | switch (this->target) { 108 | case ADDRESS: 109 | return ""; 110 | case CONTRACT: 111 | return "profanity_transform_contract"; 112 | default: 113 | throw "No kernel for target"; 114 | } 115 | } 116 | 117 | std::string Mode::transformName() const { 118 | switch (this->target) { 119 | case ADDRESS: 120 | return "Address"; 121 | case CONTRACT: 122 | return "Contract"; 123 | default: 124 | throw "No name for target"; 125 | } 126 | } 127 | 128 | Mode Mode::leadingRange(const cl_uchar min, const cl_uchar max) { 129 | Mode r; 130 | r.name = "leadingrange"; 131 | r.kernel = "profanity_score_leadingrange"; 132 | r.data1[0] = min; 133 | r.data2[0] = max; 134 | return r; 135 | } 136 | 137 | Mode Mode::mirror() { 138 | Mode r; 139 | r.name = "mirror"; 140 | r.kernel = "profanity_score_mirror"; 141 | return r; 142 | } 143 | 144 | Mode Mode::doubles() { 145 | Mode r; 146 | r.name = "doubles"; 147 | r.kernel = "profanity_score_doubles"; 148 | return r; 149 | } 150 | -------------------------------------------------------------------------------- /Dispatcher.hpp: -------------------------------------------------------------------------------- 1 | #ifndef HPP_DISPATCHER 2 | #define HPP_DISPATCHER 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #if defined(__APPLE__) || defined(__MACOSX) 11 | #include 12 | #define clCreateCommandQueueWithProperties clCreateCommandQueue 13 | #else 14 | #include 15 | #endif 16 | 17 | #include "SpeedSample.hpp" 18 | #include "CLMemory.hpp" 19 | #include "types.hpp" 20 | #include "Mode.hpp" 21 | 22 | #define PROFANITY_SPEEDSAMPLES 20 23 | #define PROFANITY_MAX_SCORE 40 24 | 25 | class Dispatcher { 26 | private: 27 | class OpenCLException : public std::runtime_error { 28 | public: 29 | OpenCLException(const std::string s, const cl_int res); 30 | 31 | static void throwIfError(const std::string s, const cl_int res); 32 | 33 | const cl_int m_res; 34 | }; 35 | 36 | struct Device { 37 | static cl_command_queue createQueue(cl_context & clContext, cl_device_id & clDeviceId); 38 | static cl_kernel createKernel(cl_program & clProgram, const std::string s); 39 | static cl_ulong4 createSeed(); 40 | 41 | Device(Dispatcher & parent, cl_context & clContext, cl_program & clProgram, cl_device_id clDeviceId, const size_t worksizeLocal, const size_t size, const size_t index, const Mode & mode, cl_ulong4 clSeedX, cl_ulong4 clSeedY); 42 | ~Device(); 43 | 44 | Dispatcher & m_parent; 45 | const size_t m_index; 46 | 47 | cl_device_id m_clDeviceId; 48 | size_t m_worksizeLocal; 49 | cl_uchar m_clScoreMax; 50 | cl_command_queue m_clQueue; 51 | 52 | cl_kernel m_kernelInit; 53 | cl_kernel m_kernelInverse; 54 | cl_kernel m_kernelIterate; 55 | cl_kernel m_kernelTransform; 56 | cl_kernel m_kernelScore; 57 | 58 | CLMemory m_memPrecomp; 59 | CLMemory m_memPointsDeltaX; 60 | CLMemory m_memInversedNegativeDoubleGy; 61 | CLMemory m_memPrevLambda; 62 | CLMemory m_memResult; 63 | 64 | // Data parameters used in some modes 65 | CLMemory m_memData1; 66 | CLMemory m_memData2; 67 | 68 | // Seed and round information 69 | cl_ulong4 m_clSeed; 70 | cl_ulong4 m_clSeedX; 71 | cl_ulong4 m_clSeedY; 72 | cl_ulong m_round; 73 | 74 | // Speed sampling 75 | SpeedSample m_speed; 76 | 77 | // Initialization 78 | size_t m_sizeInitialized; 79 | cl_event m_eventFinished; 80 | }; 81 | 82 | public: 83 | Dispatcher(cl_context & clContext, cl_program & clProgram, const Mode mode, const size_t worksizeMax, const size_t inverseSize, const size_t inverseMultiple, const cl_uchar clScoreQuit, const std::string & seedPublicKey); 84 | ~Dispatcher(); 85 | 86 | void addDevice(cl_device_id clDeviceId, const size_t worksizeLocal, const size_t index); 87 | void run(); 88 | 89 | private: 90 | void init(); 91 | void initBegin(Device & d); 92 | void initContinue(Device & d); 93 | 94 | void dispatch(Device & d); 95 | void enqueueKernel(cl_command_queue & clQueue, cl_kernel & clKernel, size_t worksizeGlobal, const size_t worksizeLocal, cl_event * pEvent); 96 | void enqueueKernelDevice(Device & d, cl_kernel & clKernel, size_t worksizeGlobal, cl_event * pEvent); 97 | 98 | void handleResult(Device & d); 99 | void randomizeSeed(Device & d); 100 | 101 | void onEvent(cl_event event, cl_int status, Device & d); 102 | 103 | void printSpeed(); 104 | 105 | private: 106 | static void CL_CALLBACK staticCallback(cl_event event, cl_int event_command_exec_status, void * user_data); 107 | 108 | static std::string formatSpeed(double s); 109 | 110 | private: /* Instance variables */ 111 | cl_context & m_clContext; 112 | cl_program & m_clProgram; 113 | const Mode m_mode; 114 | const size_t m_worksizeMax; 115 | const size_t m_inverseSize; 116 | const size_t m_size; 117 | cl_uchar m_clScoreMax; 118 | cl_uchar m_clScoreQuit; 119 | 120 | std::vector m_vDevices; 121 | 122 | cl_event m_eventFinished; 123 | 124 | // Run information 125 | std::mutex m_mutex; 126 | std::chrono::time_point timeStart; 127 | unsigned int m_countPrint; 128 | unsigned int m_countRunning; 129 | size_t m_sizeInitTotal; 130 | size_t m_sizeInitDone; 131 | bool m_quit; 132 | cl_ulong4 m_publicKeyX; 133 | cl_ulong4 m_publicKeyY; 134 | }; 135 | 136 | #endif /* HPP_DISPATCHER */ 137 | -------------------------------------------------------------------------------- /keccak.cl: -------------------------------------------------------------------------------- 1 | /* This Keccak implementation is an amalgamation of: 2 | * Tiny SHA3 implementation by Markku-Juhani O. Saarinen: 3 | * https://github.com/mjosaarinen/tiny_sha3 4 | * Keccak implementation found in xptMiner-gpu @ Github: 5 | * https://github.com/llamasoft/xptMiner-gpu/blob/master/opencl/keccak.cl 6 | */ 7 | 8 | typedef union { 9 | uchar b[200]; 10 | ulong q[25]; 11 | uint d[50]; 12 | } ethhash; 13 | 14 | #define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) \ 15 | { \ 16 | t = rotate((ulong)(d0 ^ d1 ^ d2 ^ d3 ^ d4), (ulong)1) ^ (c0 ^ c1 ^ c2 ^ c3 ^ c4); \ 17 | } 18 | 19 | #define THETA(s00, s01, s02, s03, s04, \ 20 | s10, s11, s12, s13, s14, \ 21 | s20, s21, s22, s23, s24, \ 22 | s30, s31, s32, s33, s34, \ 23 | s40, s41, s42, s43, s44) \ 24 | { \ 25 | TH_ELT(t0, s40, s41, s42, s43, s44, s10, s11, s12, s13, s14); \ 26 | TH_ELT(t1, s00, s01, s02, s03, s04, s20, s21, s22, s23, s24); \ 27 | TH_ELT(t2, s10, s11, s12, s13, s14, s30, s31, s32, s33, s34); \ 28 | TH_ELT(t3, s20, s21, s22, s23, s24, s40, s41, s42, s43, s44); \ 29 | TH_ELT(t4, s30, s31, s32, s33, s34, s00, s01, s02, s03, s04); \ 30 | s00 ^= t0; s01 ^= t0; s02 ^= t0; s03 ^= t0; s04 ^= t0; \ 31 | s10 ^= t1; s11 ^= t1; s12 ^= t1; s13 ^= t1; s14 ^= t1; \ 32 | s20 ^= t2; s21 ^= t2; s22 ^= t2; s23 ^= t2; s24 ^= t2; \ 33 | s30 ^= t3; s31 ^= t3; s32 ^= t3; s33 ^= t3; s34 ^= t3; \ 34 | s40 ^= t4; s41 ^= t4; s42 ^= t4; s43 ^= t4; s44 ^= t4; \ 35 | } 36 | 37 | #define RHOPI(s00, s01, s02, s03, s04, \ 38 | s10, s11, s12, s13, s14, \ 39 | s20, s21, s22, s23, s24, \ 40 | s30, s31, s32, s33, s34, \ 41 | s40, s41, s42, s43, s44) \ 42 | { \ 43 | t0 = rotate(s10, (ulong) 1); \ 44 | s10 = rotate(s11, (ulong)44); \ 45 | s11 = rotate(s41, (ulong)20); \ 46 | s41 = rotate(s24, (ulong)61); \ 47 | s24 = rotate(s42, (ulong)39); \ 48 | s42 = rotate(s04, (ulong)18); \ 49 | s04 = rotate(s20, (ulong)62); \ 50 | s20 = rotate(s22, (ulong)43); \ 51 | s22 = rotate(s32, (ulong)25); \ 52 | s32 = rotate(s43, (ulong) 8); \ 53 | s43 = rotate(s34, (ulong)56); \ 54 | s34 = rotate(s03, (ulong)41); \ 55 | s03 = rotate(s40, (ulong)27); \ 56 | s40 = rotate(s44, (ulong)14); \ 57 | s44 = rotate(s14, (ulong) 2); \ 58 | s14 = rotate(s31, (ulong)55); \ 59 | s31 = rotate(s13, (ulong)45); \ 60 | s13 = rotate(s01, (ulong)36); \ 61 | s01 = rotate(s30, (ulong)28); \ 62 | s30 = rotate(s33, (ulong)21); \ 63 | s33 = rotate(s23, (ulong)15); \ 64 | s23 = rotate(s12, (ulong)10); \ 65 | s12 = rotate(s21, (ulong) 6); \ 66 | s21 = rotate(s02, (ulong) 3); \ 67 | s02 = t0; \ 68 | } 69 | 70 | #define KHI(s00, s01, s02, s03, s04, \ 71 | s10, s11, s12, s13, s14, \ 72 | s20, s21, s22, s23, s24, \ 73 | s30, s31, s32, s33, s34, \ 74 | s40, s41, s42, s43, s44) \ 75 | { \ 76 | t0 = s00 ^ (~s10 & s20); \ 77 | t1 = s10 ^ (~s20 & s30); \ 78 | t2 = s20 ^ (~s30 & s40); \ 79 | t3 = s30 ^ (~s40 & s00); \ 80 | t4 = s40 ^ (~s00 & s10); \ 81 | s00 = t0; s10 = t1; s20 = t2; s30 = t3; s40 = t4; \ 82 | \ 83 | t0 = s01 ^ (~s11 & s21); \ 84 | t1 = s11 ^ (~s21 & s31); \ 85 | t2 = s21 ^ (~s31 & s41); \ 86 | t3 = s31 ^ (~s41 & s01); \ 87 | t4 = s41 ^ (~s01 & s11); \ 88 | s01 = t0; s11 = t1; s21 = t2; s31 = t3; s41 = t4; \ 89 | \ 90 | t0 = s02 ^ (~s12 & s22); \ 91 | t1 = s12 ^ (~s22 & s32); \ 92 | t2 = s22 ^ (~s32 & s42); \ 93 | t3 = s32 ^ (~s42 & s02); \ 94 | t4 = s42 ^ (~s02 & s12); \ 95 | s02 = t0; s12 = t1; s22 = t2; s32 = t3; s42 = t4; \ 96 | \ 97 | t0 = s03 ^ (~s13 & s23); \ 98 | t1 = s13 ^ (~s23 & s33); \ 99 | t2 = s23 ^ (~s33 & s43); \ 100 | t3 = s33 ^ (~s43 & s03); \ 101 | t4 = s43 ^ (~s03 & s13); \ 102 | s03 = t0; s13 = t1; s23 = t2; s33 = t3; s43 = t4; \ 103 | \ 104 | t0 = s04 ^ (~s14 & s24); \ 105 | t1 = s14 ^ (~s24 & s34); \ 106 | t2 = s24 ^ (~s34 & s44); \ 107 | t3 = s34 ^ (~s44 & s04); \ 108 | t4 = s44 ^ (~s04 & s14); \ 109 | s04 = t0; s14 = t1; s24 = t2; s34 = t3; s44 = t4; \ 110 | } 111 | 112 | #define IOTA(s00, r) { s00 ^= r; } 113 | 114 | __constant ulong keccakf_rndc[24] = { 115 | 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, 116 | 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, 117 | 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, 118 | 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, 119 | 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, 120 | 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, 121 | 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, 122 | 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 123 | }; 124 | 125 | // Barely a bottleneck. No need to tinker more. 126 | void sha3_keccakf(ethhash * const h) 127 | { 128 | ulong * const st = &h->q; 129 | h->d[33] ^= 0x80000000; 130 | ulong t0, t1, t2, t3, t4; 131 | 132 | // Unrolling and removing PI stage gave negligable performance on GTX 1070. 133 | for (int i = 0; i < 24; ++i) { 134 | THETA(st[0], st[5], st[10], st[15], st[20], st[1], st[6], st[11], st[16], st[21], st[2], st[7], st[12], st[17], st[22], st[3], st[8], st[13], st[18], st[23], st[4], st[9], st[14], st[19], st[24]); 135 | RHOPI(st[0], st[5], st[10], st[15], st[20], st[1], st[6], st[11], st[16], st[21], st[2], st[7], st[12], st[17], st[22], st[3], st[8], st[13], st[18], st[23], st[4], st[9], st[14], st[19], st[24]); 136 | KHI(st[0], st[5], st[10], st[15], st[20], st[1], st[6], st[11], st[16], st[21], st[2], st[7], st[12], st[17], st[22], st[3], st[8], st[13], st[18], st[23], st[4], st[9], st[14], st[19], st[24]); 137 | IOTA(st[0], keccakf_rndc[i]); 138 | } 139 | } 140 | -------------------------------------------------------------------------------- /profanity.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #if defined(__APPLE__) || defined(__MACOSX) 13 | #include 14 | #include // Included to get topology to get an actual unique identifier per device 15 | #else 16 | #include 17 | #include // Included to get topology to get an actual unique identifier per device 18 | #endif 19 | 20 | #define CL_DEVICE_PCI_BUS_ID_NV 0x4008 21 | #define CL_DEVICE_PCI_SLOT_ID_NV 0x4009 22 | 23 | #include "Dispatcher.hpp" 24 | #include "ArgParser.hpp" 25 | #include "Mode.hpp" 26 | #include "help.hpp" 27 | 28 | std::string readFile(const char * const szFilename) 29 | { 30 | std::ifstream in(szFilename, std::ios::in | std::ios::binary); 31 | std::ostringstream contents; 32 | contents << in.rdbuf(); 33 | return contents.str(); 34 | } 35 | 36 | std::vector getAllDevices(cl_device_type deviceType = CL_DEVICE_TYPE_GPU) 37 | { 38 | std::vector vDevices; 39 | 40 | cl_uint platformIdCount = 0; 41 | clGetPlatformIDs (0, NULL, &platformIdCount); 42 | 43 | std::vector platformIds (platformIdCount); 44 | clGetPlatformIDs (platformIdCount, platformIds.data (), NULL); 45 | 46 | for( auto it = platformIds.cbegin(); it != platformIds.cend(); ++it ) { 47 | cl_uint countDevice; 48 | clGetDeviceIDs(*it, deviceType, 0, NULL, &countDevice); 49 | 50 | std::vector deviceIds(countDevice); 51 | clGetDeviceIDs(*it, deviceType, countDevice, deviceIds.data(), &countDevice); 52 | 53 | std::copy( deviceIds.begin(), deviceIds.end(), std::back_inserter(vDevices) ); 54 | } 55 | 56 | return vDevices; 57 | } 58 | 59 | template 60 | T clGetWrapper(U function, V param, W param2) { 61 | T t; 62 | function(param, param2, sizeof(t), &t, NULL); 63 | return t; 64 | } 65 | 66 | template 67 | std::string clGetWrapperString(U function, V param, W param2) { 68 | size_t len; 69 | function(param, param2, 0, NULL, &len); 70 | char * const szString = new char[len]; 71 | function(param, param2, len, szString, NULL); 72 | std::string r(szString); 73 | delete[] szString; 74 | return r; 75 | } 76 | 77 | template 78 | std::vector clGetWrapperVector(U function, V param, W param2) { 79 | size_t len; 80 | function(param, param2, 0, NULL, &len); 81 | len /= sizeof(T); 82 | std::vector v; 83 | if (len > 0) { 84 | T * pArray = new T[len]; 85 | function(param, param2, len * sizeof(T), pArray, NULL); 86 | for (size_t i = 0; i < len; ++i) { 87 | v.push_back(pArray[i]); 88 | } 89 | delete[] pArray; 90 | } 91 | return v; 92 | } 93 | 94 | std::vector getBinaries(cl_program & clProgram) { 95 | std::vector vReturn; 96 | auto vSizes = clGetWrapperVector(clGetProgramInfo, clProgram, CL_PROGRAM_BINARY_SIZES); 97 | if (!vSizes.empty()) { 98 | unsigned char * * pBuffers = new unsigned char *[vSizes.size()]; 99 | for (size_t i = 0; i < vSizes.size(); ++i) { 100 | pBuffers[i] = new unsigned char[vSizes[i]]; 101 | } 102 | 103 | clGetProgramInfo(clProgram, CL_PROGRAM_BINARIES, vSizes.size() * sizeof(unsigned char *), pBuffers, NULL); 104 | for (size_t i = 0; i < vSizes.size(); ++i) { 105 | std::string strData(reinterpret_cast(pBuffers[i]), vSizes[i]); 106 | vReturn.push_back(strData); 107 | delete[] pBuffers[i]; 108 | } 109 | 110 | delete[] pBuffers; 111 | } 112 | 113 | return vReturn; 114 | } 115 | 116 | unsigned int getUniqueDeviceIdentifier(const cl_device_id & deviceId) { 117 | #if defined(CL_DEVICE_TOPOLOGY_AMD) 118 | auto topology = clGetWrapper(clGetDeviceInfo, deviceId, CL_DEVICE_TOPOLOGY_AMD); 119 | if (topology.raw.type == CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD) { 120 | return (topology.pcie.bus << 16) + (topology.pcie.device << 8) + topology.pcie.function; 121 | } 122 | #endif 123 | cl_int bus_id = clGetWrapper(clGetDeviceInfo, deviceId, CL_DEVICE_PCI_BUS_ID_NV); 124 | cl_int slot_id = clGetWrapper(clGetDeviceInfo, deviceId, CL_DEVICE_PCI_SLOT_ID_NV); 125 | return (bus_id << 16) + slot_id; 126 | } 127 | 128 | template bool printResult(const T & t, const cl_int & err) { 129 | std::cout << ((t == NULL) ? toString(err) : "OK") << std::endl; 130 | return t == NULL; 131 | } 132 | 133 | bool printResult(const cl_int err) { 134 | std::cout << ((err != CL_SUCCESS) ? toString(err) : "OK") << std::endl; 135 | return err != CL_SUCCESS; 136 | } 137 | 138 | std::string getDeviceCacheFilename(cl_device_id & d, const size_t & inverseSize) { 139 | const auto uniqueId = getUniqueDeviceIdentifier(d); 140 | return "cache-opencl." + toString(inverseSize) + "." + toString(uniqueId); 141 | } 142 | 143 | int main(int argc, char * * argv) { 144 | // THIS LINE WILL LEAD TO A COMPILE ERROR. THIS TOOL SHOULD NOT BE USED, SEE README. 145 | 146 | // ^^ Commented previous line and excluded private key generation out of scope of this project, 147 | // now it only advances provided public key to a random offset to find vanity address 148 | 149 | try { 150 | ArgParser argp(argc, argv); 151 | bool bHelp = false; 152 | bool bModeBenchmark = false; 153 | bool bModeZeros = false; 154 | bool bModeZeroBytes = false; 155 | bool bModeLetters = false; 156 | bool bModeNumbers = false; 157 | std::string strModeLeading; 158 | std::string strModeMatching; 159 | std::string strPublicKey; 160 | bool bModeLeadingRange = false; 161 | bool bModeRange = false; 162 | bool bModeMirror = false; 163 | bool bModeDoubles = false; 164 | int rangeMin = 0; 165 | int rangeMax = 0; 166 | std::vector vDeviceSkipIndex; 167 | size_t worksizeLocal = 64; 168 | size_t worksizeMax = 0; // Will be automatically determined later if not overriden by user 169 | bool bNoCache = false; 170 | size_t inverseSize = 255; 171 | size_t inverseMultiple = 16384; 172 | bool bMineContract = false; 173 | 174 | argp.addSwitch('h', "help", bHelp); 175 | argp.addSwitch('0', "benchmark", bModeBenchmark); 176 | argp.addSwitch('1', "zeros", bModeZeros); 177 | argp.addSwitch('2', "letters", bModeLetters); 178 | argp.addSwitch('3', "numbers", bModeNumbers); 179 | argp.addSwitch('4', "leading", strModeLeading); 180 | argp.addSwitch('5', "matching", strModeMatching); 181 | argp.addSwitch('6', "leading-range", bModeLeadingRange); 182 | argp.addSwitch('7', "range", bModeRange); 183 | argp.addSwitch('8', "mirror", bModeMirror); 184 | argp.addSwitch('9', "leading-doubles", bModeDoubles); 185 | argp.addSwitch('m', "min", rangeMin); 186 | argp.addSwitch('M', "max", rangeMax); 187 | argp.addMultiSwitch('s', "skip", vDeviceSkipIndex); 188 | argp.addSwitch('w', "work", worksizeLocal); 189 | argp.addSwitch('W', "work-max", worksizeMax); 190 | argp.addSwitch('n', "no-cache", bNoCache); 191 | argp.addSwitch('i', "inverse-size", inverseSize); 192 | argp.addSwitch('I', "inverse-multiple", inverseMultiple); 193 | argp.addSwitch('c', "contract", bMineContract); 194 | argp.addSwitch('z', "publicKey", strPublicKey); 195 | argp.addSwitch('b', "zero-bytes", bModeZeroBytes); 196 | 197 | if (!argp.parse()) { 198 | std::cout << "error: bad arguments, try again :<" << std::endl; 199 | return 1; 200 | } 201 | 202 | if (bHelp) { 203 | std::cout << g_strHelp << std::endl; 204 | return 0; 205 | } 206 | 207 | Mode mode = Mode::benchmark(); 208 | if (bModeBenchmark) { 209 | mode = Mode::benchmark(); 210 | } else if (bModeZeros) { 211 | mode = Mode::zeros(); 212 | } else if (bModeLetters) { 213 | mode = Mode::letters(); 214 | } else if (bModeNumbers) { 215 | mode = Mode::numbers(); 216 | } else if (!strModeLeading.empty()) { 217 | mode = Mode::leading(strModeLeading.front()); 218 | } else if (!strModeMatching.empty()) { 219 | mode = Mode::matching(strModeMatching); 220 | } else if (bModeLeadingRange) { 221 | mode = Mode::leadingRange(rangeMin, rangeMax); 222 | } else if (bModeRange) { 223 | mode = Mode::range(rangeMin, rangeMax); 224 | } else if(bModeMirror) { 225 | mode = Mode::mirror(); 226 | } else if (bModeDoubles) { 227 | mode = Mode::doubles(); 228 | } else if (bModeZeroBytes) { 229 | mode = Mode::zeroBytes(); 230 | } else { 231 | std::cout << g_strHelp << std::endl; 232 | return 0; 233 | } 234 | 235 | if (strPublicKey.length() == 0) { 236 | std::cout << "error: this tool requires your public key to derive it's private key security" << std::endl; 237 | return 1; 238 | } 239 | 240 | if (strPublicKey.length() != 128) { 241 | std::cout << "error: public key must be 128 hexademical characters long" << std::endl; 242 | return 1; 243 | } 244 | 245 | std::cout << "Mode: " << mode.name << std::endl; 246 | 247 | if (bMineContract) { 248 | mode.target = CONTRACT; 249 | } else { 250 | mode.target = ADDRESS; 251 | } 252 | std::cout << "Target: " << mode.transformName() << std:: endl; 253 | 254 | std::vector vFoundDevices = getAllDevices(); 255 | std::vector vDevices; 256 | std::map mDeviceIndex; 257 | 258 | std::vector vDeviceBinary; 259 | std::vector vDeviceBinarySize; 260 | cl_int errorCode; 261 | bool bUsedCache = false; 262 | 263 | std::cout << "Devices:" << std::endl; 264 | for (size_t i = 0; i < vFoundDevices.size(); ++i) { 265 | // Ignore devices in skip index 266 | if (std::find(vDeviceSkipIndex.begin(), vDeviceSkipIndex.end(), i) != vDeviceSkipIndex.end()) { 267 | continue; 268 | } 269 | 270 | cl_device_id & deviceId = vFoundDevices[i]; 271 | 272 | const auto strName = clGetWrapperString(clGetDeviceInfo, deviceId, CL_DEVICE_NAME); 273 | const auto computeUnits = clGetWrapper(clGetDeviceInfo, deviceId, CL_DEVICE_MAX_COMPUTE_UNITS); 274 | const auto globalMemSize = clGetWrapper(clGetDeviceInfo, deviceId, CL_DEVICE_GLOBAL_MEM_SIZE); 275 | bool precompiled = false; 276 | 277 | // Check if there's a prebuilt binary for this device and load it 278 | if(!bNoCache) { 279 | std::ifstream fileIn(getDeviceCacheFilename(deviceId, inverseSize), std::ios::binary); 280 | if (fileIn.is_open()) { 281 | vDeviceBinary.push_back(std::string((std::istreambuf_iterator(fileIn)), std::istreambuf_iterator())); 282 | vDeviceBinarySize.push_back(vDeviceBinary.back().size()); 283 | precompiled = true; 284 | } 285 | } 286 | 287 | std::cout << " GPU" << i << ": " << strName << ", " << globalMemSize << " bytes available, " << computeUnits << " compute units (precompiled = " << (precompiled ? "yes" : "no") << ")" << std::endl; 288 | vDevices.push_back(vFoundDevices[i]); 289 | mDeviceIndex[vFoundDevices[i]] = i; 290 | } 291 | 292 | if (vDevices.empty()) { 293 | return 1; 294 | } 295 | 296 | std::cout << std::endl; 297 | std::cout << "Initializing OpenCL..." << std::endl; 298 | std::cout << " Creating context..." << std::flush; 299 | auto clContext = clCreateContext( NULL, vDevices.size(), vDevices.data(), NULL, NULL, &errorCode); 300 | if (printResult(clContext, errorCode)) { 301 | return 1; 302 | } 303 | 304 | cl_program clProgram; 305 | if (vDeviceBinary.size() == vDevices.size()) { 306 | // Create program from binaries 307 | bUsedCache = true; 308 | 309 | std::cout << " Loading kernel from binary..." << std::flush; 310 | const unsigned char * * pKernels = new const unsigned char *[vDevices.size()]; 311 | for (size_t i = 0; i < vDeviceBinary.size(); ++i) { 312 | pKernels[i] = reinterpret_cast(vDeviceBinary[i].data()); 313 | } 314 | 315 | cl_int * pStatus = new cl_int[vDevices.size()]; 316 | 317 | clProgram = clCreateProgramWithBinary(clContext, vDevices.size(), vDevices.data(), vDeviceBinarySize.data(), pKernels, pStatus, &errorCode); 318 | if(printResult(clProgram, errorCode)) { 319 | return 1; 320 | } 321 | } else { 322 | // Create a program from the kernel source 323 | std::cout << " Compiling kernel..." << std::flush; 324 | const std::string strKeccak = readFile("keccak.cl"); 325 | const std::string strVanity = readFile("profanity.cl"); 326 | const char * szKernels[] = { strKeccak.c_str(), strVanity.c_str() }; 327 | 328 | clProgram = clCreateProgramWithSource(clContext, sizeof(szKernels) / sizeof(char *), szKernels, NULL, &errorCode); 329 | if (printResult(clProgram, errorCode)) { 330 | return 1; 331 | } 332 | } 333 | 334 | // Build the program 335 | std::cout << " Building program..." << std::flush; 336 | const std::string strBuildOptions = "-D PROFANITY_INVERSE_SIZE=" + toString(inverseSize) + " -D PROFANITY_MAX_SCORE=" + toString(PROFANITY_MAX_SCORE); 337 | if (printResult(clBuildProgram(clProgram, vDevices.size(), vDevices.data(), strBuildOptions.c_str(), NULL, NULL))) { 338 | #ifdef PROFANITY_DEBUG 339 | std::cout << std::endl; 340 | std::cout << "build log:" << std::endl; 341 | 342 | size_t sizeLog; 343 | clGetProgramBuildInfo(clProgram, vDevices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &sizeLog); 344 | char * const szLog = new char[sizeLog]; 345 | clGetProgramBuildInfo(clProgram, vDevices[0], CL_PROGRAM_BUILD_LOG, sizeLog, szLog, NULL); 346 | 347 | std::cout << szLog << std::endl; 348 | delete[] szLog; 349 | #endif 350 | return 1; 351 | } 352 | 353 | // Save binary to improve future start times 354 | if( !bUsedCache && !bNoCache ) { 355 | std::cout << " Saving program..." << std::flush; 356 | auto binaries = getBinaries(clProgram); 357 | for (size_t i = 0; i < binaries.size(); ++i) { 358 | std::ofstream fileOut(getDeviceCacheFilename(vDevices[i], inverseSize), std::ios::binary); 359 | fileOut.write(binaries[i].data(), binaries[i].size()); 360 | } 361 | std::cout << "OK" << std::endl; 362 | } 363 | 364 | std::cout << std::endl; 365 | 366 | Dispatcher d(clContext, clProgram, mode, worksizeMax == 0 ? inverseSize * inverseMultiple : worksizeMax, inverseSize, inverseMultiple, 0, strPublicKey); 367 | for (auto & i : vDevices) { 368 | d.addDevice(i, worksizeLocal, mDeviceIndex[i]); 369 | } 370 | 371 | d.run(); 372 | clReleaseContext(clContext); 373 | return 0; 374 | } catch (std::runtime_error & e) { 375 | std::cout << "std::runtime_error - " << e.what() << std::endl; 376 | } catch (...) { 377 | std::cout << "unknown exception occured" << std::endl; 378 | } 379 | 380 | return 1; 381 | } 382 | 383 | -------------------------------------------------------------------------------- /Dispatcher.cpp: -------------------------------------------------------------------------------- 1 | #include "Dispatcher.hpp" 2 | 3 | // Includes 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #if defined(__APPLE__) || defined(__MACOSX) 14 | #include 15 | #else 16 | #include 17 | #endif 18 | 19 | #include "precomp.hpp" 20 | 21 | #ifndef htonll 22 | #define htonll(x) ((((uint64_t)htonl(x)) << 32) | htonl((x) >> 32)) 23 | #endif 24 | 25 | static std::string::size_type fromHex(char c) { 26 | if (c >= 'A' && c <= 'F') { 27 | c += 'a' - 'A'; 28 | } 29 | 30 | const std::string hex = "0123456789abcdef"; 31 | const std::string::size_type ret = hex.find(c); 32 | return ret; 33 | } 34 | 35 | static cl_ulong4 fromHex(const std::string & strHex) { 36 | uint8_t data[32]; 37 | std::fill(data, data + sizeof(data), cl_uchar(0)); 38 | 39 | auto index = 0; 40 | for(size_t i = 0; i < strHex.size(); i += 2) { 41 | const auto indexHi = fromHex(strHex[i]); 42 | const auto indexLo = i + 1 < strHex.size() ? fromHex(strHex[i+1]) : std::string::npos; 43 | 44 | const auto valHi = (indexHi == std::string::npos) ? 0 : indexHi << 4; 45 | const auto valLo = (indexLo == std::string::npos) ? 0 : indexLo; 46 | 47 | data[index] = valHi | valLo; 48 | ++index; 49 | } 50 | 51 | cl_ulong4 res = { 52 | .s = { 53 | htonll(*(uint64_t *)(data + 24)), 54 | htonll(*(uint64_t *)(data + 16)), 55 | htonll(*(uint64_t *)(data + 8)), 56 | htonll(*(uint64_t *)(data + 0)), 57 | } 58 | }; 59 | return res; 60 | } 61 | 62 | static std::string toHex(const uint8_t * const s, const size_t len) { 63 | std::string b("0123456789abcdef"); 64 | std::string r; 65 | 66 | for (size_t i = 0; i < len; ++i) { 67 | const unsigned char h = s[i] / 16; 68 | const unsigned char l = s[i] % 16; 69 | 70 | r = r + b.substr(h, 1) + b.substr(l, 1); 71 | } 72 | 73 | return r; 74 | } 75 | 76 | static void printResult(cl_ulong4 seed, cl_ulong round, result r, cl_uchar score, const std::chrono::time_point & timeStart, const Mode & mode) { 77 | // Time delta 78 | const auto seconds = std::chrono::duration_cast(std::chrono::steady_clock::now() - timeStart).count(); 79 | 80 | // Format private key 81 | cl_ulong carry = 0; 82 | cl_ulong4 seedRes; 83 | 84 | seedRes.s[0] = seed.s[0] + round; carry = seedRes.s[0] < round; 85 | seedRes.s[1] = seed.s[1] + carry; carry = !seedRes.s[1]; 86 | seedRes.s[2] = seed.s[2] + carry; carry = !seedRes.s[2]; 87 | seedRes.s[3] = seed.s[3] + carry + r.foundId; 88 | 89 | std::ostringstream ss; 90 | ss << std::hex << std::setfill('0'); 91 | ss << std::setw(16) << seedRes.s[3] << std::setw(16) << seedRes.s[2] << std::setw(16) << seedRes.s[1] << std::setw(16) << seedRes.s[0]; 92 | const std::string strPrivate = ss.str(); 93 | 94 | // Format public key 95 | const std::string strPublic = toHex(r.foundHash, 20); 96 | 97 | // Print 98 | const std::string strVT100ClearLine = "\33[2K\r"; 99 | std::cout << strVT100ClearLine << " Time: " << std::setw(5) << seconds << "s Score: " << std::setw(2) << (int) score << " Private: 0x" << strPrivate << ' '; 100 | 101 | std::cout << mode.transformName(); 102 | std::cout << ": 0x" << strPublic << std::endl; 103 | } 104 | 105 | unsigned int getKernelExecutionTimeMicros(cl_event & e) { 106 | cl_ulong timeStart = 0, timeEnd = 0; 107 | clWaitForEvents(1, &e); 108 | clGetEventProfilingInfo(e, CL_PROFILING_COMMAND_START, sizeof(timeStart), &timeStart, NULL); 109 | clGetEventProfilingInfo(e, CL_PROFILING_COMMAND_END, sizeof(timeEnd), &timeEnd, NULL); 110 | return (timeEnd - timeStart) / 1000; 111 | } 112 | 113 | Dispatcher::OpenCLException::OpenCLException(const std::string s, const cl_int res) : 114 | std::runtime_error( s + " (res = " + toString(res) + ")"), 115 | m_res(res) 116 | { 117 | 118 | } 119 | 120 | void Dispatcher::OpenCLException::OpenCLException::throwIfError(const std::string s, const cl_int res) { 121 | if (res != CL_SUCCESS) { 122 | throw OpenCLException(s, res); 123 | } 124 | } 125 | 126 | cl_command_queue Dispatcher::Device::createQueue(cl_context & clContext, cl_device_id & clDeviceId) { 127 | // nVidia CUDA Toolkit 10.1 only supports OpenCL 1.2 so we revert back to older functions for compatability 128 | #ifdef PROFANITY_DEBUG 129 | cl_command_queue_properties p = CL_QUEUE_PROFILING_ENABLE; 130 | #else 131 | cl_command_queue_properties p = NULL; 132 | #endif 133 | 134 | #ifdef CL_VERSION_2_0 135 | const cl_command_queue ret = clCreateCommandQueueWithProperties(clContext, clDeviceId, &p, NULL); 136 | #else 137 | const cl_command_queue ret = clCreateCommandQueue(clContext, clDeviceId, p, NULL); 138 | #endif 139 | return ret == NULL ? throw std::runtime_error("failed to create command queue") : ret; 140 | } 141 | 142 | cl_kernel Dispatcher::Device::createKernel(cl_program & clProgram, const std::string s) { 143 | cl_kernel ret = clCreateKernel(clProgram, s.c_str(), NULL); 144 | return ret == NULL ? throw std::runtime_error("failed to create kernel \"" + s + "\"") : ret; 145 | } 146 | 147 | cl_ulong4 Dispatcher::Device::createSeed() { 148 | #ifdef PROFANITY_DEBUG 149 | cl_ulong4 r; 150 | r.s[0] = 1; 151 | r.s[1] = 1; 152 | r.s[2] = 1; 153 | r.s[3] = 1; 154 | return r; 155 | #else 156 | // We do not need really safe crypto random here, since we inherit safety 157 | // of the key from the user-provided seed public key. 158 | // We only need this random to not repeat same job among different devices 159 | std::random_device rd; 160 | 161 | cl_ulong4 diff; 162 | diff.s[0] = (((uint64_t)rd()) << 32) | rd(); 163 | diff.s[1] = (((uint64_t)rd()) << 32) | rd(); 164 | diff.s[2] = (((uint64_t)rd()) << 32) | rd(); 165 | diff.s[3] = (((uint64_t)rd() & 0x0000ffff) << 32) | rd(); // zeroing 2 highest bytes to prevent overflowing sum private key after adding to seed private key 166 | return diff; 167 | #endif 168 | } 169 | 170 | Dispatcher::Device::Device(Dispatcher & parent, cl_context & clContext, cl_program & clProgram, cl_device_id clDeviceId, const size_t worksizeLocal, const size_t size, const size_t index, const Mode & mode, cl_ulong4 clSeedX, cl_ulong4 clSeedY) : 171 | m_parent(parent), 172 | m_index(index), 173 | m_clDeviceId(clDeviceId), 174 | m_worksizeLocal(worksizeLocal), 175 | m_clScoreMax(0), 176 | m_clQueue(createQueue(clContext, clDeviceId) ), 177 | m_kernelInit( createKernel(clProgram, "profanity_init") ), 178 | m_kernelInverse(createKernel(clProgram, "profanity_inverse")), 179 | m_kernelIterate(createKernel(clProgram, "profanity_iterate")), 180 | m_kernelTransform( mode.transformKernel() == "" ? NULL : createKernel(clProgram, mode.transformKernel())), 181 | m_kernelScore(createKernel(clProgram, mode.kernel)), 182 | m_memPrecomp(clContext, m_clQueue, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, sizeof(g_precomp), g_precomp), 183 | m_memPointsDeltaX(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, size, true), 184 | m_memInversedNegativeDoubleGy(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, size, true), 185 | m_memPrevLambda(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, size, true), 186 | m_memResult(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY, PROFANITY_MAX_SCORE + 1), 187 | m_memData1(clContext, m_clQueue, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, 20), 188 | m_memData2(clContext, m_clQueue, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, 20), 189 | m_clSeed(createSeed()), 190 | m_clSeedX(clSeedX), 191 | m_clSeedY(clSeedY), 192 | m_round(0), 193 | m_speed(PROFANITY_SPEEDSAMPLES), 194 | m_sizeInitialized(0), 195 | m_eventFinished(NULL) 196 | { 197 | 198 | } 199 | 200 | Dispatcher::Device::~Device() { 201 | 202 | } 203 | 204 | Dispatcher::Dispatcher(cl_context & clContext, cl_program & clProgram, const Mode mode, const size_t worksizeMax, const size_t inverseSize, const size_t inverseMultiple, const cl_uchar clScoreQuit, const std::string & seedPublicKey) 205 | : m_clContext(clContext) 206 | , m_clProgram(clProgram) 207 | , m_mode(mode) 208 | , m_worksizeMax(worksizeMax) 209 | , m_inverseSize(inverseSize) 210 | , m_size(inverseSize*inverseMultiple) 211 | , m_clScoreMax(mode.score) 212 | , m_clScoreQuit(clScoreQuit) 213 | , m_eventFinished(NULL) 214 | , m_countPrint(0) 215 | , m_publicKeyX(fromHex(seedPublicKey.substr(0, 64))) 216 | , m_publicKeyY(fromHex(seedPublicKey.substr(64, 64))) 217 | { 218 | } 219 | 220 | Dispatcher::~Dispatcher() { 221 | 222 | } 223 | 224 | void Dispatcher::addDevice(cl_device_id clDeviceId, const size_t worksizeLocal, const size_t index) { 225 | Device * pDevice = new Device(*this, m_clContext, m_clProgram, clDeviceId, worksizeLocal, m_size, index, m_mode, m_publicKeyX, m_publicKeyY); 226 | m_vDevices.push_back(pDevice); 227 | } 228 | 229 | void Dispatcher::run() { 230 | m_eventFinished = clCreateUserEvent(m_clContext, NULL); 231 | timeStart = std::chrono::steady_clock::now(); 232 | 233 | init(); 234 | 235 | const auto timeInitialization = std::chrono::duration_cast(std::chrono::steady_clock::now() - timeStart).count(); 236 | std::cout << "Initialization time: " << timeInitialization << " seconds" << std::endl; 237 | 238 | m_quit = false; 239 | m_countRunning = m_vDevices.size(); 240 | 241 | std::cout << "Running..." << std::endl; 242 | std::cout << " Always verify that a private key generated by this program corresponds to the" << std::endl; 243 | std::cout << " public key printed by importing it to a wallet of your choice. This program" << std::endl; 244 | std::cout << " like any software might contain bugs and it does by design cut corners to" << std::endl; 245 | std::cout << " improve overall performance." << std::endl; 246 | std::cout << std::endl; 247 | 248 | for (auto it = m_vDevices.begin(); it != m_vDevices.end(); ++it) { 249 | dispatch(*(*it)); 250 | } 251 | 252 | clWaitForEvents(1, &m_eventFinished); 253 | clReleaseEvent(m_eventFinished); 254 | m_eventFinished = NULL; 255 | } 256 | 257 | void Dispatcher::init() { 258 | std::cout << "Initializing devices..." << std::endl; 259 | std::cout << " This should take less than a minute. The number of objects initialized on each" << std::endl; 260 | std::cout << " device is equal to inverse-size * inverse-multiple. To lower" << std::endl; 261 | std::cout << " initialization time (and memory footprint) I suggest lowering the" << std::endl; 262 | std::cout << " inverse-multiple first. You can do this via the -I switch. Do note that" << std::endl; 263 | std::cout << " this might negatively impact your performance." << std::endl; 264 | std::cout << std::endl; 265 | 266 | const auto deviceCount = m_vDevices.size(); 267 | m_sizeInitTotal = m_size * deviceCount; 268 | m_sizeInitDone = 0; 269 | 270 | cl_event * const pInitEvents = new cl_event[deviceCount]; 271 | 272 | for (size_t i = 0; i < deviceCount; ++i) { 273 | pInitEvents[i] = clCreateUserEvent(m_clContext, NULL); 274 | m_vDevices[i]->m_eventFinished = pInitEvents[i]; 275 | initBegin(*m_vDevices[i]); 276 | } 277 | 278 | clWaitForEvents(deviceCount, pInitEvents); 279 | for (size_t i = 0; i < deviceCount; ++i) { 280 | m_vDevices[i]->m_eventFinished = NULL; 281 | clReleaseEvent(pInitEvents[i]); 282 | } 283 | 284 | delete[] pInitEvents; 285 | 286 | std::cout << std::endl; 287 | } 288 | 289 | void Dispatcher::initBegin(Device & d) { 290 | // Set mode data 291 | for (auto i = 0; i < 20; ++i) { 292 | d.m_memData1[i] = m_mode.data1[i]; 293 | d.m_memData2[i] = m_mode.data2[i]; 294 | } 295 | 296 | // Write precompute table and mode data 297 | d.m_memPrecomp.write(true); 298 | d.m_memData1.write(true); 299 | d.m_memData2.write(true); 300 | 301 | // Kernel arguments - profanity_begin 302 | d.m_memPrecomp.setKernelArg(d.m_kernelInit, 0); 303 | d.m_memPointsDeltaX.setKernelArg(d.m_kernelInit, 1); 304 | d.m_memPrevLambda.setKernelArg(d.m_kernelInit, 2); 305 | d.m_memResult.setKernelArg(d.m_kernelInit, 3); 306 | CLMemory::setKernelArg(d.m_kernelInit, 4, d.m_clSeed); 307 | CLMemory::setKernelArg(d.m_kernelInit, 5, d.m_clSeedX); 308 | CLMemory::setKernelArg(d.m_kernelInit, 6, d.m_clSeedY); 309 | 310 | // Kernel arguments - profanity_inverse 311 | d.m_memPointsDeltaX.setKernelArg(d.m_kernelInverse, 0); 312 | d.m_memInversedNegativeDoubleGy.setKernelArg(d.m_kernelInverse, 1); 313 | 314 | // Kernel arguments - profanity_iterate 315 | d.m_memPointsDeltaX.setKernelArg(d.m_kernelIterate, 0); 316 | d.m_memInversedNegativeDoubleGy.setKernelArg(d.m_kernelIterate, 1); 317 | d.m_memPrevLambda.setKernelArg(d.m_kernelIterate, 2); 318 | 319 | // Kernel arguments - profanity_transform_* 320 | if(d.m_kernelTransform) { 321 | d.m_memInversedNegativeDoubleGy.setKernelArg(d.m_kernelTransform, 0); 322 | } 323 | 324 | // Kernel arguments - profanity_score_* 325 | d.m_memInversedNegativeDoubleGy.setKernelArg(d.m_kernelScore, 0); 326 | d.m_memResult.setKernelArg(d.m_kernelScore, 1); 327 | d.m_memData1.setKernelArg(d.m_kernelScore, 2); 328 | d.m_memData2.setKernelArg(d.m_kernelScore, 3); 329 | 330 | CLMemory::setKernelArg(d.m_kernelScore, 4, d.m_clScoreMax); // Updated in handleResult() 331 | 332 | // Seed device 333 | initContinue(d); 334 | } 335 | 336 | void Dispatcher::initContinue(Device & d) { 337 | size_t sizeLeft = m_size - d.m_sizeInitialized; 338 | const size_t sizeInitLimit = m_size / 20; 339 | 340 | // Print progress 341 | const size_t percentDone = m_sizeInitDone * 100 / m_sizeInitTotal; 342 | std::cout << " " << percentDone << "%\r" << std::flush; 343 | 344 | if (sizeLeft) { 345 | cl_event event; 346 | const size_t sizeRun = std::min(sizeInitLimit, std::min(sizeLeft, m_worksizeMax)); 347 | const auto resEnqueue = clEnqueueNDRangeKernel(d.m_clQueue, d.m_kernelInit, 1, &d.m_sizeInitialized, &sizeRun, NULL, 0, NULL, &event); 348 | OpenCLException::throwIfError("kernel queueing failed during initilization", resEnqueue); 349 | 350 | // See: https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/clSetEventCallback.html 351 | // If an application needs to wait for completion of a routine from the above list in a callback, please use the non-blocking form of the function, and 352 | // assign a completion callback to it to do the remainder of your work. Note that when a callback (or other code) enqueues commands to a command-queue, 353 | // the commands are not required to begin execution until the queue is flushed. In standard usage, blocking enqueue calls serve this role by implicitly 354 | // flushing the queue. Since blocking calls are not permitted in callbacks, those callbacks that enqueue commands on a command queue should either call 355 | // clFlush on the queue before returning or arrange for clFlush to be called later on another thread. 356 | clFlush(d.m_clQueue); 357 | 358 | std::lock_guard lock(m_mutex); 359 | d.m_sizeInitialized += sizeRun; 360 | m_sizeInitDone += sizeRun; 361 | 362 | const auto resCallback = clSetEventCallback(event, CL_COMPLETE, staticCallback, &d); 363 | OpenCLException::throwIfError("failed to set custom callback during initialization", resCallback); 364 | } else { 365 | // Printing one whole string at once helps in avoiding garbled output when executed in parallell 366 | const std::string strOutput = " GPU" + toString(d.m_index) + " initialized"; 367 | std::cout << strOutput << std::endl; 368 | clSetUserEventStatus(d.m_eventFinished, CL_COMPLETE); 369 | } 370 | } 371 | 372 | void Dispatcher::enqueueKernel(cl_command_queue & clQueue, cl_kernel & clKernel, size_t worksizeGlobal, const size_t worksizeLocal, cl_event * pEvent = NULL) { 373 | const size_t worksizeMax = m_worksizeMax; 374 | size_t worksizeOffset = 0; 375 | while (worksizeGlobal) { 376 | const size_t worksizeRun = std::min(worksizeGlobal, worksizeMax); 377 | const size_t * const pWorksizeLocal = (worksizeLocal == 0 ? NULL : &worksizeLocal); 378 | const auto res = clEnqueueNDRangeKernel(clQueue, clKernel, 1, &worksizeOffset, &worksizeRun, pWorksizeLocal, 0, NULL, pEvent); 379 | OpenCLException::throwIfError("kernel queueing failed", res); 380 | 381 | worksizeGlobal -= worksizeRun; 382 | worksizeOffset += worksizeRun; 383 | } 384 | } 385 | 386 | void Dispatcher::enqueueKernelDevice(Device & d, cl_kernel & clKernel, size_t worksizeGlobal, cl_event * pEvent = NULL) { 387 | try { 388 | enqueueKernel(d.m_clQueue, clKernel, worksizeGlobal, d.m_worksizeLocal, pEvent); 389 | } catch ( OpenCLException & e ) { 390 | // If local work size is invalid, abandon it and let implementation decide 391 | if ((e.m_res == CL_INVALID_WORK_GROUP_SIZE || e.m_res == CL_INVALID_WORK_ITEM_SIZE) && d.m_worksizeLocal != 0) { 392 | std::cout << std::endl << "warning: local work size abandoned on GPU" << d.m_index << std::endl; 393 | d.m_worksizeLocal = 0; 394 | enqueueKernel(d.m_clQueue, clKernel, worksizeGlobal, d.m_worksizeLocal, pEvent); 395 | } 396 | else { 397 | throw; 398 | } 399 | } 400 | } 401 | 402 | void Dispatcher::dispatch(Device & d) { 403 | cl_event event; 404 | d.m_memResult.read(false, &event); 405 | 406 | #ifdef PROFANITY_DEBUG 407 | cl_event eventInverse; 408 | cl_event eventIterate; 409 | 410 | enqueueKernelDevice(d, d.m_kernelInverse, m_size / m_inverseSize, &eventInverse); 411 | enqueueKernelDevice(d, d.m_kernelIterate, m_size, &eventIterate); 412 | #else 413 | enqueueKernelDevice(d, d.m_kernelInverse, m_size / m_inverseSize); 414 | enqueueKernelDevice(d, d.m_kernelIterate, m_size); 415 | #endif 416 | 417 | if (d.m_kernelTransform) { 418 | enqueueKernelDevice(d, d.m_kernelTransform, m_size); 419 | } 420 | 421 | enqueueKernelDevice(d, d.m_kernelScore, m_size); 422 | clFlush(d.m_clQueue); 423 | 424 | #ifdef PROFANITY_DEBUG 425 | // We're actually not allowed to call clFinish here because this function is ultimately asynchronously called by OpenCL. 426 | // However, this happens to work on my computer and it's not really intended for release, just something to aid me in 427 | // optimizations. 428 | clFinish(d.m_clQueue); 429 | std::cout << "Timing: profanity_inverse = " << getKernelExecutionTimeMicros(eventInverse) << "us, profanity_iterate = " << getKernelExecutionTimeMicros(eventIterate) << "us" << std::endl; 430 | #endif 431 | 432 | const auto res = clSetEventCallback(event, CL_COMPLETE, staticCallback, &d); 433 | OpenCLException::throwIfError("failed to set custom callback", res); 434 | } 435 | 436 | void Dispatcher::handleResult(Device & d) { 437 | for (auto i = PROFANITY_MAX_SCORE; i > m_clScoreMax; --i) { 438 | result & r = d.m_memResult[i]; 439 | 440 | if (r.found > 0 && i >= d.m_clScoreMax) { 441 | d.m_clScoreMax = i; 442 | CLMemory::setKernelArg(d.m_kernelScore, 4, d.m_clScoreMax); 443 | 444 | std::lock_guard lock(m_mutex); 445 | if (i >= m_clScoreMax) { 446 | m_clScoreMax = i; 447 | 448 | if (m_clScoreQuit && i >= m_clScoreQuit) { 449 | m_quit = true; 450 | } 451 | 452 | printResult(d.m_clSeed, d.m_round, r, i, timeStart, m_mode); 453 | } 454 | 455 | break; 456 | } 457 | } 458 | } 459 | 460 | void Dispatcher::onEvent(cl_event event, cl_int status, Device & d) { 461 | if (status != CL_COMPLETE) { 462 | std::cout << "Dispatcher::onEvent - Got bad status: " << status << std::endl; 463 | } 464 | else if (d.m_eventFinished != NULL) { 465 | initContinue(d); 466 | } else { 467 | ++d.m_round; 468 | handleResult(d); 469 | 470 | bool bDispatch = true; 471 | { 472 | std::lock_guard lock(m_mutex); 473 | d.m_speed.sample(m_size); 474 | printSpeed(); 475 | 476 | if( m_quit ) { 477 | bDispatch = false; 478 | if(--m_countRunning == 0) { 479 | clSetUserEventStatus(m_eventFinished, CL_COMPLETE); 480 | } 481 | } 482 | } 483 | 484 | if (bDispatch) { 485 | dispatch(d); 486 | } 487 | } 488 | } 489 | 490 | // This is run when m_mutex is held. 491 | void Dispatcher::printSpeed() { 492 | ++m_countPrint; 493 | if( m_countPrint > m_vDevices.size() ) { 494 | std::string strGPUs; 495 | double speedTotal = 0; 496 | unsigned int i = 0; 497 | for (auto & e : m_vDevices) { 498 | const auto curSpeed = e->m_speed.getSpeed(); 499 | speedTotal += curSpeed; 500 | strGPUs += " GPU" + toString(e->m_index) + ": " + formatSpeed(curSpeed); 501 | ++i; 502 | } 503 | 504 | const std::string strVT100ClearLine = "\33[2K\r"; 505 | std::cerr << strVT100ClearLine << "Total: " << formatSpeed(speedTotal) << " -" << strGPUs << '\r' << std::flush; 506 | m_countPrint = 0; 507 | } 508 | } 509 | 510 | void CL_CALLBACK Dispatcher::staticCallback(cl_event event, cl_int event_command_exec_status, void * user_data) { 511 | Device * const pDevice = static_cast(user_data); 512 | pDevice->m_parent.onEvent(event, event_command_exec_status, *pDevice); 513 | clReleaseEvent(event); 514 | } 515 | 516 | std::string Dispatcher::formatSpeed(double f) { 517 | const std::string S = " KMGT"; 518 | 519 | unsigned int index = 0; 520 | while (f > 1000.0f && index < S.size()) { 521 | f /= 1000.0f; 522 | ++index; 523 | } 524 | 525 | std::ostringstream ss; 526 | ss << std::fixed << std::setprecision(3) << (double)f << " " << S[index] << "H/s"; 527 | return ss.str(); 528 | } 529 | -------------------------------------------------------------------------------- /profanity.cl: -------------------------------------------------------------------------------- 1 | /* profanity.cl 2 | * ============ 3 | * Contains multi-precision arithmetic functions and iterative elliptical point 4 | * addition which is the heart of profanity. 5 | * 6 | * Terminology 7 | * =========== 8 | * 9 | * 10 | * Cutting corners 11 | * =============== 12 | * In some instances this code will produce the incorrect results. The elliptical 13 | * point addition does for example not properly handle the case of two points 14 | * sharing the same X-coordinate. The reason the code doesn't handle it properly 15 | * is because it is very unlikely to ever occur and the performance penalty for 16 | * doing it right is too severe. In the future I'll introduce a periodic check 17 | * after N amount of cycles that verifies the integrity of all the points to 18 | * make sure that even very unlikely event are at some point rectified. 19 | * 20 | * Currently, if any of the points in the kernels experiences the unlikely event 21 | * of an error then that point is forever garbage and your runtime-performance 22 | * will in practice be (i*I-N) / (i*I). i and I here refers to the values given 23 | * to the program via the -i and -I switches (default values of 255 and 16384 24 | * respectively) and N is the number of errornous points. 25 | * 26 | * So if a single error occurs you'll lose 1/(i*I) of your performance. That's 27 | * around 0.00002%. The program will still report the same hashrate of course, 28 | * only that some of that work is entirely wasted on this errornous point. 29 | * 30 | * Initialization of main structure 31 | * ================================ 32 | * 33 | * Iteration 34 | * ========= 35 | * 36 | * 37 | * TODO 38 | * ==== 39 | * * Update comments to reflect new optimizations and structure 40 | * 41 | */ 42 | 43 | /* ------------------------------------------------------------------------ */ 44 | /* Multiprecision functions */ 45 | /* ------------------------------------------------------------------------ */ 46 | #define MP_WORDS 8 47 | #define MP_BITS 32 48 | #define bswap32(n) (rotate(n & 0x00FF00FF, 24U)|(rotate(n, 8U) & 0x00FF00FF)) 49 | 50 | typedef uint mp_word; 51 | typedef struct { 52 | mp_word d[MP_WORDS]; 53 | } mp_number; 54 | 55 | // mod = 0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffefffffc2f 56 | __constant const mp_number mod = { {0xfffffc2f, 0xfffffffe, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff} }; 57 | 58 | // tripleNegativeGx = 0x92c4cc831269ccfaff1ed83e946adeeaf82c096e76958573f2287becbb17b196 59 | __constant const mp_number tripleNegativeGx = { {0xbb17b196, 0xf2287bec, 0x76958573, 0xf82c096e, 0x946adeea, 0xff1ed83e, 0x1269ccfa, 0x92c4cc83 } }; 60 | 61 | // doubleNegativeGy = 0x6f8a4b11b2b8773544b60807e3ddeeae05d0976eb2f557ccc7705edf09de52bf 62 | __constant const mp_number doubleNegativeGy = { {0x09de52bf, 0xc7705edf, 0xb2f557cc, 0x05d0976e, 0xe3ddeeae, 0x44b60807, 0xb2b87735, 0x6f8a4b11} }; 63 | 64 | // negativeGy = 0xb7c52588d95c3b9aa25b0403f1eef75702e84bb7597aabe663b82f6f04ef2777 65 | __constant const mp_number negativeGy = { {0x04ef2777, 0x63b82f6f, 0x597aabe6, 0x02e84bb7, 0xf1eef757, 0xa25b0403, 0xd95c3b9a, 0xb7c52588 } }; 66 | 67 | 68 | // Multiprecision subtraction. Underflow signalled via return value. 69 | mp_word mp_sub(mp_number * const r, const mp_number * const a, const mp_number * const b) { 70 | mp_word t, c = 0; 71 | 72 | for (mp_word i = 0; i < MP_WORDS; ++i) { 73 | t = a->d[i] - b->d[i] - c; 74 | c = t > a->d[i] ? 1 : (t == a->d[i] ? c : 0); 75 | 76 | r->d[i] = t; 77 | } 78 | 79 | return c; 80 | } 81 | 82 | // Multiprecision subtraction of the modulus saved in mod. Underflow signalled via return value. 83 | mp_word mp_sub_mod(mp_number * const r) { 84 | mp_number mod = { {0xfffffc2f, 0xfffffffe, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff} }; 85 | 86 | mp_word t, c = 0; 87 | 88 | for (mp_word i = 0; i < MP_WORDS; ++i) { 89 | t = r->d[i] - mod.d[i] - c; 90 | c = t > r->d[i] ? 1 : (t == r->d[i] ? c : 0); 91 | 92 | r->d[i] = t; 93 | } 94 | 95 | return c; 96 | } 97 | 98 | // Multiprecision subtraction modulo M, M = mod. 99 | // This function is often also used for additions by subtracting a negative number. I've chosen 100 | // to do this because: 101 | // 1. It's easier to re-use an already existing function 102 | // 2. A modular addition would have more overhead since it has to determine if the result of 103 | // the addition (r) is in the gap M <= r < 2^256. This overhead doesn't exist in a 104 | // subtraction. We immediately know at the end of a subtraction if we had underflow 105 | // or not by inspecting the carry value. M refers to the modulus saved in variable mod. 106 | void mp_mod_sub(mp_number * const r, const mp_number * const a, const mp_number * const b) { 107 | mp_word i, t, c = 0; 108 | 109 | for (i = 0; i < MP_WORDS; ++i) { 110 | t = a->d[i] - b->d[i] - c; 111 | c = t < a->d[i] ? 0 : (t == a->d[i] ? c : 1); 112 | 113 | r->d[i] = t; 114 | } 115 | 116 | if (c) { 117 | c = 0; 118 | for (i = 0; i < MP_WORDS; ++i) { 119 | r->d[i] += mod.d[i] + c; 120 | c = r->d[i] < mod.d[i] ? 1 : (r->d[i] == mod.d[i] ? c : 0); 121 | } 122 | } 123 | } 124 | 125 | // Multiprecision subtraction modulo M from a constant number. 126 | // I made this in the belief that using constant address space instead of private address space for any 127 | // constant numbers would lead to increase in performance. Judges are still out on this one. 128 | void mp_mod_sub_const(mp_number * const r, __constant const mp_number * const a, const mp_number * const b) { 129 | mp_word i, t, c = 0; 130 | 131 | for (i = 0; i < MP_WORDS; ++i) { 132 | t = a->d[i] - b->d[i] - c; 133 | c = t < a->d[i] ? 0 : (t == a->d[i] ? c : 1); 134 | 135 | r->d[i] = t; 136 | } 137 | 138 | if (c) { 139 | c = 0; 140 | for (i = 0; i < MP_WORDS; ++i) { 141 | r->d[i] += mod.d[i] + c; 142 | c = r->d[i] < mod.d[i] ? 1 : (r->d[i] == mod.d[i] ? c : 0); 143 | } 144 | } 145 | } 146 | 147 | // Multiprecision subtraction modulo M of G_x from a number. 148 | // Specialization of mp_mod_sub in hope of performance gain. 149 | void mp_mod_sub_gx(mp_number * const r, const mp_number * const a) { 150 | mp_word i, t, c = 0; 151 | 152 | t = a->d[0] - 0x16f81798; c = t < a->d[0] ? 0 : (t == a->d[0] ? c : 1); r->d[0] = t; 153 | t = a->d[1] - 0x59f2815b - c; c = t < a->d[1] ? 0 : (t == a->d[1] ? c : 1); r->d[1] = t; 154 | t = a->d[2] - 0x2dce28d9 - c; c = t < a->d[2] ? 0 : (t == a->d[2] ? c : 1); r->d[2] = t; 155 | t = a->d[3] - 0x029bfcdb - c; c = t < a->d[3] ? 0 : (t == a->d[3] ? c : 1); r->d[3] = t; 156 | t = a->d[4] - 0xce870b07 - c; c = t < a->d[4] ? 0 : (t == a->d[4] ? c : 1); r->d[4] = t; 157 | t = a->d[5] - 0x55a06295 - c; c = t < a->d[5] ? 0 : (t == a->d[5] ? c : 1); r->d[5] = t; 158 | t = a->d[6] - 0xf9dcbbac - c; c = t < a->d[6] ? 0 : (t == a->d[6] ? c : 1); r->d[6] = t; 159 | t = a->d[7] - 0x79be667e - c; c = t < a->d[7] ? 0 : (t == a->d[7] ? c : 1); r->d[7] = t; 160 | 161 | if (c) { 162 | c = 0; 163 | for (i = 0; i < MP_WORDS; ++i) { 164 | r->d[i] += mod.d[i] + c; 165 | c = r->d[i] < mod.d[i] ? 1 : (r->d[i] == mod.d[i] ? c : 0); 166 | } 167 | } 168 | } 169 | 170 | // Multiprecision subtraction modulo M of G_y from a number. 171 | // Specialization of mp_mod_sub in hope of performance gain. 172 | void mp_mod_sub_gy(mp_number * const r, const mp_number * const a) { 173 | mp_word i, t, c = 0; 174 | 175 | t = a->d[0] - 0xfb10d4b8; c = t < a->d[0] ? 0 : (t == a->d[0] ? c : 1); r->d[0] = t; 176 | t = a->d[1] - 0x9c47d08f - c; c = t < a->d[1] ? 0 : (t == a->d[1] ? c : 1); r->d[1] = t; 177 | t = a->d[2] - 0xa6855419 - c; c = t < a->d[2] ? 0 : (t == a->d[2] ? c : 1); r->d[2] = t; 178 | t = a->d[3] - 0xfd17b448 - c; c = t < a->d[3] ? 0 : (t == a->d[3] ? c : 1); r->d[3] = t; 179 | t = a->d[4] - 0x0e1108a8 - c; c = t < a->d[4] ? 0 : (t == a->d[4] ? c : 1); r->d[4] = t; 180 | t = a->d[5] - 0x5da4fbfc - c; c = t < a->d[5] ? 0 : (t == a->d[5] ? c : 1); r->d[5] = t; 181 | t = a->d[6] - 0x26a3c465 - c; c = t < a->d[6] ? 0 : (t == a->d[6] ? c : 1); r->d[6] = t; 182 | t = a->d[7] - 0x483ada77 - c; c = t < a->d[7] ? 0 : (t == a->d[7] ? c : 1); r->d[7] = t; 183 | 184 | if (c) { 185 | c = 0; 186 | for (i = 0; i < MP_WORDS; ++i) { 187 | r->d[i] += mod.d[i] + c; 188 | c = r->d[i] < mod.d[i] ? 1 : (r->d[i] == mod.d[i] ? c : 0); 189 | } 190 | } 191 | } 192 | 193 | // Multiprecision addition. Overflow signalled via return value. 194 | mp_word mp_add(mp_number * const r, const mp_number * const a) { 195 | mp_word c = 0; 196 | 197 | for (mp_word i = 0; i < MP_WORDS; ++i) { 198 | r->d[i] += a->d[i] + c; 199 | c = r->d[i] < a->d[i] ? 1 : (r->d[i] == a->d[i] ? c : 0); 200 | } 201 | 202 | return c; 203 | } 204 | 205 | // Multiprecision addition of the modulus saved in mod. Overflow signalled via return value. 206 | mp_word mp_add_mod(mp_number * const r) { 207 | mp_word c = 0; 208 | 209 | for (mp_word i = 0; i < MP_WORDS; ++i) { 210 | r->d[i] += mod.d[i] + c; 211 | c = r->d[i] < mod.d[i] ? 1 : (r->d[i] == mod.d[i] ? c : 0); 212 | } 213 | 214 | return c; 215 | } 216 | 217 | // Multiprecision addition of two numbers with one extra word each. Overflow signalled via return value. 218 | mp_word mp_add_more(mp_number * const r, mp_word * const extraR, const mp_number * const a, const mp_word * const extraA) { 219 | const mp_word c = mp_add(r, a); 220 | *extraR += *extraA + c; 221 | return *extraR < *extraA ? 1 : (*extraR == *extraA ? c : 0); 222 | } 223 | 224 | // Multiprecision greater than or equal (>=) operator 225 | mp_word mp_gte(const mp_number * const a, const mp_number * const b) { 226 | mp_word l = 0, g = 0; 227 | 228 | for (mp_word i = 0; i < MP_WORDS; ++i) { 229 | if (a->d[i] < b->d[i]) l |= (1 << i); 230 | if (a->d[i] > b->d[i]) g |= (1 << i); 231 | } 232 | 233 | return g >= l; 234 | } 235 | 236 | // Bit shifts a number with an extra word to the right one step 237 | void mp_shr_extra(mp_number * const r, mp_word * const e) { 238 | r->d[0] = (r->d[1] << 31) | (r->d[0] >> 1); 239 | r->d[1] = (r->d[2] << 31) | (r->d[1] >> 1); 240 | r->d[2] = (r->d[3] << 31) | (r->d[2] >> 1); 241 | r->d[3] = (r->d[4] << 31) | (r->d[3] >> 1); 242 | r->d[4] = (r->d[5] << 31) | (r->d[4] >> 1); 243 | r->d[5] = (r->d[6] << 31) | (r->d[5] >> 1); 244 | r->d[6] = (r->d[7] << 31) | (r->d[6] >> 1); 245 | r->d[7] = (*e << 31) | (r->d[7] >> 1); 246 | *e >>= 1; 247 | } 248 | 249 | // Bit shifts a number to the right one step 250 | void mp_shr(mp_number * const r) { 251 | r->d[0] = (r->d[1] << 31) | (r->d[0] >> 1); 252 | r->d[1] = (r->d[2] << 31) | (r->d[1] >> 1); 253 | r->d[2] = (r->d[3] << 31) | (r->d[2] >> 1); 254 | r->d[3] = (r->d[4] << 31) | (r->d[3] >> 1); 255 | r->d[4] = (r->d[5] << 31) | (r->d[4] >> 1); 256 | r->d[5] = (r->d[6] << 31) | (r->d[5] >> 1); 257 | r->d[6] = (r->d[7] << 31) | (r->d[6] >> 1); 258 | r->d[7] >>= 1; 259 | } 260 | 261 | // Multiplies a number with a word and adds it to an existing number with an extra word, overflow of the extra word is signalled in return value 262 | // This is a special function only used for modular multiplication 263 | mp_word mp_mul_word_add_extra(mp_number * const r, const mp_number * const a, const mp_word w, mp_word * const extra) { 264 | mp_word cM = 0; // Carry for multiplication 265 | mp_word cA = 0; // Carry for addition 266 | mp_word tM = 0; // Temporary storage for multiplication 267 | 268 | for (mp_word i = 0; i < MP_WORDS; ++i) { 269 | tM = (a->d[i] * w + cM); 270 | cM = mul_hi(a->d[i], w) + (tM < cM); 271 | 272 | r->d[i] += tM + cA; 273 | cA = r->d[i] < tM ? 1 : (r->d[i] == tM ? cA : 0); 274 | } 275 | 276 | *extra += cM + cA; 277 | return *extra < cM ? 1 : (*extra == cM ? cA : 0); 278 | } 279 | 280 | // Multiplies a number with a word, potentially adds modhigher to it, and then subtracts it from en existing number, no extra words, no overflow 281 | // This is a special function only used for modular multiplication 282 | void mp_mul_mod_word_sub(mp_number * const r, const mp_word w, const bool withModHigher) { 283 | // Having these numbers declared here instead of using the global values in __constant address space seems to lead 284 | // to better optimizations by the compiler on my GTX 1070. 285 | mp_number mod = { { 0xfffffc2f, 0xfffffffe, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff} }; 286 | mp_number modhigher = { {0x00000000, 0xfffffc2f, 0xfffffffe, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff} }; 287 | 288 | mp_word cM = 0; // Carry for multiplication 289 | mp_word cS = 0; // Carry for subtraction 290 | mp_word tS = 0; // Temporary storage for subtraction 291 | mp_word tM = 0; // Temporary storage for multiplication 292 | mp_word cA = 0; // Carry for addition of modhigher 293 | 294 | for (mp_word i = 0; i < MP_WORDS; ++i) { 295 | tM = (mod.d[i] * w + cM); 296 | cM = mul_hi(mod.d[i], w) + (tM < cM); 297 | 298 | tM += (withModHigher ? modhigher.d[i] : 0) + cA; 299 | cA = tM < (withModHigher ? modhigher.d[i] : 0) ? 1 : (tM == (withModHigher ? modhigher.d[i] : 0) ? cA : 0); 300 | 301 | tS = r->d[i] - tM - cS; 302 | cS = tS > r->d[i] ? 1 : (tS == r->d[i] ? cS : 0); 303 | 304 | r->d[i] = tS; 305 | } 306 | } 307 | 308 | // Modular multiplication. Based on Algorithm 3 (and a series of hunches) from this article: 309 | // https://www.esat.kuleuven.be/cosic/publications/article-1191.pdf 310 | // When I first implemented it I never encountered a situation where the additional end steps 311 | // of adding or subtracting the modulo was necessary. Maybe it's not for the particular modulo 312 | // used in secp256k1, maybe the overflow bit can be skipped in to avoid 8 subtractions and 313 | // trade it for the final steps? Maybe the final steps are necessary but seldom needed? 314 | // I have no idea, for the time being I'll leave it like this, also see the comments at the 315 | // beginning of this document under the title "Cutting corners". 316 | void mp_mod_mul(mp_number * const r, const mp_number * const X, const mp_number * const Y) { 317 | mp_number Z = { {0} }; 318 | mp_word extraWord; 319 | 320 | for (int i = MP_WORDS - 1; i >= 0; --i) { 321 | // Z = Z * 2^32 322 | extraWord = Z.d[7]; Z.d[7] = Z.d[6]; Z.d[6] = Z.d[5]; Z.d[5] = Z.d[4]; Z.d[4] = Z.d[3]; Z.d[3] = Z.d[2]; Z.d[2] = Z.d[1]; Z.d[1] = Z.d[0]; Z.d[0] = 0; 323 | 324 | // Z = Z + X * Y_i 325 | bool overflow = mp_mul_word_add_extra(&Z, X, Y->d[i], &extraWord); 326 | 327 | // Z = Z - qM 328 | mp_mul_mod_word_sub(&Z, extraWord, overflow); 329 | } 330 | 331 | *r = Z; 332 | } 333 | 334 | // Modular inversion of a number. 335 | void mp_mod_inverse(mp_number * const r) { 336 | mp_number A = { { 1 } }; 337 | mp_number C = { { 0 } }; 338 | mp_number v = mod; 339 | 340 | mp_word extraA = 0; 341 | mp_word extraC = 0; 342 | 343 | while (r->d[0] || r->d[1] || r->d[2] || r->d[3] || r->d[4] || r->d[5] || r->d[6] || r->d[7]) { 344 | while (!(r->d[0] & 1)) { 345 | mp_shr(r); 346 | if (A.d[0] & 1) { 347 | extraA += mp_add_mod(&A); 348 | } 349 | 350 | mp_shr_extra(&A, &extraA); 351 | } 352 | 353 | while (!(v.d[0] & 1)) { 354 | mp_shr(&v); 355 | if (C.d[0] & 1) { 356 | extraC += mp_add_mod(&C); 357 | } 358 | 359 | mp_shr_extra(&C, &extraC); 360 | } 361 | 362 | if (mp_gte(r, &v)) { 363 | mp_sub(r, r, &v); 364 | mp_add_more(&A, &extraA, &C, &extraC); 365 | } 366 | else { 367 | mp_sub(&v, &v, r); 368 | mp_add_more(&C, &extraC, &A, &extraA); 369 | } 370 | } 371 | 372 | while (extraC) { 373 | extraC -= mp_sub_mod(&C); 374 | } 375 | 376 | v = mod; 377 | mp_sub(r, &v, &C); 378 | } 379 | 380 | /* ------------------------------------------------------------------------ */ 381 | /* Elliptic point and addition (with caveats). */ 382 | /* ------------------------------------------------------------------------ */ 383 | typedef struct { 384 | mp_number x; 385 | mp_number y; 386 | } point; 387 | 388 | // Elliptical point addition 389 | // Does not handle points sharing X coordinate, this is a deliberate design choice. 390 | // For more information on this choice see the beginning of this file. 391 | void point_add(point * const r, point * const p, point * const o) { 392 | mp_number tmp; 393 | mp_number newX; 394 | mp_number newY; 395 | 396 | mp_mod_sub(&tmp, &o->x, &p->x); 397 | 398 | mp_mod_inverse(&tmp); 399 | 400 | mp_mod_sub(&newX, &o->y, &p->y); 401 | mp_mod_mul(&tmp, &tmp, &newX); 402 | 403 | mp_mod_mul(&newX, &tmp, &tmp); 404 | mp_mod_sub(&newX, &newX, &p->x); 405 | mp_mod_sub(&newX, &newX, &o->x); 406 | 407 | mp_mod_sub(&newY, &p->x, &newX); 408 | mp_mod_mul(&newY, &newY, &tmp); 409 | mp_mod_sub(&newY, &newY, &p->y); 410 | 411 | r->x = newX; 412 | r->y = newY; 413 | } 414 | 415 | /* ------------------------------------------------------------------------ */ 416 | /* Profanity. */ 417 | /* ------------------------------------------------------------------------ */ 418 | typedef struct { 419 | uint found; 420 | uint foundId; 421 | uchar foundHash[20]; 422 | } result; 423 | 424 | void profanity_init_seed(__global const point * const precomp, point * const p, bool * const pIsFirst, const size_t precompOffset, const ulong seed) { 425 | point o; 426 | 427 | for (uchar i = 0; i < 8; ++i) { 428 | const uchar shift = i * 8; 429 | const uchar byte = (seed >> shift) & 0xFF; 430 | 431 | if (byte) { 432 | o = precomp[precompOffset + i * 255 + byte - 1]; 433 | if (*pIsFirst) { 434 | *p = o; 435 | *pIsFirst = false; 436 | } 437 | else { 438 | point_add(p, p, &o); 439 | } 440 | } 441 | } 442 | } 443 | 444 | __kernel void profanity_init(__global const point * const precomp, __global mp_number * const pDeltaX, __global mp_number * const pPrevLambda, __global result * const pResult, const ulong4 seed, const ulong4 seedX, const ulong4 seedY) { 445 | const size_t id = get_global_id(0); 446 | point p = { 447 | .x = {.d = { 448 | seedX.x & 0xFFFFFFFF, seedX.x >> 32, 449 | seedX.y & 0xFFFFFFFF, seedX.y >> 32, 450 | seedX.z & 0xFFFFFFFF, seedX.z >> 32, 451 | seedX.w & 0xFFFFFFFF, seedX.w >> 32, 452 | }}, 453 | .y = {.d = { 454 | seedY.x & 0xFFFFFFFF, seedY.x >> 32, 455 | seedY.y & 0xFFFFFFFF, seedY.y >> 32, 456 | seedY.z & 0xFFFFFFFF, seedY.z >> 32, 457 | seedY.w & 0xFFFFFFFF, seedY.w >> 32, 458 | }}, 459 | }; 460 | point p_random; 461 | bool bIsFirst = true; 462 | 463 | mp_number tmp1, tmp2; 464 | point tmp3; 465 | 466 | // Calculate k*G where k = seed.wzyx (in other words, find the point indicated by the private key represented in seed) 467 | profanity_init_seed(precomp, &p_random, &bIsFirst, 8 * 255 * 0, seed.x); 468 | profanity_init_seed(precomp, &p_random, &bIsFirst, 8 * 255 * 1, seed.y); 469 | profanity_init_seed(precomp, &p_random, &bIsFirst, 8 * 255 * 2, seed.z); 470 | profanity_init_seed(precomp, &p_random, &bIsFirst, 8 * 255 * 3, seed.w + id); 471 | point_add(&p, &p, &p_random); 472 | 473 | // Calculate current lambda in this point 474 | mp_mod_sub_gx(&tmp1, &p.x); 475 | mp_mod_inverse(&tmp1); 476 | 477 | mp_mod_sub_gy(&tmp2, &p.y); 478 | mp_mod_mul(&tmp1, &tmp1, &tmp2); 479 | 480 | // Jump to next point (precomp[0] is the generator point G) 481 | tmp3 = precomp[0]; 482 | point_add(&p, &tmp3, &p); 483 | 484 | // pDeltaX should contain the delta (x - G_x) 485 | mp_mod_sub_gx(&p.x, &p.x); 486 | 487 | pDeltaX[id] = p.x; 488 | pPrevLambda[id] = tmp1; 489 | 490 | for (uchar i = 0; i < PROFANITY_MAX_SCORE + 1; ++i) { 491 | pResult[i].found = 0; 492 | } 493 | } 494 | 495 | // This kernel calculates several modular inversions at once with just one inverse. 496 | // It's an implementation of Algorithm 2.11 from Modern Computer Arithmetic: 497 | // https://members.loria.fr/PZimmermann/mca/pub226.html 498 | // 499 | // My RX 480 is very sensitive to changes in the second loop and sometimes I have 500 | // to make seemingly non-functional changes to the code to make the compiler 501 | // generate the most optimized version. 502 | __kernel void profanity_inverse(__global const mp_number * const pDeltaX, __global mp_number * const pInverse) { 503 | const size_t id = get_global_id(0) * PROFANITY_INVERSE_SIZE; 504 | 505 | // negativeDoubleGy = 0x6f8a4b11b2b8773544b60807e3ddeeae05d0976eb2f557ccc7705edf09de52bf 506 | mp_number negativeDoubleGy = { {0x09de52bf, 0xc7705edf, 0xb2f557cc, 0x05d0976e, 0xe3ddeeae, 0x44b60807, 0xb2b87735, 0x6f8a4b11 } }; 507 | 508 | mp_number copy1, copy2; 509 | mp_number buffer[PROFANITY_INVERSE_SIZE]; 510 | mp_number buffer2[PROFANITY_INVERSE_SIZE]; 511 | 512 | // We initialize buffer and buffer2 such that: 513 | // buffer[i] = pDeltaX[id] * pDeltaX[id + 1] * pDeltaX[id + 2] * ... * pDeltaX[id + i] 514 | // buffer2[i] = pDeltaX[id + i] 515 | buffer[0] = pDeltaX[id]; 516 | for (uint i = 1; i < PROFANITY_INVERSE_SIZE; ++i) { 517 | buffer2[i] = pDeltaX[id + i]; 518 | mp_mod_mul(&buffer[i], &buffer2[i], &buffer[i - 1]); 519 | } 520 | 521 | // Take the inverse of all x-values combined 522 | copy1 = buffer[PROFANITY_INVERSE_SIZE - 1]; 523 | mp_mod_inverse(©1); 524 | 525 | // We multiply in -2G_y together with the inverse so that we have: 526 | // - 2 * G_y 527 | // ---------------------------- 528 | // x_0 * x_1 * x_2 * x_3 * ... 529 | mp_mod_mul(©1, ©1, &negativeDoubleGy); 530 | 531 | // Multiply out each individual inverse using the buffers 532 | for (uint i = PROFANITY_INVERSE_SIZE - 1; i > 0; --i) { 533 | mp_mod_mul(©2, ©1, &buffer[i - 1]); 534 | mp_mod_mul(©1, ©1, &buffer2[i]); 535 | pInverse[id + i] = copy2; 536 | } 537 | 538 | pInverse[id] = copy1; 539 | } 540 | 541 | // This kernel performs en elliptical curve point addition. See: 542 | // https://en.wikipedia.org/wiki/Elliptic_curve_point_multiplication#Point_addition 543 | // I've made one mathematical optimization by never calculating x_r, 544 | // instead I directly calculate the delta (x_q - x_p). It's for this 545 | // delta we calculate the inverse and that's already been done at this 546 | // point. By calculating and storing the next delta we don't have to 547 | // calculate the delta in profanity_inverse_multiple which saves us 548 | // one call to mp_mod_sub per point, but inversely we have to introduce 549 | // an addition (or addition by subtracting a negative number) in 550 | // profanity_end to retrieve the actual x-coordinate instead of the 551 | // delta as that's what used for calculating the public hash. 552 | // 553 | // One optimization is when calculating the next y-coordinate. As 554 | // given in the wiki the next y-coordinate is given by: 555 | // y_r = λ²(x_p - x_r) - y_p 556 | // In our case the other point P is the generator point so x_p = G_x, 557 | // a constant value. x_r is the new point which we never calculate, we 558 | // calculate the new delta (x_q - x_p) instead. Let's denote the delta 559 | // with d and new delta as d' and remove notation for points P and Q and 560 | // instead refeer to x_p as G_x, y_p as G_y and x_q as x, y_q as y. 561 | // Furthermore let's denote new x by x' and new y with y'. 562 | // 563 | // Then we have: 564 | // d = x - G_x <=> x = d + G_x 565 | // x' = λ² - G_x - x <=> x_r = λ² - G_x - d - G_x = λ² - 2G_x - d 566 | // 567 | // d' = x' - G_x = λ² - 2G_x - d - G_x = λ² - 3G_x - d 568 | // 569 | // So we see that the new delta d' can be calculated with the same 570 | // amount of steps as the new x'; 3G_x is still just a single constant. 571 | // 572 | // Now for the next y-coordinate in the new notation: 573 | // y' = λ(G_x - x') - G_y 574 | // 575 | // If we expand the expression (G_x - x') we can see that this 576 | // subtraction can be removed! Saving us one call to mp_mod_sub! 577 | // G_x - x' = -(x' - G_x) = -d' 578 | // It has the same value as the new delta but negated! We can avoid 579 | // having to perform the negation by: 580 | // y' = λ * -d' - G_y = -G_y - (λ * d') 581 | // 582 | // We can just precalculate the constant -G_y and we get rid of one 583 | // subtraction. Woo! 584 | // 585 | // But we aren't done yet! Let's expand the expression for the next 586 | // lambda, λ'. We have: 587 | // λ' = (y' - G_y) / d' 588 | // = (-λ * d' - G_y - G_y) / d' 589 | // = (-λ * d' - 2*G_y) / d' 590 | // = -λ - 2*G_y / d' 591 | // 592 | // So the next lambda value can be calculated from the old one. This in 593 | // and of itself is not so interesting but the fact that the term -2 * G_y 594 | // is a constant is! Since it's constant it'll be the same value no matter 595 | // which point we're currently working with. This means that this factor 596 | // can be multiplied in during the inversion, and just with one call per 597 | // inversion instead of one call per point! This is small enough to be 598 | // negligible and thus we've reduced our point addition from three 599 | // multi-precision multiplications to just two! Wow. Just wow. 600 | // 601 | // There is additional overhead introduced by storing the previous lambda 602 | // but it's still a net gain. To additionally decrease memory access 603 | // overhead I never any longer store the Y coordinate. Instead I 604 | // calculate it at the end directly from the lambda and deltaX. 605 | // 606 | // In addition to this some algebraic re-ordering has been done to move 607 | // constants into the same argument to a new function mp_mod_sub_const 608 | // in hopes that using constant storage instead of private storage 609 | // will aid speeds. 610 | // 611 | // After the above point addition this kernel calculates the public address 612 | // corresponding to the point and stores it in pInverse which is used only 613 | // as interim storage as it won't otherwise be used again this cycle. 614 | // 615 | // One of the scoring kernels will run after this and fetch the address 616 | // from pInverse. 617 | __kernel void profanity_iterate(__global mp_number * const pDeltaX, __global mp_number * const pInverse, __global mp_number * const pPrevLambda) { 618 | const size_t id = get_global_id(0); 619 | 620 | // negativeGx = 0x8641998106234453aa5f9d6a3178f4f8fd640324d231d726a60d7ea3e907e497 621 | mp_number negativeGx = { {0xe907e497, 0xa60d7ea3, 0xd231d726, 0xfd640324, 0x3178f4f8, 0xaa5f9d6a, 0x06234453, 0x86419981 } }; 622 | 623 | ethhash h = { { 0 } }; 624 | 625 | mp_number dX = pDeltaX[id]; 626 | mp_number tmp = pInverse[id]; 627 | mp_number lambda = pPrevLambda[id]; 628 | 629 | // λ' = - (2G_y) / d' - λ <=> lambda := pInversedNegativeDoubleGy[id] - pPrevLambda[id] 630 | mp_mod_sub(&lambda, &tmp, &lambda); 631 | 632 | // λ² = λ * λ <=> tmp := lambda * lambda = λ² 633 | mp_mod_mul(&tmp, &lambda, &lambda); 634 | 635 | // d' = λ² - d - 3g = (-3g) - (d - λ²) <=> x := tripleNegativeGx - (x - tmp) 636 | mp_mod_sub(&dX, &dX, &tmp); 637 | mp_mod_sub_const(&dX, &tripleNegativeGx, &dX); 638 | 639 | pDeltaX[id] = dX; 640 | pPrevLambda[id] = lambda; 641 | 642 | // Calculate y from dX and lambda 643 | // y' = (-G_Y) - λ * d' <=> p.y := negativeGy - (p.y * p.x) 644 | mp_mod_mul(&tmp, &lambda, &dX); 645 | mp_mod_sub_const(&tmp, &negativeGy, &tmp); 646 | 647 | // Restore X coordinate from delta value 648 | mp_mod_sub(&dX, &dX, &negativeGx); 649 | 650 | // Initialize Keccak structure with point coordinates in big endian 651 | h.d[0] = bswap32(dX.d[MP_WORDS - 1]); 652 | h.d[1] = bswap32(dX.d[MP_WORDS - 2]); 653 | h.d[2] = bswap32(dX.d[MP_WORDS - 3]); 654 | h.d[3] = bswap32(dX.d[MP_WORDS - 4]); 655 | h.d[4] = bswap32(dX.d[MP_WORDS - 5]); 656 | h.d[5] = bswap32(dX.d[MP_WORDS - 6]); 657 | h.d[6] = bswap32(dX.d[MP_WORDS - 7]); 658 | h.d[7] = bswap32(dX.d[MP_WORDS - 8]); 659 | h.d[8] = bswap32(tmp.d[MP_WORDS - 1]); 660 | h.d[9] = bswap32(tmp.d[MP_WORDS - 2]); 661 | h.d[10] = bswap32(tmp.d[MP_WORDS - 3]); 662 | h.d[11] = bswap32(tmp.d[MP_WORDS - 4]); 663 | h.d[12] = bswap32(tmp.d[MP_WORDS - 5]); 664 | h.d[13] = bswap32(tmp.d[MP_WORDS - 6]); 665 | h.d[14] = bswap32(tmp.d[MP_WORDS - 7]); 666 | h.d[15] = bswap32(tmp.d[MP_WORDS - 8]); 667 | h.d[16] ^= 0x01; // length 64 668 | 669 | sha3_keccakf(&h); 670 | 671 | // Save public address hash in pInverse, only used as interim storage until next cycle 672 | pInverse[id].d[0] = h.d[3]; 673 | pInverse[id].d[1] = h.d[4]; 674 | pInverse[id].d[2] = h.d[5]; 675 | pInverse[id].d[3] = h.d[6]; 676 | pInverse[id].d[4] = h.d[7]; 677 | } 678 | 679 | void profanity_result_update(const size_t id, __global const uchar * const hash, __global result * const pResult, const uchar score, const uchar scoreMax) { 680 | if (score && score > scoreMax) { 681 | uchar hasResult = atomic_inc(&pResult[score].found); // NOTE: If "too many" results are found it'll wrap around to 0 again and overwrite last result. Only relevant if global worksize exceeds MAX(uint). 682 | 683 | // Save only one result for each score, the first. 684 | if (hasResult == 0) { 685 | pResult[score].foundId = id; 686 | 687 | for (int i = 0; i < 20; ++i) { 688 | pResult[score].foundHash[i] = hash[i]; 689 | } 690 | } 691 | } 692 | } 693 | 694 | __kernel void profanity_transform_contract(__global mp_number * const pInverse) { 695 | const size_t id = get_global_id(0); 696 | __global const uchar * const hash = pInverse[id].d; 697 | 698 | ethhash h; 699 | for (int i = 0; i < 50; ++i) { 700 | h.d[i] = 0; 701 | } 702 | // set up keccak(0xd6, 0x94, address, 0x80) 703 | h.b[0] = 214; 704 | h.b[1] = 148; 705 | for (int i = 0; i < 20; i++) { 706 | h.b[i + 2] = hash[i]; 707 | } 708 | h.b[22] = 128; 709 | 710 | h.b[23] ^= 0x01; // length 23 711 | sha3_keccakf(&h); 712 | 713 | pInverse[id].d[0] = h.d[3]; 714 | pInverse[id].d[1] = h.d[4]; 715 | pInverse[id].d[2] = h.d[5]; 716 | pInverse[id].d[3] = h.d[6]; 717 | pInverse[id].d[4] = h.d[7]; 718 | } 719 | 720 | __kernel void profanity_score_benchmark(__global mp_number * const pInverse, __global result * const pResult, __constant const uchar * const data1, __constant const uchar * const data2, const uchar scoreMax) { 721 | const size_t id = get_global_id(0); 722 | __global const uchar * const hash = pInverse[id].d; 723 | int score = 0; 724 | 725 | profanity_result_update(id, hash, pResult, score, scoreMax); 726 | } 727 | 728 | __kernel void profanity_score_matching(__global mp_number * const pInverse, __global result * const pResult, __constant const uchar * const data1, __constant const uchar * const data2, const uchar scoreMax) { 729 | const size_t id = get_global_id(0); 730 | __global const uchar * const hash = pInverse[id].d; 731 | int score = 0; 732 | 733 | for (int i = 0; i < 20; ++i) { 734 | if (data1[i] > 0 && (hash[i] & data1[i]) == data2[i]) { 735 | ++score; 736 | } 737 | } 738 | 739 | profanity_result_update(id, hash, pResult, score, scoreMax); 740 | } 741 | 742 | __kernel void profanity_score_leading(__global mp_number * const pInverse, __global result * const pResult, __constant const uchar * const data1, __constant const uchar * const data2, const uchar scoreMax) { 743 | const size_t id = get_global_id(0); 744 | __global const uchar * const hash = pInverse[id].d; 745 | int score = 0; 746 | 747 | for (int i = 0; i < 20; ++i) { 748 | if ((hash[i] & 0xF0) >> 4 == data1[0]) { 749 | ++score; 750 | } 751 | else { 752 | break; 753 | } 754 | 755 | if ((hash[i] & 0x0F) == data1[0]) { 756 | ++score; 757 | } 758 | else { 759 | break; 760 | } 761 | } 762 | 763 | profanity_result_update(id, hash, pResult, score, scoreMax); 764 | } 765 | 766 | __kernel void profanity_score_range(__global mp_number * const pInverse, __global result * const pResult, __constant const uchar * const data1, __constant const uchar * const data2, const uchar scoreMax) { 767 | const size_t id = get_global_id(0); 768 | __global const uchar * const hash = pInverse[id].d; 769 | int score = 0; 770 | 771 | for (int i = 0; i < 20; ++i) { 772 | const uchar first = (hash[i] & 0xF0) >> 4; 773 | const uchar second = (hash[i] & 0x0F); 774 | 775 | if (first >= data1[0] && first <= data2[0]) { 776 | ++score; 777 | } 778 | 779 | if (second >= data1[0] && second <= data2[0]) { 780 | ++score; 781 | } 782 | } 783 | 784 | profanity_result_update(id, hash, pResult, score, scoreMax); 785 | } 786 | 787 | __kernel void profanity_score_zerobytes(__global mp_number * const pInverse, __global result * const pResult, __constant const uchar * const data1, __constant const uchar * const data2, const uchar scoreMax) { 788 | const size_t id = get_global_id(0); 789 | __global const uchar * const hash = pInverse[id].d; 790 | int score = 0; 791 | 792 | for (int i = 0; i < 20; ++i) { 793 | if (hash[i] == 0) { 794 | score++; 795 | } 796 | } 797 | 798 | profanity_result_update(id, hash, pResult, score, scoreMax); 799 | } 800 | 801 | __kernel void profanity_score_leadingrange(__global mp_number * const pInverse, __global result * const pResult, __constant const uchar * const data1, __constant const uchar * const data2, const uchar scoreMax) { 802 | const size_t id = get_global_id(0); 803 | __global const uchar * const hash = pInverse[id].d; 804 | int score = 0; 805 | 806 | for (int i = 0; i < 20; ++i) { 807 | const uchar first = (hash[i] & 0xF0) >> 4; 808 | const uchar second = (hash[i] & 0x0F); 809 | 810 | if (first >= data1[0] && first <= data2[0]) { 811 | ++score; 812 | } 813 | else { 814 | break; 815 | } 816 | 817 | if (second >= data1[0] && second <= data2[0]) { 818 | ++score; 819 | } 820 | else { 821 | break; 822 | } 823 | } 824 | 825 | profanity_result_update(id, hash, pResult, score, scoreMax); 826 | } 827 | 828 | __kernel void profanity_score_mirror(__global mp_number * const pInverse, __global result * const pResult, __constant const uchar * const data1, __constant const uchar * const data2, const uchar scoreMax) { 829 | const size_t id = get_global_id(0); 830 | __global const uchar * const hash = pInverse[id].d; 831 | int score = 0; 832 | 833 | for (int i = 0; i < 10; ++i) { 834 | const uchar leftLeft = (hash[9 - i] & 0xF0) >> 4; 835 | const uchar leftRight = (hash[9 - i] & 0x0F); 836 | 837 | const uchar rightLeft = (hash[10 + i] & 0xF0) >> 4; 838 | const uchar rightRight = (hash[10 + i] & 0x0F); 839 | 840 | if (leftRight != rightLeft) { 841 | break; 842 | } 843 | 844 | ++score; 845 | 846 | if (leftLeft != rightRight) { 847 | break; 848 | } 849 | 850 | ++score; 851 | } 852 | 853 | profanity_result_update(id, hash, pResult, score, scoreMax); 854 | } 855 | 856 | __kernel void profanity_score_doubles(__global mp_number * const pInverse, __global result * const pResult, __constant const uchar * const data1, __constant const uchar * const data2, const uchar scoreMax) { 857 | const size_t id = get_global_id(0); 858 | __global const uchar * const hash = pInverse[id].d; 859 | int score = 0; 860 | 861 | for (int i = 0; i < 20; ++i) { 862 | if ((hash[i] == 0x00) || (hash[i] == 0x11) || (hash[i] == 0x22) || (hash[i] == 0x33) || (hash[i] == 0x44) || (hash[i] == 0x55) || (hash[i] == 0x66) || (hash[i] == 0x77) || (hash[i] == 0x88) || (hash[i] == 0x99) || (hash[i] == 0xAA) || (hash[i] == 0xBB) || (hash[i] == 0xCC) || (hash[i] == 0xDD) || (hash[i] == 0xEE) || (hash[i] == 0xFF)) { 863 | ++score; 864 | } 865 | else { 866 | break; 867 | } 868 | } 869 | 870 | profanity_result_update(id, hash, pResult, score, scoreMax); 871 | } 872 | --------------------------------------------------------------------------------