├── .gitignore ├── LICENSE ├── Library ├── __init__.py ├── buffer_structs.py ├── opencl.py ├── opencl_information.py ├── passwordutils.py └── worker │ └── generic │ ├── buffer_structs_template.cl │ ├── hash_iterations.cl │ ├── hmac_qualcomm.cl │ ├── md5.cl │ ├── pbkdf2.cl │ ├── pbkdf2_sha1_32.cl │ ├── pbkdf2_sha256_32.cl │ ├── sCrypt.cl │ ├── sCrypt_Bip38fork.cl │ ├── sha1.cl │ ├── sha256.cl │ └── sha512.cl ├── README.md ├── examples └── bruteforce.py ├── requirements.txt └── test.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | .pytest_cache/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | db.sqlite3 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # IPython 78 | profile_default/ 79 | ipython_config.py 80 | 81 | # pyenv 82 | .python-version 83 | 84 | # celery beat schedule file 85 | celerybeat-schedule 86 | 87 | # SageMath parsed files 88 | *.sage.py 89 | 90 | # Environments 91 | .env 92 | .venv 93 | env/ 94 | venv/ 95 | ENV/ 96 | env.bak/ 97 | venv.bak/ 98 | 99 | # Spyder project settings 100 | .spyderproject 101 | .spyproject 102 | 103 | # Rope project settings 104 | .ropeproject 105 | 106 | # mkdocs documentation 107 | /site 108 | 109 | # mypy 110 | .mypy_cache/ 111 | .dmypy.json 112 | dmypy.json 113 | 114 | # Pyre type checker 115 | .pyre/ 116 | .idea/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Bjoern Kerler 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Library/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bkerler/opencl_brute/c294a158cd56e32c8a05f88f0bebf89466513015/Library/__init__.py -------------------------------------------------------------------------------- /Library/buffer_structs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # -*- coding: utf-8 -*- 3 | # (c) B. Kerler 2018-2021 4 | # MIT License 5 | ''' 6 | Provides a class for filling in my buffer_structs_template.cl 7 | ''' 8 | 9 | import os 10 | import re 11 | 12 | # Read the template in 13 | template = "" 14 | with open(os.path.join(os.path.dirname(__file__), "worker","generic","buffer_structs_template.cl"), "r") as rf: 15 | template = rf.read() 16 | 17 | class buffer_structs: 18 | def __init__(self): 19 | self.code = "" 20 | self.wordSize = 4 21 | 22 | def setMaxBufferSizes(self, max_in_bytes, max_out_bytes, max_salt_bytes=32, max_ct_bytes=0, max_pwd_bytes=32): 23 | # Ensure each are a multiple of 4 24 | max_in_bytes += (-max_in_bytes % self.wordSize) 25 | max_out_bytes += (-max_out_bytes % self.wordSize) 26 | max_salt_bytes += (-max_salt_bytes % self.wordSize) 27 | max_pwd_bytes += (-max_pwd_bytes % self.wordSize) 28 | 29 | self.inBufferSize_bytes = max_in_bytes 30 | self.outBufferSize_bytes = max_out_bytes 31 | self.saltBufferSize_bytes = max_salt_bytes 32 | self.pwdBufferSize_bytes = max_pwd_bytes 33 | self.inBufferSize = (max_in_bytes + 3) // self.wordSize 34 | self.outBufferSize = (max_out_bytes + 3) // self.wordSize 35 | self.saltBufferSize = (max_salt_bytes + 3) // self.wordSize 36 | self.pwdBufferSize = (max_pwd_bytes + 3) // self.wordSize 37 | self.ctBufferSize_bytes = max_ct_bytes 38 | 39 | def specifyHashSizes(self, hashBlockSize_bits, hashDigestSize_bits): 40 | self.hashBlockSize_bits = hashBlockSize_bits 41 | self.hashDigestSize_bits = hashDigestSize_bits 42 | 43 | def setBufferSizesForHashing(self, hashMaxNumBlocks): 44 | self.setMaxBufferSizes( ((self.hashBlockSize_bits + 7) // 8) * hashMaxNumBlocks, 45 | (self.hashDigestSize_bits + 7) // 8, 46 | 0) 47 | 48 | def ceilToMult(self, n, k): 49 | return n + ((-n) % k) 50 | 51 | def fill_template(self): 52 | rep = { "": str(self.hashBlockSize_bits), 53 | "" : str(self.hashDigestSize_bits), 54 | "" : str(self.inBufferSize_bytes), 55 | "" : str(self.outBufferSize_bytes), 56 | "" : str(self.saltBufferSize_bytes), 57 | "": str(self.pwdBufferSize_bytes), 58 | "" : str(self.ctBufferSize_bytes), 59 | "" : str(self.wordSize) 60 | } 61 | 62 | rep = dict((re.escape(k), v) for k, v in rep.items()) 63 | pattern = re.compile("|".join(rep.keys())) 64 | self.code = pattern.sub(lambda m: rep[re.escape(m.group(0))], template) 65 | 66 | def specifyMD5(self, max_in_bytes=128, max_salt_bytes=32, dklen=0, max_ct_bytes=0, max_password_bytes = 32): 67 | self.specifyHashSizes(512, 128) 68 | maxNumBlocks = 3 69 | self.wordSize = 4 70 | self.setBufferSizesForHashing(maxNumBlocks) 71 | max_out_bytes = self.hashDigestSize_bits // 8 72 | if dklen!=0: 73 | # Adjust output size to be a multiple of the digest 74 | max_out_bytes = self.ceilToMult(dklen, (self.hashDigestSize_bits // 8)) 75 | self.setMaxBufferSizes(max_in_bytes, max_out_bytes, max_salt_bytes, max_ct_bytes, max_password_bytes) 76 | self.fill_template() 77 | return max_out_bytes 78 | 79 | def specifySHA1(self, max_in_bytes=128, max_salt_bytes=32, dklen=0, max_ct_bytes=0, max_password_bytes = 32): 80 | self.specifyHashSizes(512,160) 81 | maxNumBlocks = 3 82 | self.wordSize = 4 83 | self.setBufferSizesForHashing(maxNumBlocks) 84 | max_out_bytes = self.hashDigestSize_bits // 8 85 | if dklen!=0: 86 | # Adjust output size to be a multiple of the digest 87 | max_out_bytes = self.ceilToMult(dklen, (self.hashDigestSize_bits // 8)) 88 | self.setMaxBufferSizes(max_in_bytes, max_out_bytes, max_salt_bytes, max_ct_bytes, max_password_bytes) 89 | self.fill_template() 90 | return max_out_bytes 91 | 92 | def specifySHA2(self, hashDigestSize_bits=256, max_in_bytes=128, max_salt_bytes=32, dklen=0, max_ct_bytes=0, max_password_bytes = 32): 93 | assert hashDigestSize_bits in [224,256,384,512] 94 | hashBlockSize_bits = 512 95 | if hashDigestSize_bits >= 384: 96 | hashBlockSize_bits = 1024 97 | self.specifyHashSizes(hashBlockSize_bits, hashDigestSize_bits) 98 | if hashDigestSize_bits==512: 99 | maxNumBlocks = 2 100 | self.wordSize = 8 101 | else: 102 | maxNumBlocks = 3 103 | self.wordSize = 4 104 | self.setBufferSizesForHashing(maxNumBlocks) 105 | max_out_bytes = self.hashDigestSize_bits // 8 106 | if dklen!=0: 107 | # Adjust output size to be a multiple of the digest 108 | max_out_bytes = self.ceilToMult(dklen, (self.hashDigestSize_bits // 8)) 109 | 110 | self.setMaxBufferSizes(max_in_bytes, max_out_bytes, max_salt_bytes, max_ct_bytes, max_password_bytes) 111 | #bufStructs.setMaxBufferSizes(128, (bufStructs.hashDigestSize_bits * 2) // 8, 128) 112 | self.fill_template() 113 | return max_out_bytes 114 | 115 | ## sha3 not worth the fuss until I write the .cl's -------------------------------------------------------------------------------- /Library/opencl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # -*- coding: utf-8 -*- 3 | # (c) B. Kerler 2018-2021 4 | # MIT License 5 | import os 6 | from hashlib import pbkdf2_hmac 7 | from binascii import unhexlify 8 | from collections import deque 9 | from itertools import chain, repeat, zip_longest 10 | import numpy as np 11 | import pyopencl as cl 12 | from Library.buffer_structs import buffer_structs 13 | import os, re, sys, inspect 14 | current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) 15 | parent_dir = os.path.dirname(current_dir) 16 | 17 | """ 18 | Original copyright: 19 | Copyright by B.Kerler 2017, PBKDF1_SHA1 and SHA256 PyOpenCl implementation, max 32 chars for password + salt 20 | MIT License 21 | Implementation was confirmed to work with 22 | Intel OpenCL on Intel(R) HD Graphics 520 and Intel(R) Core(TM) i5-6200U CPU and GeForce RTX 3080 23 | """ 24 | """ 25 | Adapted for generalising to more hash functions 26 | Allows any length input (efficiently, by declaring the max in advance) 27 | - salt ditched atm, but hoping to restore it 28 | - pbkdf2 forgotten about for now 29 | """ 30 | 31 | 32 | # Corresponding to opencl (CAN'T BE CHANGED): 33 | r = 8 34 | BLOCK_LEN_BYTES = 128 * r 35 | 36 | 37 | # Little helper, (22,5) -> 5,5,5,5,2. itertools is bae 38 | def take_in_chunks(n, d): 39 | assert d > 0 and n >= 0 40 | return chain(repeat(d, n // d), filter(lambda x: x != 0, [n % d])) 41 | 42 | 43 | def printif(b, s): 44 | if b: 45 | print(s) 46 | 47 | 48 | class opencl_interface: 49 | debug = False 50 | inv_memory_density = 1 51 | 52 | # Initialiser for the key properties 53 | # pbkdf related initialisation removed, will reappear somewhere else 54 | def __init__(self, platformNum, debug=0, write_combined_file=False, maxWorkgroupSize=60000, inv_memory_density=1, 55 | N_value=15, openclDevice = 0): 56 | self.workgroupsize = 0 57 | self.computeunits = 0 58 | self.wordSize = None 59 | self.N = None 60 | self.wordType = None 61 | printif(debug, "Using Platform %d:" % platformNum) 62 | devices = cl.get_platforms()[platformNum].get_devices() 63 | self.platform_number = platformNum 64 | # Show devices for the platform, and adjust workgroup size 65 | # Create the context for GPU/CPU 66 | # Adjust workgroup size so that we don't run out of RAM: 67 | # As with bench_sCrypt.py, not really working! 68 | self.sworkgroupsize = self.determine_workgroupsize(N_value) 69 | self.inv_memory_density = inv_memory_density 70 | self.ctx = cl.Context(devices) 71 | self.queue = cl.CommandQueue(self.ctx, devices[openclDevice]) 72 | self.debug = debug 73 | 74 | for device in devices: 75 | printif(debug, '--------------------------------------------------------------------------') 76 | printif(debug, ' Device - Name: ' + device.name) 77 | printif(debug, ' Device - Type: ' + cl.device_type.to_string(device.type)) 78 | printif(debug, ' Device - Compute Units: {0}'.format(device.max_compute_units)) 79 | printif(debug, ' Device - Max Work Group Size: {0:.0f}'.format(device.max_work_group_size)) 80 | printif(debug, ' Device - Global memory size: {}'.format(device.global_mem_size)) 81 | printif(debug, ' Device - Local memory size: {}'.format(device.local_mem_size)) 82 | printif(debug, ' Device - Max clock frequency: {} MHz'.format(device.max_clock_frequency)) 83 | 84 | assert device.endian_little == 1, "DEVICE is not little endian : pretty sure we rely on this!" 85 | if self.workgroupsize == 0: 86 | self.workgroupsize = maxWorkgroupSize 87 | self.workgroupsize = min(self.workgroupsize, device.max_work_group_size) 88 | else: 89 | self.workgroupsize = min(self.workgroupsize, device.max_work_group_size) 90 | 91 | if self.computeunits == 0: 92 | self.computeunits = device.max_compute_units 93 | else: 94 | self.computeunits = min(self.computeunits, device.max_compute_units) 95 | 96 | # if device.max_work_group_size= 2.0: 103 | os.environ['PYOPENCL_BUILD_OPTIONS'] = "-cl-std=CL1.2" 104 | 105 | printif(debug, "\nUsing work group size of %d\n" % self.workgroupsize) 106 | 107 | # Set the debug flags 108 | os.environ['PYOPENCL_COMPILER_OUTPUT'] = str(debug) 109 | self.write_combined_file = write_combined_file 110 | 111 | def compile(self, bufferStructsObj, library_file, footer_file=None, N=15, invMemoryDensity=2): 112 | assert type(N) == int 113 | assert N < 20, "N >= 20 won't fit in a single buffer, so is unsupported. " + \ 114 | "Nothing sane should use 20, is this wickr?" 115 | self.N = N 116 | assert bufferStructsObj is not None, "need to supply a bufferStructsObj : set all to 0 if necessary" 117 | assert bufferStructsObj.code is not None, "bufferStructsObj should be initialised" 118 | bufStructs = bufferStructsObj 119 | self.wordSize = bufStructs.wordSize 120 | 121 | # set the np word type, for use in .run 122 | npType = { 123 | 4: np.uint32, 124 | 8: np.uint64, 125 | } 126 | self.wordType = npType[self.wordSize] 127 | 128 | if footer_file != None: 129 | src = bufStructs.code 130 | else: 131 | src = "" 132 | if library_file: 133 | with open(os.path.join(current_dir, "worker", "generic", library_file), "r") as rf: 134 | src += rf.read() 135 | 136 | if footer_file: 137 | with open(os.path.join(current_dir, "worker", "generic", footer_file), "r") as rf: 138 | src += "\n" + rf.read() 139 | 140 | # Standardise to using no \r's, move to bytes to stop trickery 141 | src = src.encode("ascii") 142 | src = src.replace(b"\r\n", b"\n") 143 | 144 | # Debugging 145 | if self.write_combined_file: 146 | with open("combined_" + library_file, "wb") as wf: 147 | wf.write(src) 148 | 149 | # Convert back to text! 150 | src = src.decode("ascii") 151 | 152 | # Check that it starts with 2 newlines, for adding our defines 153 | if src.startswith("\n\n"): 154 | src = "\n\n" + src 155 | src = src[len("\n\n"):] 156 | # Prepend define N and invMemoryDensity 157 | defines = "#define N {}\n#define invMemoryDensity {}\n".format(N, invMemoryDensity) 158 | src = defines + src 159 | 160 | # Kernel function instantiation. Build returns self. 161 | prg = cl.Program(self.ctx, src).build() 162 | return prg 163 | 164 | # Forms the input buffer of derived keys 165 | # Returns the buffer and number in the buffer, <= n (iter may be exhausted) 166 | def make_input_buffer(self, dkIter, n): 167 | inpArray = bytearray() 168 | numEaten = n 169 | 170 | for i in range(n): 171 | try: 172 | dk = dkIter.__next__() 173 | except StopIteration: 174 | # Correct the chunk size and break 175 | numEaten = i 176 | break 177 | 178 | assert len(dk) == BLOCK_LEN_BYTES 179 | # , "Derived key input is length {}, when we expected {}".format(len(dk), BLOCK_LEN_BYTES) 180 | 181 | inpArray.extend(dk) 182 | 183 | # pyopencl doesn't like empty buffers, so just cheer it up 184 | # (making the buffer larger isn't an issue) 185 | if len(inpArray) == 0: 186 | inpArray = b"\x00" 187 | 188 | inp_g = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=inpArray) 189 | 190 | return inp_g, numEaten 191 | 192 | def run(self, bufStructs, func, pwdIter, salt=b"", paddedLenFunc=None, rtnPwds=None): 193 | # PaddedLenFunc is just for checking: lower bound with original length if not supplied 194 | wordType=self.wordType 195 | wordSize=self.wordSize 196 | ctx=self.ctx 197 | queue=self.queue 198 | hashBlockSize_bits=bufStructs.hashBlockSize_bits 199 | if not paddedLenFunc: 200 | paddedLenFunc = lambda x, bs: x 201 | 202 | # Checks on password list : not possible now we have iters! 203 | 204 | inBufSize_bytes = bufStructs.inBufferSize_bytes 205 | outBufSize_bytes = bufStructs.outBufferSize_bytes 206 | outBufferSize = bufStructs.outBufferSize 207 | saltBufferSize_bytes = bufStructs.saltBufferSize_bytes 208 | 209 | # Main loop is taking chunks of at most the workgroup size 210 | while True: 211 | # Moved to bytearray initially, avoiding copying and above all 212 | # 'np.append' which is horrific 213 | pwArray = bytearray() 214 | 215 | # For each password in our chunk, process it into pwArray, with length first 216 | # Notice that this lines up with the struct declared in the .cl file! 217 | chunkSize = self.workgroupsize 218 | for i in range(self.workgroupsize): 219 | try: 220 | pw = pwdIter.__next__() 221 | # Since we take a iterator, we feed the passwords back if requested 222 | if rtnPwds is not None: 223 | rtnPwds.append(pw) 224 | except StopIteration: 225 | # Correct the chunk size and break 226 | chunkSize = i 227 | break 228 | 229 | pwLen = len(pw) 230 | # Now passing hash block size as a parameter.. could be None? 231 | assert paddedLenFunc(pwLen, hashBlockSize_bits // 8) <= inBufSize_bytes, \ 232 | "password #" + str(i) + ", '" + pw.decode() + "' (length " + str( 233 | pwLen) + ") exceeds the input buffer (length " + str(inBufSize_bytes) + ") when padded" 234 | 235 | # Add the length to our pwArray, then pad with 0s to struct size 236 | # prev code was np.array([pwLen], dtype=np.uint32), this ultimately is equivalent 237 | pwArray.extend(pwLen.to_bytes(wordSize, 'little')+pw+(b"\x00"* (inBufSize_bytes - pwLen))) 238 | 239 | if chunkSize == 0: 240 | break 241 | # print("Chunksize = {}".format(chunkSize)) 242 | 243 | # Convert the pwArray into a numpy array, just the once. 244 | # Declare the numpy array for the digest output 245 | pwArray = np.frombuffer(pwArray, dtype=wordType) 246 | result = np.zeros(outBufferSize * chunkSize, dtype=wordType) 247 | 248 | # Make the salty array, with length at the front 249 | saltLen = len(salt) 250 | saltArray = bytearray(saltLen.to_bytes(wordSize, 'little')+salt+(b"\x00" * (saltBufferSize_bytes - saltLen))) 251 | saltArray = np.frombuffer(saltArray, dtype=wordType) 252 | assert saltArray.nbytes - wordSize == saltBufferSize_bytes, "Salt doesn't fit in the " \ 253 | "buffer! " 254 | 255 | # Allocate memory for variables on the device 256 | pass_g = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=pwArray) 257 | salt_g = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=saltArray) 258 | result_g = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, result.nbytes) 259 | 260 | # print("=========== Initial buffers ==============") 261 | # print(" pass_g.nbytes = {}".format(pwArray.nbytes)) 262 | # print(" salt_g.nbytes = {}".format(saltArray.nbytes)) 263 | # print(" result_g.nbytes = {}".format(result.nbytes)) 264 | 265 | # Call Kernel. Automatically takes care of block/grid distribution 266 | pwdim = (chunkSize,) 267 | 268 | # Main function callback : could adapt to pass further data 269 | func(self, pwdim, pass_g, salt_g, result_g) 270 | # self.prg.hmac_main(self.queue, pwdim, None, pass_g, salt_g, result_g) 271 | 272 | # Read the results back into our array of int32s, then hexlify 273 | # Some inefficiency here, unavoidable using hexlify 274 | cl.enqueue_copy(queue, result, result_g) 275 | 276 | # Chop up into the individual hash digests, then trim to necessary hash length. 277 | 278 | # Yield this block of results 279 | yield [bytes(result[i:i + outBufSize_bytes // wordSize]) 280 | for i in range(0, len(result), outBufSize_bytes // wordSize)] 281 | 282 | # No main return 283 | return None 284 | 285 | def run_saltlist(self, bufStructs, func, saltIter, password = b"", paddedLenFunc=None, rtnSalts=None): 286 | # PaddedLenFunc is just for checking: lower bound with original length if not supplied 287 | wordType=self.wordType 288 | wordSize=self.wordSize 289 | ctx=self.ctx 290 | queue=self.queue 291 | hashBlockSize_bits=bufStructs.hashBlockSize_bits 292 | if not paddedLenFunc: 293 | paddedLenFunc = lambda x, bs: x 294 | 295 | # Checks on password list : not possible now we have iters! 296 | 297 | inBufSize_bytes = bufStructs.inBufferSize_bytes 298 | outBufSize_bytes = bufStructs.outBufferSize_bytes 299 | outBufferSize = bufStructs.outBufferSize 300 | 301 | # Main loop is taking chunks of at most the workgroup size 302 | while True: 303 | # Moved to bytearray initially, avoiding copying and above all 304 | # 'np.append' which is horrific 305 | saltArray = bytearray() 306 | 307 | # For each password in our chunk, process it into pwArray, with length first 308 | # Notice that this lines up with the struct declared in the .cl file! 309 | chunkSize = self.workgroupsize 310 | for i in range(self.workgroupsize): 311 | try: 312 | salt = saltIter.__next__() 313 | # Since we take a iterator, we feed the passwords back if requested 314 | if rtnSalts != None: 315 | rtnSalts.append(salt) 316 | except StopIteration: 317 | # Correct the chunk size and break 318 | chunkSize = i 319 | break 320 | 321 | saltLen = len(salt) 322 | # Now passing hash block size as a parameter.. could be None? 323 | assert paddedLenFunc(saltLen, hashBlockSize_bits // 8) <= inBufSize_bytes, \ 324 | "salt #" + str(i) + ", '" + salt.decode() + "' (length " + str( 325 | saltLen) + ") exceeds the input buffer (length " + str(inBufSize_bytes) + ") when padded" 326 | 327 | # Add the length to our saltLen, then pad with 0s to struct size 328 | # prev code was np.array([saltLen], dtype=np.uint32), this ultimately is equivalent 329 | saltArray.extend((saltLen).to_bytes(self.wordSize, 'little')) 330 | saltArray.extend(salt) 331 | saltArray.extend([0] * (inBufSize_bytes - saltLen)) 332 | 333 | if chunkSize == 0: 334 | break 335 | # print("Chunksize = {}".format(chunkSize)) 336 | 337 | # Convert the pwArray into a numpy array, just the once. 338 | # Declare the numpy array for the digest output 339 | saltArray = np.frombuffer(saltArray, dtype=self.wordType) 340 | result = np.zeros(bufStructs.outBufferSize * chunkSize, dtype=self.wordType) 341 | 342 | # Make the salty array, with length at the front 343 | pwLen = len(password) 344 | pwArray = bytearray((pwLen).to_bytes(self.wordSize, 'little')) 345 | pwArray.extend(password) 346 | ##saltArray.extend(b"\x00" * ((-saltLen) % 4)) 347 | pwArray.extend(b"\x00" * (bufStructs.pwdBufferSize_bytes - pwLen)) 348 | pwArray = np.frombuffer(pwArray, dtype=self.wordType) 349 | assert pwArray.nbytes - self.wordSize == bufStructs.pwdBufferSize_bytes, "Salt doesn't fit in the buffer!" 350 | 351 | # Allocate memory for variables on the device 352 | pass_g = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=pwArray) 353 | salt_g = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=saltArray) 354 | result_g = cl.Buffer(self.ctx, cl.mem_flags.WRITE_ONLY, result.nbytes) 355 | 356 | # print("=========== Initial buffers ==============") 357 | # print(" pass_g.nbytes = {}".format(pwArray.nbytes)) 358 | # print(" salt_g.nbytes = {}".format(saltArray.nbytes)) 359 | # print(" result_g.nbytes = {}".format(result.nbytes)) 360 | 361 | # Call Kernel. Automatically takes care of block/grid distribution 362 | pwdim = (chunkSize,) 363 | 364 | # Main function callback : could adapt to pass further data 365 | func(self, pwdim, pass_g, salt_g, result_g) 366 | ##self.prg.hmac_main(self.queue, pwdim, None, pass_g, salt_g, result_g) 367 | 368 | # Read the results back into our array of int32s, then hexlify 369 | # Some inefficiency here, unavoidable using hexlify 370 | cl.enqueue_copy(self.queue, result, result_g) 371 | 372 | # hexvalue = hexlify(result) 373 | 374 | # Chop up into the individual hash digests, then trim to necessary hash length. 375 | results = [] 376 | # for i in range(0, len(hexvalue), outBufSize_hs): 377 | # hexRes = hexvalue[i:i + outBufSize_hs].decode() 378 | # results.append(hexRes) 379 | 380 | for i in range(0, len(result), outBufSize_bytes // bufStructs.wordSize): 381 | v = bytes(result[i:i + outBufSize_bytes // bufStructs.wordSize]) 382 | results.append(v) 383 | 384 | # Yield this block of results 385 | yield results 386 | 387 | # No main return 388 | return None 389 | 390 | def determine_workgroupsize(self, N_value=15): 391 | devices = cl.get_platforms()[self.platform_number].get_devices() 392 | wgSize = 0 393 | for device in devices: 394 | # Actually adjust based on invMemoryDensity! 395 | N_blocks_bytes = (1 << N_value) * BLOCK_LEN_BYTES // self.inv_memory_density 396 | memoryForOneCore = BLOCK_LEN_BYTES * 2 + N_blocks_bytes # input, output & V 397 | 398 | ## ! Restrict to 98% of avaiable memory 399 | coresOnDevice = (int(0.98 * device.global_mem_size) // memoryForOneCore) 400 | percentUsage = 100 * memoryForOneCore * coresOnDevice / device.global_mem_size 401 | percentUsage = str(percentUsage)[:4] 402 | if self.debug == 1: 403 | print("Using {} cores on device with global memory {}, = {}%".format( 404 | coresOnDevice, device.global_mem_size, percentUsage 405 | )) 406 | wgSize += coresOnDevice 407 | 408 | if self.debug == 1: 409 | print("Workgroup size determined as {}".format(wgSize)) 410 | 411 | return wgSize 412 | 413 | def run_scrypt(self, sprg, kernelCall, dkIter): 414 | N_blocks_bytes = (1 << self.N) * BLOCK_LEN_BYTES 415 | 416 | # no. of cores' memory that we can fit into a single buffer 417 | # (seemingly anyway, why isn't it 2^31?) 418 | # note: this is NOT the workgroupsize, nor does it bound it 419 | maxGangSize = (1 << 31) // N_blocks_bytes 420 | assert maxGangSize > 0, "Uh-oh we couldn't fit a single core's V in a buffer." 421 | 422 | # A. Before the loop we produce our huge buffers, once only. 423 | # B. Also make our output buffers & numpys now, just once, to save work 424 | # Note these will be atleast big enough throughout the loop: sometimes they'll have extra room. 425 | largeBuffers = [] 426 | outBuffers = [] 427 | outNumpys = [] 428 | outSizes = [] 429 | for gangSize in take_in_chunks(self.sworkgroupsize, maxGangSize): 430 | # Produce the large buffer for storing this gang's V arrays 431 | # No longer producing a big bytes object in Python 432 | 433 | # arr = np.frombuffer(bytes(gangSize * N_blocks_bytes), dtype=np.uint32) 434 | # Why is this read only? 435 | arr_g = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY, size=gangSize * N_blocks_bytes) 436 | largeBuffers.append(arr_g) 437 | 438 | # Produce the gang's output buffer and (small) numpy array to copy out to 439 | nBytes = BLOCK_LEN_BYTES * gangSize 440 | result = np.zeros(nBytes // 4, dtype=np.uint32) 441 | assert nBytes == result.nbytes 442 | result_g = cl.Buffer(self.ctx, cl.mem_flags.WRITE_ONLY, nBytes) 443 | outBuffers.append(result_g) 444 | outNumpys.append(result) 445 | 446 | # No output from round 0! 447 | outSizes.append(0) 448 | 449 | # ! For minimal latency, we only block just before our next calls to the kernels: 450 | # there is basically no work between the two. 451 | 452 | # Main loop is taking chunks of workgroup size, 453 | # or less if we exhaust the input derived keys iter 454 | iterActive = True 455 | while iterActive: 456 | # 1. Make New SMALL input buffers (derived key buffer, was password & salt) 457 | # if we exhaust dkIter, continue producing 'empty' input buffers, but mark to leave the main loop 458 | newInputs = [] 459 | inCounts = [] 460 | for gangSize in take_in_chunks(self.sworkgroupsize, maxGangSize): 461 | input_g, numEaten = self.make_input_buffer(dkIter, gangSize) 462 | iterActive = (numEaten == gangSize) # note gangSize > 0, so once False this will persist 463 | newInputs.append(input_g) 464 | inCounts.append(numEaten) 465 | 466 | # 2. (BLOCKING) wait for all our workers to finish (should be at similar times), 467 | # and copy output buffers out to numpy (minimal time loss here, could use 468 | # 2 sets of output buffers instead) 469 | # Note we may well have copied too much: this is dealt with in 4. below 470 | for outSize, outNumpy, outBuf in zip_longest(outSizes, outNumpys, outBuffers): 471 | if outSize > 0: 472 | cl.enqueue_copy(self.queue, outNumpy, outBuf) # is_blocking defaults to true :) 473 | 474 | # print("Calling kernels..") 475 | # 3. (NON-BLOCKING) queue the kernel calls 476 | for input_g, arr_g, result_g, inCount in zip_longest(newInputs, largeBuffers, outBuffers, inCounts): 477 | if inCount > 0: 478 | dim = (inCount,) 479 | # print("inCount = {}".format(inCount)) 480 | # print("All sizes in bytes (hopefully):") 481 | # print("input_g.size = {}".format(input_g.size)) 482 | # print("arr_g.size = {}".format(arr_g.size)) 483 | # print("result_g.size = {}".format(result_g.size)) 484 | # print("\nOpenCL code now:\n") 485 | kernelCall(sprg, (self.queue, dim, None, input_g, arr_g, result_g)) 486 | # print("Kernels running..") 487 | 488 | # 4. Process the outputs from the last round, yielding now (while the GPUs are busy) 489 | # Also copy the input counts across to output sizes, for the next loop / final processing below 490 | for i, (outNumpy, inCount) in enumerate(zip_longest(outNumpys, inCounts)): 491 | outSize = outSizes[i] 492 | 493 | assert outSize % BLOCK_LEN_BYTES == 0 494 | outBytes = outNumpy.tobytes() 495 | for j in range(0, outSize, BLOCK_LEN_BYTES): 496 | yield outBytes[j:j + BLOCK_LEN_BYTES] 497 | 498 | outSizes[i] = inCount * BLOCK_LEN_BYTES 499 | 500 | # Note that if exiting here then we've updated the outSizes & called the functions 501 | # Just remains to capture & process the output.. 502 | 503 | # print("Dropped out of loop") 504 | # Do a final loop of processing output (3 & 2) 505 | for outBuf, outNumpy, outSize in zip_longest(outBuffers, outNumpys, outSizes): 506 | # (BLOCKING) Copy! 507 | cl.enqueue_copy(self.queue, outNumpy, outBuf) 508 | 509 | assert outSize % BLOCK_LEN_BYTES == 0 510 | outBytes = outNumpy.tobytes() 511 | for i in range(0, outSize, BLOCK_LEN_BYTES): 512 | yield outBytes[i:i + BLOCK_LEN_BYTES] 513 | 514 | 515 | def mdpad_128_func(pwdLen, blockSize): 516 | llen = (pwdLen + 1 + 16) 517 | llen += (-llen) % blockSize 518 | return llen 519 | 520 | 521 | def mdpad_64_func(pwdLen, blockSize): 522 | # both parameters in bytes 523 | # length appended as a 64-bit integer 524 | llen = (pwdLen + 1 + 8) 525 | llen += (-llen) % blockSize 526 | return llen 527 | 528 | 529 | def concat(ll): 530 | return [obj for lval in ll for obj in lval] 531 | 532 | 533 | class opencl_algos: 534 | def __init__(self, platform, debug, write_combined_file, inv_memory_density=1, openclDevice = 0): 535 | if not debug: 536 | debug = 0 537 | self.opencl_ctx = opencl_interface(platform, debug, write_combined_file, openclDevice = openclDevice) 538 | self.platform_number = platform 539 | self.inv_memory_density = inv_memory_density 540 | self.max_out_bytes=0 541 | 542 | def cl_scrypt_init(self, N_value=15, forceAltKernel = None): 543 | # Initialise the openCL context & compile, with both debugging settings off 544 | debug = 0 545 | bufStructs = buffer_structs() 546 | if forceAltKernel: 547 | print("Loading Alternative sCrypt Kernel:", forceAltKernel) 548 | sprg = self.opencl_ctx.compile(bufStructs, forceAltKernel, None, N=N_value, invMemoryDensity=self.inv_memory_density) 549 | else: 550 | sprg=self.opencl_ctx.compile(bufStructs, "sCrypt.cl", None, N=N_value, invMemoryDensity=self.inv_memory_density) 551 | return [sprg,bufStructs] 552 | 553 | def cl_scrypt(self, ctx, passwords, N_value=15, r_value=3, p_value=1, desired_key_length=32, 554 | hex_salt=unhexlify("DEADBEEFDEADBEEFDEADBEEFDEADBEEF")): 555 | 556 | def get_dk_iter(p, salt, pwdIter, rtnPwds=None): 557 | # r fixed as 8 in the OpenCL 558 | r_val = 8 559 | blockSize = 128 * r_val 560 | for pwd in pwdIter: 561 | if rtnPwds is not None: 562 | rtnPwds.append(pwd) 563 | # Get derived key, then split into p chunks and yield 564 | dk = pbkdf2_hmac("sha256", pwd, salt, 1, dklen=blockSize * p) 565 | 566 | # Yield 567 | for i in range(p): 568 | yield dk[i * blockSize: (i + 1) * blockSize] 569 | 570 | sprg = ctx[0] 571 | 572 | # Our callback with the kernel name 573 | # Debugging: calls Salsa20 574 | def kernel_call(snprg, params): 575 | return snprg.ROMix(*params) # prg.ROMix(*params) 576 | 577 | # Derived key iter: yields p keys for each password 578 | passwordList = deque() 579 | dkIter = get_dk_iter(1 << p_value, hex_salt, passwords, passwordList) 580 | 581 | result = [] 582 | # Main call. 583 | group = [] 584 | result_append = result.append 585 | passwordList_popleft = passwordList.popleft 586 | for singleOutput in self.opencl_ctx.run_scrypt(sprg, kernel_call, dkIter): 587 | group.append(singleOutput) 588 | if len(group) == 1 << p_value: 589 | expensiveSalt = b"".join(group) 590 | commonPwd = passwordList_popleft() 591 | sCryptResult = pbkdf2_hmac("sha256", commonPwd, expensiveSalt, 1, desired_key_length) 592 | 593 | # For now print out for debugging 594 | # print("Password={}".format(commonPwd)) 595 | # print("sCrypt={}".format(hexlify(sCryptResult).decode().upper())) 596 | # result.append("{}".format(hexlify(sCryptResult).decode().lower())) 597 | result_append(sCryptResult) 598 | group = [] 599 | return result 600 | 601 | # def mdPadLenFunc(self, pwdLen): 602 | # l = (pwdLen + 1 + 8) 603 | # l += (64 - (l % 64)) % 64 604 | # return l 605 | 606 | def cl_sha512_init(self, option="", max_in_bytes=128, max_salt_bytes=32, dklen=0, max_ct_bytes=0): 607 | bufStructs = buffer_structs() 608 | bufStructs.specifySHA2(512, max_in_bytes, max_salt_bytes, dklen, max_ct_bytes) 609 | assert bufStructs.wordSize == 8 # set when you specify sha512 610 | prg = self.opencl_ctx.compile(bufStructs, 'sha512.cl', option) 611 | return [prg, bufStructs] 612 | 613 | def cl_sha512(self, ctx, passwordlist): 614 | # self.cl_sha512_init() 615 | prg = ctx[0] 616 | bufStructs = ctx[1] 617 | 618 | def func(s, pwdim, pass_g, salt_g, result_g): 619 | prg.hash_main(s.queue, pwdim, None, pass_g, result_g) 620 | 621 | return concat(self.opencl_ctx.run(bufStructs, func, iter(passwordlist), b"", mdpad_128_func)) 622 | 623 | def cl_sha256_init(self, option="", max_in_bytes=128, max_salt_bytes=32, dklen=0, max_ct_bytes=0): 624 | bufStructs = buffer_structs() 625 | bufStructs.specifySHA2(256, max_in_bytes, max_salt_bytes, dklen, max_ct_bytes) 626 | assert bufStructs.wordSize == 4 # set when you specify sha256 627 | prg = self.opencl_ctx.compile(bufStructs, 'sha256.cl', option) 628 | return [prg, bufStructs] 629 | 630 | def cl_sha256(self, ctx, passwordlist): 631 | # self.cl_sha256_init() 632 | prg = ctx[0] 633 | bufStructs = ctx[1] 634 | 635 | def func(s, pwdim, pass_g, salt_g, result_g): 636 | prg.hash_main(s.queue, pwdim, None, pass_g, result_g) 637 | 638 | return concat(self.opencl_ctx.run(bufStructs, func, iter(passwordlist), b"", mdpad_64_func)) 639 | 640 | def cl_md5_init(self, option=""): 641 | bufStructs = buffer_structs() 642 | bufStructs.specifyMD5() 643 | assert bufStructs.wordSize == 4 # set when you specify md5 644 | prg = self.opencl_ctx.compile(bufStructs, 'md5.cl', option) 645 | return [prg, bufStructs] 646 | 647 | def cl_md5(self, ctx, passwordlist): 648 | # self.cl_md5_init() 649 | prg = ctx[0] 650 | bufStructs = ctx[1] 651 | 652 | def func(s, pwdim, pass_g, salt_g, result_g): 653 | prg.hash_main(s.queue, pwdim, None, pass_g, result_g) 654 | 655 | return concat(self.opencl_ctx.run(bufStructs, func, iter(passwordlist), b"", mdpad_64_func)) 656 | 657 | def cl_sha1_init(self, option=""): 658 | bufStructs = buffer_structs() 659 | bufStructs.specifySHA1() 660 | assert bufStructs.wordSize == 4 # set when you specify sha1 661 | prg = self.opencl_ctx.compile(bufStructs, 'sha1.cl', option) 662 | return [prg, bufStructs] 663 | 664 | def cl_sha1(self, ctx, passwordlist): 665 | # self.cl_sha1_init() 666 | prg = ctx[0] 667 | bufStructs = ctx[1] 668 | 669 | def func(s, pwdim, pass_g, salt_g, result_g): 670 | prg.hash_main(s.queue, pwdim, None, pass_g, result_g) 671 | 672 | return concat(self.opencl_ctx.run(bufStructs, func, iter(passwordlist), b"", mdpad_64_func)) 673 | 674 | # =========================================================================================== 675 | 676 | def cl_hmac(self, ctx, passwordlist, salt): 677 | prg = ctx[0] 678 | bufStructs = ctx[1] 679 | 680 | def func(s, pwdim, pass_g, salt_g, result_g): 681 | prg.hmac_main(s.queue, pwdim, None, pass_g, salt_g, result_g) 682 | 683 | return concat(self.opencl_ctx.run(bufStructs, func, iter(passwordlist), salt)) 684 | 685 | def cl_md5_hmac(self, ctx, passwordlist, salt): 686 | # self.cl_md5_init("pbkdf2.cl") 687 | return self.cl_hmac(ctx, passwordlist, salt) 688 | 689 | def cl_sha1_hmac(self, ctx, passwordlist, salt): 690 | # self.cl_sha1_init("pbkdf2.cl") 691 | return self.cl_hmac(ctx, passwordlist, salt) 692 | 693 | def cl_sha256_hmac(self, ctx, passwordlist, salt): 694 | # self.cl_sha256_init("pbkdf2.cl") 695 | return self.cl_hmac(ctx, passwordlist, salt) 696 | 697 | def cl_sha512_hmac(self, ctx, passwordlist, salt): 698 | # self.cl_sha512_init("pbkdf2.cl") 699 | return self.cl_hmac(ctx, passwordlist, salt) 700 | 701 | # =========================================================================================== 702 | 703 | def cl_pbkdf2(self, ctx, passwordlist, salt, iters, dklen): 704 | prg = ctx[0] 705 | bufStructs = ctx[1] 706 | 707 | def func(s, pwdim, pass_g, salt_g, result_g): 708 | prg.pbkdf2(s.queue, pwdim, None, pass_g, salt_g, result_g, 709 | iters.to_bytes(4, 'little'), dklen.to_bytes(4, 'little')) # ! iters, dklen are always ints 710 | 711 | result = concat(self.opencl_ctx.run(bufStructs, func, iter(passwordlist), salt)) 712 | if dklen != self.max_out_bytes: 713 | # We may have made more space for a multiple of the digest size 714 | result = [hexRes[:dklen] for hexRes in result] 715 | return result 716 | 717 | def cl_pbkdf2_init(self, rtype, saltlen, dklen): 718 | bufStructs = buffer_structs() 719 | if rtype == "md5": 720 | self.max_out_bytes = bufStructs.specifyMD5(128, saltlen, dklen) 721 | # hmac is defined in with pbkdf2, as a kernel function 722 | prg = self.opencl_ctx.compile(bufStructs, "md5.cl", "pbkdf2.cl") 723 | elif rtype == "sha1": 724 | if saltlen < 32 and dklen < 32: 725 | dklen=32 726 | self.max_out_bytes = bufStructs.specifySHA1(32, saltlen, dklen) 727 | prg = self.opencl_ctx.compile(bufStructs, "pbkdf2_sha1_32.cl", None) 728 | else: 729 | self.max_out_bytes = bufStructs.specifySHA1(128, saltlen, dklen) 730 | prg = self.opencl_ctx.compile(bufStructs, "sha1.cl", "pbkdf2.cl") 731 | elif rtype == "sha256": 732 | if saltlen <= 64 and dklen <= 64: 733 | dklen=64 734 | self.max_out_bytes = bufStructs.specifySHA2(256, 128, saltlen, dklen) 735 | if saltlen <= 64 and dklen <= 64: 736 | prg = self.opencl_ctx.compile(bufStructs, "pbkdf2_sha256_32.cl", None) 737 | else: 738 | prg = self.opencl_ctx.compile(bufStructs, "sha256.cl", "pbkdf2.cl") 739 | elif rtype == "sha512": 740 | self.max_out_bytes = bufStructs.specifySHA2(512, 256, saltlen, dklen) 741 | prg = self.opencl_ctx.compile(bufStructs, "sha512.cl", "pbkdf2.cl") 742 | else: 743 | assert ("Error on hash type, unknown !!!") 744 | return [prg, bufStructs] 745 | 746 | # =========================================================================================== 747 | 748 | def cl_pbkdf2_saltlist(self, ctx, password, saltlist, iters, dklen): 749 | prg = ctx[0] 750 | bufStructs = ctx[1] 751 | def func(s, pwdim, pass_g, salt_g, result_g): 752 | prg.pbkdf2_saltlist(s.queue, pwdim, None, pass_g, salt_g, result_g, 753 | (iters).to_bytes(4, 'little'), (dklen).to_bytes(4, 'little')) # ! iters, dklen are always ints 754 | 755 | result = concat(self.opencl_ctx.run_saltlist(bufStructs, func, iter(saltlist), password)) 756 | if dklen != self.max_out_bytes: 757 | # We may have made more space for a multiple of the digest size 758 | result = [hexRes[:dklen] for hexRes in result] 759 | return result 760 | 761 | def cl_pbkdf2_saltlist_init(self, type, pwdlen, dklen): 762 | bufStructs = buffer_structs() 763 | if type == "md5": 764 | self.max_out_bytes = bufStructs.specifyMD5(max_in_bytes=128, max_salt_bytes=128, dklen=dklen, max_password_bytes=pwdlen) 765 | ## hmac is defined in with pbkdf2, as a kernel function 766 | prg=self.opencl_ctx.compile(bufStructs, "md5.cl", "pbkdf2.cl") 767 | elif type == "sha1": 768 | self.max_out_bytes = bufStructs.specifySHA1(max_in_bytes=128, max_salt_bytes=128, dklen=dklen, max_password_bytes=pwdlen) 769 | ## hmac is defined in with pbkdf2, as a kernel function 770 | prg=self.opencl_ctx.compile(bufStructs, "sha1.cl", "pbkdf2.cl") 771 | elif type == "sha256": 772 | self.max_out_bytes = bufStructs.specifySHA2(hashDigestSize_bits=256, max_in_bytes=128, max_salt_bytes=128, dklen=dklen, max_password_bytes=pwdlen) 773 | prg=self.opencl_ctx.compile(bufStructs, "sha256.cl", "pbkdf2.cl") 774 | elif type == "sha512": 775 | self.max_out_bytes = bufStructs.specifySHA2(hashDigestSize_bits=512, max_in_bytes=256, max_salt_bytes=128, dklen=dklen, max_password_bytes=pwdlen) 776 | prg=self.opencl_ctx.compile(bufStructs, "sha512.cl", "pbkdf2.cl") 777 | else: 778 | assert ("Error on hash type, unknown !!!") 779 | return [prg, bufStructs] 780 | 781 | # =========================================================================================== 782 | 783 | def cl_hash_iterations(self, ctx, passwordlist, iters, hash_size): 784 | prg = ctx[0] 785 | bufStructs = ctx[1] 786 | def func(s, pwdim, pass_g, salt_g, result_g): 787 | prg.hash_iterations(s.queue, pwdim, None, pass_g, result_g, iters.to_bytes(4, 'little'), hash_size.to_bytes(4, 'little')) # ! iters are always ints 788 | 789 | return concat(self.opencl_ctx.run(bufStructs, func, iter(passwordlist), b"", mdpad_64_func)) 790 | 791 | def cl_hash_iterations_init(self, type): 792 | bufStructs = buffer_structs() 793 | if type == "md5": 794 | self.max_out_bytes = bufStructs.specifyMD5() 795 | ## hmac is defined in with pbkdf2, as a kernel function 796 | prg=self.opencl_ctx.compile(bufStructs, "md5.cl", "hash_iterations.cl") 797 | elif type == "sha1": 798 | self.max_out_bytes = bufStructs.specifySHA1() 799 | ## hmac is defined in with pbkdf2, as a kernel function 800 | prg=self.opencl_ctx.compile(bufStructs, "sha1.cl", "hash_iterations.cl") 801 | elif type == "sha256": 802 | self.max_out_bytes = bufStructs.specifySHA2() 803 | prg=self.opencl_ctx.compile(bufStructs, "sha256.cl", "hash_iterations.cl") 804 | elif type == "sha512": 805 | self.max_out_bytes = bufStructs.specifySHA2(512, 256, 0, 64) 806 | prg=self.opencl_ctx.compile(bufStructs, "sha512.cl", "hash_iterations.cl") 807 | else: 808 | assert ("Error on hash type, unknown !!!") 809 | return [prg, bufStructs] 810 | -------------------------------------------------------------------------------- /Library/opencl_information.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # -*- coding: utf-8 -*- 3 | # (c) B. Kerler 2017-2021 4 | # MIT License 5 | ''' 6 | Original copyright: 7 | Copyright by B.Kerler 2017, PBKDF1_SHA1 and SHA256 PyOpenCl implementation, max 32 chars for password + salt 8 | MIT License 9 | Implementation was confirmed to work with Intel OpenCL on Intel(R) HD Graphics 520 and Intel(R) Core(TM) i5-6200U CPU 10 | ''' 11 | ''' 12 | Refactored out of 'opencl.py' 13 | ''' 14 | 15 | import pyopencl as cl 16 | 17 | class opencl_information: 18 | def __init__(self): 19 | pass 20 | 21 | def printplatforms(self): 22 | for i,platformNum in enumerate(cl.get_platforms()): 23 | print('Platform %d - Name %s, Vendor %s' %(i,platformNum.name,platformNum.vendor)) 24 | 25 | def printfullinfo(self): 26 | print('\n' + '=' * 60 + '\nOpenCL Platforms and Devices') 27 | for i,platformNum in enumerate(cl.get_platforms()): 28 | print('=' * 60) 29 | print('Platform %d - Name: ' %i + platformNum.name) 30 | print('Platform %d - Vendor: ' %i + platformNum.vendor) 31 | print('Platform %d - Version: ' %i + platformNum.version) 32 | print('Platform %d - Profile: ' %i + platformNum.profile) 33 | 34 | for device in platformNum.get_devices(): 35 | print(' ' + '-' * 56) 36 | print(' Device - Name: ' + device.name) 37 | print(' Device - Type: ' + cl.device_type.to_string(device.type)) 38 | print(' Device - Max Clock Speed: {0} Mhz'.format(device.max_clock_frequency)) 39 | print(' Device - Compute Units: {0}'.format(device.max_compute_units)) 40 | print(' Device - Local Memory: {0:.0f} KB'.format(device.local_mem_size / 1024.0)) 41 | print(' Device - Constant Memory: {0:.0f} KB'.format(device.max_constant_buffer_size / 1024.0)) 42 | print(' Device - Global Memory: {0:.0f} GB'.format(device.global_mem_size / 1073741824.0)) 43 | print(' Device - Max Buffer/Image Size: {0:.0f} MB'.format(device.max_mem_alloc_size / 1048576.0)) 44 | print(' Device - Max Work Group Size: {0:.0f}'.format(device.max_work_group_size)) 45 | print('\n') -------------------------------------------------------------------------------- /Library/passwordutils.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import threading 3 | from time import sleep 4 | from queue import Queue 5 | 6 | class passwordutils(threading.Thread): 7 | def __init__(self, stop, threadLock, passwords:Queue, totalthreads:int, minlen=4, maxlen=16): 8 | threading.Thread.__init__(self) 9 | self.minlen = minlen 10 | self.stop = stop 11 | self.maxlen = maxlen 12 | self.passwords = passwords 13 | self.threadLock = threadLock 14 | self.totalthreads = totalthreads 15 | # We start the password generator here as a thread 16 | 17 | def run(self): 18 | global threadLock 19 | try: 20 | while True: 21 | self.threadLock.acquire() 22 | buff = sys.stdin.readline() 23 | self.threadLock.release() 24 | if buff == b"\n": 25 | continue 26 | elif buff == b"": 27 | self.threadLock.acquire() 28 | self.stop() 29 | self.threadLock.release() 30 | while not self.passwords.empty(): 31 | sleep(1) 32 | return 33 | h = buff.rstrip() 34 | if self.maxlen < len(h) < self.minlen: 35 | continue 36 | self.threadLock.acquire() 37 | self.passwords.put(h) 38 | self.threadLock.release() 39 | while self.passwords.qsize() > self.totalthreads: 40 | if self.passwords.empty(): 41 | sleep(1) 42 | break 43 | sleep(0.02) 44 | 45 | except KeyboardInterrupt: 46 | sys.stdout.flush() 47 | pass 48 | return None 49 | -------------------------------------------------------------------------------- /Library/worker/generic/buffer_structs_template.cl: -------------------------------------------------------------------------------- 1 | /* 2 | In- and out- buffer structures (of int32), with variable sizes, for hashing. 3 | These allow indexing just using just get_global_id(0) 4 | Variables tagged with <..> are replaced, so we can specify just enough room for the data. 5 | These are: 6 | - hashBlockSize_bits : The hash's block size in Bits 7 | - inMaxNumBlocks : per hash operation 8 | - hashDigestSize_bits : The hash's digest size in Bits 9 | 10 | Originally adapted from Bjorn Kerler's sha256.cl 11 | MIT License 12 | */ 13 | #define DEBUG 1 14 | 15 | // All macros left defined for usage in the program 16 | #define ceilDiv(n,d) (((n) + (d) - 1) / (d)) 17 | 18 | // All important now, defining whether we're working with unsigned ints or longs 19 | #define wordSize 20 | 21 | // Practical sizes of buffers, in words. 22 | #define inBufferSize ceilDiv(, wordSize) 23 | #define outBufferSize ceilDiv(, wordSize) 24 | #define pwdBufferSize ceilDiv(, wordSize) 25 | #define saltBufferSize ceilDiv(, wordSize) 26 | #define ctBufferSize ceilDiv(, wordSize) 27 | 28 | // 29 | #define hashBlockSize_bytes ceilDiv(, 8) /* Needs to be a multiple of 4, or 8 when we work with unsigned longs */ 30 | #define hashDigestSize_bytes ceilDiv(, 8) 31 | 32 | // just Size always implies _word 33 | #define hashBlockSize ceilDiv(hashBlockSize_bytes, wordSize) 34 | #define hashDigestSize ceilDiv(hashDigestSize_bytes, wordSize) 35 | 36 | 37 | // Ultimately hoping to faze out the Size_int32/long64, 38 | // in favour of just size (_word implied) 39 | #if wordSize == 4 40 | #define hashBlockSize_int32 hashBlockSize 41 | #define hashDigestSize_int32 hashDigestSize 42 | #define word unsigned int 43 | 44 | unsigned int SWAP (unsigned int val) 45 | { 46 | return (rotate(((val) & 0x00FF00FF), 24U) | rotate(((val) & 0xFF00FF00), 8U)); 47 | } 48 | 49 | #elif wordSize == 8 50 | // Initially for use in SHA-512 51 | #define hashBlockSize_long64 hashBlockSize 52 | #define hashDigestSize_long64 hashDigestSize 53 | #define word unsigned long 54 | #define rotl64(a,n) (rotate ((a), (n))) 55 | #define rotr64(a,n) (rotate ((a), (64ul-n))) 56 | 57 | unsigned long SWAP (const unsigned long val) 58 | { 59 | // ab cd ef gh -> gh ef cd ab using the 32 bit trick 60 | unsigned long tmp = (rotr64(val & 0x0000FFFF0000FFFFUL, 16UL) | rotl64(val & 0xFFFF0000FFFF0000UL, 16UL)); 61 | 62 | // Then see this as g- e- c- a- and -h -f -d -b to swap within the pairs, 63 | // gh ef cd ab -> hg fe dc ba 64 | return (rotr64(tmp & 0xFF00FF00FF00FF00UL, 8UL) | rotl64(tmp & 0x00FF00FF00FF00FFUL, 8UL)); 65 | } 66 | #endif 67 | 68 | 69 | 70 | // ==== Define the structs with the right word size ===== 71 | // Helpful & more cohesive to have the lengths of structures as words too, 72 | // (rather than unsigned int for both) 73 | typedef struct { 74 | word length; // in bytes 75 | word buffer[inBufferSize]; 76 | } inbuf; 77 | 78 | typedef struct { 79 | word buffer[outBufferSize]; 80 | } outbuf; 81 | 82 | // Salt buffer, used by pbkdf2 & pbe 83 | typedef struct { 84 | word length; // in bytes 85 | word buffer[saltBufferSize]; 86 | } saltbuf; 87 | 88 | // Password buffer, used by pbkdf2 & pbe 89 | typedef struct { 90 | word length; // in bytes 91 | word buffer[pwdBufferSize]; 92 | } pwdbuf; 93 | 94 | // ciphertext buffer, used in pbe. 95 | // no code relating to this in the opencl.py core, dealt with in signal_pbe_mac.cl as it's a special case 96 | typedef struct { 97 | word length; // in bytes 98 | word buffer[ctBufferSize]; 99 | } ctbuf; 100 | 101 | 102 | 103 | 104 | // ========== Debugging function ============ 105 | 106 | #ifdef DEBUG 107 | #if DEBUG 108 | #define mod(x,y) ((x)-((x)/(y)*(y))) 109 | #define def_printFromWord(tag, funcName, end) \ 110 | /* For printing the string of bytes stored in an array of words. 111 | Option to print hex. */ \ 112 | static void funcName(tag const word *arr, const unsigned int len_bytes, const bool hex)\ 113 | { \ 114 | for (int j = 0; j < len_bytes; j++){ \ 115 | word v = arr[j / wordSize]; \ 116 | word r = mod(j,wordSize) * 8; \ 117 | /* Prints little endian, since that's what we use */ \ 118 | v = (v >> r) & 0xFF; \ 119 | if (hex) { \ 120 | printf("%02x", v); \ 121 | } else { \ 122 | printf("%c", (char)v); \ 123 | } \ 124 | } \ 125 | printf(end); \ 126 | } 127 | 128 | def_printFromWord(__private, printFromWord, "") 129 | def_printFromWord(__global, printFromWord_glbl, "") 130 | def_printFromWord(__private, printFromWord_n, "\n") 131 | def_printFromWord(__global, printFromWord_glbl_n, "\n") 132 | 133 | #endif 134 | #endif -------------------------------------------------------------------------------- /Library/worker/generic/hash_iterations.cl: -------------------------------------------------------------------------------- 1 | // Extremely basic (but useful) script to perform a certain number of hashing iterations when used with a pre-existing 2 | // hasing library which is called via hash_main. (Useful for some cryptocurrency wallets which use custom key stretching) 3 | // 4 | // Generally speaking, this this function will take a hash (and maybe salted) password as the input, with this initial 5 | // hash happening in the calling application. This means that the input and output will always be the same size and that 6 | // we don't need to worry about padding, etc... 7 | // 8 | // Originally created for BTCRecover by Stephen Rothery, available at https://github.com/3rdIteration/btcrecover 9 | __kernel void hash_iterations(__global inbuf *inbuffer, __global outbuf *outbuffer, __private unsigned int iters, __private unsigned int hash_size) 10 | { 11 | unsigned int idx = get_global_id(0); 12 | 13 | // Iterate through and has the input as many times as required 14 | for (unsigned int j = 0; j < iters; j++){ 15 | hash_main(inbuffer, outbuffer); 16 | 17 | // Copy the output from the hash back in to the input... 18 | for (unsigned int i = 0; i < hash_size; i++){ 19 | inbuffer[idx].buffer[i] = outbuffer[idx].buffer[i]; 20 | } 21 | } 22 | } -------------------------------------------------------------------------------- /Library/worker/generic/hmac_qualcomm.cl: -------------------------------------------------------------------------------- 1 | /* 2 | Qualcomm HMAC OpenCL Optimized kernel 3 | (c) B. Kerler 2018-2021 4 | MIT License 5 | */ 6 | 7 | /* 8 | pbkdf2 and HMAC implementation 9 | requires implementation of PRF (pseudo-random function), 10 | probably using HMAC and an implementation of hash_main 11 | */ 12 | /* 13 | REQ: outBuf.buffer must have space for ceil(dkLen / PRF_output_bytes) * PRF_output_bytes 14 | REQ: PRF implementation MUST allow that output may be the salt (m in hmac) 15 | inBuffer / pwdBuffer / the like are not const to allow for padding 16 | */ 17 | 18 | // Determine (statically) the actual required buffer size 19 | // Just allowing for MD padding: 64 bits for int, 1 for the 1-pad = 3 int32s. 20 | #define sizeForHash(reqSize) (ceilDiv((reqSize) + 2 + 1, hashBlockSize_int32) * hashBlockSize_int32) 21 | 22 | /* Swaps between little and big-endian*/ 23 | #define swapEndian(x) (rotate((x) & 0x00FF00FF, 24U) | rotate((x) & 0xFF00FF00, 8U)) 24 | 25 | __constant const unsigned int opad = 0x5c5c5c5c; 26 | __constant const unsigned int ipad = 0x36363636; 27 | __constant const unsigned int xoredPad = opad ^ ipad; 28 | // Slightly ugly: large enough for hmac_main usage, and tight for pbkdf2 29 | #define m_buffer_size (saltBufferSize + 1) 30 | 31 | static void hmac(__global unsigned int *K, const unsigned int K_len_bytes, 32 | const unsigned int *m, const unsigned int m_len_bytes, unsigned int *output) 33 | { 34 | // REQ: If K_len_bytes isn't divisible by 4, final int should be clean (0s to the end) 35 | // REQ: s digestSize is a multiple of 4 bytes 36 | 37 | /* Declare the space for input to the last hash function: 38 | Compute and write K_ ^ opad to the first block of this. This will be the only place that we store K_ */ 39 | 40 | #define size_2 sizeForHash(hashBlockSize_int32 + hashDigestSize_int32) 41 | unsigned int input_2[size_2] = {0}; 42 | #undef size_2 43 | 44 | int end; 45 | if (K_len_bytes <= hashBlockSize_bytes) 46 | { 47 | end = (K_len_bytes + 3) / 4; 48 | // XOR with opad and slightly pad with zeros.. 49 | for (int j = 0; j < end; j++){ 50 | input_2[j] = K[j] ^ opad; 51 | } 52 | } else { 53 | end = hashDigestSize_int32; 54 | // Hash K to get K'. XOR with opad.. 55 | hash_glbl_to_priv(K, K_len_bytes, input_2); 56 | for (int j = 0; j < hashDigestSize_int32; j++){ 57 | input_2[j] ^= opad; 58 | } 59 | } 60 | // And if short, pad with 0s to the BLOCKsize, completing xor with opad 61 | for (int j = end; j < hashBlockSize_int32; j++){ 62 | input_2[j] = opad; 63 | } 64 | 65 | // Copy K' ^ ipad into the first block. 66 | // Be careful: hash needs a whole block after the end. ceilDiv from buffer_structs 67 | #define size_1 sizeForHash(hashBlockSize_int32 + m_buffer_size) 68 | 69 | // K' ^ ipad into the first block 70 | unsigned int input_1[size_1] = {0}; 71 | #undef size_1 72 | for (int j = 0; j < hashBlockSize_int32; j++){ 73 | input_1[j] = input_2[j]^xoredPad; 74 | } 75 | 76 | // Slightly inefficient copying m in.. 77 | int m_len_int32 = (m_len_bytes + 3) / 4; 78 | for (int j = 0; j < m_len_int32; j++){ 79 | input_1[hashBlockSize_int32 + j] = m[j]; 80 | } 81 | 82 | // Hash input1 into the second half of input2 83 | int leng = hashBlockSize_bytes + m_len_bytes; 84 | hash_private(input_1, leng, input_2 + hashBlockSize_int32); 85 | 86 | // Hash input2 into output! 87 | hash_private(input_2, hashBlockSize_bytes + hashDigestSize_bytes, output); 88 | } 89 | 90 | #undef sizeForHash 91 | 92 | // Might as well be very clean 93 | #undef swapEndian 94 | 95 | // Exposing HMAC in the same way. Useful for testing atleast. 96 | __kernel void hmac_main(__global inbuf *inbuffer, __global const saltbuf *saltbuffer, __global outbuf *outbuffer) 97 | { 98 | int counter=0; 99 | int i=0; 100 | int j=0; 101 | unsigned int idx = get_global_id(0); 102 | unsigned int pwdLen_bytes = inbuffer[idx].length; 103 | __global unsigned int *pwdBuffer = inbuffer[idx].buffer; 104 | 105 | // Copy salt just to cheer the compiler up 106 | int saltLen_bytes = saltbuffer[0].length; 107 | int saltLen_int32 = ceilDiv(saltLen_bytes, 4); 108 | unsigned int personal_salt[saltBufferSize] = {0}; 109 | 110 | for (j = 0; j < saltLen_int32; j++){ 111 | personal_salt[j] = saltbuffer[0].buffer[j]; 112 | } 113 | 114 | // Call hmac, with local 115 | unsigned int out[hashDigestSize_int32]; 116 | 117 | unsigned int V[hashDigestSize_int32]={0}; 118 | for (counter=0;counter<10000;counter++) 119 | { 120 | hmac(pwdBuffer, pwdLen_bytes, personal_salt, saltLen_bytes, out); 121 | for (j=0;j> (8*(wordSize-4)); 117 | if (overhang>0) 118 | { 119 | salt[saltLastI] |= be_callI << overhang; 120 | salt[saltLastI+1] = be_callI >> ((8*wordSize)-overhang); 121 | } 122 | else 123 | { 124 | salt[saltLastI]=be_callI; 125 | } 126 | 127 | // Make initial call, copy into output 128 | // This copy is avoidable, but only with __global / __private macro stuff 129 | word u[PRF_output_size] = {0}; 130 | // +4 is correct even for 64 bit 131 | PRF(pwd, pwdLen_bytes, salt, saltLen_bytes + 4, u); 132 | for (unsigned int j = 0; j < PRF_output_size; j++){ 133 | output[j] = u[j]; 134 | } 135 | 136 | #define xor(x,acc) \ 137 | /* xors PRF output x onto acc*/ \ 138 | { \ 139 | for (int k = 0; k < PRF_output_size; k++){ \ 140 | acc[k] ^= x[k]; \ 141 | } \ 142 | } 143 | 144 | // Perform all the iterations, reading salt from- AND writing to- u. 145 | for (unsigned int j = 1; j < iters; j++){ 146 | PRF(pwd, pwdLen_bytes, u, PRF_output_bytes, u); 147 | xor(u,output); 148 | } 149 | } 150 | 151 | __kernel void pbkdf2(__global inbuf *inbuffer, __global const saltbuf *saltbuffer, __global outbuf *outbuffer, 152 | __private unsigned int iters, __private unsigned int dkLen_bytes) 153 | { 154 | 155 | unsigned int idx = get_global_id(0); 156 | word pwdLen_bytes = inbuffer[idx].length; 157 | __global word *pwdBuffer = inbuffer[idx].buffer; 158 | __global word *currOutBuffer = outbuffer[idx].buffer; 159 | 160 | // Copy salt so that we can write our integer into the last 4 bytes 161 | word saltLen_bytes = saltbuffer[0].length; 162 | int saltLen = ceilDiv(saltLen_bytes, wordSize); 163 | word personal_salt[saltBufferSize+2] = {0}; 164 | 165 | for (int j = 0; j < saltLen; j++){ 166 | personal_salt[j] = saltbuffer[0].buffer[j]; 167 | } 168 | 169 | // Determine the number of calls to F that we need to make 170 | unsigned int nBlocks = ceilDiv(dkLen_bytes, PRF_output_bytes); 171 | for (unsigned int j = 1; j <= nBlocks; j++) 172 | { 173 | F(pwdBuffer, pwdLen_bytes, personal_salt, saltbuffer[0].length, iters, j, currOutBuffer); 174 | currOutBuffer += PRF_output_size; 175 | } 176 | } 177 | 178 | 179 | // Exposing HMAC in the same way. Useful for testing atleast. 180 | __kernel void hmac_main(__global inbuf *inbuffer, __global const saltbuf *saltbuffer, __global outbuf *outbuffer) 181 | { 182 | unsigned int idx = get_global_id(0); 183 | word pwdLen_bytes = inbuffer[idx].length; 184 | __global word *pwdBuffer = inbuffer[idx].buffer; 185 | 186 | // Copy salt just to cheer the compiler up 187 | int saltLen_bytes = (int)saltbuffer[0].length; 188 | int saltLen = ceilDiv(saltLen_bytes, wordSize); 189 | word personal_salt[saltBufferSize] = {0}; 190 | 191 | for (int j = 0; j < saltLen; j++){ 192 | personal_salt[j] = saltbuffer[0].buffer[j]; 193 | } 194 | 195 | // Call hmac, with local 196 | word out[hashDigestSize]; 197 | 198 | hmac(pwdBuffer, pwdLen_bytes, personal_salt, saltLen_bytes, out); 199 | 200 | for (int j = 0; j < hashDigestSize; j++){ 201 | outbuffer[idx].buffer[j] = out[j]; 202 | } 203 | } 204 | 205 | // A modified version of the pbkdf2 kernel that allows you to use these kernels in a situation where you have a password 206 | // and are attempting to brute-force the salt. (So this kernel takes a single password and an array of salts 207 | // 208 | // Originally created for BTCRecover by Stephen Rothery, available at https://github.com/3rdIteration/btcrecover 209 | // MIT License 210 | 211 | __kernel void pbkdf2_saltlist(__global const pwdbuf *pwdbuffer_arg, __global inbuf *inbuffer, __global outbuf *outbuffer, 212 | __private unsigned int iters, __private unsigned int dkLen_bytes) 213 | { 214 | 215 | unsigned int idx = get_global_id(0); 216 | word pwdLen_bytes = pwdbuffer_arg[0].length; 217 | __global word *pwdBuffer = pwdbuffer_arg[0].buffer; 218 | __global word *currOutBuffer = outbuffer[idx].buffer; 219 | 220 | // Copy salt so that we can write our integer into the last 4 bytes 221 | word saltLen_bytes = inbuffer[idx].length; 222 | int saltLen = ceilDiv(saltLen_bytes, wordSize); 223 | word personal_salt[saltBufferSize+2] = {0}; 224 | 225 | 226 | for (int j = 0; j < saltLen; j++){ 227 | personal_salt[j] = inbuffer[idx].buffer[j]; 228 | } 229 | 230 | // Determine the number of calls to F that we need to make 231 | unsigned int nBlocks = ceilDiv(dkLen_bytes, PRF_output_bytes); 232 | for (unsigned int j = 1; j <= nBlocks; j++) 233 | { 234 | F(pwdBuffer, pwdLen_bytes, personal_salt, saltLen_bytes, iters, j, currOutBuffer); 235 | currOutBuffer += PRF_output_size; 236 | } 237 | } 238 | 239 | -------------------------------------------------------------------------------- /Library/worker/generic/pbkdf2_sha1_32.cl: -------------------------------------------------------------------------------- 1 | /* 2 | In- and out- buffer structures (of int32), with variable sizes, for hashing. 3 | These allow indexing just using just get_global_id(0) 4 | Variables tagged with <..> are replaced, so we can specify just enough room for the data. 5 | These are: 6 | - hashBlockSize_bits : The hash's block size in Bits 7 | - inMaxNumBlocks : per hash operation 8 | - hashDigestSize_bits : The hash's digest size in Bits 9 | 10 | Originally adapted from Bjorn Kerler's sha256.cl 11 | MIT License 12 | */ 13 | #define DEBUG 1 14 | 15 | // All macros left defined for usage in the program 16 | #define ceilDiv(n,d) (((n) + (d) - 1) / (d)) 17 | 18 | // All important now, defining whether we're working with unsigned ints or longs 19 | #define wordSize 4 20 | 21 | // Practical sizes of buffers, in words. 22 | #define inBufferSize ceilDiv(128, wordSize) 23 | #define outBufferSize ceilDiv(40, wordSize) 24 | #define saltBufferSize ceilDiv(8, wordSize) 25 | #define ctBufferSize ceilDiv(0, wordSize) 26 | 27 | // 28 | #define hashBlockSize_bytes ceilDiv(512, 8) /* Needs to be a multiple of 4, or 8 when we work with unsigned longs */ 29 | #define hashDigestSize_bytes ceilDiv(160, 8) 30 | 31 | // just Size always implies _word 32 | #define hashBlockSize ceilDiv(hashBlockSize_bytes, wordSize) 33 | #define hashDigestSize ceilDiv(hashDigestSize_bytes, wordSize) 34 | 35 | 36 | // Ultimately hoping to faze out the Size_int32/long64, 37 | // in favour of just size (_word implied) 38 | #if wordSize == 4 39 | #define hashBlockSize_int32 hashBlockSize 40 | #define hashDigestSize_int32 hashDigestSize 41 | #define word unsigned int 42 | 43 | unsigned int SWAP (unsigned int val) 44 | { 45 | return (rotate(((val) & 0x00FF00FF), 24U) | rotate(((val) & 0xFF00FF00), 8U)); 46 | } 47 | 48 | #elif wordSize == 8 49 | // Initially for use in SHA-512 50 | #define hashBlockSize_long64 hashBlockSize 51 | #define hashDigestSize_long64 hashDigestSize 52 | #define word unsigned long 53 | #define rotl64(a,n) (rotate ((a), (n))) 54 | #define rotr64(a,n) (rotate ((a), (64ul-n))) 55 | 56 | unsigned long SWAP (const unsigned long val) 57 | { 58 | // ab cd ef gh -> gh ef cd ab using the 32 bit trick 59 | unsigned long tmp = (rotr64(val & 0x0000FFFF0000FFFFUL, 16UL) | rotl64(val & 0xFFFF0000FFFF0000UL, 16UL)); 60 | 61 | // Then see this as g- e- c- a- and -h -f -d -b to swap within the pairs, 62 | // gh ef cd ab -> hg fe dc ba 63 | return (rotr64(tmp & 0xFF00FF00FF00FF00UL, 8UL) | rotl64(tmp & 0x00FF00FF00FF00FFUL, 8UL)); 64 | } 65 | #endif 66 | 67 | 68 | 69 | // ==== Define the structs with the right word size ===== 70 | // Helpful & more cohesive to have the lengths of structures as words too, 71 | // (rather than unsigned int for both) 72 | typedef struct { 73 | word length; // in bytes 74 | word buffer[inBufferSize]; 75 | } inbuf; 76 | 77 | typedef struct { 78 | word buffer[outBufferSize]; 79 | } outbuf; 80 | 81 | // Salt buffer, used by pbkdf2 & pbe 82 | typedef struct { 83 | word length; // in bytes 84 | word buffer[saltBufferSize]; 85 | } saltbuf; 86 | 87 | // ciphertext buffer, used in pbe. 88 | // no code relating to this in the opencl.py core, dealt with in signal_pbe_mac.cl as it's a special case 89 | typedef struct { 90 | word length; // in bytes 91 | word buffer[ctBufferSize]; 92 | } ctbuf; 93 | 94 | 95 | 96 | 97 | // ========== Debugging function ============ 98 | 99 | #ifdef DEBUG 100 | #if DEBUG 101 | 102 | #define def_printFromWord(tag, funcName, end) \ 103 | /* For printing the string of bytes stored in an array of words. 104 | Option to print hex. */ \ 105 | static void funcName(tag const word *arr, const unsigned int len_bytes, const bool hex)\ 106 | { \ 107 | for (int j = 0; j < len_bytes; j++){ \ 108 | word v = arr[j / wordSize]; \ 109 | word r = mod(j,wordSize) * 8; \ 110 | /* Prints little endian, since that's what we use */ \ 111 | v = (v >> r) & 0xFF; \ 112 | if (hex) { \ 113 | printf("%02x", v); \ 114 | } else { \ 115 | printf("%c", (char)v); \ 116 | } \ 117 | } \ 118 | printf(end); \ 119 | } 120 | 121 | def_printFromWord(__private, printFromWord, "") 122 | def_printFromWord(__global, printFromWord_glbl, "") 123 | def_printFromWord(__private, printFromWord_n, "\n") 124 | def_printFromWord(__global, printFromWord_glbl_n, "\n") 125 | 126 | #endif 127 | #endif/* 128 | PBKDF2 SHA1 OpenCL Optimized kernel, limited to max. 32 chars for salt and password 129 | (c) B. Kerler 2017 130 | MIT License 131 | */ 132 | 133 | #define rotl32(a,n) rotate ((a), (n)) 134 | 135 | #define mod(x,y) x-(x/y*y) 136 | 137 | #define F2(x,y,z) ((x) ^ (y) ^ (z)) 138 | #define F1(x,y,z) (bitselect(z,y,x)) 139 | #define F0(x,y,z) (bitselect (x, y, (x ^ z))) 140 | 141 | #define SHA1M_A 0x67452301u 142 | #define SHA1M_B 0xefcdab89u 143 | #define SHA1M_C 0x98badcfeu 144 | #define SHA1M_D 0x10325476u 145 | #define SHA1M_E 0xc3d2e1f0u 146 | 147 | #define SHA1C00 0x5a827999u 148 | #define SHA1C01 0x6ed9eba1u 149 | #define SHA1C02 0x8f1bbcdcu 150 | #define SHA1C03 0xca62c1d6u 151 | 152 | #define SHA1_STEP(f,a,b,c,d,e,x) \ 153 | { \ 154 | e += K; \ 155 | e += x; \ 156 | e += f (b, c, d); \ 157 | e += rotl32 (a, 5u); \ 158 | b = rotl32 (b, 30u); \ 159 | } 160 | 161 | static void sha1_process2 (const unsigned int *W, unsigned int *digest) 162 | { 163 | unsigned int A = digest[0]; 164 | unsigned int B = digest[1]; 165 | unsigned int C = digest[2]; 166 | unsigned int D = digest[3]; 167 | unsigned int E = digest[4]; 168 | 169 | unsigned int w0_t = W[0]; 170 | unsigned int w1_t = W[1]; 171 | unsigned int w2_t = W[2]; 172 | unsigned int w3_t = W[3]; 173 | unsigned int w4_t = W[4]; 174 | unsigned int w5_t = W[5]; 175 | unsigned int w6_t = W[6]; 176 | unsigned int w7_t = W[7]; 177 | unsigned int w8_t = W[8]; 178 | unsigned int w9_t = W[9]; 179 | unsigned int wa_t = W[10]; 180 | unsigned int wb_t = W[11]; 181 | unsigned int wc_t = W[12]; 182 | unsigned int wd_t = W[13]; 183 | unsigned int we_t = W[14]; 184 | unsigned int wf_t = W[15]; 185 | 186 | #undef K 187 | #define K SHA1C00 188 | 189 | SHA1_STEP (F1, A, B, C, D, E, w0_t); 190 | SHA1_STEP (F1, E, A, B, C, D, w1_t); 191 | SHA1_STEP (F1, D, E, A, B, C, w2_t); 192 | SHA1_STEP (F1, C, D, E, A, B, w3_t); 193 | SHA1_STEP (F1, B, C, D, E, A, w4_t); 194 | SHA1_STEP (F1, A, B, C, D, E, w5_t); 195 | SHA1_STEP (F1, E, A, B, C, D, w6_t); 196 | SHA1_STEP (F1, D, E, A, B, C, w7_t); 197 | SHA1_STEP (F1, C, D, E, A, B, w8_t); 198 | SHA1_STEP (F1, B, C, D, E, A, w9_t); 199 | SHA1_STEP (F1, A, B, C, D, E, wa_t); 200 | SHA1_STEP (F1, E, A, B, C, D, wb_t); 201 | SHA1_STEP (F1, D, E, A, B, C, wc_t); 202 | SHA1_STEP (F1, C, D, E, A, B, wd_t); 203 | SHA1_STEP (F1, B, C, D, E, A, we_t); 204 | SHA1_STEP (F1, A, B, C, D, E, wf_t); 205 | w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (F1, E, A, B, C, D, w0_t); 206 | w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (F1, D, E, A, B, C, w1_t); 207 | w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (F1, C, D, E, A, B, w2_t); 208 | w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (F1, B, C, D, E, A, w3_t); 209 | 210 | #undef K 211 | #define K SHA1C01 212 | 213 | w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (F2, A, B, C, D, E, w4_t); 214 | w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (F2, E, A, B, C, D, w5_t); 215 | w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (F2, D, E, A, B, C, w6_t); 216 | w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (F2, C, D, E, A, B, w7_t); 217 | w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (F2, B, C, D, E, A, w8_t); 218 | w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (F2, A, B, C, D, E, w9_t); 219 | wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (F2, E, A, B, C, D, wa_t); 220 | wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (F2, D, E, A, B, C, wb_t); 221 | wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (F2, C, D, E, A, B, wc_t); 222 | wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (F2, B, C, D, E, A, wd_t); 223 | we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (F2, A, B, C, D, E, we_t); 224 | wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (F2, E, A, B, C, D, wf_t); 225 | w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (F2, D, E, A, B, C, w0_t); 226 | w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (F2, C, D, E, A, B, w1_t); 227 | w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (F2, B, C, D, E, A, w2_t); 228 | w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (F2, A, B, C, D, E, w3_t); 229 | w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (F2, E, A, B, C, D, w4_t); 230 | w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (F2, D, E, A, B, C, w5_t); 231 | w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (F2, C, D, E, A, B, w6_t); 232 | w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (F2, B, C, D, E, A, w7_t); 233 | 234 | #undef K 235 | #define K SHA1C02 236 | 237 | w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (F0, A, B, C, D, E, w8_t); 238 | w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (F0, E, A, B, C, D, w9_t); 239 | wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (F0, D, E, A, B, C, wa_t); 240 | wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (F0, C, D, E, A, B, wb_t); 241 | wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (F0, B, C, D, E, A, wc_t); 242 | wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (F0, A, B, C, D, E, wd_t); 243 | we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (F0, E, A, B, C, D, we_t); 244 | wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (F0, D, E, A, B, C, wf_t); 245 | w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (F0, C, D, E, A, B, w0_t); 246 | w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (F0, B, C, D, E, A, w1_t); 247 | w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (F0, A, B, C, D, E, w2_t); 248 | w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (F0, E, A, B, C, D, w3_t); 249 | w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (F0, D, E, A, B, C, w4_t); 250 | w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (F0, C, D, E, A, B, w5_t); 251 | w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (F0, B, C, D, E, A, w6_t); 252 | w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (F0, A, B, C, D, E, w7_t); 253 | w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (F0, E, A, B, C, D, w8_t); 254 | w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (F0, D, E, A, B, C, w9_t); 255 | wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (F0, C, D, E, A, B, wa_t); 256 | wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (F0, B, C, D, E, A, wb_t); 257 | 258 | #undef K 259 | #define K SHA1C03 260 | 261 | wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (F2, A, B, C, D, E, wc_t); 262 | wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (F2, E, A, B, C, D, wd_t); 263 | we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (F2, D, E, A, B, C, we_t); 264 | wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (F2, C, D, E, A, B, wf_t); 265 | w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (F2, B, C, D, E, A, w0_t); 266 | w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (F2, A, B, C, D, E, w1_t); 267 | w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (F2, E, A, B, C, D, w2_t); 268 | w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (F2, D, E, A, B, C, w3_t); 269 | w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (F2, C, D, E, A, B, w4_t); 270 | w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (F2, B, C, D, E, A, w5_t); 271 | w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (F2, A, B, C, D, E, w6_t); 272 | w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (F2, E, A, B, C, D, w7_t); 273 | w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (F2, D, E, A, B, C, w8_t); 274 | w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (F2, C, D, E, A, B, w9_t); 275 | wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (F2, B, C, D, E, A, wa_t); 276 | wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (F2, A, B, C, D, E, wb_t); 277 | wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (F2, E, A, B, C, D, wc_t); 278 | wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (F2, D, E, A, B, C, wd_t); 279 | we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (F2, C, D, E, A, B, we_t); 280 | wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (F2, B, C, D, E, A, wf_t); 281 | 282 | digest[0] += A; 283 | digest[1] += B; 284 | digest[2] += C; 285 | digest[3] += D; 286 | digest[4] += E; 287 | } 288 | 289 | static void F(__global const unsigned int *pass, const unsigned int pass_len, unsigned int *salt, const unsigned int salt_len, const unsigned int iter, __global unsigned int* hash, unsigned int hash_len) 290 | { 291 | int plen=pass_len/4; 292 | if (mod(pass_len,4)) plen++; 293 | 294 | int slen=salt_len/4; 295 | if (mod(salt_len,4)) slen++; 296 | 297 | __global unsigned int* p = hash; 298 | 299 | unsigned int ipad[16]; 300 | ipad[0x0]=0x36363636; 301 | ipad[0x1]=0x36363636; 302 | ipad[0x2]=0x36363636; 303 | ipad[0x3]=0x36363636; 304 | ipad[0x4]=0x36363636; 305 | ipad[0x5]=0x36363636; 306 | ipad[0x6]=0x36363636; 307 | ipad[0x7]=0x36363636; 308 | ipad[0x8]=0x36363636; 309 | ipad[0x9]=0x36363636; 310 | ipad[0xA]=0x36363636; 311 | ipad[0xB]=0x36363636; 312 | ipad[0xC]=0x36363636; 313 | ipad[0xD]=0x36363636; 314 | ipad[0xE]=0x36363636; 315 | ipad[0xF]=0x36363636; 316 | 317 | unsigned int opad[16]; 318 | opad[0x0]=0x5C5C5C5C; 319 | opad[0x1]=0x5C5C5C5C; 320 | opad[0x2]=0x5C5C5C5C; 321 | opad[0x3]=0x5C5C5C5C; 322 | opad[0x4]=0x5C5C5C5C; 323 | opad[0x5]=0x5C5C5C5C; 324 | opad[0x6]=0x5C5C5C5C; 325 | opad[0x7]=0x5C5C5C5C; 326 | opad[0x8]=0x5C5C5C5C; 327 | opad[0x9]=0x5C5C5C5C; 328 | opad[0xA]=0x5C5C5C5C; 329 | opad[0xB]=0x5C5C5C5C; 330 | opad[0xC]=0x5C5C5C5C; 331 | opad[0xD]=0x5C5C5C5C; 332 | opad[0xE]=0x5C5C5C5C; 333 | opad[0xF]=0x5C5C5C5C; 334 | 335 | for (int m=0;msha256_update(state,W,ilenor,wposr,ipad,0x40); 350 | unsigned int W[0x10]={0}; 351 | W[0]=ipad[0]; 352 | W[1]=ipad[1]; 353 | W[2]=ipad[2]; 354 | W[3]=ipad[3]; 355 | W[4]=ipad[4]; 356 | W[5]=ipad[5]; 357 | W[6]=ipad[6]; 358 | W[7]=ipad[7]; 359 | W[8]=ipad[8]; 360 | W[9]=ipad[9]; 361 | W[10]=ipad[10]; 362 | W[11]=ipad[11]; 363 | W[12]=ipad[12]; 364 | W[13]=ipad[13]; 365 | W[14]=ipad[14]; 366 | W[15]=ipad[15]; 367 | sha1_process2(W,stateipad); 368 | 369 | // precompute ipad 370 | unsigned int stateopad[5]={0}; 371 | stateopad[0] = 0x67452301; 372 | stateopad[1] = 0xefcdab89; 373 | stateopad[2] = 0x98badcfe; 374 | stateopad[3] = 0x10325476; 375 | stateopad[4] = 0xc3d2e1f0; 376 | 377 | //->sha1_update(state,W,ilenor,wposr,ipad,0x40); 378 | W[0]=opad[0]; 379 | W[1]=opad[1]; 380 | W[2]=opad[2]; 381 | W[3]=opad[3]; 382 | W[4]=opad[4]; 383 | W[5]=opad[5]; 384 | W[6]=opad[6]; 385 | W[7]=opad[7]; 386 | W[8]=opad[8]; 387 | W[9]=opad[9]; 388 | W[10]=opad[10]; 389 | W[11]=opad[11]; 390 | W[12]=opad[12]; 391 | W[13]=opad[13]; 392 | W[14]=opad[14]; 393 | W[15]=opad[15]; 394 | sha1_process2(W,stateopad); 395 | 396 | unsigned int counter = 1; 397 | unsigned int state[5]={0}; 398 | 399 | unsigned int tkeylen=hash_len; 400 | unsigned int cplen=0; 401 | while(tkeylen>0) 402 | { 403 | if(tkeylen > 20) cplen = 20; 404 | else cplen=tkeylen; 405 | 406 | //hmac_sha1_init(state,W,ileno,wpos,ipad,opad,pwd); 407 | //->sha1_init(state,W,ileno,wpos); 408 | //->sha1_update(state,W,ileno,wpos,ipad,0x40); 409 | state[0] = stateipad[0]; 410 | state[1] = stateipad[1]; 411 | state[2] = stateipad[2]; 412 | state[3] = stateipad[3]; 413 | state[4] = stateipad[4]; 414 | //hmac_sha1_update(state,W,ileno,wpos,ipad,opad,salt,salt_len); 415 | //->sha1_update(state,W,ileno,wpos,salt,salt_len); 416 | //hmac_sha1_update(state,W,ileno,wpos,ipad,opad,itmp,4); 417 | //->sha1_update(state,W,ileno,wpos,itmp,4); 418 | W[0]=0; 419 | W[1]=0; 420 | W[2]=0; 421 | W[3]=0; 422 | W[4]=0; 423 | W[5]=0; 424 | W[6]=0; 425 | W[7]=0; 426 | W[8]=0; 427 | W[9]=0; 428 | W[10]=0; 429 | W[11]=0; 430 | W[12]=0; 431 | W[13]=0; 432 | W[14]=0; 433 | for (int m=0;msha1_finish(state,W,ileno,&opad[0x10]); 448 | sha1_process2(W,state); 449 | 450 | //sha1(opad,0x54,digtmp); 451 | //->sha1_init(state,W,ileno,wpos); 452 | //->sha1_update(state,W,ileno,wpos,opad,0x54); 453 | //->sha1_finish(state,W,ileno,digtmp); 454 | 455 | W[0]=state[0]; 456 | W[1]=state[1]; 457 | W[2]=state[2]; 458 | W[3]=state[3]; 459 | W[4]=state[4]; 460 | W[5]=0x80000000; 461 | W[6]=0x0; 462 | W[7]=0x0; 463 | W[8]=0x0; 464 | W[9]=0; 465 | W[10]=0; 466 | W[11]=0; 467 | W[12]=0; 468 | W[13]=0; 469 | W[14]=0; 470 | W[15]=0x54*8; 471 | 472 | state[0]=stateopad[0]; 473 | state[1]=stateopad[1]; 474 | state[2]=stateopad[2]; 475 | state[3]=stateopad[3]; 476 | state[4]=stateopad[4]; 477 | 478 | //sha256_finish(state,W,ileno,digtmp); 479 | sha1_process2(W,state); 480 | 481 | p[0]=W[0]=state[0]; 482 | p[1]=W[1]=state[1]; 483 | p[2]=W[2]=state[2]; 484 | p[3]=W[3]=state[3]; 485 | p[4]=W[4]=state[4]; 486 | 487 | for(int j = 1; j < iter; j++) 488 | { 489 | //hmac_sha1(pwd,digtmp,32,digtmp); 490 | //->sha1_init(state,W,ilenor,wposr); 491 | //->sha1_update(state,W,ilenor,wposr,digtmp,32); 492 | //->sha1_finish(state,W,ileno,&opad[0x10]); 493 | 494 | W[5]=0x80000000; //Padding 495 | W[6]=0; 496 | W[7]=0; 497 | W[8]=0; 498 | W[9]=0; 499 | W[10]=0; 500 | W[11]=0; 501 | W[12]=0; 502 | W[13]=0; 503 | W[14]=0; 504 | W[15]=0x54*8; 505 | state[0] = stateipad[0]; 506 | state[1] = stateipad[1]; 507 | state[2] = stateipad[2]; 508 | state[3] = stateipad[3]; 509 | state[4] = stateipad[4]; 510 | sha1_process2(W,state); 511 | 512 | unsigned int M[0x10]={0}; 513 | M[0]=state[0]; 514 | M[1]=state[1]; 515 | M[2]=state[2]; 516 | M[3]=state[3]; 517 | M[4]=state[4]; 518 | M[5]=0x80000000; //Padding 519 | M[6]=0; 520 | M[7]=0; 521 | M[8]=0; 522 | M[9]=0; 523 | M[10]=0; 524 | M[11]=0; 525 | M[12]=0; 526 | M[13]=0; 527 | M[14]=0; 528 | M[15]=0x54*8; 529 | 530 | //->sha1_init(state,W,ilenor,wposr); 531 | //->sha1_update(state,W,ilenor,wposr,opad,0x60); 532 | state[0] = stateopad[0]; 533 | state[1] = stateopad[1]; 534 | state[2] = stateopad[2]; 535 | state[3] = stateopad[3]; 536 | state[4] = stateopad[4]; 537 | 538 | //->sha1_finish(state,W,ilenor,digtmp); 539 | sha1_process2(M,state); 540 | 541 | W[0]=state[0]; 542 | W[1]=state[1]; 543 | W[2]=state[2]; 544 | W[3]=state[3]; 545 | W[4]=state[4]; 546 | 547 | p[0] ^= state[0]; 548 | p[1] ^= state[1]; 549 | p[2] ^= state[2]; 550 | p[3] ^= state[3]; 551 | p[4] ^= state[4]; 552 | } 553 | 554 | p[0]=SWAP(p[0]); 555 | p[1]=SWAP(p[1]); 556 | p[2]=SWAP(p[2]); 557 | p[3]=SWAP(p[3]); 558 | p[4]=SWAP(p[4]); 559 | 560 | tkeylen-= cplen; 561 | counter++; 562 | p+= cplen/4; 563 | } 564 | return; 565 | } 566 | 567 | 568 | __kernel void pbkdf2(__global inbuf *inbuffer, __global const saltbuf *saltbuffer, __global outbuf *outbuffer, 569 | __private unsigned int iters, __private unsigned int dkLen_bytes) 570 | { 571 | 572 | unsigned int idx = get_global_id(0); 573 | word pwdLen_bytes = inbuffer[idx].length; 574 | __global word *pwdBuffer = inbuffer[idx].buffer; 575 | __global word *currOutBuffer = outbuffer[idx].buffer; 576 | 577 | // Copy salt so that we can write our integer into the last 4 bytes 578 | word personal_salt[32/4] = {0}; 579 | personal_salt[0] = saltbuffer[0].buffer[0]; 580 | personal_salt[1] = saltbuffer[0].buffer[1]; 581 | personal_salt[2] = saltbuffer[0].buffer[2]; 582 | personal_salt[3] = saltbuffer[0].buffer[3]; 583 | personal_salt[4] = saltbuffer[0].buffer[4]; 584 | personal_salt[5] = saltbuffer[0].buffer[5]; 585 | personal_salt[6] = saltbuffer[0].buffer[6]; 586 | personal_salt[7] = saltbuffer[0].buffer[7]; 587 | 588 | F(pwdBuffer, pwdLen_bytes, personal_salt, saltbuffer[0].length, iters, currOutBuffer,32); 589 | } 590 | -------------------------------------------------------------------------------- /Library/worker/generic/pbkdf2_sha256_32.cl: -------------------------------------------------------------------------------- 1 | /* 2 | In- and out- buffer structures (of int32), with variable sizes, for hashing. 3 | These allow indexing just using just get_global_id(0) 4 | Variables tagged with <..> are replaced, so we can specify just enough room for the data. 5 | These are: 6 | - hashBlockSize_bits : The hash's block size in Bits 7 | - inMaxNumBlocks : per hash operation 8 | - hashDigestSize_bits : The hash's digest size in Bits 9 | 10 | Originally adapted from Bjorn Kerler's sha256.cl 11 | MIT License 12 | */ 13 | #define DEBUG 1 14 | 15 | // All macros left defined for usage in the program 16 | #define ceilDiv(n,d) (((n) + (d) - 1) / (d)) 17 | // All important now, defining whether we're working with unsigned ints or longs 18 | #define wordSize 4 19 | 20 | // Practical sizes of buffers, in words. 21 | #define inBufferSize ceilDiv(128, wordSize) 22 | 23 | 24 | // Ultimately hoping to faze out the Size_int32/long64, 25 | // in favour of just size (_word implied) 26 | #define word unsigned int 27 | 28 | unsigned int SWAP (unsigned int val) 29 | { 30 | return (rotate(((val) & 0x00FF00FF), 24U) | rotate(((val) & 0xFF00FF00), 8U)); 31 | } 32 | 33 | // ==== Define the structs with the right word size ===== 34 | // Helpful & more cohesive to have the lengths of structures as words too, 35 | // (rather than unsigned int for both) 36 | typedef struct { 37 | word length; // in bytes 38 | word buffer[inBufferSize]; 39 | } inbuf; 40 | 41 | typedef struct { 42 | word buffer[16]; 43 | } outbuf; 44 | 45 | // Salt buffer, used by pbkdf2 & pbe 46 | typedef struct { 47 | word length; // in bytes 48 | word buffer[8]; 49 | } saltbuf; 50 | 51 | 52 | // ========== Debugging function ============ 53 | 54 | #ifdef DEBUG 55 | #if DEBUG 56 | 57 | #define def_printFromWord(tag, funcName, end) \ 58 | /* For printing the string of bytes stored in an array of words. 59 | Option to print hex. */ \ 60 | static void funcName(tag const word *arr, const unsigned int len_bytes, const bool hex)\ 61 | { \ 62 | for (int j = 0; j < len_bytes; j++){ \ 63 | word v = arr[j / wordSize]; \ 64 | word r = (j % wordSize) * 8; \ 65 | /* Prints little endian, since that's what we use */ \ 66 | v = (v >> r) & 0xFF; \ 67 | if (hex) { \ 68 | printf("%02x", v); \ 69 | } else { \ 70 | printf("%c", (char)v); \ 71 | } \ 72 | } \ 73 | printf(end); \ 74 | } 75 | 76 | def_printFromWord(__private, printFromWord, "") 77 | def_printFromWord(__global, printFromWord_glbl, "") 78 | def_printFromWord(__private, printFromWord_n, "\n") 79 | def_printFromWord(__global, printFromWord_glbl_n, "\n") 80 | 81 | #endif 82 | #endif/* 83 | Original: 84 | SHA1 OpenCL Optimized kernel 85 | (c) B. Kerler 2018 86 | MIT License 87 | */ 88 | 89 | /* 90 | (small) Changes: 91 | outbuf and inbuf structs defined using the buffer_structs_template 92 | func_sha256 renamed to hash_main 93 | */ 94 | 95 | #define F1(x,y,z) (bitselect(z,y,x)) 96 | #define F0(x,y,z) (bitselect (x, y, ((x) ^ (z)))) 97 | #define mod(x,y) ((x)-((x)/(y)*(y))) 98 | #define shr32(x,n) ((x) >> (n)) 99 | #define rotl32(a,n) rotate ((a), (n)) 100 | 101 | #define S0(x) (rotl32 ((x), 25u) ^ rotl32 ((x), 14u) ^ shr32 ((x), 3u)) 102 | #define S1(x) (rotl32 ((x), 15u) ^ rotl32 ((x), 13u) ^ shr32 ((x), 10u)) 103 | #define S2(x) (rotl32 ((x), 30u) ^ rotl32 ((x), 19u) ^ rotl32 ((x), 10u)) 104 | #define S3(x) (rotl32 ((x), 26u) ^ rotl32 ((x), 21u) ^ rotl32 ((x), 7u)) 105 | 106 | #define SHA256C00 0x428a2f98u 107 | #define SHA256C01 0x71374491u 108 | #define SHA256C02 0xb5c0fbcfu 109 | #define SHA256C03 0xe9b5dba5u 110 | #define SHA256C04 0x3956c25bu 111 | #define SHA256C05 0x59f111f1u 112 | #define SHA256C06 0x923f82a4u 113 | #define SHA256C07 0xab1c5ed5u 114 | #define SHA256C08 0xd807aa98u 115 | #define SHA256C09 0x12835b01u 116 | #define SHA256C0a 0x243185beu 117 | #define SHA256C0b 0x550c7dc3u 118 | #define SHA256C0c 0x72be5d74u 119 | #define SHA256C0d 0x80deb1feu 120 | #define SHA256C0e 0x9bdc06a7u 121 | #define SHA256C0f 0xc19bf174u 122 | #define SHA256C10 0xe49b69c1u 123 | #define SHA256C11 0xefbe4786u 124 | #define SHA256C12 0x0fc19dc6u 125 | #define SHA256C13 0x240ca1ccu 126 | #define SHA256C14 0x2de92c6fu 127 | #define SHA256C15 0x4a7484aau 128 | #define SHA256C16 0x5cb0a9dcu 129 | #define SHA256C17 0x76f988dau 130 | #define SHA256C18 0x983e5152u 131 | #define SHA256C19 0xa831c66du 132 | #define SHA256C1a 0xb00327c8u 133 | #define SHA256C1b 0xbf597fc7u 134 | #define SHA256C1c 0xc6e00bf3u 135 | #define SHA256C1d 0xd5a79147u 136 | #define SHA256C1e 0x06ca6351u 137 | #define SHA256C1f 0x14292967u 138 | #define SHA256C20 0x27b70a85u 139 | #define SHA256C21 0x2e1b2138u 140 | #define SHA256C22 0x4d2c6dfcu 141 | #define SHA256C23 0x53380d13u 142 | #define SHA256C24 0x650a7354u 143 | #define SHA256C25 0x766a0abbu 144 | #define SHA256C26 0x81c2c92eu 145 | #define SHA256C27 0x92722c85u 146 | #define SHA256C28 0xa2bfe8a1u 147 | #define SHA256C29 0xa81a664bu 148 | #define SHA256C2a 0xc24b8b70u 149 | #define SHA256C2b 0xc76c51a3u 150 | #define SHA256C2c 0xd192e819u 151 | #define SHA256C2d 0xd6990624u 152 | #define SHA256C2e 0xf40e3585u 153 | #define SHA256C2f 0x106aa070u 154 | #define SHA256C30 0x19a4c116u 155 | #define SHA256C31 0x1e376c08u 156 | #define SHA256C32 0x2748774cu 157 | #define SHA256C33 0x34b0bcb5u 158 | #define SHA256C34 0x391c0cb3u 159 | #define SHA256C35 0x4ed8aa4au 160 | #define SHA256C36 0x5b9cca4fu 161 | #define SHA256C37 0x682e6ff3u 162 | #define SHA256C38 0x748f82eeu 163 | #define SHA256C39 0x78a5636fu 164 | #define SHA256C3a 0x84c87814u 165 | #define SHA256C3b 0x8cc70208u 166 | #define SHA256C3c 0x90befffau 167 | #define SHA256C3d 0xa4506cebu 168 | #define SHA256C3e 0xbef9a3f7u 169 | #define SHA256C3f 0xc67178f2u 170 | 171 | __constant uint k_sha256[64] = 172 | { 173 | SHA256C00, SHA256C01, SHA256C02, SHA256C03, 174 | SHA256C04, SHA256C05, SHA256C06, SHA256C07, 175 | SHA256C08, SHA256C09, SHA256C0a, SHA256C0b, 176 | SHA256C0c, SHA256C0d, SHA256C0e, SHA256C0f, 177 | SHA256C10, SHA256C11, SHA256C12, SHA256C13, 178 | SHA256C14, SHA256C15, SHA256C16, SHA256C17, 179 | SHA256C18, SHA256C19, SHA256C1a, SHA256C1b, 180 | SHA256C1c, SHA256C1d, SHA256C1e, SHA256C1f, 181 | SHA256C20, SHA256C21, SHA256C22, SHA256C23, 182 | SHA256C24, SHA256C25, SHA256C26, SHA256C27, 183 | SHA256C28, SHA256C29, SHA256C2a, SHA256C2b, 184 | SHA256C2c, SHA256C2d, SHA256C2e, SHA256C2f, 185 | SHA256C30, SHA256C31, SHA256C32, SHA256C33, 186 | SHA256C34, SHA256C35, SHA256C36, SHA256C37, 187 | SHA256C38, SHA256C39, SHA256C3a, SHA256C3b, 188 | SHA256C3c, SHA256C3d, SHA256C3e, SHA256C3f, 189 | }; 190 | 191 | #define SHA256_STEP(F0a,F1a,a,b,c,d,e,f,g,h,x,K) \ 192 | { \ 193 | h += K; \ 194 | h += x; \ 195 | h += S3 (e); \ 196 | h += F1a (e,f,g); \ 197 | d += h; \ 198 | h += S2 (a); \ 199 | h += F0a (a,b,c); \ 200 | } 201 | 202 | #define SHA256_EXPAND(x,y,z,w) (S1 (x) + y + S0 (z) + w) 203 | 204 | static void sha256_process2 (const unsigned int *W, unsigned int *digest) 205 | { 206 | unsigned int a = digest[0]; 207 | unsigned int b = digest[1]; 208 | unsigned int c = digest[2]; 209 | unsigned int d = digest[3]; 210 | unsigned int e = digest[4]; 211 | unsigned int f = digest[5]; 212 | unsigned int g = digest[6]; 213 | unsigned int h = digest[7]; 214 | 215 | unsigned int w0_t = W[0]; 216 | unsigned int w1_t = W[1]; 217 | unsigned int w2_t = W[2]; 218 | unsigned int w3_t = W[3]; 219 | unsigned int w4_t = W[4]; 220 | unsigned int w5_t = W[5]; 221 | unsigned int w6_t = W[6]; 222 | unsigned int w7_t = W[7]; 223 | unsigned int w8_t = W[8]; 224 | unsigned int w9_t = W[9]; 225 | unsigned int wa_t = W[10]; 226 | unsigned int wb_t = W[11]; 227 | unsigned int wc_t = W[12]; 228 | unsigned int wd_t = W[13]; 229 | unsigned int we_t = W[14]; 230 | unsigned int wf_t = W[15]; 231 | 232 | #define ROUND_EXPAND(i) \ 233 | { \ 234 | w0_t = SHA256_EXPAND (we_t, w9_t, w1_t, w0_t); \ 235 | w1_t = SHA256_EXPAND (wf_t, wa_t, w2_t, w1_t); \ 236 | w2_t = SHA256_EXPAND (w0_t, wb_t, w3_t, w2_t); \ 237 | w3_t = SHA256_EXPAND (w1_t, wc_t, w4_t, w3_t); \ 238 | w4_t = SHA256_EXPAND (w2_t, wd_t, w5_t, w4_t); \ 239 | w5_t = SHA256_EXPAND (w3_t, we_t, w6_t, w5_t); \ 240 | w6_t = SHA256_EXPAND (w4_t, wf_t, w7_t, w6_t); \ 241 | w7_t = SHA256_EXPAND (w5_t, w0_t, w8_t, w7_t); \ 242 | w8_t = SHA256_EXPAND (w6_t, w1_t, w9_t, w8_t); \ 243 | w9_t = SHA256_EXPAND (w7_t, w2_t, wa_t, w9_t); \ 244 | wa_t = SHA256_EXPAND (w8_t, w3_t, wb_t, wa_t); \ 245 | wb_t = SHA256_EXPAND (w9_t, w4_t, wc_t, wb_t); \ 246 | wc_t = SHA256_EXPAND (wa_t, w5_t, wd_t, wc_t); \ 247 | wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); \ 248 | we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); \ 249 | wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); \ 250 | } 251 | 252 | #define ROUND_STEP(i) \ 253 | { \ 254 | SHA256_STEP (F0, F1, a, b, c, d, e, f, g, h, w0_t, k_sha256[i + 0]); \ 255 | SHA256_STEP (F0, F1, h, a, b, c, d, e, f, g, w1_t, k_sha256[i + 1]); \ 256 | SHA256_STEP (F0, F1, g, h, a, b, c, d, e, f, w2_t, k_sha256[i + 2]); \ 257 | SHA256_STEP (F0, F1, f, g, h, a, b, c, d, e, w3_t, k_sha256[i + 3]); \ 258 | SHA256_STEP (F0, F1, e, f, g, h, a, b, c, d, w4_t, k_sha256[i + 4]); \ 259 | SHA256_STEP (F0, F1, d, e, f, g, h, a, b, c, w5_t, k_sha256[i + 5]); \ 260 | SHA256_STEP (F0, F1, c, d, e, f, g, h, a, b, w6_t, k_sha256[i + 6]); \ 261 | SHA256_STEP (F0, F1, b, c, d, e, f, g, h, a, w7_t, k_sha256[i + 7]); \ 262 | SHA256_STEP (F0, F1, a, b, c, d, e, f, g, h, w8_t, k_sha256[i + 8]); \ 263 | SHA256_STEP (F0, F1, h, a, b, c, d, e, f, g, w9_t, k_sha256[i + 9]); \ 264 | SHA256_STEP (F0, F1, g, h, a, b, c, d, e, f, wa_t, k_sha256[i + 10]); \ 265 | SHA256_STEP (F0, F1, f, g, h, a, b, c, d, e, wb_t, k_sha256[i + 11]); \ 266 | SHA256_STEP (F0, F1, e, f, g, h, a, b, c, d, wc_t, k_sha256[i + 12]); \ 267 | SHA256_STEP (F0, F1, d, e, f, g, h, a, b, c, wd_t, k_sha256[i + 13]); \ 268 | SHA256_STEP (F0, F1, c, d, e, f, g, h, a, b, we_t, k_sha256[i + 14]); \ 269 | SHA256_STEP (F0, F1, b, c, d, e, f, g, h, a, wf_t, k_sha256[i + 15]); \ 270 | } 271 | 272 | ROUND_STEP (0); 273 | 274 | ROUND_EXPAND(); 275 | ROUND_STEP(16); 276 | 277 | ROUND_EXPAND(); 278 | ROUND_STEP(32); 279 | 280 | ROUND_EXPAND(); 281 | ROUND_STEP(48); 282 | 283 | digest[0] += a; 284 | digest[1] += b; 285 | digest[2] += c; 286 | digest[3] += d; 287 | digest[4] += e; 288 | digest[5] += f; 289 | digest[6] += g; 290 | digest[7] += h; 291 | } 292 | 293 | #define def_hash(funcName, passTag, hashTag) \ 294 | /* The main hashing function */ \ 295 | static void funcName(passTag const unsigned int *pass, int pass_len, hashTag unsigned int* hash) \ 296 | { \ 297 | int plen=pass_len/4; \ 298 | if (mod(pass_len,4)) plen++; \ 299 | \ 300 | hashTag unsigned int* p = hash; \ 301 | \ 302 | unsigned int W[0x10]={0}; \ 303 | int loops=plen; \ 304 | int curloop=0; \ 305 | unsigned int State[8]={0}; \ 306 | State[0] = 0x6a09e667; \ 307 | State[1] = 0xbb67ae85; \ 308 | State[2] = 0x3c6ef372; \ 309 | State[3] = 0xa54ff53a; \ 310 | State[4] = 0x510e527f; \ 311 | State[5] = 0x9b05688c; \ 312 | State[6] = 0x1f83d9ab; \ 313 | State[7] = 0x5be0cd19; \ 314 | \ 315 | while (loops>0) \ 316 | { \ 317 | W[0x0]=0x0; \ 318 | W[0x1]=0x0; \ 319 | W[0x2]=0x0; \ 320 | W[0x3]=0x0; \ 321 | W[0x4]=0x0; \ 322 | W[0x5]=0x0; \ 323 | W[0x6]=0x0; \ 324 | W[0x7]=0x0; \ 325 | W[0x8]=0x0; \ 326 | W[0x9]=0x0; \ 327 | W[0xA]=0x0; \ 328 | W[0xB]=0x0; \ 329 | W[0xC]=0x0; \ 330 | W[0xD]=0x0; \ 331 | W[0xE]=0x0; \ 332 | W[0xF]=0x0; \ 333 | \ 334 | for (int m=0;loops!=0 && m<16;m++) \ 335 | { \ 336 | W[m]^=SWAP(pass[m+(curloop*16)]); \ 337 | loops--; \ 338 | } \ 339 | \ 340 | if (loops==0 && mod(pass_len,64)!=0) \ 341 | { \ 342 | unsigned int padding=0x80<<(((pass_len+4)-((pass_len+4)/4*4))*8); \ 343 | int v=mod(pass_len,64); \ 344 | W[v/4]|=SWAP(padding); \ 345 | if ((pass_len&0x3B)!=0x3B) \ 346 | { \ 347 | /* Let's add length */ \ 348 | W[0x0F]=pass_len*8; \ 349 | } \ 350 | } \ 351 | \ 352 | sha256_process2(W,State); \ 353 | curloop++; \ 354 | } \ 355 | \ 356 | if (mod(plen,16)==0) \ 357 | { \ 358 | W[0x0]=0x0; \ 359 | W[0x1]=0x0; \ 360 | W[0x2]=0x0; \ 361 | W[0x3]=0x0; \ 362 | W[0x4]=0x0; \ 363 | W[0x5]=0x0; \ 364 | W[0x6]=0x0; \ 365 | W[0x7]=0x0; \ 366 | W[0x8]=0x0; \ 367 | W[0x9]=0x0; \ 368 | W[0xA]=0x0; \ 369 | W[0xB]=0x0; \ 370 | W[0xC]=0x0; \ 371 | W[0xD]=0x0; \ 372 | W[0xE]=0x0; \ 373 | W[0xF]=0x0; \ 374 | if ((pass_len&0x3B)!=0x3B) \ 375 | { \ 376 | word padding=0x80<<(((pass_len+4)-((pass_len+4)/4*4))*8); \ 377 | W[0]|=SWAP(padding); \ 378 | } \ 379 | /* Let's add length */ \ 380 | W[0x0F]=pass_len*8; \ 381 | \ 382 | sha256_process2(W,State); \ 383 | } \ 384 | \ 385 | p[0]=SWAP(State[0]); \ 386 | p[1]=SWAP(State[1]); \ 387 | p[2]=SWAP(State[2]); \ 388 | p[3]=SWAP(State[3]); \ 389 | p[4]=SWAP(State[4]); \ 390 | p[5]=SWAP(State[5]); \ 391 | p[6]=SWAP(State[6]); \ 392 | p[7]=SWAP(State[7]); \ 393 | return; \ 394 | } 395 | 396 | def_hash(hash_global, __global, __global) 397 | def_hash(hash_private, __private, __private) 398 | def_hash(hash_glbl_to_priv, __global, __private) 399 | def_hash(hash_priv_to_glbl, __private, __global) 400 | 401 | #undef F0 402 | #undef F1 403 | #undef S0 404 | #undef S1 405 | #undef S2 406 | #undef S3 407 | 408 | #undef mod 409 | #undef shr32 410 | #undef rotl32 411 | 412 | __kernel void hash_main(__global const inbuf * inbuffer, __global outbuf * outbuffer) 413 | { 414 | unsigned int idx = get_global_id(0); 415 | // unsigned int hash[32/4]={0}; 416 | hash_global(inbuffer[idx].buffer, inbuffer[idx].length, outbuffer[idx].buffer); 417 | } 418 | /* 419 | pbkdf2 and HMAC implementation 420 | requires implementation of PRF (pseudo-random function), 421 | probably using HMAC and an implementation of hash_main 422 | */ 423 | /* 424 | REQ: outBuf.buffer must have space for ceil(dkLen / PRF_output_bytes) * PRF_output_bytes 425 | REQ: PRF implementation MUST allow that output may be the salt (m in hmac) 426 | inBuffer / pwdBuffer / the like are not const to allow for padding 427 | */ 428 | 429 | // Determine (statically) the actual required buffer size 430 | // Correct for both 64 & 32 bit 431 | // Just allowing for MD padding: 2 words for length, 1 for the 1-pad = 3 words 432 | #define sizeForHash(reqSize) (ceilDiv((reqSize) + 2 + 1, 16) * 16) 433 | 434 | __constant const unsigned int opad = 0x5c5c5c5c; 435 | __constant const unsigned int ipad = 0x36363636; 436 | 437 | __constant const word xoredPad = opad ^ ipad; 438 | 439 | // Slightly ugly: large enough for hmac_main usage, and tight for pbkdf2 440 | // #define m_buffer_size (8 + 1) 441 | 442 | static void hmac(__global word *K, const word K_len_bytes, 443 | const word *m, const word m_len_bytes, word *output) 444 | { 445 | // REQ: If K_len_bytes isn't divisible by 4/8, final word should be clean (0s to the end) 446 | // REQ: s digestSize is a multiple of 4/8 bytes 447 | 448 | /* Declare the space for input to the last hash function: 449 | Compute and write K_ ^ opad to the first block of this. This will be the only place that we store K_ */ 450 | 451 | word input_2[16 + 8] = {0}; 452 | word end; 453 | if (K_len_bytes <= 64) 454 | { 455 | end = ceilDiv(K_len_bytes, wordSize); 456 | // XOR with opad and slightly pad with zeros.. 457 | input_2[0] = K[0] ^ opad; 458 | input_2[1] = K[1] ^ opad; 459 | input_2[2] = K[2] ^ opad; 460 | input_2[3] = K[3] ^ opad; 461 | input_2[4] = K[4] ^ opad; 462 | input_2[5] = K[5] ^ opad; 463 | input_2[6] = K[6] ^ opad; 464 | input_2[7] = K[7] ^ opad; 465 | input_2[8] = K[8] ^ opad; 466 | input_2[9] = K[9] ^ opad; 467 | input_2[0xA] = K[0xA] ^ opad; 468 | input_2[0xB] = K[0xB] ^ opad; 469 | input_2[0xC] = K[0xC] ^ opad; 470 | input_2[0xD] = K[0xD] ^ opad; 471 | input_2[0xE] = K[0xE] ^ opad; 472 | input_2[0xF] = K[0xF] ^ opad; 473 | } else { 474 | end = 8; 475 | // Hash K to get K'. XOR with opad.. 476 | hash_glbl_to_priv(K, K_len_bytes, input_2); 477 | input_2[0] ^= opad; 478 | input_2[1] ^= opad; 479 | input_2[2] ^= opad; 480 | input_2[3] ^= opad; 481 | input_2[4] ^= opad; 482 | input_2[5] ^= opad; 483 | input_2[6] ^= opad; 484 | input_2[7] ^= opad; 485 | input_2[8] = opad; 486 | input_2[9] = opad; 487 | input_2[0xA] = opad; 488 | input_2[0xB] = opad; 489 | input_2[0xC] = opad; 490 | input_2[0xD] = opad; 491 | input_2[0xE] = opad; 492 | input_2[0xF] = opad; 493 | } 494 | // Copy K' ^ ipad into the first block. 495 | // Be careful: hash needs a whole block after the end. ceilDiv from buffer_structs 496 | // Slightly ugly: large enough for hmac_main usage, and tight for pbkdf2 497 | // #define m_buffer_size (8 + 1) 498 | // K' ^ ipad into the first block 499 | word input_1[16 + 9] = {0}; 500 | 501 | input_1[0] = input_2[0]^xoredPad; 502 | input_1[1] = input_2[1]^xoredPad; 503 | input_1[2] = input_2[2]^xoredPad; 504 | input_1[3] = input_2[3]^xoredPad; 505 | input_1[4] = input_2[4]^xoredPad; 506 | input_1[5] = input_2[5]^xoredPad; 507 | input_1[6] = input_2[6]^xoredPad; 508 | input_1[7] = input_2[7]^xoredPad; 509 | input_1[8] = input_2[8]^xoredPad; 510 | input_1[9] = input_2[9]^xoredPad; 511 | input_1[0xA] = input_2[0xA]^xoredPad; 512 | input_1[0xB] = input_2[0xB]^xoredPad; 513 | input_1[0xC] = input_2[0xC]^xoredPad; 514 | input_1[0xD] = input_2[0xD]^xoredPad; 515 | input_1[0xE] = input_2[0xE]^xoredPad; 516 | input_1[0xF] = input_2[0xF]^xoredPad; 517 | 518 | 519 | // Slightly inefficient copying m in.. 520 | word m_len_word = ceilDiv(m_len_bytes, wordSize); 521 | for (int j = 0; j < m_len_word; j++){ 522 | input_1[16 + j] = m[j]; 523 | } 524 | 525 | // Hash input1 into the second half of input2 526 | word leng = 64 + m_len_bytes; 527 | hash_private(input_1, leng, input_2 + 16); 528 | 529 | // Hash input2 into output! 530 | hash_private(input_2, 64 + 32, output); 531 | } 532 | 533 | #undef sizeForHash 534 | 535 | 536 | // PRF 537 | // Our PRF is the hmac using the hash. Commas remove need for bracketing 538 | #define PRF(pwd, pwdLen_bytes, salt, saltLen_bytes, output) \ 539 | hmac(pwd, pwdLen_bytes, salt, saltLen_bytes, output) 540 | 541 | 542 | static void F(__global word *pwd, const word pwdLen_bytes, 543 | word *salt, const word saltLen_bytes, 544 | const unsigned int iters, unsigned int callI, 545 | __global word *output) 546 | { 547 | // ASSUMPTION: salt array has wordSize bytes more room 548 | // Note salt is not const, so we can efficiently tweak the end of it 549 | 550 | // Add the integer to the end of the salt 551 | // NOTE! Always adding callI as just a u32 552 | //word overhang = saltLen_bytes % wordSize; 553 | word overhang=((saltLen_bytes)-((saltLen_bytes)/(wordSize)*(wordSize))); 554 | overhang *= 8; // convert to bits 555 | word saltLastI = saltLen_bytes / wordSize; 556 | 557 | // ! Crucial line: BE, moved as if it's a u32 but still within the word 558 | word be_callI = SWAP((word)callI) >> (8*(wordSize-4)); 559 | if (overhang>0) 560 | { 561 | salt[saltLastI] |= be_callI << overhang; 562 | salt[saltLastI+1] = be_callI >> ((8*wordSize)-overhang); 563 | } 564 | else 565 | { 566 | salt[saltLastI]=be_callI; 567 | } 568 | 569 | // Make initial call, copy into output 570 | // This copy is avoidable, but only with __global / __private macro stuff 571 | word u[8] = {0}; 572 | // +4 is correct even for 64 bit 573 | PRF(pwd, pwdLen_bytes, salt, saltLen_bytes + 4, u); 574 | output[0] = u[0]; 575 | output[1] = u[1]; 576 | output[2] = u[2]; 577 | output[3] = u[3]; 578 | output[4] = u[4]; 579 | output[5] = u[5]; 580 | output[6] = u[6]; 581 | output[7] = u[7]; 582 | 583 | // Perform all the iterations, reading salt from- AND writing to- u. 584 | for (unsigned int j = 1; j < iters; j++){ 585 | PRF(pwd, pwdLen_bytes, u, 32, u); 586 | output[0]^=u[0]; 587 | output[1]^=u[1]; 588 | output[2]^=u[2]; 589 | output[3]^=u[3]; 590 | output[4]^=u[4]; 591 | output[5]^=u[5]; 592 | output[6]^=u[6]; 593 | output[7]^=u[7]; 594 | } 595 | } 596 | 597 | __kernel void pbkdf2(__global inbuf *inbuffer, __global const saltbuf *saltbuffer, __global outbuf *outbuffer, 598 | __private unsigned int iters, __private unsigned int dkLen_bytes) 599 | { 600 | 601 | unsigned int idx = get_global_id(0); 602 | word pwdLen_bytes = inbuffer[idx].length; 603 | __global word *pwdBuffer = inbuffer[idx].buffer; 604 | __global word *currOutBuffer = outbuffer[idx].buffer; 605 | 606 | // Copy salt so that we can write our integer into the last 4 bytes 607 | word saltLen_bytes = saltbuffer[0].length; 608 | int saltLen = ceilDiv(saltLen_bytes, wordSize); 609 | word personal_salt[8+2] = {0}; 610 | 611 | personal_salt[0] = saltbuffer[0].buffer[0]; 612 | personal_salt[1] = saltbuffer[0].buffer[1]; 613 | personal_salt[2] = saltbuffer[0].buffer[2]; 614 | personal_salt[3] = saltbuffer[0].buffer[3]; 615 | personal_salt[4] = saltbuffer[0].buffer[4]; 616 | personal_salt[5] = saltbuffer[0].buffer[5]; 617 | personal_salt[6] = saltbuffer[0].buffer[6]; 618 | personal_salt[7] = saltbuffer[0].buffer[7]; 619 | 620 | // Determine the number of calls to F that we need to make 621 | unsigned int nBlocks = ceilDiv(dkLen_bytes, 32); 622 | for (unsigned int j = 1; j <= nBlocks; j++) 623 | { 624 | F(pwdBuffer, pwdLen_bytes, personal_salt, saltbuffer[0].length, iters, j, currOutBuffer); 625 | currOutBuffer += 8; 626 | } 627 | } 628 | 629 | 630 | // Exposing HMAC in the same way. Useful for testing atleast. 631 | __kernel void hmac_main(__global inbuf *inbuffer, __global const saltbuf *saltbuffer, __global outbuf *outbuffer) 632 | { 633 | unsigned int idx = get_global_id(0); 634 | word pwdLen_bytes = inbuffer[idx].length; 635 | __global word *pwdBuffer = inbuffer[idx].buffer; 636 | 637 | // Copy salt just to cheer the compiler up 638 | int saltLen_bytes = (int)saltbuffer[0].length; 639 | int saltLen = ceilDiv(saltLen_bytes, wordSize); 640 | word personal_salt[8] = {0}; 641 | 642 | personal_salt[0] = saltbuffer[0].buffer[0]; 643 | personal_salt[1] = saltbuffer[0].buffer[1]; 644 | personal_salt[2] = saltbuffer[0].buffer[2]; 645 | personal_salt[3] = saltbuffer[0].buffer[3]; 646 | personal_salt[4] = saltbuffer[0].buffer[4]; 647 | personal_salt[5] = saltbuffer[0].buffer[5]; 648 | personal_salt[6] = saltbuffer[0].buffer[6]; 649 | personal_salt[7] = saltbuffer[0].buffer[7]; 650 | 651 | // Call hmac, with local 652 | word out[8]; 653 | 654 | hmac(pwdBuffer, pwdLen_bytes, personal_salt, saltLen_bytes, out); 655 | 656 | outbuffer[idx].buffer[0] = out[0]; 657 | outbuffer[idx].buffer[1] = out[1]; 658 | outbuffer[idx].buffer[2] = out[2]; 659 | outbuffer[idx].buffer[3] = out[3]; 660 | outbuffer[idx].buffer[4] = out[4]; 661 | outbuffer[idx].buffer[5] = out[5]; 662 | outbuffer[idx].buffer[6] = out[6]; 663 | outbuffer[idx].buffer[7] = out[7]; 664 | } -------------------------------------------------------------------------------- /Library/worker/generic/sCrypt.cl: -------------------------------------------------------------------------------- 1 | /* 2 | Scrypt OpenCL Optimized kernel 3 | (c) C.B. and B. Kerler 2018-2019 4 | MIT License 5 | */ 6 | 7 | // [Lines 1 and 2 are for defining N and invMemoryDensity, and must be blank] 8 | 9 | /* 10 | sCrypt kernel.. or just ROMix really, for use with my sBrute PyOpenCL core 11 | Originally adapted from Bjorn Kerler's opencl_brute 12 | 13 | Follows the variable names of wikipedia's psuedocode: 14 | https://en.wikipedia.org/wiki/Scrypt#Algorithm 15 | Function/macro convention is F(output, input_1, input_2, ..), i.e. output first. 16 | Generally work with pointers. 17 | 18 | === Design choices & reasoning ================================================= 19 | 20 | > initial and final pbkdf2s are left to python for a few reasons: 21 | - vastly simplier cl code, hopefully giving us better optimisation 22 | - reduced bugs 23 | - simplier parallelisation across the parameter 'p' 24 | - not a burden on python: work is tiny.. 25 | & the special sBrute python core is careful that any work is while the GPUs are busy 26 | 27 | > salsa20 is sort of inplace 28 | - fundamentally needs to copy the input internally 29 | - does (hopefully) make savings by having input = output, making the algo: 30 | orig_input < input 31 | Process(input) // inplace 32 | input ^= orig_input 33 | where the last line should be faster than output = input ^ orig_input 34 | 35 | > JUMBLES! 36 | - jumble(Y0|Y1|..|Y_2r-1) = Y0|Y2|..|Y_2r-1 | Y1|Y3|..|Y_2r-1, 37 | which is effectively performed at the end of BlockMix in the original definition 38 | - jumble is of order 4, i.e. jumble^4 = id 39 | - we want to avoid doing this copying.. 40 | - naturally we unroll the loop in BlockMix, so reordering the input is free 41 | => all this leads to us working in 4 different states of "jumbled-ness" throughout the program 42 | - indeed our V[j]s are jumbled j % 4 times. 43 | - xoring the V[j]'s back onto a (somewhat jumbled) X in the 2nd loop effectively requires a function call 44 | 45 | > Salsa function is long, so can't be macro-ed and called lots of times. 46 | - We could have kept the BlockMix loop, 47 | but this would require reading the jumble index from an array each iteration 48 | - Instead we make Salsa a void Function 49 | - Also a xor loop is moved into Salsa, so that we can unroll it, 50 | at the small cost of an extra parameter 51 | 52 | > All values except our huge V array are kept locally. 53 | - V[j] is accessed and xored onto a local array. 54 | 55 | > After a long battle, the Salsa20/8's 4-pairs-of-rounds loop is unrolled. 56 | - Program size should still be fine. 57 | 58 | > using "= {0}" to initialise local arrays is the classic fix copied from Bjorn Kerler's code: 59 | seems to be necessary to actually make the program work, even though it should have no effect. 60 | 61 | 62 | === FIN ======================================================================== 63 | */ 64 | 65 | 66 | 67 | 68 | // =========================================================================== 69 | // 1 / memory density 70 | #ifndef invMemoryDensity 71 | #define invMemoryDensity 1 72 | #endif 73 | #define iMD_is_pow_2 (!(invMemoryDensity & (invMemoryDensity - 1)) && invMemoryDensity) 74 | 75 | 76 | // sCrypt constants : 77 | // - p irrelevant to us 78 | // - r below cannot be changed (without altering the program) 79 | // > makes the 'jumble' operation order 4 80 | // - N can be changed if necessary, up until we run out of buffer space (so maybe <= 20?) 81 | #ifndef N 82 | #define N 15 // <= 20? 83 | #endif 84 | 85 | #define mod(x,y) ((x)-((x)/(y)*(y))) 86 | 87 | #define r 8 // CAN'T BE CHANGED 88 | 89 | // derivatives of constants :s 90 | #define blockSize_bytes (128 * r) // 1024 91 | #define ceilDiv(n,d) (((n) + (d) - 1) / (d)) 92 | #define blockSize_int32 ceilDiv(blockSize_bytes, 4) // 256 93 | #define iterations (1 << N) 94 | 95 | // Useful struct for internal processing: a lump of 64 bytes (sort of an atomic unit) 96 | typedef struct { 97 | unsigned int buffer[16]; // 64 bytes 98 | } T_Lump64; 99 | 100 | // Comfy Block struct 101 | typedef struct { 102 | T_Lump64 lump[2*r]; // 1024 bytes 103 | } T_Block; 104 | 105 | // Struct for the large V array which needs to be pseduo-randomly accessed. 106 | // Now restricted in length by invMemoryDensity 107 | typedef struct { 108 | T_Block blk[ceilDiv(iterations, invMemoryDensity)]; 109 | } T_HugeArray; 110 | 111 | 112 | 113 | 114 | 115 | 116 | // =========================================================================== 117 | // Simple macros 118 | // Lump & Block macros take pointers 119 | 120 | #define copy16_unrolled(dest,src) \ 121 | /* dest[i] = src[i] for i in [0..16) */ \ 122 | { \ 123 | dest[0] = src[0]; \ 124 | dest[1] = src[1]; \ 125 | dest[2] = src[2]; \ 126 | dest[3] = src[3]; \ 127 | dest[4] = src[4]; \ 128 | dest[5] = src[5]; \ 129 | dest[6] = src[6]; \ 130 | dest[7] = src[7]; \ 131 | dest[8] = src[8]; \ 132 | dest[9] = src[9]; \ 133 | dest[10] = src[10]; \ 134 | dest[11] = src[11]; \ 135 | dest[12] = src[12]; \ 136 | dest[13] = src[13]; \ 137 | dest[14] = src[14]; \ 138 | dest[15] = src[15]; \ 139 | } 140 | 141 | #define xor16_unrolled(dest,src) \ 142 | /* dest[i] ^= src[i] for i in [0..16) */ \ 143 | { \ 144 | dest[0] ^= src[0]; \ 145 | dest[1] ^= src[1]; \ 146 | dest[2] ^= src[2]; \ 147 | dest[3] ^= src[3]; \ 148 | dest[4] ^= src[4]; \ 149 | dest[5] ^= src[5]; \ 150 | dest[6] ^= src[6]; \ 151 | dest[7] ^= src[7]; \ 152 | dest[8] ^= src[8]; \ 153 | dest[9] ^= src[9]; \ 154 | dest[10] ^= src[10]; \ 155 | dest[11] ^= src[11]; \ 156 | dest[12] ^= src[12]; \ 157 | dest[13] ^= src[13]; \ 158 | dest[14] ^= src[14]; \ 159 | dest[15] ^= src[15]; \ 160 | } 161 | 162 | #define add16_unrolled(dest, src) \ 163 | /* dest[i] += src[i] for i in [0..16) */ \ 164 | { \ 165 | dest[0] += src[0]; \ 166 | dest[1] += src[1]; \ 167 | dest[2] += src[2]; \ 168 | dest[3] += src[3]; \ 169 | dest[4] += src[4]; \ 170 | dest[5] += src[5]; \ 171 | dest[6] += src[6]; \ 172 | dest[7] += src[7]; \ 173 | dest[8] += src[8]; \ 174 | dest[9] += src[9]; \ 175 | dest[10] += src[10]; \ 176 | dest[11] += src[11]; \ 177 | dest[12] += src[12]; \ 178 | dest[13] += src[13]; \ 179 | dest[14] += src[14]; \ 180 | dest[15] += src[15]; \ 181 | } 182 | 183 | #define copyLump64_unrolled(dest, src) \ 184 | /* &dest = &src */ \ 185 | { \ 186 | copy16_unrolled(dest->buffer, src->buffer) \ 187 | } 188 | 189 | #define xorLump64_unrolled(dest, src) \ 190 | /* &dest ^= &src */ \ 191 | { \ 192 | xor16_unrolled(dest->buffer, src->buffer) \ 193 | } 194 | 195 | #define copyBlock_halfrolled(destTag, dest, srcTag, src) \ 196 | /* [destTag] &dest = [srcTag] &src, copying lumps of 64 in a loop */ \ 197 | { \ 198 | destTag T_Lump64* _CB_d; \ 199 | srcTag T_Lump64* _CB_s; \ 200 | for (int i = 2*r - 1; i >= 0; i--) \ 201 | { \ 202 | _CB_d = &(dest)->lump[i]; \ 203 | _CB_s = &(src)->lump[i]; \ 204 | copyLump64_unrolled(_CB_d, _CB_s) \ 205 | } \ 206 | } 207 | 208 | #define xorBlock_halfrolled(destTag, dest, srcTag, src) \ 209 | /* [destTag] &dest ^= [srcTag] &src, xoring lumps of 64 in a loop */ \ 210 | { \ 211 | destTag T_Lump64* _XB_d; \ 212 | srcTag T_Lump64* _XB_s; \ 213 | for (int i = 2*r - 1; i >= 0; i--) \ 214 | { \ 215 | _XB_d = &(dest)->lump[i]; \ 216 | _XB_s = &(src)->lump[i]; \ 217 | xorLump64_unrolled(_XB_d, _XB_s) \ 218 | } \ 219 | } 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | // ========================================================================== 228 | // Debug printing macros 229 | 230 | #define printLump(lump) \ 231 | /* Takes the object not a pointer */ \ 232 | { \ 233 | for (int j = 0; j < 16; j++){ \ 234 | printf("%08X", lump.buffer[j]); \ 235 | } \ 236 | } 237 | 238 | #define printBlock(blk) \ 239 | /* Takes a pointer */ \ 240 | { \ 241 | for (int i = 0; i < 2*r; i++) \ 242 | { \ 243 | printLump(blk->lump[i]) \ 244 | } \ 245 | } 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | // =========================================================================== 254 | // Salsa 20/8 255 | // Adapted from https://en.wikipedia.org/wiki/Salsa20#Structure 256 | 257 | 258 | // Rotation synonym and quarter round for Salsa20 259 | #define rotl32(a,n) rotate((a), (n)) 260 | #define quarterRound(a, b, c, d) \ 261 | /**/ \ 262 | { \ 263 | b ^= rotl32(a + d, 7u); \ 264 | c ^= rotl32(b + a, 9u); \ 265 | d ^= rotl32(c + b, 13u); \ 266 | a ^= rotl32(d + c, 18u); \ 267 | } 268 | 269 | #define pairOfRounds(x) \ 270 | /* Pinched from wikipedia */ \ 271 | { \ 272 | /* Odd round */ \ 273 | quarterRound(x[ 0], x[ 4], x[ 8], x[12]); \ 274 | quarterRound(x[ 5], x[ 9], x[13], x[ 1]); \ 275 | quarterRound(x[10], x[14], x[ 2], x[ 6]); \ 276 | quarterRound(x[15], x[ 3], x[ 7], x[11]); \ 277 | /* Even round */ \ 278 | quarterRound(x[ 0], x[ 1], x[ 2], x[ 3]); \ 279 | quarterRound(x[ 5], x[ 6], x[ 7], x[ 4]); \ 280 | quarterRound(x[10], x[11], x[ 8], x[ 9]); \ 281 | quarterRound(x[15], x[12], x[13], x[14]); \ 282 | } 283 | 284 | // Function not a macro (see 'design choices' at the top) 285 | // Xors X onto lump then computes lump <- Salsa20/8(lump) 286 | void Xor_then_Salsa_20_8_InPlace(__private T_Lump64* lump, __private T_Lump64* X) 287 | { 288 | // Includes xoring here, to allow for unrolling (at expense of an extra param) 289 | xorLump64_unrolled(lump, X) 290 | 291 | // Copy input into x (lowercase) for processing 292 | unsigned int x[16] = {0}; 293 | copy16_unrolled(x, lump->buffer) 294 | 295 | // Do the 8 rounds 296 | // After much internal conflict I have unrolled this loop of 4 297 | pairOfRounds(x) 298 | pairOfRounds(x) 299 | pairOfRounds(x) 300 | pairOfRounds(x) 301 | 302 | // Add x to original input, and store into output.. which is the input :) 303 | add16_unrolled(lump->buffer, x) 304 | } 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | // ==================================================================================== 313 | // BlockMix variants 314 | // Nomenclature of the variants is composition: f_g_h(x) = f(g(h(x))) 315 | 316 | 317 | #define BlockMixLoopBody(_B_i, _BMLB_X) \ 318 | /* My heavily adapted BlockMix loop body */ \ 319 | { \ 320 | /* _B_i = _B_i ^ _BMLB_X 321 | _B_i = Salsa20(_B_i) 322 | _BMLB_X = _B_i (as pointers) 323 | [ Doesn't increment i ] 324 | */ \ 325 | Xor_then_Salsa_20_8_InPlace(_B_i, _BMLB_X);\ 326 | _BMLB_X = _B_i; \ 327 | } 328 | 329 | #define _BlockMix_Generic(B, \ 330 | i_1, i_2, i_3, i_4, i_5, i_6, i_7, \ 331 | i_8, i_9, i_10, i_11, i_12, i_13, i_14, i_15) \ 332 | /* Takes {i_0, .. , i_15} a permutation of {0, .. , 15}, the order of indices 333 | i_0 = 0 implied. */ \ 334 | { \ 335 | /* Don't even need to copy to _BM_X, can just point! */ \ 336 | /* Start with _BM_X = B[2r-1] (indexing across blocks of 64 bytes) */ \ 337 | __private T_Lump64* _BM_X = &B->lump[i_15]; \ 338 | __private T_Lump64* _BM_B_i; \ 339 | \ 340 | /* i_0 = 0 */ \ 341 | BlockMixLoopBody(&B->lump[0], _BM_X)\ 342 | _BM_B_i = &B->lump[i_1]; \ 343 | BlockMixLoopBody(_BM_B_i, _BM_X) \ 344 | _BM_B_i = &B->lump[i_2]; \ 345 | BlockMixLoopBody(_BM_B_i, _BM_X) \ 346 | _BM_B_i = &B->lump[i_3]; \ 347 | BlockMixLoopBody(_BM_B_i, _BM_X) \ 348 | \ 349 | _BM_B_i = &B->lump[i_4]; \ 350 | BlockMixLoopBody(_BM_B_i, _BM_X) \ 351 | _BM_B_i = &B->lump[i_5]; \ 352 | BlockMixLoopBody(_BM_B_i, _BM_X) \ 353 | _BM_B_i = &B->lump[i_6]; \ 354 | BlockMixLoopBody(_BM_B_i, _BM_X) \ 355 | _BM_B_i = &B->lump[i_7]; \ 356 | BlockMixLoopBody(_BM_B_i, _BM_X) \ 357 | \ 358 | _BM_B_i = &B->lump[i_8]; \ 359 | BlockMixLoopBody(_BM_B_i, _BM_X) \ 360 | _BM_B_i = &B->lump[i_9]; \ 361 | BlockMixLoopBody(_BM_B_i, _BM_X) \ 362 | _BM_B_i = &B->lump[i_10]; \ 363 | BlockMixLoopBody(_BM_B_i, _BM_X) \ 364 | _BM_B_i = &B->lump[i_11]; \ 365 | BlockMixLoopBody(_BM_B_i, _BM_X) \ 366 | \ 367 | _BM_B_i = &B->lump[i_12]; \ 368 | BlockMixLoopBody(_BM_B_i, _BM_X) \ 369 | _BM_B_i = &B->lump[i_13]; \ 370 | BlockMixLoopBody(_BM_B_i, _BM_X) \ 371 | _BM_B_i = &B->lump[i_14]; \ 372 | BlockMixLoopBody(_BM_B_i, _BM_X) \ 373 | _BM_B_i = &B->lump[i_15]; \ 374 | BlockMixLoopBody(_BM_B_i, _BM_X) \ 375 | } 376 | 377 | 378 | #define BlockMix_J3(B) \ 379 | /* 3 jumbles then a BlockMix */ \ 380 | { \ 381 | _BlockMix_Generic(B, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15) \ 382 | } 383 | 384 | #define J1_BlockMix_J2(B) \ 385 | /* Jumble twice, BlockMixes, then jumbles. */ \ 386 | { \ 387 | _BlockMix_Generic(B, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15) \ 388 | } 389 | 390 | #define J2_BlockMix_J1(B) \ 391 | /* Jumbles, BlockMixes, then 2 jumbles. */ \ 392 | { \ 393 | _BlockMix_Generic(B, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15) \ 394 | } 395 | 396 | #define J3_BlockMix(B) \ 397 | /* BlockMix followed by 3 jumbles (i.e. a jumble-inverse) */ \ 398 | { \ 399 | _BlockMix_Generic(B, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) \ 400 | } 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | // =============================================================================== 410 | // Integerify: gets it's own section 411 | 412 | #define Integerify(j, block) \ 413 | /* Observe that the last 64 bytes is the last lump */ \ 414 | /* Correct regardless of the jumbled-ness of the block! */ \ 415 | /* Requires N <= 32 */ \ 416 | { \ 417 | j = mod(block->lump[15].buffer[0],iterations); \ 418 | } 419 | 420 | 421 | 422 | 423 | 424 | 425 | // =============================================================================== 426 | // Xoring methods for the 4 states of jumbled-ness 427 | // Culminates in the 'recover_and_xor_appropriately' function, which selects the correct one. 428 | 429 | #define _xor_generic(dest, srcTag, src, \ 430 | i_0, i_1, i_2, i_3, i_4, i_5, i_6, i_7, \ 431 | i_8, i_9, i_10, i_11, i_12, i_13, i_14, i_15) \ 432 | /* dest ^= perm(src), xor permuted source on, k -> i_k the permutation. 433 | requires src disjoint from dest : guaranteed by address spaces */ \ 434 | { \ 435 | __private T_Lump64* _XB_d; \ 436 | srcTag T_Lump64* _XB_s; \ 437 | const int perm[16] = {i_0, i_1, i_2, i_3, i_4, i_5, i_6, i_7, \ 438 | i_8, i_9, i_10, i_11, i_12, i_13, i_14, i_15}; \ 439 | for (int i = 2*r - 1; i >= 0; i--) \ 440 | { \ 441 | _XB_d = &(dest)->lump[i]; \ 442 | /* Select perm index instead of index */ \ 443 | _XB_s = &(src)->lump[perm[i]]; \ 444 | xorLump64_unrolled(_XB_d, _XB_s) \ 445 | } \ 446 | } 447 | 448 | #define xor_J1(dest, srcTag, src) \ 449 | { \ 450 | _xor_generic(dest, srcTag, src, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15) \ 451 | } 452 | 453 | #define xor_J2(dest, srcTag, src) \ 454 | { \ 455 | _xor_generic(dest, srcTag, src, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15) \ 456 | } 457 | 458 | #define xor_J3(dest, srcTag, src) \ 459 | { \ 460 | _xor_generic(dest, srcTag, src, 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15) \ 461 | } 462 | 463 | // Chooses the appropriate xoring based on the supplied value diff, which is modded by 4 464 | // diff is such that jumble^diff(inp) is 'equally jumbled' as out 465 | // diff will be pseudorandom, so case statement should maximise efficiency. 466 | // Now also recomputes V'[j] from V[j // density] 467 | void recover_and_xor_appropriately(__private T_Block* dest, __global T_Block* V, 468 | unsigned int j, unsigned int diff){ 469 | 470 | // Number of computations to make. 471 | int nComps = mod(j,invMemoryDensity); 472 | int V_index = j / invMemoryDensity; 473 | 474 | if (nComps == 0){ 475 | label_nComps_is_zero: 476 | // Do the xoring directly from the global block V[V_index] 477 | // Basically the old "xor_appropriately" 478 | switch(mod(diff,4)){ 479 | case 0: 480 | xorBlock_halfrolled(__private, dest, __global, &V[V_index]) 481 | break; 482 | case 1: 483 | xor_J1(dest, __global, &V[V_index]) 484 | break; 485 | case 2: 486 | xor_J2(dest, __global, &V[V_index]) 487 | break; 488 | case 3: 489 | xor_J3(dest, __global, &V[V_index]) 490 | break; 491 | } 492 | } 493 | else 494 | { 495 | // Copy V[j/iMD] into Y, where we'll do our work 496 | // (using Bjorn's initialisation-bug-prevention once more) 497 | // Observe that this copy is pretty essential 498 | __private unsigned int _Y_bytes[ceilDiv(sizeof(T_Block), 4)] = {0}; 499 | __private T_Block* Y = (T_Block*) _Y_bytes; 500 | copyBlock_halfrolled(__private, Y, __global, &V[V_index]) 501 | 502 | // We have to decide where to enter the loop, based on how jumbled V[V_index] is 503 | // i.e. (V_index * invMemoryDensity) % 4 504 | switch(mod(j - nComps,4)){ 505 | case 0: 506 | goto label_j0; 507 | case 1: 508 | goto label_j3; 509 | case 2: 510 | goto label_j2; 511 | case 3: 512 | goto label_j1; 513 | } 514 | 515 | // Could change to nComps-- .. would save an assembly instruction? :) 516 | do { 517 | label_j0: J3_BlockMix(Y); 518 | if (--nComps == 0){ 519 | break; 520 | } 521 | 522 | label_j3: J2_BlockMix_J1(Y); 523 | if (--nComps == 0){ 524 | break; 525 | } 526 | 527 | label_j2: J1_BlockMix_J2(Y); 528 | if (--nComps == 0){ 529 | break; 530 | } 531 | 532 | label_j1: BlockMix_J3(Y); 533 | } while (--nComps > 0); 534 | 535 | 536 | // With Y = V'[j] recovered, we can finish the job off by xoring appropriately. 537 | switch(mod(diff,4)){ 538 | case 0: 539 | xorBlock_halfrolled(__private, dest, __private, Y) 540 | break; 541 | case 1: 542 | xor_J1(dest, __private, Y) 543 | break; 544 | case 2: 545 | xor_J2(dest, __private, Y) 546 | break; 547 | case 3: 548 | xor_J3(dest, __private, Y) 549 | break; 550 | } 551 | } 552 | 553 | } 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | // ================================================================================== 564 | // The big one: ROMix kernel 565 | 566 | __kernel void ROMix( __global T_Block* blocksFlat, 567 | __global T_HugeArray* hugeArraysFlat, 568 | __global T_Block* outputsFlat 569 | ) 570 | { 571 | // Get our id and so unflatten our block & huge array 'V', to get pointers 572 | // &arr[i] and arr + i should be equivalent syntax? 573 | __private unsigned int id = get_global_id(0); 574 | __global T_Block* origBlock = &blocksFlat[id]; 575 | __global T_Block* outputBlock = &outputsFlat[id]; 576 | __global T_Block* V = hugeArraysFlat[id].blk; 577 | __global T_Block* curr_V_blk = V; 578 | 579 | // Copy our block into local X : could roll fully 580 | // slightly weird to allow for Bjorn's bug-preventing-initialisation 581 | __private unsigned int _X_bytes[ceilDiv(sizeof(T_Block), 4)] = {0}; 582 | __private T_Block* X = (T_Block*) _X_bytes; 583 | copyBlock_halfrolled(__private, X, __global, origBlock) 584 | 585 | 586 | 587 | // ===================================================== 588 | // 1st loop, fill V with the correct values, in varying states of jumbled-ness: 589 | // Let V' be the correct value. d the invMemoryDensity 590 | // d*i mod 4 || state in V[i] 591 | // ============================================ 592 | // 0 || V'[d*i] 593 | // 1 || J^3(V'[d*i]) 594 | // 2 || J^2(V'[d*i]) 595 | // 3 || J^1(V'[d*i]) 596 | // Now only storing the first in every invMemoryDensity 597 | 598 | #define maybeStore(curr_V_blk, X, _j) \ 599 | /* If due, stores X to curr_V_blk and increments it */ \ 600 | { \ 601 | if (mod(_j,invMemoryDensity) == 0){ \ 602 | copyBlock_halfrolled(__global, curr_V_blk, __private, X); \ 603 | curr_V_blk++; \ 604 | } \ 605 | } 606 | 607 | // Still needs to do all 'iterations' loops, to compute the final X 608 | for (int j = 0; j < iterations; j+=4){ 609 | maybeStore(curr_V_blk, X, j) 610 | J3_BlockMix(X); 611 | 612 | maybeStore(curr_V_blk, X, j+1) 613 | J2_BlockMix_J1(X); 614 | 615 | maybeStore(curr_V_blk, X, j+2) 616 | J1_BlockMix_J2(X); 617 | 618 | maybeStore(curr_V_blk, X, j+3) 619 | BlockMix_J3(X); 620 | } 621 | 622 | #undef maybeStore 623 | 624 | 625 | // ==================================================== 626 | // 2nd loop, similarly X passes through 4 states of jumbled-ness 627 | // Observe that we need to choose our xor based on j-i % 4, 628 | // which adds more complexity compared to the first loop. 629 | 630 | // Moreover we may need to actually recompute the value. 631 | // => sensibly (in terms of program length) this is in "recover_and_xor_appropriately" 632 | unsigned int j; 633 | for (unsigned int i = 0; i < iterations; i+=4){ 634 | Integerify(j, X) 635 | recover_and_xor_appropriately(X, V, j, j - i); 636 | J3_BlockMix(X); 637 | 638 | Integerify(j, X); 639 | recover_and_xor_appropriately(X, V, j, j - (i+1)); 640 | J2_BlockMix_J1(X); 641 | 642 | Integerify(j, X); 643 | recover_and_xor_appropriately(X, V, j, j - (i+2)); 644 | J1_BlockMix_J2(X); 645 | 646 | Integerify(j, X); 647 | recover_and_xor_appropriately(X, V, j, j - (i+3)); 648 | BlockMix_J3(X); 649 | } 650 | 651 | // Copy to output: could roll fully 652 | copyBlock_halfrolled(__global, outputBlock, __private, X) 653 | } 654 | 655 | 656 | 657 | 658 | 659 | 660 | // =============================================================================== 661 | // For testing, Salsa20's each lump in place 662 | // Same signature as ROMix for ease 663 | __kernel void Salsa20( __global T_Block* blocksFlat, 664 | __global T_HugeArray* hugeArraysFlat, 665 | __global T_Block* outputsFlat) 666 | { 667 | __private unsigned int id = get_global_id(0); 668 | 669 | // Copy locally, initialising first for fear of bugs 670 | __private unsigned int _b[ceilDiv(sizeof(T_Block), 4)] = {0}; 671 | __private T_Block* blk = (T_Block*) _b; 672 | copyBlock_halfrolled(__private, blk, __global, (&blocksFlat[id])) 673 | 674 | // Initialise a zero lump 675 | unsigned int _z[ceilDiv(sizeof(T_Lump64), 4)] = {0}; 676 | T_Lump64* zeroLump = (T_Lump64*)_z; 677 | 678 | // Salsa each lump inPlace 679 | for (int j = 0; j < 2*r; j++) 680 | { 681 | Xor_then_Salsa_20_8_InPlace((&blk->lump[j]), zeroLump); 682 | } 683 | 684 | // Copy to output 685 | __global T_Block* output = &outputsFlat[id]; 686 | copyBlock_halfrolled(__global, output, __private, blk) 687 | } 688 | -------------------------------------------------------------------------------- /Library/worker/generic/sCrypt_Bip38fork.cl: -------------------------------------------------------------------------------- 1 | // Improved OpenCL Scrypt Kernel 2 | // Part of BTCRecover fork jeffersonn-1/btcrecover, licensed under the GNU General Public License v2.0 3 | // 2020 Jefferson Nunn and Gaith 4 | 5 | #define iterations 16384 6 | 7 | #define reorder(B) \ 8 | { \ 9 | __private uint4 tmp[4]; \ 10 | tmp[0] = (uint4)(B[1].x,B[2].y,B[3].z,B[0].w); \ 11 | tmp[1] = (uint4)(B[2].x,B[3].y,B[0].z,B[1].w); \ 12 | tmp[2] = (uint4)(B[3].x,B[0].y,B[1].z,B[2].w); \ 13 | tmp[3] = (uint4)(B[0].x,B[1].y,B[2].z,B[3].w); \ 14 | B[0] = tmp[0]; \ 15 | B[1] = tmp[1]; \ 16 | B[2] = tmp[2]; \ 17 | B[3] = tmp[3]; \ 18 | } \ 19 | 20 | #define undo_reorder(B) \ 21 | { \ 22 | __private uint4 tmp[4]; \ 23 | tmp[0] = (uint4)(B[3].x,B[2].y,B[1].z,B[0].w); \ 24 | tmp[1] = (uint4)(B[0].x,B[3].y,B[2].z,B[1].w); \ 25 | tmp[2] = (uint4)(B[1].x,B[0].y,B[3].z,B[2].w); \ 26 | tmp[3] = (uint4)(B[2].x,B[1].y,B[0].z,B[3].w); \ 27 | B[0] = tmp[0]; \ 28 | B[1] = tmp[1]; \ 29 | B[2] = tmp[2]; \ 30 | B[3] = tmp[3]; \ 31 | } \ 32 | 33 | #define copy64(dest, idx_dest, src, idx_src) \ 34 | { \ 35 | dest[idx_dest ] = src[idx_src ]; \ 36 | dest[idx_dest + 1] = src[idx_src + 1]; \ 37 | dest[idx_dest + 2] = src[idx_src + 2]; \ 38 | dest[idx_dest + 3] = src[idx_src + 3]; \ 39 | } \ 40 | 41 | typedef struct { 42 | uint4 buf[64]; 43 | } T_Block; 44 | 45 | void salsa(__private const uint4 Bx[4], __private uint4 B[4]); 46 | void BlockMix(__private T_Block* B); 47 | 48 | void salsa(__private const uint4 Bx[4], __private uint4 B[4]) 49 | { 50 | __private uint4 w[4]; 51 | 52 | w[0] = (B[0] ^= Bx[0]); 53 | w[1] = (B[1] ^= Bx[1]); 54 | w[2] = (B[2] ^= Bx[2]); 55 | w[3] = (B[3] ^= Bx[3]); 56 | 57 | reorder(w); 58 | 59 | /* Rounds 1 + 2 */ 60 | w[0] ^= rotate(w[3] +w[2] , 7U); 61 | w[1] ^= rotate(w[0] +w[3] , 9U); 62 | w[2] ^= rotate(w[1] +w[0] ,13U); 63 | w[3] ^= rotate(w[2] +w[1] ,18U); 64 | w[2] ^= rotate(w[3].wxyz+w[0].zwxy, 7U); 65 | w[1] ^= rotate(w[2].wxyz+w[3].zwxy, 9U); 66 | w[0] ^= rotate(w[1].wxyz+w[2].zwxy,13U); 67 | w[3] ^= rotate(w[0].wxyz+w[1].zwxy,18U); 68 | 69 | /* Rounds 3 + 4 */ 70 | w[0] ^= rotate(w[3] +w[2] , 7U); 71 | w[1] ^= rotate(w[0] +w[3] , 9U); 72 | w[2] ^= rotate(w[1] +w[0] ,13U); 73 | w[3] ^= rotate(w[2] +w[1] ,18U); 74 | w[2] ^= rotate(w[3].wxyz+w[0].zwxy, 7U); 75 | w[1] ^= rotate(w[2].wxyz+w[3].zwxy, 9U); 76 | w[0] ^= rotate(w[1].wxyz+w[2].zwxy,13U); 77 | w[3] ^= rotate(w[0].wxyz+w[1].zwxy,18U); 78 | 79 | /* Rounds 5 + 6 */ 80 | w[0] ^= rotate(w[3] +w[2] , 7U); 81 | w[1] ^= rotate(w[0] +w[3] , 9U); 82 | w[2] ^= rotate(w[1] +w[0] ,13U); 83 | w[3] ^= rotate(w[2] +w[1] ,18U); 84 | w[2] ^= rotate(w[3].wxyz+w[0].zwxy, 7U); 85 | w[1] ^= rotate(w[2].wxyz+w[3].zwxy, 9U); 86 | w[0] ^= rotate(w[1].wxyz+w[2].zwxy,13U); 87 | w[3] ^= rotate(w[0].wxyz+w[1].zwxy,18U); 88 | 89 | /* Rounds 7 + 8 */ 90 | w[0] ^= rotate(w[3] +w[2] , 7U); 91 | w[1] ^= rotate(w[0] +w[3] , 9U); 92 | w[2] ^= rotate(w[1] +w[0] ,13U); 93 | w[3] ^= rotate(w[2] +w[1] ,18U); 94 | w[2] ^= rotate(w[3].wxyz+w[0].zwxy, 7U); 95 | w[1] ^= rotate(w[2].wxyz+w[3].zwxy, 9U); 96 | w[0] ^= rotate(w[1].wxyz+w[2].zwxy,13U); 97 | w[3] ^= rotate(w[0].wxyz+w[1].zwxy,18U); 98 | 99 | undo_reorder(w); 100 | 101 | B[0] += w[0]; 102 | B[1] += w[1]; 103 | B[2] += w[2]; 104 | B[3] += w[3]; 105 | } 106 | 107 | void BlockMix(__private T_Block* B) 108 | { 109 | salsa(&B->buf[60], &B->buf[0 ]); 110 | salsa(&B->buf[0 ], &B->buf[4 ]); 111 | salsa(&B->buf[4 ], &B->buf[8 ]); 112 | salsa(&B->buf[8 ], &B->buf[12]); 113 | salsa(&B->buf[12], &B->buf[16]); 114 | salsa(&B->buf[16], &B->buf[20]); 115 | salsa(&B->buf[20], &B->buf[24]); 116 | salsa(&B->buf[24], &B->buf[28]); 117 | salsa(&B->buf[28], &B->buf[32]); 118 | salsa(&B->buf[32], &B->buf[36]); 119 | salsa(&B->buf[36], &B->buf[40]); 120 | salsa(&B->buf[40], &B->buf[44]); 121 | salsa(&B->buf[44], &B->buf[48]); 122 | salsa(&B->buf[48], &B->buf[52]); 123 | salsa(&B->buf[52], &B->buf[56]); 124 | salsa(&B->buf[56], &B->buf[60]); 125 | 126 | __private T_Block Y = *B; 127 | 128 | copy64(B->buf, 0, Y.buf, 0); 129 | copy64(B->buf, 4, Y.buf, 8); 130 | copy64(B->buf, 8, Y.buf, 16); 131 | copy64(B->buf, 12, Y.buf, 24); 132 | copy64(B->buf, 16, Y.buf, 32); 133 | copy64(B->buf, 20, Y.buf, 40); 134 | copy64(B->buf, 24, Y.buf, 48); 135 | copy64(B->buf, 28, Y.buf, 56); 136 | copy64(B->buf, 32, Y.buf, 4); 137 | copy64(B->buf, 36, Y.buf, 12); 138 | copy64(B->buf, 40, Y.buf, 20); 139 | copy64(B->buf, 44, Y.buf, 28); 140 | copy64(B->buf, 48, Y.buf, 36); 141 | copy64(B->buf, 52, Y.buf, 44); 142 | copy64(B->buf, 56, Y.buf, 52); 143 | copy64(B->buf, 60, Y.buf, 60); 144 | } 145 | 146 | __kernel void ROMix(__global T_Block* Xs, 147 | __global T_Block* Vs, 148 | __global T_Block* outputs 149 | ) 150 | { 151 | __private unsigned int id = get_global_id(0); 152 | __private T_Block X = Xs[id]; 153 | __private int i, j, k, v_idx; 154 | 155 | __private int v_idx_offset = id * iterations; 156 | 157 | for (i = 0, v_idx = v_idx_offset; i < iterations; ++i, ++v_idx) 158 | { 159 | Vs[v_idx] = X; 160 | BlockMix(&X); 161 | } 162 | 163 | for (i = 0; i < iterations; ++i) 164 | { 165 | j = X.buf[60].x & (iterations - 1); 166 | v_idx = v_idx_offset + j; 167 | for (k = 0; k < 64; ++k) 168 | { 169 | X.buf[k] ^= Vs[v_idx].buf[k]; 170 | } 171 | BlockMix(&X); 172 | } 173 | 174 | __global T_Block* output = &outputs[id]; 175 | for (i = 0; i < 64; ++i) 176 | { 177 | output->buf[i] = X.buf[i]; 178 | } 179 | } -------------------------------------------------------------------------------- /Library/worker/generic/sha1.cl: -------------------------------------------------------------------------------- 1 | /* 2 | SHA1 OpenCL Optimized kernel 3 | (c) B. Kerler 2018 4 | MIT License 5 | */ 6 | 7 | /* 8 | (small) Changes: 9 | outbuf and inbuf structs defined using the buffer_structs_template 10 | func_sha1 renamed to hash_main 11 | hash array trimmed to size 5 12 | */ 13 | 14 | #define rotl32(a,n) rotate ((a), (n)) 15 | 16 | #define mod(x,y) ((x)-((x)/(y)*(y))) 17 | 18 | #define F2(x,y,z) ((x) ^ (y) ^ (z)) 19 | #define F1(x,y,z) (bitselect(z,y,x)) 20 | #define F0(x,y,z) (bitselect (x, y, ((x) ^ (z)))) 21 | 22 | #define SHA1M_A 0x67452301u 23 | #define SHA1M_B 0xefcdab89u 24 | #define SHA1M_C 0x98badcfeu 25 | #define SHA1M_D 0x10325476u 26 | #define SHA1M_E 0xc3d2e1f0u 27 | 28 | #define SHA1C00 0x5a827999u 29 | #define SHA1C01 0x6ed9eba1u 30 | #define SHA1C02 0x8f1bbcdcu 31 | #define SHA1C03 0xca62c1d6u 32 | 33 | #define SHA1_STEP(f,a,b,c,d,e,x) \ 34 | { \ 35 | e += K; \ 36 | e += x; \ 37 | e += f (b, c, d); \ 38 | e += rotl32 (a, 5u); \ 39 | b = rotl32 (b, 30u); \ 40 | } 41 | 42 | static void sha1_process2 (const unsigned int *W, unsigned int *digest) 43 | { 44 | unsigned int A = digest[0]; 45 | unsigned int B = digest[1]; 46 | unsigned int C = digest[2]; 47 | unsigned int D = digest[3]; 48 | unsigned int E = digest[4]; 49 | 50 | unsigned int w0_t = W[0]; 51 | unsigned int w1_t = W[1]; 52 | unsigned int w2_t = W[2]; 53 | unsigned int w3_t = W[3]; 54 | unsigned int w4_t = W[4]; 55 | unsigned int w5_t = W[5]; 56 | unsigned int w6_t = W[6]; 57 | unsigned int w7_t = W[7]; 58 | unsigned int w8_t = W[8]; 59 | unsigned int w9_t = W[9]; 60 | unsigned int wa_t = W[10]; 61 | unsigned int wb_t = W[11]; 62 | unsigned int wc_t = W[12]; 63 | unsigned int wd_t = W[13]; 64 | unsigned int we_t = W[14]; 65 | unsigned int wf_t = W[15]; 66 | 67 | #undef K 68 | #define K SHA1C00 69 | 70 | SHA1_STEP (F1, A, B, C, D, E, w0_t); 71 | SHA1_STEP (F1, E, A, B, C, D, w1_t); 72 | SHA1_STEP (F1, D, E, A, B, C, w2_t); 73 | SHA1_STEP (F1, C, D, E, A, B, w3_t); 74 | SHA1_STEP (F1, B, C, D, E, A, w4_t); 75 | SHA1_STEP (F1, A, B, C, D, E, w5_t); 76 | SHA1_STEP (F1, E, A, B, C, D, w6_t); 77 | SHA1_STEP (F1, D, E, A, B, C, w7_t); 78 | SHA1_STEP (F1, C, D, E, A, B, w8_t); 79 | SHA1_STEP (F1, B, C, D, E, A, w9_t); 80 | SHA1_STEP (F1, A, B, C, D, E, wa_t); 81 | SHA1_STEP (F1, E, A, B, C, D, wb_t); 82 | SHA1_STEP (F1, D, E, A, B, C, wc_t); 83 | SHA1_STEP (F1, C, D, E, A, B, wd_t); 84 | SHA1_STEP (F1, B, C, D, E, A, we_t); 85 | SHA1_STEP (F1, A, B, C, D, E, wf_t); 86 | w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (F1, E, A, B, C, D, w0_t); 87 | w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (F1, D, E, A, B, C, w1_t); 88 | w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (F1, C, D, E, A, B, w2_t); 89 | w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (F1, B, C, D, E, A, w3_t); 90 | 91 | #undef K 92 | #define K SHA1C01 93 | 94 | w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (F2, A, B, C, D, E, w4_t); 95 | w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (F2, E, A, B, C, D, w5_t); 96 | w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (F2, D, E, A, B, C, w6_t); 97 | w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (F2, C, D, E, A, B, w7_t); 98 | w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (F2, B, C, D, E, A, w8_t); 99 | w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (F2, A, B, C, D, E, w9_t); 100 | wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (F2, E, A, B, C, D, wa_t); 101 | wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (F2, D, E, A, B, C, wb_t); 102 | wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (F2, C, D, E, A, B, wc_t); 103 | wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (F2, B, C, D, E, A, wd_t); 104 | we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (F2, A, B, C, D, E, we_t); 105 | wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (F2, E, A, B, C, D, wf_t); 106 | w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (F2, D, E, A, B, C, w0_t); 107 | w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (F2, C, D, E, A, B, w1_t); 108 | w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (F2, B, C, D, E, A, w2_t); 109 | w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (F2, A, B, C, D, E, w3_t); 110 | w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (F2, E, A, B, C, D, w4_t); 111 | w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (F2, D, E, A, B, C, w5_t); 112 | w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (F2, C, D, E, A, B, w6_t); 113 | w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (F2, B, C, D, E, A, w7_t); 114 | 115 | #undef K 116 | #define K SHA1C02 117 | 118 | w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (F0, A, B, C, D, E, w8_t); 119 | w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (F0, E, A, B, C, D, w9_t); 120 | wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (F0, D, E, A, B, C, wa_t); 121 | wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (F0, C, D, E, A, B, wb_t); 122 | wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (F0, B, C, D, E, A, wc_t); 123 | wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (F0, A, B, C, D, E, wd_t); 124 | we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (F0, E, A, B, C, D, we_t); 125 | wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (F0, D, E, A, B, C, wf_t); 126 | w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (F0, C, D, E, A, B, w0_t); 127 | w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (F0, B, C, D, E, A, w1_t); 128 | w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (F0, A, B, C, D, E, w2_t); 129 | w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (F0, E, A, B, C, D, w3_t); 130 | w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (F0, D, E, A, B, C, w4_t); 131 | w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (F0, C, D, E, A, B, w5_t); 132 | w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (F0, B, C, D, E, A, w6_t); 133 | w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (F0, A, B, C, D, E, w7_t); 134 | w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (F0, E, A, B, C, D, w8_t); 135 | w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (F0, D, E, A, B, C, w9_t); 136 | wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (F0, C, D, E, A, B, wa_t); 137 | wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (F0, B, C, D, E, A, wb_t); 138 | 139 | #undef K 140 | #define K SHA1C03 141 | 142 | wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (F2, A, B, C, D, E, wc_t); 143 | wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (F2, E, A, B, C, D, wd_t); 144 | we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (F2, D, E, A, B, C, we_t); 145 | wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (F2, C, D, E, A, B, wf_t); 146 | w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (F2, B, C, D, E, A, w0_t); 147 | w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (F2, A, B, C, D, E, w1_t); 148 | w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (F2, E, A, B, C, D, w2_t); 149 | w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (F2, D, E, A, B, C, w3_t); 150 | w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (F2, C, D, E, A, B, w4_t); 151 | w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (F2, B, C, D, E, A, w5_t); 152 | w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (F2, A, B, C, D, E, w6_t); 153 | w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (F2, E, A, B, C, D, w7_t); 154 | w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (F2, D, E, A, B, C, w8_t); 155 | w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (F2, C, D, E, A, B, w9_t); 156 | wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (F2, B, C, D, E, A, wa_t); 157 | wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (F2, A, B, C, D, E, wb_t); 158 | wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (F2, E, A, B, C, D, wc_t); 159 | wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (F2, D, E, A, B, C, wd_t); 160 | we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (F2, C, D, E, A, B, we_t); 161 | wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (F2, B, C, D, E, A, wf_t); 162 | 163 | // Macros don't have scope, so this K was being preserved 164 | #undef K 165 | 166 | digest[0] += A; 167 | digest[1] += B; 168 | digest[2] += C; 169 | digest[3] += D; 170 | digest[4] += E; 171 | } 172 | 173 | #define def_hash(funcName, passTag, hashTag) \ 174 | /* The main hashing function */ \ 175 | static void funcName(passTag const unsigned int *pass, int pass_len, hashTag unsigned int* hash) \ 176 | { \ 177 | /* pass is only given to SWAP 178 | and hash is just assigned to p, which is only accessed by p[i] = 179 | => both tags irrelevant! */ \ 180 | \ 181 | int plen=pass_len/4; \ 182 | if (mod(pass_len,4)) plen++; \ 183 | \ 184 | hashTag unsigned int* p = hash; \ 185 | \ 186 | unsigned int W[0x10]={0}; \ 187 | int loops=plen; \ 188 | int curloop=0; \ 189 | unsigned int State[5]={0}; \ 190 | State[0] = 0x67452301; \ 191 | State[1] = 0xefcdab89; \ 192 | State[2] = 0x98badcfe; \ 193 | State[3] = 0x10325476; \ 194 | State[4] = 0xc3d2e1f0; \ 195 | \ 196 | \ 197 | while (loops>0) \ 198 | { \ 199 | W[0x0]=0x0; \ 200 | W[0x1]=0x0; \ 201 | W[0x2]=0x0; \ 202 | W[0x3]=0x0; \ 203 | W[0x4]=0x0; \ 204 | W[0x5]=0x0; \ 205 | W[0x6]=0x0; \ 206 | W[0x7]=0x0; \ 207 | W[0x8]=0x0; \ 208 | W[0x9]=0x0; \ 209 | W[0xA]=0x0; \ 210 | W[0xB]=0x0; \ 211 | W[0xC]=0x0; \ 212 | W[0xD]=0x0; \ 213 | W[0xE]=0x0; \ 214 | W[0xF]=0x0; \ 215 | \ 216 | for (int m=0;loops!=0 && m<16;m++) \ 217 | { \ 218 | W[m]^=SWAP(pass[m+(curloop*16)]); \ 219 | loops--; \ 220 | } \ 221 | \ 222 | if (loops==0 && mod(pass_len,64)!=0) \ 223 | { \ 224 | unsigned int padding=0x80<<(((pass_len+4)-((pass_len+4)/4*4))*8); \ 225 | int v=mod(pass_len,64); \ 226 | W[v/4]|=SWAP(padding); \ 227 | if ((pass_len&0x3B)!=0x3B) \ 228 | { \ 229 | /* Let's add length */ \ 230 | W[0x0F]=pass_len*8; \ 231 | } \ 232 | } \ 233 | \ 234 | sha1_process2(W,State); \ 235 | curloop++; \ 236 | } \ 237 | \ 238 | if (mod(plen,16)==0) \ 239 | { \ 240 | W[0x0]=0x0; \ 241 | W[0x1]=0x0; \ 242 | W[0x2]=0x0; \ 243 | W[0x3]=0x0; \ 244 | W[0x4]=0x0; \ 245 | W[0x5]=0x0; \ 246 | W[0x6]=0x0; \ 247 | W[0x7]=0x0; \ 248 | W[0x8]=0x0; \ 249 | W[0x9]=0x0; \ 250 | W[0xA]=0x0; \ 251 | W[0xB]=0x0; \ 252 | W[0xC]=0x0; \ 253 | W[0xD]=0x0; \ 254 | W[0xE]=0x0; \ 255 | W[0xF]=0x0; \ 256 | if ((pass_len&0x3B)!=0x3B) \ 257 | { \ 258 | unsigned int padding=0x80<<(((pass_len+4)-((pass_len+4)/4*4))*8); \ 259 | W[0]|=SWAP(padding); \ 260 | } \ 261 | /* Let's add length */ \ 262 | W[0x0F]=pass_len*8; \ 263 | \ 264 | sha1_process2(W,State); \ 265 | } \ 266 | \ 267 | p[0]=SWAP(State[0]); \ 268 | p[1]=SWAP(State[1]); \ 269 | p[2]=SWAP(State[2]); \ 270 | p[3]=SWAP(State[3]); \ 271 | p[4]=SWAP(State[4]); \ 272 | return; \ 273 | } 274 | 275 | def_hash(hash_global, __global, __global) 276 | def_hash(hash_private, __private, __private) 277 | def_hash(hash_glbl_to_priv, __global, __private) 278 | def_hash(hash_priv_to_glbl, __private, __global) 279 | 280 | #undef mod 281 | 282 | #undef rotl32 283 | #undef F0 284 | #undef F1 285 | #undef F2 286 | 287 | __kernel void hash_main(__global const inbuf * inbuffer, __global outbuf * outbuffer) 288 | { 289 | unsigned int idx = get_global_id(0); 290 | 291 | // unsigned int hash[20/4]={0}; 292 | 293 | hash_global(inbuffer[idx].buffer, inbuffer[idx].length, outbuffer[idx].buffer); 294 | 295 | /* outbuffer[idx].buffer[0]=hash[0]; 296 | outbuffer[idx].buffer[1]=hash[1]; 297 | outbuffer[idx].buffer[2]=hash[2]; 298 | outbuffer[idx].buffer[3]=hash[3]; 299 | outbuffer[idx].buffer[4]=hash[4]; */ 300 | } 301 | -------------------------------------------------------------------------------- /Library/worker/generic/sha256.cl: -------------------------------------------------------------------------------- 1 | /* 2 | Original: 3 | SHA1 OpenCL Optimized kernel 4 | (c) B. Kerler 2018 5 | MIT License 6 | */ 7 | 8 | /* 9 | (small) Changes: 10 | outbuf and inbuf structs defined using the buffer_structs_template 11 | func_sha256 renamed to hash_main 12 | */ 13 | 14 | /* 15 | Modified: hash_main function works for any length inputs. 16 | */ 17 | 18 | #define F1(x,y,z) (bitselect(z,y,x)) 19 | #define F0(x,y,z) (bitselect (x, y, ((x) ^ (z)))) 20 | #define mod(x,y) ((x)-((x)/(y)*(y))) 21 | #define shr32(x,n) ((x) >> (n)) 22 | #define rotl32(a,n) rotate ((a), (n)) 23 | 24 | #define S0(x) (rotl32 ((x), 25u) ^ rotl32 ((x), 14u) ^ shr32 ((x), 3u)) 25 | #define S1(x) (rotl32 ((x), 15u) ^ rotl32 ((x), 13u) ^ shr32 ((x), 10u)) 26 | #define S2(x) (rotl32 ((x), 30u) ^ rotl32 ((x), 19u) ^ rotl32 ((x), 10u)) 27 | #define S3(x) (rotl32 ((x), 26u) ^ rotl32 ((x), 21u) ^ rotl32 ((x), 7u)) 28 | 29 | #define SHA256C00 0x428a2f98u 30 | #define SHA256C01 0x71374491u 31 | #define SHA256C02 0xb5c0fbcfu 32 | #define SHA256C03 0xe9b5dba5u 33 | #define SHA256C04 0x3956c25bu 34 | #define SHA256C05 0x59f111f1u 35 | #define SHA256C06 0x923f82a4u 36 | #define SHA256C07 0xab1c5ed5u 37 | #define SHA256C08 0xd807aa98u 38 | #define SHA256C09 0x12835b01u 39 | #define SHA256C0a 0x243185beu 40 | #define SHA256C0b 0x550c7dc3u 41 | #define SHA256C0c 0x72be5d74u 42 | #define SHA256C0d 0x80deb1feu 43 | #define SHA256C0e 0x9bdc06a7u 44 | #define SHA256C0f 0xc19bf174u 45 | #define SHA256C10 0xe49b69c1u 46 | #define SHA256C11 0xefbe4786u 47 | #define SHA256C12 0x0fc19dc6u 48 | #define SHA256C13 0x240ca1ccu 49 | #define SHA256C14 0x2de92c6fu 50 | #define SHA256C15 0x4a7484aau 51 | #define SHA256C16 0x5cb0a9dcu 52 | #define SHA256C17 0x76f988dau 53 | #define SHA256C18 0x983e5152u 54 | #define SHA256C19 0xa831c66du 55 | #define SHA256C1a 0xb00327c8u 56 | #define SHA256C1b 0xbf597fc7u 57 | #define SHA256C1c 0xc6e00bf3u 58 | #define SHA256C1d 0xd5a79147u 59 | #define SHA256C1e 0x06ca6351u 60 | #define SHA256C1f 0x14292967u 61 | #define SHA256C20 0x27b70a85u 62 | #define SHA256C21 0x2e1b2138u 63 | #define SHA256C22 0x4d2c6dfcu 64 | #define SHA256C23 0x53380d13u 65 | #define SHA256C24 0x650a7354u 66 | #define SHA256C25 0x766a0abbu 67 | #define SHA256C26 0x81c2c92eu 68 | #define SHA256C27 0x92722c85u 69 | #define SHA256C28 0xa2bfe8a1u 70 | #define SHA256C29 0xa81a664bu 71 | #define SHA256C2a 0xc24b8b70u 72 | #define SHA256C2b 0xc76c51a3u 73 | #define SHA256C2c 0xd192e819u 74 | #define SHA256C2d 0xd6990624u 75 | #define SHA256C2e 0xf40e3585u 76 | #define SHA256C2f 0x106aa070u 77 | #define SHA256C30 0x19a4c116u 78 | #define SHA256C31 0x1e376c08u 79 | #define SHA256C32 0x2748774cu 80 | #define SHA256C33 0x34b0bcb5u 81 | #define SHA256C34 0x391c0cb3u 82 | #define SHA256C35 0x4ed8aa4au 83 | #define SHA256C36 0x5b9cca4fu 84 | #define SHA256C37 0x682e6ff3u 85 | #define SHA256C38 0x748f82eeu 86 | #define SHA256C39 0x78a5636fu 87 | #define SHA256C3a 0x84c87814u 88 | #define SHA256C3b 0x8cc70208u 89 | #define SHA256C3c 0x90befffau 90 | #define SHA256C3d 0xa4506cebu 91 | #define SHA256C3e 0xbef9a3f7u 92 | #define SHA256C3f 0xc67178f2u 93 | 94 | __constant uint k_sha256[64] = 95 | { 96 | SHA256C00, SHA256C01, SHA256C02, SHA256C03, 97 | SHA256C04, SHA256C05, SHA256C06, SHA256C07, 98 | SHA256C08, SHA256C09, SHA256C0a, SHA256C0b, 99 | SHA256C0c, SHA256C0d, SHA256C0e, SHA256C0f, 100 | SHA256C10, SHA256C11, SHA256C12, SHA256C13, 101 | SHA256C14, SHA256C15, SHA256C16, SHA256C17, 102 | SHA256C18, SHA256C19, SHA256C1a, SHA256C1b, 103 | SHA256C1c, SHA256C1d, SHA256C1e, SHA256C1f, 104 | SHA256C20, SHA256C21, SHA256C22, SHA256C23, 105 | SHA256C24, SHA256C25, SHA256C26, SHA256C27, 106 | SHA256C28, SHA256C29, SHA256C2a, SHA256C2b, 107 | SHA256C2c, SHA256C2d, SHA256C2e, SHA256C2f, 108 | SHA256C30, SHA256C31, SHA256C32, SHA256C33, 109 | SHA256C34, SHA256C35, SHA256C36, SHA256C37, 110 | SHA256C38, SHA256C39, SHA256C3a, SHA256C3b, 111 | SHA256C3c, SHA256C3d, SHA256C3e, SHA256C3f, 112 | }; 113 | 114 | #define SHA256_STEP(F0a,F1a,a,b,c,d,e,f,g,h,x,K) \ 115 | { \ 116 | h += K; \ 117 | h += x; \ 118 | h += S3 (e); \ 119 | h += F1a (e,f,g); \ 120 | d += h; \ 121 | h += S2 (a); \ 122 | h += F0a (a,b,c); \ 123 | } 124 | 125 | #define SHA256_EXPAND(x,y,z,w) (S1 (x) + y + S0 (z) + w) 126 | 127 | static void sha256_process2 (const unsigned int *W, unsigned int *digest) 128 | { 129 | unsigned int a = digest[0]; 130 | unsigned int b = digest[1]; 131 | unsigned int c = digest[2]; 132 | unsigned int d = digest[3]; 133 | unsigned int e = digest[4]; 134 | unsigned int f = digest[5]; 135 | unsigned int g = digest[6]; 136 | unsigned int h = digest[7]; 137 | 138 | unsigned int w0_t = W[0]; 139 | unsigned int w1_t = W[1]; 140 | unsigned int w2_t = W[2]; 141 | unsigned int w3_t = W[3]; 142 | unsigned int w4_t = W[4]; 143 | unsigned int w5_t = W[5]; 144 | unsigned int w6_t = W[6]; 145 | unsigned int w7_t = W[7]; 146 | unsigned int w8_t = W[8]; 147 | unsigned int w9_t = W[9]; 148 | unsigned int wa_t = W[10]; 149 | unsigned int wb_t = W[11]; 150 | unsigned int wc_t = W[12]; 151 | unsigned int wd_t = W[13]; 152 | unsigned int we_t = W[14]; 153 | unsigned int wf_t = W[15]; 154 | 155 | #define ROUND_EXPAND(i) \ 156 | { \ 157 | w0_t = SHA256_EXPAND (we_t, w9_t, w1_t, w0_t); \ 158 | w1_t = SHA256_EXPAND (wf_t, wa_t, w2_t, w1_t); \ 159 | w2_t = SHA256_EXPAND (w0_t, wb_t, w3_t, w2_t); \ 160 | w3_t = SHA256_EXPAND (w1_t, wc_t, w4_t, w3_t); \ 161 | w4_t = SHA256_EXPAND (w2_t, wd_t, w5_t, w4_t); \ 162 | w5_t = SHA256_EXPAND (w3_t, we_t, w6_t, w5_t); \ 163 | w6_t = SHA256_EXPAND (w4_t, wf_t, w7_t, w6_t); \ 164 | w7_t = SHA256_EXPAND (w5_t, w0_t, w8_t, w7_t); \ 165 | w8_t = SHA256_EXPAND (w6_t, w1_t, w9_t, w8_t); \ 166 | w9_t = SHA256_EXPAND (w7_t, w2_t, wa_t, w9_t); \ 167 | wa_t = SHA256_EXPAND (w8_t, w3_t, wb_t, wa_t); \ 168 | wb_t = SHA256_EXPAND (w9_t, w4_t, wc_t, wb_t); \ 169 | wc_t = SHA256_EXPAND (wa_t, w5_t, wd_t, wc_t); \ 170 | wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); \ 171 | we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); \ 172 | wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); \ 173 | } 174 | 175 | #define ROUND_STEP(i) \ 176 | { \ 177 | SHA256_STEP (F0, F1, a, b, c, d, e, f, g, h, w0_t, k_sha256[i + 0]); \ 178 | SHA256_STEP (F0, F1, h, a, b, c, d, e, f, g, w1_t, k_sha256[i + 1]); \ 179 | SHA256_STEP (F0, F1, g, h, a, b, c, d, e, f, w2_t, k_sha256[i + 2]); \ 180 | SHA256_STEP (F0, F1, f, g, h, a, b, c, d, e, w3_t, k_sha256[i + 3]); \ 181 | SHA256_STEP (F0, F1, e, f, g, h, a, b, c, d, w4_t, k_sha256[i + 4]); \ 182 | SHA256_STEP (F0, F1, d, e, f, g, h, a, b, c, w5_t, k_sha256[i + 5]); \ 183 | SHA256_STEP (F0, F1, c, d, e, f, g, h, a, b, w6_t, k_sha256[i + 6]); \ 184 | SHA256_STEP (F0, F1, b, c, d, e, f, g, h, a, w7_t, k_sha256[i + 7]); \ 185 | SHA256_STEP (F0, F1, a, b, c, d, e, f, g, h, w8_t, k_sha256[i + 8]); \ 186 | SHA256_STEP (F0, F1, h, a, b, c, d, e, f, g, w9_t, k_sha256[i + 9]); \ 187 | SHA256_STEP (F0, F1, g, h, a, b, c, d, e, f, wa_t, k_sha256[i + 10]); \ 188 | SHA256_STEP (F0, F1, f, g, h, a, b, c, d, e, wb_t, k_sha256[i + 11]); \ 189 | SHA256_STEP (F0, F1, e, f, g, h, a, b, c, d, wc_t, k_sha256[i + 12]); \ 190 | SHA256_STEP (F0, F1, d, e, f, g, h, a, b, c, wd_t, k_sha256[i + 13]); \ 191 | SHA256_STEP (F0, F1, c, d, e, f, g, h, a, b, we_t, k_sha256[i + 14]); \ 192 | SHA256_STEP (F0, F1, b, c, d, e, f, g, h, a, wf_t, k_sha256[i + 15]); \ 193 | } 194 | 195 | ROUND_STEP (0); 196 | 197 | ROUND_EXPAND(); 198 | ROUND_STEP(16); 199 | 200 | ROUND_EXPAND(); 201 | ROUND_STEP(32); 202 | 203 | ROUND_EXPAND(); 204 | ROUND_STEP(48); 205 | 206 | digest[0] += a; 207 | digest[1] += b; 208 | digest[2] += c; 209 | digest[3] += d; 210 | digest[4] += e; 211 | digest[5] += f; 212 | digest[6] += g; 213 | digest[7] += h; 214 | } 215 | 216 | #define def_hash(funcName, passTag, hashTag) \ 217 | /* The main hashing function */ \ 218 | static void funcName(passTag const unsigned int *pass, int pass_len, hashTag unsigned int* hash) \ 219 | { \ 220 | int plen=pass_len/4; \ 221 | if (mod(pass_len,4)) plen++; \ 222 | \ 223 | unsigned int slidePadding=0; \ 224 | if (mod(pass_len,64)>=56) slidePadding=1; \ 225 | \ 226 | hashTag unsigned int* p = hash; \ 227 | \ 228 | unsigned int W[0x10]={0}; \ 229 | int loops=plen; \ 230 | int curloop=0; \ 231 | unsigned int State[8]={0}; \ 232 | State[0] = 0x6a09e667; \ 233 | State[1] = 0xbb67ae85; \ 234 | State[2] = 0x3c6ef372; \ 235 | State[3] = 0xa54ff53a; \ 236 | State[4] = 0x510e527f; \ 237 | State[5] = 0x9b05688c; \ 238 | State[6] = 0x1f83d9ab; \ 239 | State[7] = 0x5be0cd19; \ 240 | \ 241 | while (loops>0) \ 242 | { \ 243 | W[0x0]=0x0; \ 244 | W[0x1]=0x0; \ 245 | W[0x2]=0x0; \ 246 | W[0x3]=0x0; \ 247 | W[0x4]=0x0; \ 248 | W[0x5]=0x0; \ 249 | W[0x6]=0x0; \ 250 | W[0x7]=0x0; \ 251 | W[0x8]=0x0; \ 252 | W[0x9]=0x0; \ 253 | W[0xA]=0x0; \ 254 | W[0xB]=0x0; \ 255 | W[0xC]=0x0; \ 256 | W[0xD]=0x0; \ 257 | W[0xE]=0x0; \ 258 | W[0xF]=0x0; \ 259 | \ 260 | for (int m=0;loops!=0 && m<16;m++) \ 261 | { \ 262 | W[m]^=SWAP(pass[m+(curloop*16)]); \ 263 | loops--; \ 264 | } \ 265 | \ 266 | if (loops==0 && mod(pass_len,64)!=0) \ 267 | { \ 268 | unsigned int padding=0x80<<(((pass_len+4)-((pass_len+4)/4*4))*8); \ 269 | int v=mod(pass_len,64); \ 270 | W[v/4]|=SWAP(padding); \ 271 | if (slidePadding==0) \ 272 | { \ 273 | /* Let's add length */ \ 274 | W[0x0F]=pass_len*8; \ 275 | } \ 276 | } \ 277 | \ 278 | sha256_process2(W,State); \ 279 | curloop++; \ 280 | } \ 281 | \ 282 | if (slidePadding!=0) { \ 283 | W[0x0]=0x0; \ 284 | W[0x1]=0x0; \ 285 | W[0x2]=0x0; \ 286 | W[0x3]=0x0; \ 287 | W[0x4]=0x0; \ 288 | W[0x5]=0x0; \ 289 | W[0x6]=0x0; \ 290 | W[0x7]=0x0; \ 291 | W[0x8]=0x0; \ 292 | W[0x9]=0x0; \ 293 | W[0xA]=0x0; \ 294 | W[0xB]=0x0; \ 295 | W[0xC]=0x0; \ 296 | W[0xD]=0x0; \ 297 | W[0xE]=0x0; \ 298 | W[0x0F]=pass_len*8; \ 299 | \ 300 | sha256_process2(W,State); \ 301 | } else { \ 302 | if (mod(plen,16)==0) \ 303 | { \ 304 | W[0x0]=0x80000000; \ 305 | W[0x1]=0x0; \ 306 | W[0x2]=0x0; \ 307 | W[0x3]=0x0; \ 308 | W[0x4]=0x0; \ 309 | W[0x5]=0x0; \ 310 | W[0x6]=0x0; \ 311 | W[0x7]=0x0; \ 312 | W[0x8]=0x0; \ 313 | W[0x9]=0x0; \ 314 | W[0xA]=0x0; \ 315 | W[0xB]=0x0; \ 316 | W[0xC]=0x0; \ 317 | W[0xD]=0x0; \ 318 | W[0xE]=0x0; \ 319 | W[0x0F]=pass_len*8; \ 320 | \ 321 | sha256_process2(W,State); \ 322 | } \ 323 | } \ 324 | \ 325 | p[0]=SWAP(State[0]); \ 326 | p[1]=SWAP(State[1]); \ 327 | p[2]=SWAP(State[2]); \ 328 | p[3]=SWAP(State[3]); \ 329 | p[4]=SWAP(State[4]); \ 330 | p[5]=SWAP(State[5]); \ 331 | p[6]=SWAP(State[6]); \ 332 | p[7]=SWAP(State[7]); \ 333 | return; \ 334 | } 335 | 336 | def_hash(hash_global, __global, __global) 337 | def_hash(hash_private, __private, __private) 338 | def_hash(hash_glbl_to_priv, __global, __private) 339 | def_hash(hash_priv_to_glbl, __private, __global) 340 | 341 | #undef F0 342 | #undef F1 343 | #undef S0 344 | #undef S1 345 | #undef S2 346 | #undef S3 347 | 348 | #undef mod 349 | #undef shr32 350 | #undef rotl32 351 | 352 | __kernel void hash_main(__global const inbuf * inbuffer, __global outbuf * outbuffer) 353 | { 354 | unsigned int idx = get_global_id(0); 355 | // unsigned int hash[32/4]={0}; 356 | hash_global(inbuffer[idx].buffer, inbuffer[idx].length, outbuffer[idx].buffer); 357 | /* outbuffer[idx].buffer[0]=hash[0]; 358 | outbuffer[idx].buffer[1]=hash[1]; 359 | outbuffer[idx].buffer[2]=hash[2]; 360 | outbuffer[idx].buffer[3]=hash[3]; 361 | outbuffer[idx].buffer[4]=hash[4]; 362 | outbuffer[idx].buffer[5]=hash[5]; 363 | outbuffer[idx].buffer[6]=hash[6]; 364 | outbuffer[idx].buffer[7]=hash[7]; */ 365 | } 366 | -------------------------------------------------------------------------------- /Library/worker/generic/sha512.cl: -------------------------------------------------------------------------------- 1 | /* 2 | Original copyright (sha256): 3 | OpenCL Optimized kernel 4 | (c) B. Kerler 2018 5 | MIT License 6 | 7 | Adapted for SHA512 by C.B .. apparently quite a while ago 8 | The moral of the story is always use UL on unsigned longs! 9 | */ 10 | 11 | 12 | 13 | // bitselect is "if c then b else a" for each bit 14 | // so equivalent to (c & b) | ((~c) & a) 15 | #define choose(x,y,z) (bitselect(z,y,x)) 16 | // Cleverly determines majority vote, conditioning on x=z 17 | #define bit_maj(x,y,z) (bitselect (x, y, ((x) ^ (z)))) 18 | 19 | // Hopefully rotate works for long too? 20 | 21 | 22 | 23 | 24 | // ============================================================================== 25 | // ========= S0,S1,s0,s1 ====================================================== 26 | 27 | 28 | #define S0(x) (rotr64(x,28ul) ^ rotr64(x,34ul) ^ rotr64(x,39ul)) 29 | #define S1(x) (rotr64(x,14ul) ^ rotr64(x,18ul) ^ rotr64(x,41ul)) 30 | 31 | #define little_s0(x) (rotr64(x,1ul) ^ rotr64(x,8ul) ^ ((x) >> 7ul)) 32 | #define little_s1(x) (rotr64(x,19ul) ^ rotr64(x,61ul) ^ ((x) >> 6ul)) 33 | 34 | 35 | // ============================================================================== 36 | // ========= MD-pads the input, taken from md5.cl ============================= 37 | // Adapted for unsigned longs 38 | // Note that the padding is still in a distinct unsigned long to the appended length. 39 | 40 | 41 | // 'highBit' macro is (i+1) bytes, all 0 but the last which is 0x80 42 | // where we are thinking Little-endian thoughts. 43 | // Don't forget to call constants longs!! 44 | #define highBit(i) (0x1UL << (8*i + 7)) 45 | #define fBytes(i) (0xFFFFFFFFFFFFFFFFUL >> (8 * (8-i))) 46 | __constant unsigned long padLong[8] = { 47 | highBit(0), highBit(1), highBit(2), highBit(3), 48 | highBit(4), highBit(5), highBit(6), highBit(7) 49 | }; 50 | __constant unsigned long maskLong[8] = { 51 | 0, fBytes(1), fBytes(2), fBytes(3), // strange behaviour for fBytes(0) 52 | fBytes(4), fBytes(5), fBytes(6), fBytes(7) 53 | }; 54 | 55 | #define bs_long hashBlockSize_long64 56 | #define def_md_pad_128(funcName, tag) \ 57 | /* The standard padding, INPLACE, 58 | add a 1 bit, then little-endian original length mod 2^128 (not 64) at the end of a block 59 | RETURN number of blocks */ \ 60 | static int funcName(tag unsigned long *msg, const long msgLen_bytes) \ 61 | { \ 62 | /* Appends the 1 bit to the end, and 0s to the end of the byte */ \ 63 | const unsigned int padLongIndex = ((unsigned int)msgLen_bytes) / 8; \ 64 | const unsigned int overhang = (((unsigned int)msgLen_bytes) - padLongIndex*8); \ 65 | /* Don't assume that there are zeros here! */ \ 66 | msg[padLongIndex] &= maskLong[overhang]; \ 67 | msg[padLongIndex] |= padLong[overhang]; \ 68 | \ 69 | /* Previous code was horrible 70 | Now we zero until we reach a multiple of the block size, 71 | Skipping TWO longs to ensure there is room for the length */ \ 72 | msg[padLongIndex + 1] = 0; \ 73 | msg[padLongIndex + 2] = 0; \ 74 | unsigned int i = 0; \ 75 | for (i = padLongIndex + 3; i % bs_long != 0; i++) \ 76 | { \ 77 | msg[i] = 0; \ 78 | } \ 79 | \ 80 | /* Determine the total number of blocks */ \ 81 | int nBlocks = i / bs_long; \ 82 | /* Add the bit length to the end, 128-bit, big endian? (source wikipedia) 83 | Seemingly this does require SWAPing, so perhaps it's little-endian? */ \ 84 | msg[i-2] = 0; /* For clarity */ \ 85 | msg[i-1] = SWAP(msgLen_bytes*8); \ 86 | \ 87 | return nBlocks; \ 88 | }; 89 | 90 | // Define it with the various tags to cheer OpenCL up 91 | def_md_pad_128(md_pad__global, __global) 92 | def_md_pad_128(md_pad__private, __private) 93 | 94 | #undef bs_long 95 | #undef def_md_pad_128 96 | #undef highBit 97 | #undef fBytes 98 | 99 | 100 | 101 | 102 | // ============================================================================== 103 | 104 | __constant unsigned long k_sha256[80] = 105 | { 106 | 0x428a2f98d728ae22UL, 0x7137449123ef65cdUL, 0xb5c0fbcfec4d3b2fUL, 0xe9b5dba58189dbbcUL, 0x3956c25bf348b538UL, 107 | 0x59f111f1b605d019UL, 0x923f82a4af194f9bUL, 0xab1c5ed5da6d8118UL, 0xd807aa98a3030242UL, 0x12835b0145706fbeUL, 108 | 0x243185be4ee4b28cUL, 0x550c7dc3d5ffb4e2UL, 0x72be5d74f27b896fUL, 0x80deb1fe3b1696b1UL, 0x9bdc06a725c71235UL, 109 | 0xc19bf174cf692694UL, 0xe49b69c19ef14ad2UL, 0xefbe4786384f25e3UL, 0x0fc19dc68b8cd5b5UL, 0x240ca1cc77ac9c65UL, 110 | 0x2de92c6f592b0275UL, 0x4a7484aa6ea6e483UL, 0x5cb0a9dcbd41fbd4UL, 0x76f988da831153b5UL, 0x983e5152ee66dfabUL, 111 | 0xa831c66d2db43210UL, 0xb00327c898fb213fUL, 0xbf597fc7beef0ee4UL, 0xc6e00bf33da88fc2UL, 0xd5a79147930aa725UL, 112 | 0x06ca6351e003826fUL, 0x142929670a0e6e70UL, 0x27b70a8546d22ffcUL, 0x2e1b21385c26c926UL, 0x4d2c6dfc5ac42aedUL, 113 | 0x53380d139d95b3dfUL, 0x650a73548baf63deUL, 0x766a0abb3c77b2a8UL, 0x81c2c92e47edaee6UL, 0x92722c851482353bUL, 114 | 0xa2bfe8a14cf10364UL, 0xa81a664bbc423001UL, 0xc24b8b70d0f89791UL, 0xc76c51a30654be30UL, 0xd192e819d6ef5218UL, 115 | 0xd69906245565a910UL, 0xf40e35855771202aUL, 0x106aa07032bbd1b8UL, 0x19a4c116b8d2d0c8UL, 0x1e376c085141ab53UL, 116 | 0x2748774cdf8eeb99UL, 0x34b0bcb5e19b48a8UL, 0x391c0cb3c5c95a63UL, 0x4ed8aa4ae3418acbUL, 0x5b9cca4f7763e373UL, 117 | 0x682e6ff3d6b2b8a3UL, 0x748f82ee5defb2fcUL, 0x78a5636f43172f60UL, 0x84c87814a1f0ab72UL, 0x8cc702081a6439ecUL, 118 | 0x90befffa23631e28UL, 0xa4506cebde82bde9UL, 0xbef9a3f7b2c67915UL, 0xc67178f2e372532bUL, 0xca273eceea26619cUL, 119 | 0xd186b8c721c0c207UL, 0xeada7dd6cde0eb1eUL, 0xf57d4f7fee6ed178UL, 0x06f067aa72176fbaUL, 0x0a637dc5a2c898a6UL, 120 | 0x113f9804bef90daeUL, 0x1b710b35131c471bUL, 0x28db77f523047d84UL, 0x32caab7b40c72493UL, 0x3c9ebe0a15c9bebcUL, 121 | 0x431d67c49c100d4cUL, 0x4cc5d4becb3e42b6UL, 0x597f299cfc657e2aUL, 0x5fcb6fab3ad6faecUL, 0x6c44198c4a475817UL 122 | }; 123 | 124 | 125 | #define SHA512_STEP(a,b,c,d,e,f,g,h,x,K) \ 126 | /**/ \ 127 | { \ 128 | h += K + S1(e) + choose(e,f,g) + x; /* h = temp1 */ \ 129 | d += h; \ 130 | h += S0(a) + bit_maj(a,b,c); \ 131 | } 132 | 133 | 134 | static void printAll(unsigned long a, unsigned long b, unsigned long c, unsigned long d, 135 | unsigned long e, unsigned long f, unsigned long g, unsigned long h) 136 | { 137 | printf("a = %lX\n", a); 138 | printf("b = %lX\n", b); 139 | printf("c = %lX\n", c); 140 | printf("d = %lX\n", d); 141 | printf("e = %lX\n", e); 142 | printf("f = %lX\n", f); 143 | printf("g = %lX\n", g); 144 | printf("h = %lX\n\n", h); 145 | } 146 | 147 | #define ROUND_STEP(i) \ 148 | /**/ \ 149 | { \ 150 | SHA512_STEP(a, b, c, d, e, f, g, h, W[i + 0], k_sha256[i + 0]); \ 151 | SHA512_STEP(h, a, b, c, d, e, f, g, W[i + 1], k_sha256[i + 1]); \ 152 | SHA512_STEP(g, h, a, b, c, d, e, f, W[i + 2], k_sha256[i + 2]); \ 153 | SHA512_STEP(f, g, h, a, b, c, d, e, W[i + 3], k_sha256[i + 3]); \ 154 | SHA512_STEP(e, f, g, h, a, b, c, d, W[i + 4], k_sha256[i + 4]); \ 155 | SHA512_STEP(d, e, f, g, h, a, b, c, W[i + 5], k_sha256[i + 5]); \ 156 | SHA512_STEP(c, d, e, f, g, h, a, b, W[i + 6], k_sha256[i + 6]); \ 157 | SHA512_STEP(b, c, d, e, f, g, h, a, W[i + 7], k_sha256[i + 7]); \ 158 | SHA512_STEP(a, b, c, d, e, f, g, h, W[i + 8], k_sha256[i + 8]); \ 159 | SHA512_STEP(h, a, b, c, d, e, f, g, W[i + 9], k_sha256[i + 9]); \ 160 | SHA512_STEP(g, h, a, b, c, d, e, f, W[i + 10], k_sha256[i + 10]); \ 161 | SHA512_STEP(f, g, h, a, b, c, d, e, W[i + 11], k_sha256[i + 11]); \ 162 | SHA512_STEP(e, f, g, h, a, b, c, d, W[i + 12], k_sha256[i + 12]); \ 163 | SHA512_STEP(d, e, f, g, h, a, b, c, W[i + 13], k_sha256[i + 13]); \ 164 | SHA512_STEP(c, d, e, f, g, h, a, b, W[i + 14], k_sha256[i + 14]); \ 165 | SHA512_STEP(b, c, d, e, f, g, h, a, W[i + 15], k_sha256[i + 15]); \ 166 | } 167 | 168 | 169 | #define def_hash(funcName, inputTag, hashTag, mdPadFunc, printFromLongFunc) \ 170 | /* The main hashing function */ \ 171 | static void funcName(inputTag unsigned long *input, const unsigned int length, hashTag unsigned long* hash) \ 172 | { \ 173 | /* Do the padding - we weren't previously for some reason */ \ 174 | const unsigned int nBlocks = mdPadFunc(input, (const unsigned long) length); \ 175 | /*if (length == 8){ \ 176 | printf("Padded input: "); \ 177 | printFromLongFunc(input, hashBlockSize_bytes, true); \ 178 | }*/ \ 179 | \ 180 | unsigned long W[0x50]={0}; \ 181 | /* state which is repeatedly processed & added to */ \ 182 | unsigned long State[8]={0}; \ 183 | State[0] = 0x6a09e667f3bcc908UL; \ 184 | State[1] = 0xbb67ae8584caa73bUL; \ 185 | State[2] = 0x3c6ef372fe94f82bUL; \ 186 | State[3] = 0xa54ff53a5f1d36f1UL; \ 187 | State[4] = 0x510e527fade682d1UL; \ 188 | State[5] = 0x9b05688c2b3e6c1fUL; \ 189 | State[6] = 0x1f83d9abfb41bd6bUL; \ 190 | State[7] = 0x5be0cd19137e2179UL; \ 191 | \ 192 | unsigned long a,b,c,d,e,f,g,h; \ 193 | \ 194 | /* loop for each block */ \ 195 | for (int block_i = 0; block_i < nBlocks; block_i++) \ 196 | { \ 197 | /* No need to (re-)initialise W. 198 | Note that the input pointer is updated */ \ 199 | W[0] = SWAP(input[0]); \ 200 | W[1] = SWAP(input[1]); \ 201 | W[2] = SWAP(input[2]); \ 202 | W[3] = SWAP(input[3]); \ 203 | W[4] = SWAP(input[4]); \ 204 | W[5] = SWAP(input[5]); \ 205 | W[6] = SWAP(input[6]); \ 206 | W[7] = SWAP(input[7]); \ 207 | W[8] = SWAP(input[8]); \ 208 | W[9] = SWAP(input[9]); \ 209 | W[10] = SWAP(input[10]); \ 210 | W[11] = SWAP(input[11]); \ 211 | W[12] = SWAP(input[12]); \ 212 | W[13] = SWAP(input[13]); \ 213 | W[14] = SWAP(input[14]); \ 214 | W[15] = SWAP(input[15]); \ 215 | \ 216 | for (int i = 16; i < 80; i++) \ 217 | { \ 218 | W[i] = W[i-16] + little_s0(W[i-15]) + W[i-7] + little_s1(W[i-2]); \ 219 | } \ 220 | \ 221 | a = State[0]; \ 222 | b = State[1]; \ 223 | c = State[2]; \ 224 | d = State[3]; \ 225 | e = State[4]; \ 226 | f = State[5]; \ 227 | g = State[6]; \ 228 | h = State[7]; \ 229 | \ 230 | /* Note loop is only 5 */ \ 231 | for (int i = 0; i < 80; i += 16) \ 232 | { \ 233 | ROUND_STEP(i) \ 234 | } \ 235 | \ 236 | State[0] += a; \ 237 | State[1] += b; \ 238 | State[2] += c; \ 239 | State[3] += d; \ 240 | State[4] += e; \ 241 | State[5] += f; \ 242 | State[6] += g; \ 243 | State[7] += h; \ 244 | \ 245 | input += hashBlockSize_long64; \ 246 | } \ 247 | \ 248 | hash[0]=SWAP(State[0]); \ 249 | hash[1]=SWAP(State[1]); \ 250 | hash[2]=SWAP(State[2]); \ 251 | hash[3]=SWAP(State[3]); \ 252 | hash[4]=SWAP(State[4]); \ 253 | hash[5]=SWAP(State[5]); \ 254 | hash[6]=SWAP(State[6]); \ 255 | hash[7]=SWAP(State[7]); \ 256 | return; \ 257 | } 258 | 259 | def_hash(hash_global, __global, __global, md_pad__global, printFromLong_glbl_n) 260 | def_hash(hash_private, __private, __private, md_pad__private, printFromLong_n) 261 | def_hash(hash_glbl_to_priv, __global, __private, md_pad__global, printFromLong_glbl_n) 262 | def_hash(hash_priv_to_glbl, __private, __global, md_pad__private, printFromLong_n) 263 | 264 | #undef bit_maj 265 | #undef choose 266 | #undef S0 267 | #undef S1 268 | #undef little_s0 269 | #undef little_s1 270 | 271 | __kernel void hash_main(__global inbuf * inbuffer, __global outbuf * outbuffer) 272 | { 273 | unsigned int idx = get_global_id(0); 274 | hash_global(inbuffer[idx].buffer, inbuffer[idx].length, outbuffer[idx].buffer); 275 | } 276 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MD5,SHA1,SHA256,HMAC,PBKDF2,SCrypt Bruteforcing tools using OpenCL (GPU, yay!) and Python 2 | (c) B. Kerler and C.B. 2017-2019 3 | 4 | Why 5 | === 6 | - Because bruteforcing PBKDF2/HMAC/SCrypt and hashing MD5/SHA1/SHA256/SHA512 using just CPU sucks. 7 | - Because Python itself is very slow for bruteforcing 8 | - Because we'd like to bruteforce using Python and not rely on other 9 | tools like Hashcat (sorry Atom :D) and do not want to compile c++ first 10 | 11 | Installation 12 | ============= 13 | - Get python >= 3.7 64-Bit 14 | 15 | Windows: 16 | - Download pyopencl-2018.2.1+cl12-cp37-cp37m-win_amd64.whl from 17 | [Here] (http://www.lfd.uci.edu/~gohlke/pythonlibs/#pyopencl) or use from Installer directory 18 | - Download and install the Win32 OpenCL driver (from Intel) from 19 | [Here] (http://registrationcenter-download.intel.com/akdlm/irc_nas/12512/opencl_runtime_16.1.2_x64_setup.msi) 20 | - Install pyOpenCL using: python -m pip install pyopencl-2018.2.1+cl12-cp37-cp37m-win_amd64.whl 21 | - Install scrypt using: python -m pip install scrypt 22 | 23 | Linux: 24 | ``` 25 | sudo pip3 install numpy pybind11 pycryptodome 26 | sudo apt install libssl-dev libssl 27 | sudo ldconfig 28 | sudo pip3 install scrypt 29 | sudo apt install opencl-dev && sudo pip3 install pyopencl 30 | wget http://registrationcenter-download.intel.com/akdlm/irc_nas/12556/opencl_runtime_16.1.2_x64_rh_6.4.0.37.tgz 31 | tar xzvf opencl_runtime_16.1.2_x64_rh_6.4.0.37.tgz 32 | cd opencl_runtime_16.1.2_x64_rh_6.4.0.37 33 | ./install_gui.sh 34 | ``` 35 | 36 | Run 37 | === 38 | - To test if Library works correctly, run: 39 | "python test.py" -> to print info 40 | "python test.py 0" -> to run on first platform 41 | - See test.py for example implementation, Library is in Library folder 42 | 43 | Issues 44 | ====== 45 | - Tested with : Intel CPU and GPU, NVIDIA GTX 1080 Ti, AMD 970 (HMAC fails on AMD right now) 46 | 47 | 48 | Published under MIT license 49 | Additional license limitations: No use in commercial products without prior permit. 50 | 51 | Enjoy ! 52 | -------------------------------------------------------------------------------- /examples/bruteforce.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # (c) 2021 B. Kerler 4 | # MIT License 5 | 6 | import threading 7 | import sys 8 | import hashlib 9 | import argparse 10 | import queue 11 | from time import perf_counter 12 | from binascii import hexlify 13 | from Library import opencl 14 | from Library.passwordutils import passwordutils 15 | 16 | 17 | def verify_set(wordlist, key, salt, hash_val): 18 | for pwd in wordlist: 19 | pw = hashlib.pbkdf2_hmac('SHA256', password=pwd, salt=salt, iterations=10000, dklen=32) 20 | if hash_val in hashlib.sha1(pw).hexdigest()[:8]: 21 | print(f'[+] correct password: {pwd}', flush=True) 22 | return pwd 23 | return b"" 24 | 25 | 26 | def setup_args(): 27 | parser = argparse.ArgumentParser(description='PW Bruteforce-Tool V1.0 (c) B. Kerler') 28 | parser.add_argument("-p", "--platform", required=False, help='OpenCL platform id.') 29 | parser.add_argument("-b", "--batch_size", required=False, help='Define batch_size/workgroupsize if necessary.') 30 | parser.add_argument("-m", "--minlen", required=False, help='Define PW minimum length.') 31 | parser.add_argument("-x", "--maxlen", required=False, help='Define PW maximum length.') 32 | args = parser.parse_args() 33 | return args 34 | 35 | 36 | class brute: 37 | def __init__(self): 38 | self.totalthreads = None 39 | self.stop = False 40 | self.passwords=queue.Queue() 41 | self.flag = None 42 | self.key = None 43 | self.salt = None 44 | self.hash_val = None 45 | self.computeunits = None 46 | self.accel = None 47 | self.totalthreads = None 48 | self.iterations = None 49 | self.args = setup_args() 50 | if self.args.batch_size is not None: 51 | self.totalthreads = self.args.batchsize 52 | if self.args.minlen is not None: 53 | self.minlen = self.args.minlen 54 | else: 55 | self.minlen = 8 56 | 57 | if self.args.maxlen is not None: 58 | self.maxlen = self.args.maxlen 59 | else: 60 | self.maxlen = 16 61 | 62 | self.debug = 0 63 | if self.args.platform is not None: 64 | self.platform = self.args.platform 65 | else: 66 | self.platform = 0 67 | 68 | self.opencl_algo = opencl.opencl_algos(self.platform, self.debug, write_combined_file=False, 69 | inv_memory_density=1) 70 | 71 | def verifypws(self): 72 | pwcount = 0 73 | start_time = perf_counter() 74 | while not self.passwords.empty(): 75 | pwlist = [] 76 | for i in range(0, self.totalthreads): 77 | if not self.stop: 78 | pw = self.passwords.get() 79 | pwlist.append(pw) 80 | pwcount += 1 81 | else: 82 | while not self.passwords.empty(): 83 | pw = self.passwords.get() 84 | pwlist.append(pw) 85 | pwcount += 1 86 | break 87 | 88 | """ 89 | Implement your algo here 90 | """ 91 | results = self.opencl_algo.cl_pbkdf2(self.ctx_pbkdf2, pwlist, self.salt, self.iterations, 32) 92 | digests = [] 93 | for result in results: 94 | digests.append(result) 95 | """ 96 | End of implementation 97 | """ 98 | 99 | if len(pwlist) > 0: 100 | elapsed_time = perf_counter() - start_time 101 | calcedpw = self.totalthreads / elapsed_time 102 | print(f"Current try : {pwlist[0].decode('utf-8')}, {calcedpw} PWs/s, " + 103 | f"{self.totalthreads} PWs/Thread, {pwcount} total PWs.") 104 | start_time = perf_counter() 105 | 106 | """ 107 | Implement your verification here 108 | """ 109 | for number, sha in enumerate(digests): 110 | if self.hash_val == sha: 111 | print(f'[+] found password: {pwlist[number]}') 112 | return pwlist[number] 113 | return None 114 | 115 | def init_gcpu(self,salt,hash_val,iterations): 116 | self.salt = salt 117 | self.hash_val = hash_val 118 | self.iterations = iterations 119 | # init opencl instance 120 | self.ctx_pbkdf2=self.opencl_algo.cl_pbkdf2_init("sha256",len(self.salt),32) 121 | 122 | if self.totalthreads is None: 123 | self.computeunits = self.opencl_algo.opencl_ctx.computeunits 124 | self.accel = max(self.computeunits // 4 * 4 // 4, 1) 125 | self.totalthreads = self.opencl_algo.opencl_ctx.workgroupsize * self.accel 126 | print(f"Using Thread size of {self.totalthreads}") 127 | 128 | def stopthread(self): 129 | self.stop = True 130 | 131 | def run(self): 132 | sys.stdin = sys.stdin.detach() 133 | self.threadLock = threading.Lock() 134 | thread1 = passwordutils(self.stopthread, self.threadLock, self.passwords, self.totalthreads, self.minlen, self.maxlen) 135 | #thread2 = passwordutils(self.passwords, self.totalthreads, self.minlen, self.maxlen) 136 | #thread3 = passwordutils(self.passwords, self.totalthreads, self.minlen, self.maxlen) 137 | #thread4 = passwordutils(,self.passwords, self.totalthreads, self.minlen, self.maxlen) 138 | thread1.start() 139 | #thread2.start() 140 | #thread3.start() 141 | #thread4.start() 142 | # We wait here for first passwords to arrive 143 | while self.passwords.empty(): 144 | pass 145 | start_time = perf_counter() 146 | res = self.verifypws() 147 | thread1.join() 148 | #thread2.join() 149 | #thread3.join() 150 | #thread4.join() 151 | elapsed_time = perf_counter() - start_time 152 | print(f"Total time : %f" % elapsed_time) 153 | 154 | if res == -1 or res is None: 155 | print("No password found") 156 | exit(0) 157 | 158 | 159 | if __name__ == '__main__': 160 | tb = brute() 161 | salt=b"\x12\x34\x56\x78" 162 | iterations=10000 163 | hash_val=hashlib.pbkdf2_hmac("SHA256",b"testtest",salt,iterations,32) 164 | tb.init_gcpu(salt,hash_val,iterations) 165 | tb.run() 166 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | pybind11 3 | pycryptodome 4 | scrypt 5 | pyopencl -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # -*- coding: utf-8 -*- 3 | # (c) B. Kerler 2018-2021 4 | # MIT License 5 | import sys 6 | import hashlib 7 | import hmac 8 | import scrypt 9 | import functools, operator 10 | from Library import opencl 11 | from Library.opencl_information import opencl_information 12 | from binascii import unhexlify, hexlify 13 | from collections import deque 14 | from hashlib import pbkdf2_hmac 15 | 16 | 17 | # ===================================== Test funcs ============================================= 18 | 19 | def test(hashClass, passwordlist, clresult): 20 | # Generate the correct results using hashlib 21 | correct_res = [] 22 | for pwd in passwordlist: 23 | h = hashClass() 24 | h.update(pwd) 25 | correct_res.append(h.digest()) 26 | 27 | # Determine success and print 28 | correct = [r == c for r, c in zip(clresult, correct_res)] 29 | succ = (len(passwordlist) == len(clresult)) and functools.reduce(operator.and_, correct, True) 30 | if succ: 31 | print("Ok m8!") 32 | else: 33 | print("Failed !") 34 | print(clresult[0]) 35 | print(correct_res[0]) 36 | 37 | 38 | def sha256_test(opencl_algo, passwordlist): 39 | print("Testing sha256 ..") 40 | ctx = opencl_algo.cl_sha256_init() 41 | clresult = opencl_algo.cl_sha256(ctx, passwordlist) 42 | test(hashlib.sha256, passwordlist, clresult) 43 | 44 | 45 | def sha512_test(opencl_algo, passwordlist): 46 | print("Testing sha512 ..") 47 | ctx = opencl_algo.cl_sha512_init() 48 | clresult = opencl_algo.cl_sha512(ctx, passwordlist) 49 | test(hashlib.sha512, passwordlist, clresult) 50 | 51 | 52 | def md5_test(opencl_algo, passwordlist): 53 | print("Testing md5 ..") 54 | ctx = opencl_algo.cl_md5_init() 55 | clresult = opencl_algo.cl_md5(ctx, passwordlist) 56 | test(hashlib.md5, passwordlist, clresult) 57 | 58 | 59 | def sha1_test(opencl_algo, passwordlist): 60 | print("Testing sha1 ..") 61 | ctx = opencl_algo.cl_sha1_init() 62 | clresult = opencl_algo.cl_sha1(ctx, passwordlist) 63 | test(hashlib.sha1, passwordlist, clresult) 64 | 65 | 66 | def hmac_test(passwordlist, salt, hashClass, clResult): 67 | correct_res = [] 68 | for pwd in passwordlist: 69 | correct_res.append(hmac.new(pwd, salt, hashClass).digest()) 70 | 71 | # Determine success and print 72 | correct = [r == c for r, c in zip(clResult, correct_res)] 73 | succ = (len(passwordlist) == len(clResult)) and functools.reduce(operator.and_, correct, True) 74 | if succ: 75 | print("Ok m9!") 76 | else: 77 | print("Failed !") 78 | print(clResult[0]) 79 | print(correct_res[0]) 80 | 81 | 82 | def md5_hmac_test(opencl_algo, passwordlist, salt): 83 | print("Testing hmac using md5.cl") 84 | ctx = opencl_algo.cl_md5_init("pbkdf2.cl") 85 | clResult = opencl_algo.cl_md5_hmac(ctx, passwordlist, salt) 86 | hmac_test(passwordlist, salt, hashlib.md5, clResult) 87 | 88 | 89 | def sha256_hmac_test(opencl_algo, passwordlist, salt): 90 | print("Testing hmac using sha256.cl") 91 | ctx = opencl_algo.cl_sha256_init("pbkdf2.cl") 92 | clResult = opencl_algo.cl_sha256_hmac(ctx, passwordlist, salt) 93 | hmac_test(passwordlist, salt, hashlib.sha256, clResult) 94 | 95 | 96 | def sha512_hmac_test(opencl_algo, passwordlist, salt): 97 | print("Testing hmac using sha512.cl") 98 | ctx = opencl_algo.cl_sha512_init("pbkdf2.cl") 99 | clResult = opencl_algo.cl_sha512_hmac(ctx, passwordlist, salt) 100 | hmac_test(passwordlist, salt, hashlib.sha512, clResult) 101 | 102 | 103 | def sha1_hmac_test(opencl_algo, passwordlist, salt): 104 | print("Testing hmac using sha1.cl") 105 | ctx = opencl_algo.cl_sha1_init("pbkdf2.cl") 106 | clResult = opencl_algo.cl_sha1_hmac(ctx, passwordlist, salt) 107 | hmac_test(passwordlist, salt, hashlib.sha1, clResult) 108 | 109 | 110 | def pbkdf2_test(passwordlist, salt, hashName, iters, dklen, clResult): 111 | correct_res = [] 112 | for pwd in passwordlist: 113 | correct_res.append(hashlib.pbkdf2_hmac(hashName, pwd, salt, iters, dklen)) 114 | 115 | # Determine success and print 116 | correct = [r == c for r, c in zip(clResult, correct_res)] 117 | succ = (len(passwordlist) == len(clResult)) and functools.reduce(operator.and_, correct, True) 118 | if succ: 119 | print("Ok m10!") 120 | else: 121 | print("Failed !") 122 | for i in range(len(passwordlist)): 123 | if clResult[i] == correct_res[i]: 124 | print("#{} succeeded".format(i)) 125 | else: 126 | print(i) 127 | print(clResult[i]) 128 | print(correct_res[i]) 129 | 130 | def pbkdf2_saltlist_test(password, saltlist, hashName, iters, dklen, clResult): 131 | correct_res = [] 132 | for salt in saltlist: 133 | correct_res.append(hashlib.pbkdf2_hmac(hashName, password, salt, iters, dklen)) 134 | 135 | # Determine success and print 136 | correct = [r == c for r, c in zip(clResult, correct_res)] 137 | succ = (len(saltlist) == len(clResult)) and functools.reduce(operator.and_, correct, True) 138 | if succ: 139 | print("Ok m10!") 140 | else: 141 | print("Failed !") 142 | for i in range(len(saltlist)): 143 | if clResult[i] == correct_res[i]: 144 | print("#{} Succeeded".format(i)) 145 | else: 146 | print("#{} Failed".format(i)) 147 | print("clResult: ", clResult[i]) 148 | print("Hashlib: ", correct_res[i]) 149 | 150 | def pbkdf2_hmac_md5_test(opencl_algo, passwordlist, salt, iters, dklen): 151 | print("Testing pbkdf2-hmac using md5.cl") 152 | ctx = opencl_algo.cl_pbkdf2_init("md5", len(salt), dklen) 153 | clResult = opencl_algo.cl_pbkdf2(ctx, passwordlist, salt, iters, dklen) 154 | pbkdf2_test(passwordlist, salt, "md5", iters, dklen, clResult) 155 | 156 | 157 | def pbkdf2_hmac_sha1_test(opencl_algo, passwordlist, salt, iters, dklen): 158 | print("Testing pbkdf2-hmac using sha1.cl") 159 | ctx = opencl_algo.cl_pbkdf2_init("sha1", len(salt), dklen) 160 | clResult = opencl_algo.cl_pbkdf2(ctx, passwordlist, salt, iters, dklen) 161 | pbkdf2_test(passwordlist, salt, "sha1", iters, dklen, clResult) 162 | 163 | 164 | def pbkdf2_hmac_sha256_test(opencl_algo, passwordlist, salt, iters, dklen): 165 | print("Testing pbkdf2-hmac using sha256.cl") 166 | ctx = opencl_algo.cl_pbkdf2_init("sha256", len(salt), dklen) 167 | clResult = opencl_algo.cl_pbkdf2(ctx, passwordlist, salt, iters, dklen) 168 | pbkdf2_test(passwordlist, salt, "sha256", iters, dklen, clResult) 169 | 170 | def pbkdf2_hmac_sha256_speedtest(opencl_algo, passwordlist, salt, iters, dklen): 171 | print("Testing pbkdf2-hmac using sha256.cl") 172 | ctx = opencl_algo.cl_pbkdf2_init("sha256", len(salt), dklen) 173 | clResult = opencl_algo.cl_pbkdf2(ctx, passwordlist, salt, iters, dklen) 174 | 175 | 176 | def pbkdf2_hmac_sha512_test(opencl_algo, passwordlist, salt, iters, dklen): 177 | print("Testing pbkdf2-hmac using sha512.cl") 178 | ctx = opencl_algo.cl_pbkdf2_init("sha512", len(salt), dklen) 179 | clResult = opencl_algo.cl_pbkdf2(ctx, passwordlist, salt, iters, dklen) 180 | pbkdf2_test(passwordlist, salt, "sha512", iters, dklen, clResult) 181 | 182 | def pbkdf2_hmac_saltlist_md5_test(opencl_algo, password, saltlist, iters, dklen): 183 | print("Testing pbkdf2-hmac using md5.cl") 184 | ctx=opencl_algo.cl_pbkdf2_saltlist_init("md5",len(password),dklen) 185 | clResult = opencl_algo.cl_pbkdf2_saltlist(ctx,password, saltlist, iters, dklen) 186 | pbkdf2_saltlist_test(password, saltlist, "md5", iters, dklen, clResult) 187 | 188 | def pbkdf2_hmac_saltlist_sha1_test(opencl_algo, password, saltlist, iters, dklen): 189 | print("Testing pbkdf2-hmac using sha1.cl") 190 | ctx=opencl_algo.cl_pbkdf2_saltlist_init("sha1", len(password), dklen) 191 | clResult = opencl_algo.cl_pbkdf2_saltlist(ctx,password, saltlist, iters, dklen) 192 | pbkdf2_saltlist_test(password, saltlist, "sha1", iters, dklen, clResult) 193 | 194 | def pbkdf2_hmac_saltlist_sha256_test(opencl_algo, password, saltlist, iters, dklen): 195 | print("Testing pbkdf2-hmac using sha256.cl") 196 | ctx=opencl_algo.cl_pbkdf2_saltlist_init("sha256", len(password), dklen) 197 | clResult = opencl_algo.cl_pbkdf2_saltlist(ctx,password, saltlist, iters, dklen) 198 | pbkdf2_saltlist_test(password, saltlist, "sha256", iters, dklen, clResult) 199 | 200 | def pbkdf2_hmac_saltlist_sha512_test(opencl_algo, password, saltlist, iters, dklen): 201 | print("Testing pbkdf2-hmac using sha512.cl") 202 | ctx=opencl_algo.cl_pbkdf2_saltlist_init("sha512", len(password), dklen) 203 | clResult = opencl_algo.cl_pbkdf2_saltlist(ctx,password, saltlist, iters, dklen) 204 | pbkdf2_saltlist_test(password, saltlist, "sha512", iters, dklen, clResult) 205 | 206 | 207 | def scrypt_test(scrypt_opencl_algos, passwords, N_value=15, r_value=3, p_value=1, desired_key_length=32, 208 | hex_salt=unhexlify("DEADBEEFDEADBEEFDEADBEEFDEADBEEF")): 209 | print("Testing scrypt") 210 | correct_res = [] 211 | for pwd in passwords: 212 | v = scrypt.hash(pwd, hex_salt, 1 << N_value, 1 << r_value, 1 << p_value, desired_key_length) 213 | correct_res.append(v) 214 | ctx = scrypt_opencl_algos.cl_scrypt_init(N_value) 215 | clResult = scrypt_opencl_algos.cl_scrypt(ctx, passwords, N_value, r_value, p_value, desired_key_length, hex_salt) 216 | 217 | # Determine success and print 218 | correct = [r == c for r, c in zip(clResult, correct_res)] 219 | succ = (len(passwords) == len(clResult)) and functools.reduce(operator.and_, correct, True) 220 | if succ: 221 | print("Ok m11!") 222 | else: 223 | print("Failed !") 224 | for i in range(len(passwords)): 225 | if clResult[i] == correct_res[i]: 226 | print("#{} succeeded".format(i)) 227 | else: 228 | print(i) 229 | print(clResult[i]) 230 | print(correct_res[i]) 231 | 232 | 233 | def test_iterations(passwordlist, hashClass, iters, clResult): 234 | hashlib_passwords = [] 235 | for password in passwordlist: 236 | for i in range(iters): 237 | password = hashClass(password).digest() 238 | hashlib_passwords.append(password) 239 | 240 | if clResult == hashlib_passwords: 241 | print("Ok m12!") 242 | else: 243 | print("Failed !") 244 | for i in range(len(passwordlist)): 245 | if clResult[i] == hashlib_passwords[i]: 246 | print("#{} succeeded".format(i)) 247 | else: 248 | print(i) 249 | print(clResult[i]) 250 | print(hashlib_passwords[i]) 251 | 252 | 253 | def hash_iterations_md5_test(opencl_algo, passwordlist, iters): 254 | print("Testing md5 " + str(iters) + " rounds") 255 | ctx = opencl_algo.cl_hash_iterations_init("md5") 256 | 257 | for i in range(len(passwordlist)): 258 | passwordlist[i] = hashlib.md5(passwordlist[i]).digest() 259 | 260 | clresult = opencl_algo.cl_hash_iterations(ctx, passwordlist, iters, 4) 261 | 262 | test_iterations(passwordlist, hashlib.md5, iters, clresult) 263 | 264 | 265 | def hash_iterations_sha1_test(opencl_algo, passwordlist, iters): 266 | print("Testing sha1 " + str(iters) + " rounds") 267 | ctx = opencl_algo.cl_hash_iterations_init("sha1") 268 | 269 | for i in range(len(passwordlist)): 270 | passwordlist[i] = hashlib.sha1(passwordlist[i]).digest() 271 | 272 | clresult = opencl_algo.cl_hash_iterations(ctx, passwordlist, iters, 8) 273 | 274 | test_iterations(passwordlist, hashlib.sha1, iters, clresult) 275 | 276 | 277 | def hash_iterations_sha256_test(opencl_algo, passwordlist, iters): 278 | print("Testing sha256 " + str(iters) + " rounds") 279 | ctx = opencl_algo.cl_hash_iterations_init("sha256") 280 | 281 | for i in range(len(passwordlist)): 282 | passwordlist[i] = hashlib.sha256(passwordlist[i]).digest() 283 | 284 | clresult = opencl_algo.cl_hash_iterations(ctx, passwordlist, iters, 8) 285 | 286 | test_iterations(passwordlist, hashlib.sha256, iters, clresult) 287 | 288 | 289 | def hash_iterations_sha512_test(opencl_algo, passwordlist, iters): 290 | print("Testing sha512 " + str(iters) + " rounds") 291 | ctx = opencl_algo.cl_hash_iterations_init("sha512") 292 | 293 | for i in range(len(passwordlist)): 294 | passwordlist[i] = hashlib.sha512(passwordlist[i]).digest() 295 | 296 | clresult = opencl_algo.cl_hash_iterations(ctx, passwordlist, iters, 8) 297 | 298 | test_iterations(passwordlist, hashlib.sha512, iters, clresult) 299 | 300 | 301 | # =========================================================================================== 302 | 303 | def main(argv): 304 | if len(argv) < 2: 305 | print("Implementation tests") 306 | print("-----------------------------------------------------------------") 307 | info = opencl_information() 308 | info.printplatforms() 309 | print("\nPlease run as: python test.py [platform number]") 310 | return 311 | 312 | # Input values to be hashed 313 | passwordlist = [b'password', b'hmm', b'trolololl', b'madness'] 314 | salts = [b"salty123", b"salty12",b"\xd1\x0c\x00\xd2\xfe\x64\x02\x98",b"\x12\x34\x56\x78"] 315 | 316 | platform = int(argv[1]) 317 | debug = 0 318 | write_combined_file = False 319 | opencl_algos = opencl.opencl_algos(platform, debug, write_combined_file, inv_memory_density=1) 320 | # Call the tests 321 | 322 | for salt in salts: 323 | print("Using salt: %s" % salt) 324 | md5_test(opencl_algos, passwordlist) 325 | md5_hmac_test(opencl_algos, passwordlist, salt) 326 | pbkdf2_hmac_md5_test(opencl_algos, passwordlist, salt, 1000, 32) 327 | pbkdf2_hmac_md5_test(opencl_algos, passwordlist, salt, 1000, 50) 328 | hash_iterations_md5_test(opencl_algos, passwordlist, 10000) 329 | 330 | sha1_test(opencl_algos, passwordlist) 331 | sha1_hmac_test(opencl_algos, passwordlist, salt) 332 | pbkdf2_hmac_sha1_test(opencl_algos, passwordlist, 16*b"\x00", 1000, 32) 333 | pbkdf2_hmac_sha1_test(opencl_algos, passwordlist, salt, 1000, 32) 334 | pbkdf2_hmac_sha1_test(opencl_algos, passwordlist, salt, 1000, 64) 335 | hash_iterations_sha1_test(opencl_algos, passwordlist, 10000) 336 | 337 | sha256_test(opencl_algos, passwordlist) 338 | sha256_hmac_test(opencl_algos, passwordlist, salt) 339 | pbkdf2_hmac_sha256_test(opencl_algos, passwordlist, salt, 10000, 32) 340 | pbkdf2_hmac_sha256_test(opencl_algos, passwordlist, salt, 10000, 50) 341 | hash_iterations_sha256_test(opencl_algos, passwordlist, 10000) 342 | 343 | sha512_test(opencl_algos, passwordlist) 344 | sha512_hmac_test(opencl_algos, passwordlist, salt) 345 | pbkdf2_hmac_sha512_test(opencl_algos, passwordlist, salt, 1000, 32) 346 | pbkdf2_hmac_sha512_test(opencl_algos, passwordlist, salt, 1000, 50) 347 | hash_iterations_sha512_test(opencl_algos, passwordlist, 10000) 348 | 349 | scrypt_test(opencl_algos, passwordlist, 15, 3, 1, 0x20, salt) 350 | 351 | print("Testing PBKDF2 with SaltList") 352 | pbkdf2_hmac_saltlist_md5_test(opencl_algos, passwordlist[0], salts, 1000, 50) 353 | pbkdf2_hmac_saltlist_sha1_test(opencl_algos, passwordlist[0], salts, 1000, 50) 354 | pbkdf2_hmac_saltlist_sha256_test(opencl_algos, passwordlist[0], salts, 1 << 16, 32) 355 | pbkdf2_hmac_saltlist_sha512_test(opencl_algos, passwordlist[0], salts, 1000, 50) 356 | 357 | """ 358 | from time import perf_counter 359 | start=perf_counter() 360 | for i in range(200000): 361 | passwordlist.append(b"test%04d" % i) 362 | pbkdf2_hmac_sha256_speedtest(opencl_algos,passwordlist,salts[0],1000,50) 363 | end=perf_counter() 364 | print("Time: %f" % (end-start)) 365 | """ 366 | print("Tests have finished.") 367 | 368 | 369 | if __name__ == '__main__': 370 | main(sys.argv) 371 | --------------------------------------------------------------------------------