├── .gitignore
├── LICENSE
├── Library
    ├── __init__.py
    ├── buffer_structs.py
    ├── opencl.py
    ├── opencl_information.py
    ├── passwordutils.py
    └── worker
    │   └── generic
    │       ├── buffer_structs_template.cl
    │       ├── hash_iterations.cl
    │       ├── hmac_qualcomm.cl
    │       ├── md5.cl
    │       ├── pbkdf2.cl
    │       ├── pbkdf2_sha1_32.cl
    │       ├── pbkdf2_sha256_32.cl
    │       ├── sCrypt.cl
    │       ├── sCrypt_Bip38fork.cl
    │       ├── sha1.cl
    │       ├── sha256.cl
    │       └── sha512.cl
├── README.md
├── examples
    └── bruteforce.py
├── requirements.txt
└── test.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | .hypothesis/
 50 | .pytest_cache/
 51 | 
 52 | # Translations
 53 | *.mo
 54 | *.pot
 55 | 
 56 | # Django stuff:
 57 | *.log
 58 | local_settings.py
 59 | db.sqlite3
 60 | 
 61 | # Flask stuff:
 62 | instance/
 63 | .webassets-cache
 64 | 
 65 | # Scrapy stuff:
 66 | .scrapy
 67 | 
 68 | # Sphinx documentation
 69 | docs/_build/
 70 | 
 71 | # PyBuilder
 72 | target/
 73 | 
 74 | # Jupyter Notebook
 75 | .ipynb_checkpoints
 76 | 
 77 | # IPython
 78 | profile_default/
 79 | ipython_config.py
 80 | 
 81 | # pyenv
 82 | .python-version
 83 | 
 84 | # celery beat schedule file
 85 | celerybeat-schedule
 86 | 
 87 | # SageMath parsed files
 88 | *.sage.py
 89 | 
 90 | # Environments
 91 | .env
 92 | .venv
 93 | env/
 94 | venv/
 95 | ENV/
 96 | env.bak/
 97 | venv.bak/
 98 | 
 99 | # Spyder project settings
100 | .spyderproject
101 | .spyproject
102 | 
103 | # Rope project settings
104 | .ropeproject
105 | 
106 | # mkdocs documentation
107 | /site
108 | 
109 | # mypy
110 | .mypy_cache/
111 | .dmypy.json
112 | dmypy.json
113 | 
114 | # Pyre type checker
115 | .pyre/
116 | .idea/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Bjoern Kerler
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Library/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bkerler/opencl_brute/c294a158cd56e32c8a05f88f0bebf89466513015/Library/__init__.py


--------------------------------------------------------------------------------
/Library/buffer_structs.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | # -*- coding: utf-8 -*-
  3 | # (c) B. Kerler 2018-2021
  4 | # MIT License
  5 | '''
  6 |     Provides a class for filling in my buffer_structs_template.cl
  7 | '''
  8 | 
  9 | import os
 10 | import re
 11 | 
 12 | # Read the template in
 13 | template = ""
 14 | with open(os.path.join(os.path.dirname(__file__), "worker","generic","buffer_structs_template.cl"), "r") as rf:
 15 |     template = rf.read()
 16 | 
 17 | class buffer_structs:
 18 |     def __init__(self):
 19 |         self.code = ""
 20 |         self.wordSize = 4
 21 |         
 22 |     def setMaxBufferSizes(self, max_in_bytes, max_out_bytes, max_salt_bytes=32, max_ct_bytes=0, max_pwd_bytes=32):
 23 |         # Ensure each are a multiple of 4
 24 |         max_in_bytes += (-max_in_bytes % self.wordSize)
 25 |         max_out_bytes += (-max_out_bytes % self.wordSize)
 26 |         max_salt_bytes += (-max_salt_bytes % self.wordSize)
 27 |         max_pwd_bytes += (-max_pwd_bytes % self.wordSize)
 28 | 
 29 |         self.inBufferSize_bytes = max_in_bytes
 30 |         self.outBufferSize_bytes = max_out_bytes
 31 |         self.saltBufferSize_bytes = max_salt_bytes
 32 |         self.pwdBufferSize_bytes = max_pwd_bytes
 33 |         self.inBufferSize = (max_in_bytes + 3) // self.wordSize
 34 |         self.outBufferSize = (max_out_bytes + 3) // self.wordSize
 35 |         self.saltBufferSize = (max_salt_bytes + 3) // self.wordSize
 36 |         self.pwdBufferSize = (max_pwd_bytes + 3) // self.wordSize
 37 |         self.ctBufferSize_bytes = max_ct_bytes
 38 | 
 39 |     def specifyHashSizes(self, hashBlockSize_bits, hashDigestSize_bits):
 40 |         self.hashBlockSize_bits = hashBlockSize_bits
 41 |         self.hashDigestSize_bits = hashDigestSize_bits
 42 | 
 43 |     def setBufferSizesForHashing(self, hashMaxNumBlocks):
 44 |         self.setMaxBufferSizes(  ((self.hashBlockSize_bits + 7) // 8) * hashMaxNumBlocks,
 45 |                             (self.hashDigestSize_bits + 7) // 8,
 46 |                             0)
 47 | 
 48 |     def ceilToMult(self, n, k):
 49 |         return n + ((-n) % k)
 50 | 
 51 |     def fill_template(self):
 52 |         rep = { "<hashBlockSize_bits>": str(self.hashBlockSize_bits),
 53 |                 "<hashDigestSize_bits>" : str(self.hashDigestSize_bits),
 54 |                 "<inBufferSize_bytes>" : str(self.inBufferSize_bytes),
 55 |                 "<outBufferSize_bytes>" : str(self.outBufferSize_bytes),
 56 |                 "<saltBufferSize_bytes>" : str(self.saltBufferSize_bytes),
 57 |                 "<pwdBufferSize_bytes>": str(self.pwdBufferSize_bytes),
 58 |                 "<ctBufferSize_bytes>" : str(self.ctBufferSize_bytes),
 59 |                 "<word_size>" : str(self.wordSize)
 60 |         }
 61 | 
 62 |         rep = dict((re.escape(k), v) for k, v in rep.items())
 63 |         pattern = re.compile("|".join(rep.keys()))
 64 |         self.code = pattern.sub(lambda m: rep[re.escape(m.group(0))], template)
 65 | 
 66 |     def specifyMD5(self, max_in_bytes=128, max_salt_bytes=32, dklen=0, max_ct_bytes=0, max_password_bytes = 32):
 67 |         self.specifyHashSizes(512, 128)
 68 |         maxNumBlocks = 3
 69 |         self.wordSize = 4
 70 |         self.setBufferSizesForHashing(maxNumBlocks)
 71 |         max_out_bytes = self.hashDigestSize_bits // 8
 72 |         if dklen!=0:
 73 |             # Adjust output size to be a multiple of the digest
 74 |             max_out_bytes = self.ceilToMult(dklen, (self.hashDigestSize_bits // 8))
 75 |         self.setMaxBufferSizes(max_in_bytes, max_out_bytes, max_salt_bytes, max_ct_bytes, max_password_bytes)
 76 |         self.fill_template()
 77 |         return max_out_bytes
 78 | 
 79 |     def specifySHA1(self, max_in_bytes=128, max_salt_bytes=32, dklen=0, max_ct_bytes=0, max_password_bytes = 32):
 80 |         self.specifyHashSizes(512,160)
 81 |         maxNumBlocks = 3
 82 |         self.wordSize = 4
 83 |         self.setBufferSizesForHashing(maxNumBlocks)
 84 |         max_out_bytes = self.hashDigestSize_bits // 8
 85 |         if dklen!=0:
 86 |             # Adjust output size to be a multiple of the digest
 87 |             max_out_bytes = self.ceilToMult(dklen, (self.hashDigestSize_bits // 8))
 88 |         self.setMaxBufferSizes(max_in_bytes, max_out_bytes, max_salt_bytes, max_ct_bytes, max_password_bytes)
 89 |         self.fill_template()
 90 |         return max_out_bytes
 91 | 
 92 |     def specifySHA2(self, hashDigestSize_bits=256, max_in_bytes=128, max_salt_bytes=32, dklen=0, max_ct_bytes=0, max_password_bytes = 32):
 93 |         assert hashDigestSize_bits in [224,256,384,512]
 94 |         hashBlockSize_bits = 512
 95 |         if hashDigestSize_bits >= 384:
 96 |             hashBlockSize_bits = 1024
 97 |         self.specifyHashSizes(hashBlockSize_bits, hashDigestSize_bits)
 98 |         if hashDigestSize_bits==512:
 99 |             maxNumBlocks = 2
100 |             self.wordSize = 8
101 |         else:
102 |             maxNumBlocks = 3
103 |             self.wordSize = 4
104 |         self.setBufferSizesForHashing(maxNumBlocks)
105 |         max_out_bytes = self.hashDigestSize_bits // 8
106 |         if dklen!=0:
107 |             # Adjust output size to be a multiple of the digest
108 |             max_out_bytes = self.ceilToMult(dklen, (self.hashDigestSize_bits // 8))
109 | 
110 |         self.setMaxBufferSizes(max_in_bytes, max_out_bytes, max_salt_bytes, max_ct_bytes, max_password_bytes)
111 |         #bufStructs.setMaxBufferSizes(128, (bufStructs.hashDigestSize_bits * 2) // 8, 128)
112 |         self.fill_template()
113 |         return max_out_bytes
114 | 
115 |     ## sha3 not worth the fuss until I write the .cl's


--------------------------------------------------------------------------------
/Library/opencl.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | # -*- coding: utf-8 -*-
  3 | # (c) B. Kerler 2018-2021
  4 | # MIT License
  5 | import os
  6 | from hashlib import pbkdf2_hmac
  7 | from binascii import unhexlify
  8 | from collections import deque
  9 | from itertools import chain, repeat, zip_longest
 10 | import numpy as np
 11 | import pyopencl as cl
 12 | from Library.buffer_structs import buffer_structs
 13 | import os, re, sys, inspect
 14 | current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
 15 | parent_dir = os.path.dirname(current_dir)
 16 | 
 17 | """
 18 |     Original copyright:
 19 |     Copyright by B.Kerler 2017, PBKDF1_SHA1 and SHA256 PyOpenCl implementation, max 32 chars for password + salt
 20 |     MIT License
 21 |     Implementation was confirmed to work with
 22 |     Intel OpenCL on Intel(R) HD Graphics 520 and Intel(R) Core(TM) i5-6200U CPU and GeForce RTX 3080
 23 | """
 24 | """
 25 |     Adapted for generalising to more hash functions
 26 |     Allows any length input (efficiently, by declaring the max in advance)
 27 |      - salt ditched atm, but hoping to restore it
 28 |      - pbkdf2 forgotten about for now
 29 | """
 30 | 
 31 | 
 32 | # Corresponding to opencl (CAN'T BE CHANGED):
 33 | r = 8
 34 | BLOCK_LEN_BYTES = 128 * r
 35 | 
 36 | 
 37 | # Little helper, (22,5) -> 5,5,5,5,2.  itertools is bae
 38 | def take_in_chunks(n, d):
 39 |     assert d > 0 and n >= 0
 40 |     return chain(repeat(d, n // d), filter(lambda x: x != 0, [n % d]))
 41 | 
 42 | 
 43 | def printif(b, s):
 44 |     if b:
 45 |         print(s)
 46 | 
 47 | 
 48 | class opencl_interface:
 49 |     debug = False
 50 |     inv_memory_density = 1
 51 | 
 52 |     # Initialiser for the key properties
 53 |     #   pbkdf related initialisation removed, will reappear somewhere else
 54 |     def __init__(self, platformNum, debug=0, write_combined_file=False, maxWorkgroupSize=60000, inv_memory_density=1,
 55 |                  N_value=15, openclDevice = 0):
 56 |         self.workgroupsize = 0
 57 |         self.computeunits = 0
 58 |         self.wordSize = None
 59 |         self.N = None
 60 |         self.wordType = None
 61 |         printif(debug, "Using Platform %d:" % platformNum)
 62 |         devices = cl.get_platforms()[platformNum].get_devices()
 63 |         self.platform_number = platformNum
 64 |         # Show devices for the platform, and adjust workgroup size
 65 |         # Create the context for GPU/CPU
 66 |         # Adjust workgroup size so that we don't run out of RAM:
 67 |         # As with bench_sCrypt.py, not really working!
 68 |         self.sworkgroupsize = self.determine_workgroupsize(N_value)
 69 |         self.inv_memory_density = inv_memory_density
 70 |         self.ctx = cl.Context(devices)
 71 |         self.queue = cl.CommandQueue(self.ctx, devices[openclDevice])
 72 |         self.debug = debug
 73 | 
 74 |         for device in devices:
 75 |             printif(debug, '--------------------------------------------------------------------------')
 76 |             printif(debug, ' Device - Name: ' + device.name)
 77 |             printif(debug, ' Device - Type: ' + cl.device_type.to_string(device.type))
 78 |             printif(debug, ' Device - Compute Units: {0}'.format(device.max_compute_units))
 79 |             printif(debug, ' Device - Max Work Group Size: {0:.0f}'.format(device.max_work_group_size))
 80 |             printif(debug, ' Device - Global memory size: {}'.format(device.global_mem_size))
 81 |             printif(debug, ' Device - Local memory size:  {}'.format(device.local_mem_size))
 82 |             printif(debug, ' Device - Max clock frequency: {} MHz'.format(device.max_clock_frequency))
 83 | 
 84 |             assert device.endian_little == 1, "DEVICE is not little endian : pretty sure we rely on this!"
 85 |             if self.workgroupsize == 0:
 86 |                 self.workgroupsize = maxWorkgroupSize
 87 |                 self.workgroupsize = min(self.workgroupsize, device.max_work_group_size)
 88 |             else:
 89 |                 self.workgroupsize = min(self.workgroupsize, device.max_work_group_size)
 90 | 
 91 |             if self.computeunits == 0:
 92 |                 self.computeunits = device.max_compute_units
 93 |             else:
 94 |                 self.computeunits = min(self.computeunits, device.max_compute_units)
 95 | 
 96 |             # if device.max_work_group_size<self.workgroupsize:
 97 |             #    self.workgroupsize=device.max_work_group_size
 98 | 
 99 |             # Work around a Nvidia driver bug
100 |             version = device.version
101 |             m = re.match(r'OpenCL (\d+\.\d+)', version)
102 |             if m and float(m.group(1)) >= 2.0:
103 |                 os.environ['PYOPENCL_BUILD_OPTIONS'] = "-cl-std=CL1.2"
104 | 
105 |         printif(debug, "\nUsing work group size of %d\n" % self.workgroupsize)
106 | 
107 |         # Set the debug flags
108 |         os.environ['PYOPENCL_COMPILER_OUTPUT'] = str(debug)
109 |         self.write_combined_file = write_combined_file
110 | 
111 |     def compile(self, bufferStructsObj, library_file, footer_file=None, N=15, invMemoryDensity=2):
112 |         assert type(N) == int
113 |         assert N < 20, "N >= 20 won't fit in a single buffer, so is unsupported. " + \
114 |                        "Nothing sane should use 20, is this wickr?"
115 |         self.N = N
116 |         assert bufferStructsObj is not None, "need to supply a bufferStructsObj : set all to 0 if necessary"
117 |         assert bufferStructsObj.code is not None, "bufferStructsObj should be initialised"
118 |         bufStructs = bufferStructsObj
119 |         self.wordSize = bufStructs.wordSize
120 | 
121 |         # set the np word type, for use in .run
122 |         npType = {
123 |             4: np.uint32,
124 |             8: np.uint64,
125 |         }
126 |         self.wordType = npType[self.wordSize]
127 | 
128 |         if footer_file != None:
129 |             src = bufStructs.code
130 |         else:
131 |             src = ""
132 |         if library_file:
133 |             with open(os.path.join(current_dir, "worker", "generic", library_file), "r") as rf:
134 |                 src += rf.read()
135 | 
136 |         if footer_file:
137 |             with open(os.path.join(current_dir, "worker", "generic", footer_file), "r") as rf:
138 |                 src += "\n" + rf.read()
139 | 
140 |         # Standardise to using no \r's, move to bytes to stop trickery
141 |         src = src.encode("ascii")
142 |         src = src.replace(b"\r\n", b"\n")
143 | 
144 |         # Debugging
145 |         if self.write_combined_file:
146 |             with open("combined_" + library_file, "wb") as wf:
147 |                 wf.write(src)
148 | 
149 |         # Convert back to text!
150 |         src = src.decode("ascii")
151 | 
152 |         # Check that it starts with 2 newlines, for adding our defines
153 |         if src.startswith("\n\n"):
154 |             src = "\n\n" + src
155 |             src = src[len("\n\n"):]
156 |             # Prepend define N and invMemoryDensity
157 |             defines = "#define N {}\n#define invMemoryDensity {}\n".format(N, invMemoryDensity)
158 |             src = defines + src
159 | 
160 |         # Kernel function instantiation. Build returns self.
161 |         prg = cl.Program(self.ctx, src).build()
162 |         return prg
163 | 
164 |     # Forms the input buffer of derived keys
165 |     # Returns the buffer and number in the buffer, <= n (iter may be exhausted)
166 |     def make_input_buffer(self, dkIter, n):
167 |         inpArray = bytearray()
168 |         numEaten = n
169 | 
170 |         for i in range(n):
171 |             try:
172 |                 dk = dkIter.__next__()
173 |             except StopIteration:
174 |                 # Correct the chunk size and break
175 |                 numEaten = i
176 |                 break
177 | 
178 |             assert len(dk) == BLOCK_LEN_BYTES
179 |             #   , "Derived key input is length {}, when we expected {}".format(len(dk), BLOCK_LEN_BYTES)
180 | 
181 |             inpArray.extend(dk)
182 | 
183 |         # pyopencl doesn't like empty buffers, so just cheer it up
184 |         #   (making the buffer larger isn't an issue)
185 |         if len(inpArray) == 0:
186 |             inpArray = b"\x00"
187 | 
188 |         inp_g = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=inpArray)
189 | 
190 |         return inp_g, numEaten
191 | 
192 |     def run(self, bufStructs, func, pwdIter, salt=b"", paddedLenFunc=None, rtnPwds=None):
193 |         # PaddedLenFunc is just for checking: lower bound with original length if not supplied
194 |         wordType=self.wordType
195 |         wordSize=self.wordSize
196 |         ctx=self.ctx
197 |         queue=self.queue
198 |         hashBlockSize_bits=bufStructs.hashBlockSize_bits
199 |         if not paddedLenFunc:
200 |             paddedLenFunc = lambda x, bs: x
201 | 
202 |         # Checks on password list : not possible now we have iters!
203 | 
204 |         inBufSize_bytes = bufStructs.inBufferSize_bytes
205 |         outBufSize_bytes = bufStructs.outBufferSize_bytes
206 |         outBufferSize = bufStructs.outBufferSize
207 |         saltBufferSize_bytes = bufStructs.saltBufferSize_bytes
208 | 
209 |         # Main loop is taking chunks of at most the workgroup size
210 |         while True:
211 |             # Moved to bytearray initially, avoiding copying and above all
212 |             #   'np.append' which is horrific
213 |             pwArray = bytearray()
214 | 
215 |             # For each password in our chunk, process it into pwArray, with length first
216 |             # Notice that this lines up with the struct declared in the .cl file!
217 |             chunkSize = self.workgroupsize
218 |             for i in range(self.workgroupsize):
219 |                 try:
220 |                     pw = pwdIter.__next__()
221 |                     # Since we take a iterator, we feed the passwords back if requested
222 |                     if rtnPwds is not None:
223 |                         rtnPwds.append(pw)
224 |                 except StopIteration:
225 |                     # Correct the chunk size and break
226 |                     chunkSize = i
227 |                     break
228 | 
229 |                 pwLen = len(pw)
230 |                 # Now passing hash block size as a parameter.. could be None?
231 |                 assert paddedLenFunc(pwLen, hashBlockSize_bits // 8) <= inBufSize_bytes, \
232 |                     "password #" + str(i) + ", '" + pw.decode() + "' (length " + str(
233 |                         pwLen) + ") exceeds the input buffer (length " + str(inBufSize_bytes) + ") when padded"
234 | 
235 |                 # Add the length to our pwArray, then pad with 0s to struct size
236 |                 # prev code was np.array([pwLen], dtype=np.uint32), this ultimately is equivalent
237 |                 pwArray.extend(pwLen.to_bytes(wordSize, 'little')+pw+(b"\x00"* (inBufSize_bytes - pwLen)))
238 | 
239 |             if chunkSize == 0:
240 |                 break
241 |             # print("Chunksize = {}".format(chunkSize))
242 | 
243 |             # Convert the pwArray into a numpy array, just the once.
244 |             # Declare the numpy array for the digest output
245 |             pwArray = np.frombuffer(pwArray, dtype=wordType)
246 |             result = np.zeros(outBufferSize * chunkSize, dtype=wordType)
247 | 
248 |             # Make the salty array, with length at the front
249 |             saltLen = len(salt)
250 |             saltArray = bytearray(saltLen.to_bytes(wordSize, 'little')+salt+(b"\x00" * (saltBufferSize_bytes - saltLen)))
251 |             saltArray = np.frombuffer(saltArray, dtype=wordType)
252 |             assert saltArray.nbytes - wordSize == saltBufferSize_bytes, "Salt doesn't fit in the " \
253 |                                                                                         "buffer! "
254 | 
255 |             # Allocate memory for variables on the device
256 |             pass_g = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=pwArray)
257 |             salt_g = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=saltArray)
258 |             result_g = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, result.nbytes)
259 | 
260 |             # print("=========== Initial buffers ==============")
261 |             # print(" pass_g.nbytes = {}".format(pwArray.nbytes))
262 |             # print(" salt_g.nbytes = {}".format(saltArray.nbytes))
263 |             # print(" result_g.nbytes = {}".format(result.nbytes))
264 | 
265 |             # Call Kernel. Automatically takes care of block/grid distribution
266 |             pwdim = (chunkSize,)
267 | 
268 |             # Main function callback : could adapt to pass further data
269 |             func(self, pwdim, pass_g, salt_g, result_g)
270 |             # self.prg.hmac_main(self.queue, pwdim, None, pass_g, salt_g, result_g)
271 | 
272 |             # Read the results back into our array of int32s, then hexlify
273 |             # Some inefficiency here, unavoidable using hexlify
274 |             cl.enqueue_copy(queue, result, result_g)
275 | 
276 |             # Chop up into the individual hash digests, then trim to necessary hash length.
277 | 
278 |             # Yield this block of results
279 |             yield [bytes(result[i:i + outBufSize_bytes // wordSize])
280 |                    for i in range(0, len(result), outBufSize_bytes // wordSize)]
281 | 
282 |         # No main return
283 |         return None
284 | 
285 |     def run_saltlist(self, bufStructs, func, saltIter, password = b"", paddedLenFunc=None, rtnSalts=None):
286 |         # PaddedLenFunc is just for checking: lower bound with original length if not supplied
287 |         wordType=self.wordType
288 |         wordSize=self.wordSize
289 |         ctx=self.ctx
290 |         queue=self.queue
291 |         hashBlockSize_bits=bufStructs.hashBlockSize_bits
292 |         if not paddedLenFunc:
293 |             paddedLenFunc = lambda x, bs: x
294 | 
295 |         # Checks on password list : not possible now we have iters!
296 | 
297 |         inBufSize_bytes = bufStructs.inBufferSize_bytes
298 |         outBufSize_bytes = bufStructs.outBufferSize_bytes
299 |         outBufferSize = bufStructs.outBufferSize
300 | 
301 |         # Main loop is taking chunks of at most the workgroup size
302 |         while True:
303 |             # Moved to bytearray initially, avoiding copying and above all
304 |             #   'np.append' which is horrific
305 |             saltArray = bytearray()
306 | 
307 |             # For each password in our chunk, process it into pwArray, with length first
308 |             # Notice that this lines up with the struct declared in the .cl file!
309 |             chunkSize = self.workgroupsize
310 |             for i in range(self.workgroupsize):
311 |                 try:
312 |                     salt = saltIter.__next__()
313 |                     # Since we take a iterator, we feed the passwords back if requested
314 |                     if rtnSalts != None:
315 |                         rtnSalts.append(salt)
316 |                 except StopIteration:
317 |                     # Correct the chunk size and break
318 |                     chunkSize = i
319 |                     break
320 | 
321 |                 saltLen = len(salt)
322 |                 # Now passing hash block size as a parameter.. could be None?
323 |                 assert paddedLenFunc(saltLen, hashBlockSize_bits // 8) <= inBufSize_bytes, \
324 |                     "salt #" + str(i) + ", '" + salt.decode() + "' (length " + str(
325 |                         saltLen) + ") exceeds the input buffer (length " + str(inBufSize_bytes) + ") when padded"
326 | 
327 |                 # Add the length to our saltLen, then pad with 0s to struct size
328 |                 # prev code was np.array([saltLen], dtype=np.uint32), this ultimately is equivalent
329 |                 saltArray.extend((saltLen).to_bytes(self.wordSize, 'little'))
330 |                 saltArray.extend(salt)
331 |                 saltArray.extend([0] * (inBufSize_bytes - saltLen))
332 | 
333 |             if chunkSize == 0:
334 |                 break
335 |             # print("Chunksize = {}".format(chunkSize))
336 | 
337 |             # Convert the pwArray into a numpy array, just the once.
338 |             # Declare the numpy array for the digest output
339 |             saltArray = np.frombuffer(saltArray, dtype=self.wordType)
340 |             result = np.zeros(bufStructs.outBufferSize * chunkSize, dtype=self.wordType)
341 | 
342 |             # Make the salty array, with length at the front
343 |             pwLen = len(password)
344 |             pwArray = bytearray((pwLen).to_bytes(self.wordSize, 'little'))
345 |             pwArray.extend(password)
346 |             ##saltArray.extend(b"\x00" * ((-saltLen) % 4))
347 |             pwArray.extend(b"\x00" * (bufStructs.pwdBufferSize_bytes - pwLen))
348 |             pwArray = np.frombuffer(pwArray, dtype=self.wordType)
349 |             assert pwArray.nbytes - self.wordSize == bufStructs.pwdBufferSize_bytes, "Salt doesn't fit in the buffer!"
350 | 
351 |             # Allocate memory for variables on the device
352 |             pass_g = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=pwArray)
353 |             salt_g = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=saltArray)
354 |             result_g = cl.Buffer(self.ctx, cl.mem_flags.WRITE_ONLY, result.nbytes)
355 | 
356 |             # print("=========== Initial buffers ==============")
357 |             # print(" pass_g.nbytes = {}".format(pwArray.nbytes))
358 |             # print(" salt_g.nbytes = {}".format(saltArray.nbytes))
359 |             # print(" result_g.nbytes = {}".format(result.nbytes))
360 | 
361 |             # Call Kernel. Automatically takes care of block/grid distribution
362 |             pwdim = (chunkSize,)
363 | 
364 |             # Main function callback : could adapt to pass further data
365 |             func(self, pwdim, pass_g, salt_g, result_g)
366 |             ##self.prg.hmac_main(self.queue, pwdim, None, pass_g, salt_g, result_g)
367 | 
368 |             # Read the results back into our array of int32s, then hexlify
369 |             # Some inefficiency here, unavoidable using hexlify
370 |             cl.enqueue_copy(self.queue, result, result_g)
371 | 
372 |             # hexvalue = hexlify(result)
373 | 
374 |             # Chop up into the individual hash digests, then trim to necessary hash length.
375 |             results = []
376 |             # for i in range(0, len(hexvalue), outBufSize_hs):
377 |             #    hexRes = hexvalue[i:i + outBufSize_hs].decode()
378 |             #    results.append(hexRes)
379 | 
380 |             for i in range(0, len(result), outBufSize_bytes // bufStructs.wordSize):
381 |                 v = bytes(result[i:i + outBufSize_bytes // bufStructs.wordSize])
382 |                 results.append(v)
383 | 
384 |             # Yield this block of results
385 |             yield results
386 | 
387 |         # No main return
388 |         return None
389 | 
390 |     def determine_workgroupsize(self, N_value=15):
391 |         devices = cl.get_platforms()[self.platform_number].get_devices()
392 |         wgSize = 0
393 |         for device in devices:
394 |             # Actually adjust based on invMemoryDensity!
395 |             N_blocks_bytes = (1 << N_value) * BLOCK_LEN_BYTES // self.inv_memory_density
396 |             memoryForOneCore = BLOCK_LEN_BYTES * 2 + N_blocks_bytes  # input, output & V
397 | 
398 |             ## ! Restrict to 98% of avaiable memory
399 |             coresOnDevice = (int(0.98 * device.global_mem_size) // memoryForOneCore)
400 |             percentUsage = 100 * memoryForOneCore * coresOnDevice / device.global_mem_size
401 |             percentUsage = str(percentUsage)[:4]
402 |             if self.debug == 1:
403 |                 print("Using {} cores on device with global memory {}, = {}%".format(
404 |                     coresOnDevice, device.global_mem_size, percentUsage
405 |                 ))
406 |             wgSize += coresOnDevice
407 | 
408 |         if self.debug == 1:
409 |             print("Workgroup size determined as {}".format(wgSize))
410 | 
411 |         return wgSize
412 | 
413 |     def run_scrypt(self, sprg, kernelCall, dkIter):
414 |         N_blocks_bytes = (1 << self.N) * BLOCK_LEN_BYTES
415 | 
416 |         # no. of cores' memory that we can fit into a single buffer
417 |         #   (seemingly anyway, why isn't it 2^31?)
418 |         # note: this is NOT the workgroupsize, nor does it bound it
419 |         maxGangSize = (1 << 31) // N_blocks_bytes
420 |         assert maxGangSize > 0, "Uh-oh we couldn't fit a single core's V in a buffer."
421 | 
422 |         #   A. Before the loop we produce our huge buffers, once only.
423 |         #   B. Also make our output buffers & numpys now, just once, to save work
424 |         #     Note these will be atleast big enough throughout the loop: sometimes they'll have extra room.
425 |         largeBuffers = []
426 |         outBuffers = []
427 |         outNumpys = []
428 |         outSizes = []
429 |         for gangSize in take_in_chunks(self.sworkgroupsize, maxGangSize):
430 |             # Produce the large buffer for storing this gang's V arrays
431 |             # No longer producing a big bytes object in Python
432 | 
433 |             # arr = np.frombuffer(bytes(gangSize * N_blocks_bytes), dtype=np.uint32)
434 |             # Why is this read only?
435 |             arr_g = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY, size=gangSize * N_blocks_bytes)
436 |             largeBuffers.append(arr_g)
437 | 
438 |             # Produce the gang's output buffer and (small) numpy array to copy out to
439 |             nBytes = BLOCK_LEN_BYTES * gangSize
440 |             result = np.zeros(nBytes // 4, dtype=np.uint32)
441 |             assert nBytes == result.nbytes
442 |             result_g = cl.Buffer(self.ctx, cl.mem_flags.WRITE_ONLY, nBytes)
443 |             outBuffers.append(result_g)
444 |             outNumpys.append(result)
445 | 
446 |             # No output from round 0!
447 |             outSizes.append(0)
448 | 
449 |         # ! For minimal latency, we only block just before our next calls to the kernels:
450 |         #     there is basically no work between the two.
451 | 
452 |         # Main loop is taking chunks of workgroup size,
453 |         #   or less if we exhaust the input derived keys iter
454 |         iterActive = True
455 |         while iterActive:
456 |             #   1. Make New SMALL input buffers (derived key buffer, was password & salt)
457 |             #       if we exhaust dkIter, continue producing 'empty' input buffers, but mark to leave the main loop
458 |             newInputs = []
459 |             inCounts = []
460 |             for gangSize in take_in_chunks(self.sworkgroupsize, maxGangSize):
461 |                 input_g, numEaten = self.make_input_buffer(dkIter, gangSize)
462 |                 iterActive = (numEaten == gangSize)  # note gangSize > 0, so once False this will persist
463 |                 newInputs.append(input_g)
464 |                 inCounts.append(numEaten)
465 | 
466 |             #   2. (BLOCKING) wait for all our workers to finish (should be at similar times),
467 |             #       and copy output buffers out to numpy (minimal time loss here, could use
468 |             #       2 sets of output buffers instead)
469 |             #   Note we may well have copied too much: this is dealt with in 4. below
470 |             for outSize, outNumpy, outBuf in zip_longest(outSizes, outNumpys, outBuffers):
471 |                 if outSize > 0:
472 |                     cl.enqueue_copy(self.queue, outNumpy, outBuf)  # is_blocking defaults to true :)
473 | 
474 |             # print("Calling kernels..")
475 |             #   3. (NON-BLOCKING) queue the kernel calls
476 |             for input_g, arr_g, result_g, inCount in zip_longest(newInputs, largeBuffers, outBuffers, inCounts):
477 |                 if inCount > 0:
478 |                     dim = (inCount,)
479 |                     # print("inCount = {}".format(inCount))
480 |                     # print("All sizes in bytes (hopefully):")
481 |                     # print("input_g.size = {}".format(input_g.size))
482 |                     # print("arr_g.size = {}".format(arr_g.size))
483 |                     # print("result_g.size = {}".format(result_g.size))
484 |                     # print("\nOpenCL code now:\n")
485 |                     kernelCall(sprg, (self.queue, dim, None, input_g, arr_g, result_g))
486 |             # print("Kernels running..")
487 | 
488 |             #   4. Process the outputs from the last round, yielding now (while the GPUs are busy)
489 |             #       Also copy the input counts across to output sizes, for the next loop / final processing below
490 |             for i, (outNumpy, inCount) in enumerate(zip_longest(outNumpys, inCounts)):
491 |                 outSize = outSizes[i]
492 | 
493 |                 assert outSize % BLOCK_LEN_BYTES == 0
494 |                 outBytes = outNumpy.tobytes()
495 |                 for j in range(0, outSize, BLOCK_LEN_BYTES):
496 |                     yield outBytes[j:j + BLOCK_LEN_BYTES]
497 | 
498 |                 outSizes[i] = inCount * BLOCK_LEN_BYTES
499 | 
500 |             # Note that if exiting here then we've updated the outSizes & called the functions
501 |             # Just remains to capture & process the output..
502 | 
503 |         # print("Dropped out of loop")
504 |         # Do a final loop of processing output (3 & 2)
505 |         for outBuf, outNumpy, outSize in zip_longest(outBuffers, outNumpys, outSizes):
506 |             # (BLOCKING) Copy!
507 |             cl.enqueue_copy(self.queue, outNumpy, outBuf)
508 | 
509 |             assert outSize % BLOCK_LEN_BYTES == 0
510 |             outBytes = outNumpy.tobytes()
511 |             for i in range(0, outSize, BLOCK_LEN_BYTES):
512 |                 yield outBytes[i:i + BLOCK_LEN_BYTES]
513 | 
514 | 
515 | def mdpad_128_func(pwdLen, blockSize):
516 |     llen = (pwdLen + 1 + 16)
517 |     llen += (-llen) % blockSize
518 |     return llen
519 | 
520 | 
521 | def mdpad_64_func(pwdLen, blockSize):
522 |     # both parameters in bytes
523 |     # length appended as a 64-bit integer
524 |     llen = (pwdLen + 1 + 8)
525 |     llen += (-llen) % blockSize
526 |     return llen
527 | 
528 | 
529 | def concat(ll):
530 |     return [obj for lval in ll for obj in lval]
531 | 
532 | 
533 | class opencl_algos:
534 |     def __init__(self, platform, debug, write_combined_file, inv_memory_density=1, openclDevice = 0):
535 |         if not debug:
536 |             debug = 0
537 |         self.opencl_ctx = opencl_interface(platform, debug, write_combined_file, openclDevice = openclDevice)
538 |         self.platform_number = platform
539 |         self.inv_memory_density = inv_memory_density
540 |         self.max_out_bytes=0
541 | 
542 |     def cl_scrypt_init(self, N_value=15, forceAltKernel = None):
543 |         # Initialise the openCL context & compile, with both debugging settings off
544 |         debug = 0
545 |         bufStructs = buffer_structs()
546 |         if forceAltKernel:
547 |             print("Loading Alternative sCrypt Kernel:", forceAltKernel)
548 |             sprg = self.opencl_ctx.compile(bufStructs, forceAltKernel, None, N=N_value, invMemoryDensity=self.inv_memory_density)
549 |         else:
550 |             sprg=self.opencl_ctx.compile(bufStructs, "sCrypt.cl", None, N=N_value, invMemoryDensity=self.inv_memory_density)
551 |         return [sprg,bufStructs]
552 | 
553 |     def cl_scrypt(self, ctx, passwords, N_value=15, r_value=3, p_value=1, desired_key_length=32,
554 |                   hex_salt=unhexlify("DEADBEEFDEADBEEFDEADBEEFDEADBEEF")):
555 | 
556 |         def get_dk_iter(p, salt, pwdIter, rtnPwds=None):
557 |             # r fixed as 8 in the OpenCL
558 |             r_val = 8
559 |             blockSize = 128 * r_val
560 |             for pwd in pwdIter:
561 |                 if rtnPwds is not None:
562 |                     rtnPwds.append(pwd)
563 |                 # Get derived key, then split into p chunks and yield
564 |                 dk = pbkdf2_hmac("sha256", pwd, salt, 1, dklen=blockSize * p)
565 | 
566 |                 # Yield
567 |                 for i in range(p):
568 |                     yield dk[i * blockSize: (i + 1) * blockSize]
569 | 
570 |         sprg = ctx[0]
571 | 
572 |         # Our callback with the kernel name
573 |         # Debugging: calls Salsa20
574 |         def kernel_call(snprg, params):
575 |             return snprg.ROMix(*params)  # prg.ROMix(*params)
576 | 
577 |         # Derived key iter: yields p keys for each password
578 |         passwordList = deque()
579 |         dkIter = get_dk_iter(1 << p_value, hex_salt, passwords, passwordList)
580 | 
581 |         result = []
582 |         # Main call.
583 |         group = []
584 |         result_append = result.append
585 |         passwordList_popleft = passwordList.popleft
586 |         for singleOutput in self.opencl_ctx.run_scrypt(sprg, kernel_call, dkIter):
587 |             group.append(singleOutput)
588 |             if len(group) == 1 << p_value:
589 |                 expensiveSalt = b"".join(group)
590 |                 commonPwd = passwordList_popleft()
591 |                 sCryptResult = pbkdf2_hmac("sha256", commonPwd, expensiveSalt, 1, desired_key_length)
592 | 
593 |                 # For now print out for debugging
594 |                 # print("Password={}".format(commonPwd))
595 |                 # print("sCrypt={}".format(hexlify(sCryptResult).decode().upper()))
596 |                 # result.append("{}".format(hexlify(sCryptResult).decode().lower()))
597 |                 result_append(sCryptResult)
598 |                 group = []
599 |         return result
600 | 
601 |     # def mdPadLenFunc(self, pwdLen):
602 |     #    l = (pwdLen + 1 + 8)
603 |     #    l += (64 - (l % 64)) % 64
604 |     #    return l
605 | 
606 |     def cl_sha512_init(self, option="", max_in_bytes=128, max_salt_bytes=32, dklen=0, max_ct_bytes=0):
607 |         bufStructs = buffer_structs()
608 |         bufStructs.specifySHA2(512, max_in_bytes, max_salt_bytes, dklen, max_ct_bytes)
609 |         assert bufStructs.wordSize == 8  # set when you specify sha512
610 |         prg = self.opencl_ctx.compile(bufStructs, 'sha512.cl', option)
611 |         return [prg, bufStructs]
612 | 
613 |     def cl_sha512(self, ctx, passwordlist):
614 |         # self.cl_sha512_init()
615 |         prg = ctx[0]
616 |         bufStructs = ctx[1]
617 | 
618 |         def func(s, pwdim, pass_g, salt_g, result_g):
619 |             prg.hash_main(s.queue, pwdim, None, pass_g, result_g)
620 | 
621 |         return concat(self.opencl_ctx.run(bufStructs, func, iter(passwordlist), b"", mdpad_128_func))
622 | 
623 |     def cl_sha256_init(self, option="", max_in_bytes=128, max_salt_bytes=32, dklen=0, max_ct_bytes=0):
624 |         bufStructs = buffer_structs()
625 |         bufStructs.specifySHA2(256, max_in_bytes, max_salt_bytes, dklen, max_ct_bytes)
626 |         assert bufStructs.wordSize == 4  # set when you specify sha256
627 |         prg = self.opencl_ctx.compile(bufStructs, 'sha256.cl', option)
628 |         return [prg, bufStructs]
629 | 
630 |     def cl_sha256(self, ctx, passwordlist):
631 |         # self.cl_sha256_init()
632 |         prg = ctx[0]
633 |         bufStructs = ctx[1]
634 | 
635 |         def func(s, pwdim, pass_g, salt_g, result_g):
636 |             prg.hash_main(s.queue, pwdim, None, pass_g, result_g)
637 | 
638 |         return concat(self.opencl_ctx.run(bufStructs, func, iter(passwordlist), b"", mdpad_64_func))
639 | 
640 |     def cl_md5_init(self, option=""):
641 |         bufStructs = buffer_structs()
642 |         bufStructs.specifyMD5()
643 |         assert bufStructs.wordSize == 4  # set when you specify md5
644 |         prg = self.opencl_ctx.compile(bufStructs, 'md5.cl', option)
645 |         return [prg, bufStructs]
646 | 
647 |     def cl_md5(self, ctx, passwordlist):
648 |         # self.cl_md5_init()
649 |         prg = ctx[0]
650 |         bufStructs = ctx[1]
651 | 
652 |         def func(s, pwdim, pass_g, salt_g, result_g):
653 |             prg.hash_main(s.queue, pwdim, None, pass_g, result_g)
654 | 
655 |         return concat(self.opencl_ctx.run(bufStructs, func, iter(passwordlist), b"", mdpad_64_func))
656 | 
657 |     def cl_sha1_init(self, option=""):
658 |         bufStructs = buffer_structs()
659 |         bufStructs.specifySHA1()
660 |         assert bufStructs.wordSize == 4  # set when you specify sha1
661 |         prg = self.opencl_ctx.compile(bufStructs, 'sha1.cl', option)
662 |         return [prg, bufStructs]
663 | 
664 |     def cl_sha1(self, ctx, passwordlist):
665 |         # self.cl_sha1_init()
666 |         prg = ctx[0]
667 |         bufStructs = ctx[1]
668 | 
669 |         def func(s, pwdim, pass_g, salt_g, result_g):
670 |             prg.hash_main(s.queue, pwdim, None, pass_g, result_g)
671 | 
672 |         return concat(self.opencl_ctx.run(bufStructs, func, iter(passwordlist), b"", mdpad_64_func))
673 | 
674 |     # ===========================================================================================
675 | 
676 |     def cl_hmac(self, ctx, passwordlist, salt):
677 |         prg = ctx[0]
678 |         bufStructs = ctx[1]
679 | 
680 |         def func(s, pwdim, pass_g, salt_g, result_g):
681 |             prg.hmac_main(s.queue, pwdim, None, pass_g, salt_g, result_g)
682 | 
683 |         return concat(self.opencl_ctx.run(bufStructs, func, iter(passwordlist), salt))
684 | 
685 |     def cl_md5_hmac(self, ctx, passwordlist, salt):
686 |         # self.cl_md5_init("pbkdf2.cl")
687 |         return self.cl_hmac(ctx, passwordlist, salt)
688 | 
689 |     def cl_sha1_hmac(self, ctx, passwordlist, salt):
690 |         # self.cl_sha1_init("pbkdf2.cl")
691 |         return self.cl_hmac(ctx, passwordlist, salt)
692 | 
693 |     def cl_sha256_hmac(self, ctx, passwordlist, salt):
694 |         # self.cl_sha256_init("pbkdf2.cl")
695 |         return self.cl_hmac(ctx, passwordlist, salt)
696 | 
697 |     def cl_sha512_hmac(self, ctx, passwordlist, salt):
698 |         # self.cl_sha512_init("pbkdf2.cl")
699 |         return self.cl_hmac(ctx, passwordlist, salt)
700 | 
701 |     # ===========================================================================================
702 | 
703 |     def cl_pbkdf2(self, ctx, passwordlist, salt, iters, dklen):
704 |         prg = ctx[0]
705 |         bufStructs = ctx[1]
706 | 
707 |         def func(s, pwdim, pass_g, salt_g, result_g):
708 |             prg.pbkdf2(s.queue, pwdim, None, pass_g, salt_g, result_g,
709 |                        iters.to_bytes(4, 'little'), dklen.to_bytes(4, 'little'))  # ! iters, dklen are always ints
710 | 
711 |         result = concat(self.opencl_ctx.run(bufStructs, func, iter(passwordlist), salt))
712 |         if dklen != self.max_out_bytes:
713 |             # We may have made more space for a multiple of the digest size
714 |             result = [hexRes[:dklen] for hexRes in result]
715 |         return result
716 | 
717 |     def cl_pbkdf2_init(self, rtype, saltlen, dklen):
718 |         bufStructs = buffer_structs()
719 |         if rtype == "md5":
720 |             self.max_out_bytes = bufStructs.specifyMD5(128, saltlen, dklen)
721 |             # hmac is defined in with pbkdf2, as a kernel function
722 |             prg = self.opencl_ctx.compile(bufStructs, "md5.cl", "pbkdf2.cl")
723 |         elif rtype == "sha1":
724 |             if saltlen < 32 and dklen < 32:
725 |                  dklen=32
726 |                  self.max_out_bytes = bufStructs.specifySHA1(32, saltlen, dklen)
727 |                  prg = self.opencl_ctx.compile(bufStructs, "pbkdf2_sha1_32.cl", None)
728 |             else:
729 |                 self.max_out_bytes = bufStructs.specifySHA1(128, saltlen, dklen)
730 |                 prg = self.opencl_ctx.compile(bufStructs, "sha1.cl", "pbkdf2.cl")
731 |         elif rtype == "sha256":
732 |             if saltlen <= 64 and dklen <= 64:
733 |                 dklen=64
734 |             self.max_out_bytes = bufStructs.specifySHA2(256, 128, saltlen, dklen)
735 |             if saltlen <= 64 and dklen <= 64:
736 |                 prg = self.opencl_ctx.compile(bufStructs, "pbkdf2_sha256_32.cl", None)
737 |             else:
738 |                 prg = self.opencl_ctx.compile(bufStructs, "sha256.cl", "pbkdf2.cl")
739 |         elif rtype == "sha512":
740 |             self.max_out_bytes = bufStructs.specifySHA2(512, 256, saltlen, dklen)
741 |             prg = self.opencl_ctx.compile(bufStructs, "sha512.cl", "pbkdf2.cl")
742 |         else:
743 |             assert ("Error on hash type, unknown !!!")
744 |         return [prg, bufStructs]
745 | 
746 |     # ===========================================================================================
747 | 
748 |     def cl_pbkdf2_saltlist(self, ctx, password, saltlist, iters, dklen):
749 |         prg = ctx[0]
750 |         bufStructs = ctx[1]
751 |         def func(s, pwdim, pass_g, salt_g, result_g):
752 |             prg.pbkdf2_saltlist(s.queue, pwdim, None, pass_g, salt_g, result_g,
753 |                        (iters).to_bytes(4, 'little'), (dklen).to_bytes(4, 'little'))    # ! iters, dklen are always ints
754 | 
755 |         result = concat(self.opencl_ctx.run_saltlist(bufStructs, func, iter(saltlist), password))
756 |         if dklen != self.max_out_bytes:
757 |             # We may have made more space for a multiple of the digest size
758 |             result = [hexRes[:dklen] for hexRes in result]
759 |         return result
760 | 
761 |     def cl_pbkdf2_saltlist_init(self, type, pwdlen, dklen):
762 |         bufStructs = buffer_structs()
763 |         if type == "md5":
764 |             self.max_out_bytes = bufStructs.specifyMD5(max_in_bytes=128, max_salt_bytes=128, dklen=dklen, max_password_bytes=pwdlen)
765 |             ## hmac is defined in with pbkdf2, as a kernel function
766 |             prg=self.opencl_ctx.compile(bufStructs, "md5.cl", "pbkdf2.cl")
767 |         elif type == "sha1":
768 |             self.max_out_bytes = bufStructs.specifySHA1(max_in_bytes=128, max_salt_bytes=128, dklen=dklen, max_password_bytes=pwdlen)
769 |             ## hmac is defined in with pbkdf2, as a kernel function
770 |             prg=self.opencl_ctx.compile(bufStructs, "sha1.cl", "pbkdf2.cl")
771 |         elif type == "sha256":
772 |             self.max_out_bytes = bufStructs.specifySHA2(hashDigestSize_bits=256, max_in_bytes=128, max_salt_bytes=128, dklen=dklen, max_password_bytes=pwdlen)
773 |             prg=self.opencl_ctx.compile(bufStructs, "sha256.cl", "pbkdf2.cl")
774 |         elif type == "sha512":
775 |             self.max_out_bytes = bufStructs.specifySHA2(hashDigestSize_bits=512, max_in_bytes=256, max_salt_bytes=128, dklen=dklen, max_password_bytes=pwdlen)
776 |             prg=self.opencl_ctx.compile(bufStructs, "sha512.cl", "pbkdf2.cl")
777 |         else:
778 |             assert ("Error on hash type, unknown !!!")
779 |         return [prg, bufStructs]
780 | 
781 |     # ===========================================================================================
782 | 
783 |     def cl_hash_iterations(self, ctx, passwordlist, iters, hash_size):
784 |         prg = ctx[0]
785 |         bufStructs = ctx[1]
786 |         def func(s, pwdim, pass_g, salt_g, result_g):
787 |             prg.hash_iterations(s.queue, pwdim, None, pass_g, result_g, iters.to_bytes(4, 'little'), hash_size.to_bytes(4, 'little'))    # ! iters are always ints
788 | 
789 |         return concat(self.opencl_ctx.run(bufStructs, func, iter(passwordlist), b"", mdpad_64_func))
790 | 
791 |     def cl_hash_iterations_init(self, type):
792 |         bufStructs = buffer_structs()
793 |         if type == "md5":
794 |             self.max_out_bytes = bufStructs.specifyMD5()
795 |             ## hmac is defined in with pbkdf2, as a kernel function
796 |             prg=self.opencl_ctx.compile(bufStructs, "md5.cl", "hash_iterations.cl")
797 |         elif type == "sha1":
798 |             self.max_out_bytes = bufStructs.specifySHA1()
799 |             ## hmac is defined in with pbkdf2, as a kernel function
800 |             prg=self.opencl_ctx.compile(bufStructs, "sha1.cl", "hash_iterations.cl")
801 |         elif type == "sha256":
802 |             self.max_out_bytes = bufStructs.specifySHA2()
803 |             prg=self.opencl_ctx.compile(bufStructs, "sha256.cl", "hash_iterations.cl")
804 |         elif type == "sha512":
805 |             self.max_out_bytes = bufStructs.specifySHA2(512, 256, 0, 64)
806 |             prg=self.opencl_ctx.compile(bufStructs, "sha512.cl", "hash_iterations.cl")
807 |         else:
808 |             assert ("Error on hash type, unknown !!!")
809 |         return [prg, bufStructs]
810 | 


--------------------------------------------------------------------------------
/Library/opencl_information.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | # -*- coding: utf-8 -*-
 3 | # (c) B. Kerler 2017-2021
 4 | # MIT License
 5 | '''
 6 |     Original copyright:
 7 |     Copyright by B.Kerler 2017, PBKDF1_SHA1 and SHA256 PyOpenCl implementation, max 32 chars for password + salt
 8 |     MIT License
 9 |     Implementation was confirmed to work with Intel OpenCL on Intel(R) HD Graphics 520 and Intel(R) Core(TM) i5-6200U CPU
10 | '''
11 | '''
12 |     Refactored out of 'opencl.py'
13 | '''
14 | 
15 | import pyopencl as cl
16 | 
17 | class opencl_information:
18 |     def __init__(self):
19 |         pass
20 | 
21 |     def printplatforms(self):
22 |         for i,platformNum in enumerate(cl.get_platforms()):
23 |             print('Platform %d - Name %s, Vendor %s' %(i,platformNum.name,platformNum.vendor))
24 | 
25 |     def printfullinfo(self):
26 |         print('\n' + '=' * 60 + '\nOpenCL Platforms and Devices')
27 |         for i,platformNum in enumerate(cl.get_platforms()):
28 |             print('=' * 60)
29 |             print('Platform %d - Name: ' %i + platformNum.name)
30 |             print('Platform %d - Vendor: ' %i + platformNum.vendor)
31 |             print('Platform %d - Version: ' %i + platformNum.version)
32 |             print('Platform %d - Profile: ' %i + platformNum.profile)
33 | 
34 |             for device in platformNum.get_devices():
35 |                 print(' ' + '-' * 56)
36 |                 print(' Device - Name: ' + device.name)
37 |                 print(' Device - Type: ' + cl.device_type.to_string(device.type))
38 |                 print(' Device - Max Clock Speed: {0} Mhz'.format(device.max_clock_frequency))
39 |                 print(' Device - Compute Units: {0}'.format(device.max_compute_units))
40 |                 print(' Device - Local Memory: {0:.0f} KB'.format(device.local_mem_size / 1024.0))
41 |                 print(' Device - Constant Memory: {0:.0f} KB'.format(device.max_constant_buffer_size / 1024.0))
42 |                 print(' Device - Global Memory: {0:.0f} GB'.format(device.global_mem_size / 1073741824.0))
43 |                 print(' Device - Max Buffer/Image Size: {0:.0f} MB'.format(device.max_mem_alloc_size / 1048576.0))
44 |                 print(' Device - Max Work Group Size: {0:.0f}'.format(device.max_work_group_size))
45 |                 print('\n')


--------------------------------------------------------------------------------
/Library/passwordutils.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import threading
 3 | from time import sleep
 4 | from queue import Queue
 5 | 
 6 | class passwordutils(threading.Thread):
 7 |     def __init__(self, stop, threadLock, passwords:Queue, totalthreads:int, minlen=4, maxlen=16):
 8 |         threading.Thread.__init__(self)
 9 |         self.minlen = minlen
10 |         self.stop = stop
11 |         self.maxlen = maxlen
12 |         self.passwords = passwords
13 |         self.threadLock = threadLock
14 |         self.totalthreads = totalthreads
15 |         # We start the password generator here as a thread
16 | 
17 |     def run(self):
18 |         global threadLock
19 |         try:
20 |             while True:
21 |                 self.threadLock.acquire()
22 |                 buff = sys.stdin.readline()
23 |                 self.threadLock.release()
24 |                 if buff == b"\n":
25 |                     continue
26 |                 elif buff == b"":
27 |                     self.threadLock.acquire()
28 |                     self.stop()
29 |                     self.threadLock.release()
30 |                     while not self.passwords.empty():
31 |                         sleep(1)
32 |                     return
33 |                 h = buff.rstrip()
34 |                 if self.maxlen < len(h) < self.minlen:
35 |                     continue
36 |                 self.threadLock.acquire()
37 |                 self.passwords.put(h)
38 |                 self.threadLock.release()
39 |                 while self.passwords.qsize() > self.totalthreads:
40 |                     if self.passwords.empty():
41 |                         sleep(1)
42 |                         break
43 |                     sleep(0.02)
44 | 
45 |         except KeyboardInterrupt:
46 |             sys.stdout.flush()
47 |             pass
48 |         return None
49 | 


--------------------------------------------------------------------------------
/Library/worker/generic/buffer_structs_template.cl:
--------------------------------------------------------------------------------
  1 | /*
  2 |     In- and out- buffer structures (of int32), with variable sizes, for hashing.
  3 |     These allow indexing just using just get_global_id(0)
  4 |     Variables tagged with <..> are replaced, so we can specify just enough room for the data.
  5 |     These are:
  6 |         - hashBlockSize_bits   : The hash's block size in Bits
  7 |         - inMaxNumBlocks      : per hash operation
  8 |         - hashDigestSize_bits   : The hash's digest size in Bits
  9 | 
 10 |     Originally adapted from Bjorn Kerler's sha256.cl
 11 |     MIT License
 12 | */
 13 | #define DEBUG 1
 14 | 
 15 | // All macros left defined for usage in the program
 16 | #define ceilDiv(n,d) (((n) + (d) - 1) / (d))
 17 | 
 18 | // All important now, defining whether we're working with unsigned ints or longs
 19 | #define wordSize <word_size>
 20 | 
 21 | // Practical sizes of buffers, in words.
 22 | #define inBufferSize ceilDiv(<inBufferSize_bytes>, wordSize)
 23 | #define outBufferSize ceilDiv(<outBufferSize_bytes>, wordSize)
 24 | #define pwdBufferSize ceilDiv(<pwdBufferSize_bytes>, wordSize)
 25 | #define saltBufferSize ceilDiv(<saltBufferSize_bytes>, wordSize)
 26 | #define ctBufferSize ceilDiv(<ctBufferSize_bytes>, wordSize)
 27 | 
 28 | // 
 29 | #define hashBlockSize_bytes ceilDiv(<hashBlockSize_bits>, 8) /* Needs to be a multiple of 4, or 8 when we work with unsigned longs */
 30 | #define hashDigestSize_bytes ceilDiv(<hashDigestSize_bits>, 8)
 31 | 
 32 | // just Size always implies _word
 33 | #define hashBlockSize ceilDiv(hashBlockSize_bytes, wordSize)
 34 | #define hashDigestSize ceilDiv(hashDigestSize_bytes, wordSize)
 35 | 
 36 | 
 37 | // Ultimately hoping to faze out the Size_int32/long64,
 38 | //   in favour of just size (_word implied)
 39 | #if wordSize == 4
 40 |     #define hashBlockSize_int32 hashBlockSize
 41 |     #define hashDigestSize_int32 hashDigestSize
 42 |     #define word unsigned int
 43 |         
 44 |     unsigned int SWAP (unsigned int val)
 45 |     {
 46 |         return (rotate(((val) & 0x00FF00FF), 24U) | rotate(((val) & 0xFF00FF00), 8U));
 47 |     }
 48 | 
 49 | #elif wordSize == 8
 50 |     // Initially for use in SHA-512
 51 |     #define hashBlockSize_long64 hashBlockSize
 52 |     #define hashDigestSize_long64 hashDigestSize
 53 |     #define word unsigned long
 54 |     #define rotl64(a,n) (rotate ((a), (n)))
 55 |     #define rotr64(a,n) (rotate ((a), (64ul-n)))
 56 |     
 57 |     unsigned long SWAP (const unsigned long val)
 58 |     {
 59 |         // ab cd ef gh -> gh ef cd ab using the 32 bit trick
 60 |         unsigned long tmp = (rotr64(val & 0x0000FFFF0000FFFFUL, 16UL) | rotl64(val & 0xFFFF0000FFFF0000UL, 16UL));
 61 |         
 62 |         // Then see this as g- e- c- a- and -h -f -d -b to swap within the pairs,
 63 |         // gh ef cd ab -> hg fe dc ba
 64 |         return (rotr64(tmp & 0xFF00FF00FF00FF00UL, 8UL) | rotl64(tmp & 0x00FF00FF00FF00FFUL, 8UL));
 65 |     }
 66 | #endif
 67 | 
 68 | 
 69 | 
 70 | // ====  Define the structs with the right word size  =====
 71 | //  Helpful & more cohesive to have the lengths of structures as words too,
 72 | //   (rather than unsigned int for both)
 73 | typedef struct {
 74 |     word length; // in bytes
 75 |     word buffer[inBufferSize];
 76 | } inbuf;
 77 | 
 78 | typedef struct {
 79 |     word buffer[outBufferSize];
 80 | } outbuf;
 81 | 
 82 | // Salt buffer, used by pbkdf2 & pbe
 83 | typedef struct {
 84 |     word length; // in bytes
 85 |     word buffer[saltBufferSize];
 86 | } saltbuf;
 87 | 
 88 | // Password buffer, used by pbkdf2 & pbe
 89 | typedef struct {
 90 |     word length; // in bytes
 91 |     word buffer[pwdBufferSize];
 92 | } pwdbuf;
 93 | 
 94 | // ciphertext buffer, used in pbe.
 95 | // no code relating to this in the opencl.py core, dealt with in signal_pbe_mac.cl as it's a special case
 96 | typedef struct {
 97 |     word length; // in bytes
 98 |     word buffer[ctBufferSize];
 99 | } ctbuf;
100 | 
101 | 
102 | 
103 | 
104 | // ========== Debugging function ============
105 | 
106 | #ifdef DEBUG
107 | #if DEBUG
108 |     #define mod(x,y) ((x)-((x)/(y)*(y)))
109 |     #define def_printFromWord(tag, funcName, end)               \
110 |     /* For printing the string of bytes stored in an array of words.
111 |     Option to print hex. */    \
112 |     static void funcName(tag const word *arr, const unsigned int len_bytes, const bool hex)\
113 |     {                                           \
114 |         for (int j = 0; j < len_bytes; j++){    \
115 |             word v = arr[j / wordSize];                 \
116 |             word r = mod(j,wordSize) * 8;                \
117 |             /* Prints little endian, since that's what we use */   \
118 |             v = (v >> r) & 0xFF;                \
119 |             if (hex) {                          \
120 |                 printf("%02x", v);              \
121 |             } else {                            \
122 |                 printf("%c", (char)v);          \
123 |             }                                   \
124 |         }                                       \
125 |         printf(end);                            \
126 |     }
127 | 
128 |     def_printFromWord(__private, printFromWord, "")
129 |     def_printFromWord(__global, printFromWord_glbl, "")
130 |     def_printFromWord(__private, printFromWord_n, "\n")
131 |     def_printFromWord(__global, printFromWord_glbl_n, "\n")
132 | 
133 | #endif
134 | #endif


--------------------------------------------------------------------------------
/Library/worker/generic/hash_iterations.cl:
--------------------------------------------------------------------------------
 1 | // Extremely basic (but useful) script to perform a certain number of hashing iterations when used with a pre-existing
 2 | // hasing library which is called via hash_main. (Useful for some cryptocurrency wallets which use custom key stretching)
 3 | //
 4 | // Generally speaking, this this function will take a hash (and maybe salted) password as the input, with this initial
 5 | // hash happening in the calling application. This means that the input and output will always be the same size and that
 6 | // we don't need to worry about padding, etc...
 7 | //
 8 | // Originally created for BTCRecover by Stephen Rothery, available at https://github.com/3rdIteration/btcrecover
 9 | __kernel void hash_iterations(__global inbuf *inbuffer, __global outbuf *outbuffer, __private unsigned int iters, __private unsigned int hash_size)
10 | {
11 |     unsigned int idx = get_global_id(0);
12 | 
13 |     // Iterate through and has the input as many times as required
14 |     for (unsigned int j = 0; j < iters; j++){
15 |         hash_main(inbuffer, outbuffer);
16 | 
17 |         // Copy the output from the hash back in to the input...
18 |         for (unsigned int i = 0; i < hash_size; i++){
19 |             inbuffer[idx].buffer[i] = outbuffer[idx].buffer[i];
20 |         }
21 |     }
22 | }


--------------------------------------------------------------------------------
/Library/worker/generic/hmac_qualcomm.cl:
--------------------------------------------------------------------------------
  1 | /*
  2 |     Qualcomm HMAC OpenCL Optimized kernel
  3 |     (c) B. Kerler 2018-2021
  4 |     MIT License
  5 | */
  6 | 
  7 | /*
  8 |     pbkdf2 and HMAC implementation
  9 |     requires implementation of PRF (pseudo-random function),
 10 |       probably using HMAC and an implementation of hash_main
 11 | */
 12 | /*
 13 |     REQ: outBuf.buffer must have space for ceil(dkLen / PRF_output_bytes) * PRF_output_bytes
 14 |     REQ: PRF implementation MUST allow that output may be the salt (m in hmac)
 15 |     inBuffer / pwdBuffer / the like are not const to allow for padding
 16 | */
 17 | 
 18 | // Determine (statically) the actual required buffer size
 19 | //   Just allowing for MD padding: 64 bits for int, 1 for the 1-pad = 3 int32s.
 20 | #define sizeForHash(reqSize) (ceilDiv((reqSize) + 2 + 1, hashBlockSize_int32) * hashBlockSize_int32)
 21 | 
 22 | /* Swaps between little and big-endian*/
 23 | #define swapEndian(x) (rotate((x) & 0x00FF00FF, 24U) | rotate((x) & 0xFF00FF00, 8U))
 24 | 
 25 | __constant const unsigned int opad = 0x5c5c5c5c;
 26 | __constant const unsigned int ipad = 0x36363636;
 27 | __constant const unsigned int xoredPad = opad ^ ipad;
 28 | // Slightly ugly: large enough for hmac_main usage, and tight for pbkdf2
 29 | #define m_buffer_size (saltBufferSize + 1)
 30 | 
 31 | static void hmac(__global unsigned int *K, const unsigned int K_len_bytes,
 32 |     const unsigned int *m, const unsigned int m_len_bytes, unsigned int *output)
 33 | {
 34 |     // REQ: If K_len_bytes isn't divisible by 4, final int should be clean (0s to the end)
 35 |     // REQ: s digestSize is a multiple of 4 bytes
 36 | 
 37 |     /* Declare the space for input to the last hash function:
 38 |          Compute and write K_ ^ opad to the first block of this. This will be the only place that we store K_ */
 39 | 
 40 |     #define size_2 sizeForHash(hashBlockSize_int32 + hashDigestSize_int32)
 41 |     unsigned int input_2[size_2] = {0};
 42 |     #undef size_2
 43 | 
 44 |     int end;
 45 |     if (K_len_bytes <= hashBlockSize_bytes)
 46 |     {
 47 |         end = (K_len_bytes + 3) / 4;
 48 |         // XOR with opad and slightly pad with zeros..
 49 |         for (int j = 0; j < end; j++){
 50 |             input_2[j] = K[j] ^ opad;
 51 |         }
 52 |     } else {
 53 |         end = hashDigestSize_int32;
 54 |         // Hash K to get K'. XOR with opad..
 55 |         hash_glbl_to_priv(K, K_len_bytes, input_2);
 56 |         for (int j = 0; j < hashDigestSize_int32; j++){
 57 |             input_2[j] ^= opad;
 58 |         }
 59 |     }
 60 |     // And if short, pad with 0s to the BLOCKsize, completing xor with opad
 61 |     for (int j = end; j < hashBlockSize_int32; j++){
 62 |         input_2[j] = opad;
 63 |     }
 64 | 
 65 |     // Copy K' ^ ipad into the first block.
 66 |     // Be careful: hash needs a whole block after the end. ceilDiv from buffer_structs
 67 |     #define size_1 sizeForHash(hashBlockSize_int32 + m_buffer_size)
 68 | 
 69 |     // K' ^ ipad into the first block
 70 |     unsigned int input_1[size_1] = {0};
 71 |     #undef size_1
 72 |     for (int j = 0; j < hashBlockSize_int32; j++){
 73 |         input_1[j] = input_2[j]^xoredPad;
 74 |     }
 75 | 
 76 |     // Slightly inefficient copying m in..
 77 |     int m_len_int32 = (m_len_bytes + 3) / 4;
 78 |     for (int j = 0; j < m_len_int32; j++){
 79 |         input_1[hashBlockSize_int32 + j] = m[j];
 80 |     }
 81 | 
 82 |     // Hash input1 into the second half of input2
 83 |     int leng = hashBlockSize_bytes + m_len_bytes;
 84 |     hash_private(input_1, leng, input_2 + hashBlockSize_int32);
 85 | 
 86 |     // Hash input2 into output!
 87 |     hash_private(input_2, hashBlockSize_bytes + hashDigestSize_bytes, output);
 88 | }
 89 | 
 90 | #undef sizeForHash
 91 | 
 92 | // Might as well be very clean
 93 | #undef swapEndian
 94 | 
 95 | // Exposing HMAC in the same way. Useful for testing atleast.
 96 | __kernel void hmac_main(__global inbuf *inbuffer, __global const saltbuf *saltbuffer, __global outbuf *outbuffer)
 97 | {
 98 |     int counter=0;
 99 |     int i=0;
100 |     int j=0;
101 |     unsigned int idx = get_global_id(0);
102 |     unsigned int pwdLen_bytes = inbuffer[idx].length;
103 |     __global unsigned int *pwdBuffer = inbuffer[idx].buffer;
104 | 
105 |     // Copy salt just to cheer the compiler up
106 |     int saltLen_bytes = saltbuffer[0].length;
107 |     int saltLen_int32 = ceilDiv(saltLen_bytes, 4);
108 |     unsigned int personal_salt[saltBufferSize] = {0};
109 | 
110 |     for (j = 0; j < saltLen_int32; j++){
111 |         personal_salt[j] = saltbuffer[0].buffer[j];
112 |     }
113 | 
114 |     // Call hmac, with local
115 |     unsigned int out[hashDigestSize_int32];
116 |     
117 |     unsigned int V[hashDigestSize_int32]={0};
118 |     for (counter=0;counter<10000;counter++)
119 |     {
120 |         hmac(pwdBuffer, pwdLen_bytes, personal_salt, saltLen_bytes, out);
121 |         for (j=0;j<hashDigestSize_int32;j++)
122 |         {
123 |             V[j] ^= out[j];
124 |             personal_salt[j] = out[j];
125 |         }
126 |     }
127 |     
128 |     for (int j = 0; j < hashDigestSize_int32; j++){
129 |         outbuffer[idx].buffer[j] = out[j];
130 |         outbuffer[idx].buffer[j+hashDigestSize_int32] = V[j];
131 |     }
132 | }


--------------------------------------------------------------------------------
/Library/worker/generic/md5.cl:
--------------------------------------------------------------------------------
  1 | /*
  2 |     MD5 OpenCL kernel
  3 |     Adapted from Bjorn Kerler's sha256.cl
  4 |     MIT License
  5 | */
  6 | /*
  7 |     outbuf and inbuf structs defined using the buffer_structs_template
  8 |     NOTE the only arrays declared in the code are size 4.
  9 | */
 10 | 
 11 | // Shift constants
 12 | #define MD5SH00 7u
 13 | #define MD5SH01 12u
 14 | #define MD5SH02 17u
 15 | #define MD5SH03 22u
 16 | #define MD5SH04 7u
 17 | #define MD5SH05 12u
 18 | #define MD5SH06 17u
 19 | #define MD5SH07 22u
 20 | #define MD5SH08 7u
 21 | #define MD5SH09 12u
 22 | #define MD5SH0a 17u
 23 | #define MD5SH0b 22u
 24 | #define MD5SH0c 7u
 25 | #define MD5SH0d 12u
 26 | #define MD5SH0e 17u
 27 | #define MD5SH0f 22u
 28 | #define MD5SH10 5u
 29 | #define MD5SH11 9u
 30 | #define MD5SH12 14u
 31 | #define MD5SH13 20u
 32 | #define MD5SH14 5u
 33 | #define MD5SH15 9u
 34 | #define MD5SH16 14u
 35 | #define MD5SH17 20u
 36 | #define MD5SH18 5u
 37 | #define MD5SH19 9u
 38 | #define MD5SH1a 14u
 39 | #define MD5SH1b 20u
 40 | #define MD5SH1c 5u
 41 | #define MD5SH1d 9u
 42 | #define MD5SH1e 14u
 43 | #define MD5SH1f 20u
 44 | #define MD5SH20 4u
 45 | #define MD5SH21 11u
 46 | #define MD5SH22 16u
 47 | #define MD5SH23 23u
 48 | #define MD5SH24 4u
 49 | #define MD5SH25 11u
 50 | #define MD5SH26 16u
 51 | #define MD5SH27 23u
 52 | #define MD5SH28 4u
 53 | #define MD5SH29 11u
 54 | #define MD5SH2a 16u
 55 | #define MD5SH2b 23u
 56 | #define MD5SH2c 4u
 57 | #define MD5SH2d 11u
 58 | #define MD5SH2e 16u
 59 | #define MD5SH2f 23u
 60 | #define MD5SH30 6u
 61 | #define MD5SH31 10u
 62 | #define MD5SH32 15u
 63 | #define MD5SH33 21u
 64 | #define MD5SH34 6u
 65 | #define MD5SH35 10u
 66 | #define MD5SH36 15u
 67 | #define MD5SH37 21u
 68 | #define MD5SH38 6u
 69 | #define MD5SH39 10u
 70 | #define MD5SH3a 15u
 71 | #define MD5SH3b 21u
 72 | #define MD5SH3c 6u
 73 | #define MD5SH3d 10u
 74 | #define MD5SH3e 15u
 75 | #define MD5SH3f 21u
 76 | 
 77 | // Put in a constanty bit
 78 | __constant uint s_md5[64] = {
 79 |     MD5SH00, MD5SH01, MD5SH02, MD5SH03, 
 80 |     MD5SH04, MD5SH05, MD5SH06, MD5SH07, 
 81 |     MD5SH08, MD5SH09, MD5SH0a, MD5SH0b, 
 82 |     MD5SH0c, MD5SH0d, MD5SH0e, MD5SH0f, 
 83 |     MD5SH10, MD5SH11, MD5SH12, MD5SH13, 
 84 |     MD5SH14, MD5SH15, MD5SH16, MD5SH17, 
 85 |     MD5SH18, MD5SH19, MD5SH1a, MD5SH1b, 
 86 |     MD5SH1c, MD5SH1d, MD5SH1e, MD5SH1f, 
 87 |     MD5SH20, MD5SH21, MD5SH22, MD5SH23, 
 88 |     MD5SH24, MD5SH25, MD5SH26, MD5SH27, 
 89 |     MD5SH28, MD5SH29, MD5SH2a, MD5SH2b, 
 90 |     MD5SH2c, MD5SH2d, MD5SH2e, MD5SH2f, 
 91 |     MD5SH30, MD5SH31, MD5SH32, MD5SH33, 
 92 |     MD5SH34, MD5SH35, MD5SH36, MD5SH37, 
 93 |     MD5SH38, MD5SH39, MD5SH3a, MD5SH3b, 
 94 |     MD5SH3c, MD5SH3d, MD5SH3e, MD5SH3f
 95 | };
 96 | 
 97 | // MD5 constants
 98 | #define MD5C00 0xd76aa478u
 99 | #define MD5C01 0xe8c7b756u
100 | #define MD5C02 0x242070dbu
101 | #define MD5C03 0xc1bdceeeu
102 | #define MD5C04 0xf57c0fafu
103 | #define MD5C05 0x4787c62au
104 | #define MD5C06 0xa8304613u
105 | #define MD5C07 0xfd469501u
106 | #define MD5C08 0x698098d8u
107 | #define MD5C09 0x8b44f7afu
108 | #define MD5C0a 0xffff5bb1u
109 | #define MD5C0b 0x895cd7beu
110 | #define MD5C0c 0x6b901122u
111 | #define MD5C0d 0xfd987193u
112 | #define MD5C0e 0xa679438eu
113 | #define MD5C0f 0x49b40821u
114 | #define MD5C10 0xf61e2562u
115 | #define MD5C11 0xc040b340u
116 | #define MD5C12 0x265e5a51u
117 | #define MD5C13 0xe9b6c7aau
118 | #define MD5C14 0xd62f105du
119 | #define MD5C15 0x02441453u
120 | #define MD5C16 0xd8a1e681u
121 | #define MD5C17 0xe7d3fbc8u
122 | #define MD5C18 0x21e1cde6u
123 | #define MD5C19 0xc33707d6u
124 | #define MD5C1a 0xf4d50d87u
125 | #define MD5C1b 0x455a14edu
126 | #define MD5C1c 0xa9e3e905u
127 | #define MD5C1d 0xfcefa3f8u
128 | #define MD5C1e 0x676f02d9u
129 | #define MD5C1f 0x8d2a4c8au
130 | #define MD5C20 0xfffa3942u
131 | #define MD5C21 0x8771f681u
132 | #define MD5C22 0x6d9d6122u
133 | #define MD5C23 0xfde5380cu
134 | #define MD5C24 0xa4beea44u
135 | #define MD5C25 0x4bdecfa9u
136 | #define MD5C26 0xf6bb4b60u
137 | #define MD5C27 0xbebfbc70u
138 | #define MD5C28 0x289b7ec6u
139 | #define MD5C29 0xeaa127fau
140 | #define MD5C2a 0xd4ef3085u
141 | #define MD5C2b 0x04881d05u
142 | #define MD5C2c 0xd9d4d039u
143 | #define MD5C2d 0xe6db99e5u
144 | #define MD5C2e 0x1fa27cf8u
145 | #define MD5C2f 0xc4ac5665u
146 | #define MD5C30 0xf4292244u
147 | #define MD5C31 0x432aff97u
148 | #define MD5C32 0xab9423a7u
149 | #define MD5C33 0xfc93a039u
150 | #define MD5C34 0x655b59c3u
151 | #define MD5C35 0x8f0ccc92u
152 | #define MD5C36 0xffeff47du
153 | #define MD5C37 0x85845dd1u
154 | #define MD5C38 0x6fa87e4fu
155 | #define MD5C39 0xfe2ce6e0u
156 | #define MD5C3a 0xa3014314u
157 | #define MD5C3b 0x4e0811a1u
158 | #define MD5C3c 0xf7537e82u
159 | #define MD5C3d 0xbd3af235u
160 | #define MD5C3e 0x2ad7d2bbu
161 | #define MD5C3f 0xeb86d391u
162 | 
163 | // Put into a constanty bit
164 | __constant uint k_md5[64] = {
165 |     MD5C00, MD5C01, MD5C02, MD5C03, 
166 |     MD5C04, MD5C05, MD5C06, MD5C07, 
167 |     MD5C08, MD5C09, MD5C0a, MD5C0b, 
168 |     MD5C0c, MD5C0d, MD5C0e, MD5C0f, 
169 |     MD5C10, MD5C11, MD5C12, MD5C13, 
170 |     MD5C14, MD5C15, MD5C16, MD5C17, 
171 |     MD5C18, MD5C19, MD5C1a, MD5C1b, 
172 |     MD5C1c, MD5C1d, MD5C1e, MD5C1f, 
173 |     MD5C20, MD5C21, MD5C22, MD5C23, 
174 |     MD5C24, MD5C25, MD5C26, MD5C27, 
175 |     MD5C28, MD5C29, MD5C2a, MD5C2b, 
176 |     MD5C2c, MD5C2d, MD5C2e, MD5C2f, 
177 |     MD5C30, MD5C31, MD5C32, MD5C33, 
178 |     MD5C34, MD5C35, MD5C36, MD5C37, 
179 |     MD5C38, MD5C39, MD5C3a, MD5C3b, 
180 |     MD5C3c, MD5C3d, MD5C3e, MD5C3f
181 | };
182 | 
183 | // Stolen from Bjorn's code. mod I used % ..
184 | #define rotl32(a,n) rotate(a, n)
185 | 
186 | // Basic functions - be sure to wrap stuff!
187 | #define F(B,C,D) (((B) & (C)) | ((~(B)) & (D)))
188 | #define G(B,C,D) (((B) & (D)) | ((C) & (~(D))))
189 | #define H(B,C,D) ((B) ^ (C) ^ (D))
190 | #define I(B,C,D) ((C) ^ ((B) | (~(D))))
191 | 
192 | // Debugging
193 | // #define showState(A,B,C,D) printf("Hashstate = {%u,%u,%u,%u}\n", A, B, C, D)
194 | 
195 | // Updating the state, where func is the chosen function. Read the pseudocode properly next time
196 | #define updateState(A,B,C,D,i,g,func) (rotate(A + func((B),(C),(D)) + M[g] + K[i], s[i]) + (B))
197 | 
198 | // Minor macros
199 | #define mod(x,y) ((x)-((x)/(y)*(y)))
200 | #define M inpBlock
201 | #define K k_md5
202 | #define s s_md5
203 | #define g_F i                 
204 | #define g_G mod((5*i + 1),16)
205 | #define g_H mod((3*i + 5),16)
206 | #define g_I mod((7*i),16)
207 | 
208 | #define def_process512Block(funcName, tag) \
209 | /* Take a 512-bit (64 bytes, 16 ints) block and update the 4 ints of state */           \
210 | static void funcName(tag const unsigned int *inpBlock, unsigned int *state) \
211 | {                               \
212 |     unsigned int A = state[0];  \
213 |     unsigned int B = state[1];  \
214 |     unsigned int C = state[2];  \
215 |     unsigned int D = state[3];  \
216 |                                 \
217 | /*  Perform the 64 rounds, with the different functions in each block of 16 */  \
218 |     int i = 0;          \
219 |     for (; i<16;){      \
220 |         A = updateState(A,B,C,D,i,g_F,F); i++;  \
221 |         D = updateState(D,A,B,C,i,g_F,F); i++;  \
222 |         C = updateState(C,D,A,B,i,g_F,F); i++;  \
223 |         B = updateState(B,C,D,A,i,g_F,F); i++;  \
224 |     }               \
225 |     for (; i<32;){  \
226 |         A = updateState(A,B,C,D,i,g_G,G); i++;  \
227 |         D = updateState(D,A,B,C,i,g_G,G); i++;  \
228 |         C = updateState(C,D,A,B,i,g_G,G); i++;  \
229 |         B = updateState(B,C,D,A,i,g_G,G); i++;  \
230 |     }               \
231 |     for (; i<48;){  \
232 |         A = updateState(A,B,C,D,i,g_H,H); i++;  \
233 |         D = updateState(D,A,B,C,i,g_H,H); i++;  \
234 |         C = updateState(C,D,A,B,i,g_H,H); i++;  \
235 |         B = updateState(B,C,D,A,i,g_H,H); i++;  \
236 |     }               \
237 |     for (; i<64;){  \
238 |         A = updateState(A,B,C,D,i,g_I,I); i++;  \
239 |         D = updateState(D,A,B,C,i,g_I,I); i++;  \
240 |         C = updateState(C,D,A,B,i,g_I,I); i++;  \
241 |         B = updateState(B,C,D,A,i,g_I,I); i++;  \
242 |     }               \
243 |                     \
244 |     state[0] += A;  \
245 |     state[1] += B;  \
246 |     state[2] += C;  \
247 |     state[3] += D;  \
248 |                     \
249 |     return;         \
250 | }
251 | 
252 | // Create the function definitions
253 | def_process512Block(process512Block__global, __global)
254 | def_process512Block(process512Block__private, __private)
255 | 
256 | // Undefine all macros from above def_process512Block
257 | #undef M
258 | #undef K
259 | #undef s
260 | #undef g_F
261 | #undef g_G
262 | #undef g_H
263 | #undef g_I
264 | 
265 | #undef F
266 | #undef G
267 | #undef H
268 | #undef I
269 | #undef updateState
270 | #undef rotl32
271 | 
272 | #undef def_process512Block
273 | 
274 | __constant uint padInt[4] = {
275 |     0x1 << 7, 0x1 << 15, 0x1 << 23, 0x1 << 31 
276 | };
277 | __constant uint maskInt[4] = {
278 |     0x00000000, 0x000000FF, 0x0000FFFF, 0x00FFFFFF
279 | };
280 | 
281 | #define bs_int hashBlockSize_int32
282 | #define def_md_pad(funcName, tag)               \
283 | /* The standard padding,
284 |     add a 1 bit, then little-endian original length mod 2^64 at the end of a block
285 |     RETURN number of blocks */                  \
286 | static int funcName(tag unsigned int *msg, const int msgLen_bytes)      \
287 | {                                                                       \
288 |     /* Appends the 1 bit to the end, and 0s to the end of the byte */   \
289 |     int padIntIndex = msgLen_bytes / 4;                                 \
290 |     int overhang = (msgLen_bytes - padIntIndex*4);                      \
291 |     /* Don't assume that there are zeros here! */                       \
292 |     msg[padIntIndex] &= maskInt[overhang];                              \
293 |     msg[padIntIndex] |= padInt[overhang];                               \
294 |     int l = bs_int - 1 - mod(padIntIndex,bs_int);                        \
295 |     l = mod((l + (bs_int - 2)),bs_int);                                  \
296 |     for (int i = 1; i <= l && i <= 1; i++)                              \
297 |     {                                                                   \
298 |         msg[padIntIndex + i] = 0;                                       \
299 |     }                                                                   \
300 |                                                                         \
301 |     /* Add the bit length to the end.. little-endian */                 \
302 |     int lastI = padIntIndex + l + 2;                                    \
303 |     msg[lastI-1] = msgLen_bytes * 8;                                    \
304 |     msg[lastI] = 0;                                                     \
305 |                                                                         \
306 |     int nBlocks = (lastI + 1) / bs_int;                                 \
307 |     return nBlocks;                                                     \
308 | };                                                                      
309 | 
310 | // Define it with the various tags to cheer OpenCL up
311 | def_md_pad(md_pad__global, __global)
312 | def_md_pad(md_pad__private, __private)
313 | 
314 | #undef bs_int
315 | #undef def_md_pad
316 | 
317 | 
318 | #define def_hash(funcName, m_tag, output_tag, md_pad_func, process512Block_func)    \
319 | /*  The main hashing function, for use with hash_main and pbkdf2 */                 \
320 | static void funcName(m_tag unsigned int *m, const int m_len_bytes, output_tag unsigned int *output)    \
321 | {                                               \
322 |     int nBlocks = md_pad_func(m,m_len_bytes);   \
323 |                                                 \
324 |     /* Initialise state */                      \
325 |     unsigned int hashState[4]={0x67452301,0xefcdab89,0x98badcfe,0x10325476};    \
326 |                                                 \
327 |     /* Do the required number of rounds */      \
328 |     for (int i=0;i<nBlocks;i++)                 \
329 |     {                                           \
330 |         process512Block_func(m + hashBlockSize_int32*i, hashState);  \
331 |     }                                           \
332 |                             \
333 |     output[0]=hashState[0]; \
334 |     output[1]=hashState[1]; \
335 |     output[2]=hashState[2]; \
336 |     output[3]=hashState[3]; \
337 | }
338 | // Macro pays off now!
339 | def_hash(hash_global, __global, __global, md_pad__global, process512Block__global)
340 | def_hash(hash_private, __private, __private, md_pad__private, process512Block__private)
341 | def_hash(hash_glbl_to_priv, __global, __private, md_pad__global, process512Block__global)
342 | def_hash(hash_priv_to_glbl, __private, __global, md_pad__private, process512Block__private)
343 | 
344 | #undef def_hash
345 | 
346 | 
347 | // Main function, this name is referenced in the code.
348 | // Calls hash_global, with __private used in pbkdf2
349 | __kernel void hash_main(__global inbuf * inbuffer, __global outbuf * outbuffer)
350 | {
351 |     // Select our buffer areas
352 |     unsigned int idx = get_global_id(0);
353 |     __global unsigned int *inp_buffer = inbuffer[idx].buffer;
354 | 
355 |     hash_global(inp_buffer, inbuffer[idx].length, outbuffer[idx].buffer);
356 |     
357 | /*     printf("Global call output: ");
358 |     printFromInt_glbl(outbuffer[idx].buffer, hashDigestSize_bytes,true);
359 |     printf("\n"); */
360 | 
361 |     unsigned int inp[inBufferSize];
362 |     for (int j = 0; j < inBufferSize; j++){
363 |         inp[j] = inp_buffer[j];
364 |     }
365 | 
366 | /*     unsigned int out[hashDigestSize_int32];
367 |     hash_private(inp, inbuffer[idx].length, out);
368 |     printf("local call output: ");
369 |     printFromInt(out, hashDigestSize_bytes,true);
370 |     printf("\n"); */
371 | 
372 | }


--------------------------------------------------------------------------------
/Library/worker/generic/pbkdf2.cl:
--------------------------------------------------------------------------------
  1 | /*
  2 |     pbkdf2 and HMAC implementation
  3 |     requires implementation of PRF (pseudo-random function),
  4 |       probably using HMAC and an implementation of hash_main
  5 | */
  6 | /*
  7 |     REQ: outBuf.buffer must have space for ceil(dkLen / PRF_output_bytes) * PRF_output_bytes
  8 |     REQ: PRF implementation MUST allow that output may be the salt (m in hmac)
  9 |     inBuffer / pwdBuffer / the like are not const to allow for padding
 10 | */
 11 | 
 12 | // Determine (statically) the actual required buffer size
 13 | // Correct for both 64 & 32 bit
 14 | //   Just allowing for MD padding: 2 words for length, 1 for the 1-pad = 3 words
 15 | #define sizeForHash(reqSize) (ceilDiv((reqSize) + 2 + 1, hashBlockSize) * hashBlockSize)
 16 | 
 17 | #if wordSize == 4
 18 |     __constant const unsigned int opad = 0x5c5c5c5c;
 19 |     __constant const unsigned int ipad = 0x36363636;
 20 | #elif wordSize == 8
 21 |     __constant const unsigned long opad = 0x5c5c5c5c5c5c5c5c;
 22 |     __constant const unsigned long ipad = 0x3636363636363636;
 23 | #endif
 24 | 
 25 | __constant const word xoredPad = opad ^ ipad;
 26 | 
 27 | // Slightly ugly: large enough for hmac_main usage, and tight for pbkdf2
 28 | #define m_buffer_size (saltBufferSize + 1)
 29 | 
 30 | static void hmac(__global word *K, const word K_len_bytes,
 31 |     const word *m, const word m_len_bytes, word *output)
 32 | {
 33 |     // REQ: If K_len_bytes isn't divisible by 4/8, final word should be clean (0s to the end)
 34 |     // REQ: s digestSize is a multiple of 4/8 bytes
 35 | 
 36 |     /* Declare the space for input to the last hash function:
 37 |          Compute and write K_ ^ opad to the first block of this. This will be the only place that we store K_ */
 38 | 
 39 |     #define size_2 sizeForHash(hashBlockSize + hashDigestSize)
 40 |     word input_2[size_2] = {0};
 41 |     #undef size_2
 42 | 
 43 |     word end;
 44 |     if (K_len_bytes <= hashBlockSize_bytes)
 45 |     {
 46 |         end = ceilDiv(K_len_bytes, wordSize);
 47 |         // XOR with opad and slightly pad with zeros..
 48 |         for (int j = 0; j < end; j++){
 49 |             input_2[j] = K[j] ^ opad;
 50 |         }
 51 |     } else {
 52 |         end = hashDigestSize;
 53 |         // Hash K to get K'. XOR with opad..
 54 |         hash_glbl_to_priv(K, K_len_bytes, input_2);
 55 |         for (int j = 0; j < hashDigestSize; j++){
 56 |             input_2[j] ^= opad;
 57 |         }
 58 |     }
 59 |     // And if short, pad with 0s to the BLOCKsize, completing xor with opad
 60 |     for (int j = end; j < hashBlockSize; j++){
 61 |         input_2[j] = opad;
 62 |     }
 63 | 
 64 |     // Copy K' ^ ipad into the first block.
 65 |     // Be careful: hash needs a whole block after the end. ceilDiv from buffer_structs
 66 |     #define size_1 sizeForHash(hashBlockSize + m_buffer_size)
 67 | 
 68 |     // K' ^ ipad into the first block
 69 |     word input_1[size_1] = {0};
 70 |     #undef size_1
 71 |     for (int j = 0; j < hashBlockSize; j++){
 72 |         input_1[j] = input_2[j]^xoredPad;
 73 |     }
 74 | 
 75 |     // Slightly inefficient copying m in..
 76 |     word m_len_word = ceilDiv(m_len_bytes, wordSize);
 77 |     for (int j = 0; j < m_len_word; j++){
 78 |         input_1[hashBlockSize + j] = m[j];
 79 |     }
 80 | 
 81 |     // Hash input1 into the second half of input2
 82 |     word leng = hashBlockSize_bytes + m_len_bytes;
 83 |     hash_private(input_1, leng, input_2 + hashBlockSize);
 84 | 
 85 |     // Hash input2 into output!
 86 |     hash_private(input_2, hashBlockSize_bytes + hashDigestSize_bytes, output);
 87 | }
 88 | 
 89 | #undef sizeForHash
 90 | 
 91 | 
 92 | // PRF
 93 | #define PRF_output_size hashDigestSize
 94 | #define PRF_output_bytes (PRF_output_size * wordSize)
 95 | // Our PRF is the hmac using the hash. Commas remove need for bracketing
 96 | #define PRF(pwd, pwdLen_bytes, salt, saltLen_bytes, output) \
 97 |     hmac(pwd, pwdLen_bytes, salt, saltLen_bytes, output)
 98 | 
 99 | 
100 | static void F(__global word *pwd, const word pwdLen_bytes,
101 |     word *salt, const word saltLen_bytes,
102 |     const unsigned int iters, unsigned int callI,
103 |     __global word *output)
104 | {
105 |     // ASSUMPTION: salt array has wordSize bytes more room
106 |     // Note salt is not const, so we can efficiently tweak the end of it
107 | 
108 |     // Add the integer to the end of the salt
109 |     // NOTE! Always adding callI as just a u32
110 |     //word overhang = saltLen_bytes % wordSize;
111 |     word overhang=((saltLen_bytes)-((saltLen_bytes)/(wordSize)*(wordSize)));
112 |     overhang *= 8; // convert to bits
113 |     word saltLastI = saltLen_bytes / wordSize;
114 | 
115 |     // ! Crucial line: BE, moved as if it's a u32 but still within the word
116 |     word be_callI = SWAP((word)callI) >> (8*(wordSize-4));
117 |     if (overhang>0)
118 |     {
119 |         salt[saltLastI] |= be_callI << overhang;
120 |         salt[saltLastI+1] = be_callI >> ((8*wordSize)-overhang);
121 |     }
122 |     else
123 |     {
124 |         salt[saltLastI]=be_callI;
125 |     }
126 | 
127 |     // Make initial call, copy into output
128 |     // This copy is avoidable, but only with __global / __private macro stuff
129 |     word u[PRF_output_size] = {0};
130 |     // +4 is correct even for 64 bit
131 |     PRF(pwd, pwdLen_bytes, salt, saltLen_bytes + 4, u);
132 |     for (unsigned int j = 0; j < PRF_output_size; j++){
133 |         output[j] = u[j];
134 |     }
135 | 
136 |     #define xor(x,acc)                                  \
137 |     /* xors PRF output x onto acc*/                     \
138 |     {                                                   \
139 |         for (int k = 0; k < PRF_output_size; k++){     \
140 |             acc[k] ^= x[k];                             \
141 |         }                                               \
142 |     }
143 | 
144 |     // Perform all the iterations, reading salt from- AND writing to- u.
145 |     for (unsigned int j = 1; j < iters; j++){
146 |         PRF(pwd, pwdLen_bytes, u, PRF_output_bytes, u);
147 |         xor(u,output);
148 |     }
149 | }
150 | 
151 | __kernel void pbkdf2(__global inbuf *inbuffer, __global const saltbuf *saltbuffer, __global outbuf *outbuffer,
152 |     __private unsigned int iters, __private unsigned int dkLen_bytes)
153 | {
154 | 
155 |     unsigned int idx = get_global_id(0);
156 |     word pwdLen_bytes = inbuffer[idx].length;
157 |     __global word *pwdBuffer = inbuffer[idx].buffer;
158 |     __global word *currOutBuffer = outbuffer[idx].buffer;
159 | 
160 |     // Copy salt so that we can write our integer into the last 4 bytes
161 |     word saltLen_bytes = saltbuffer[0].length;
162 |     int saltLen = ceilDiv(saltLen_bytes, wordSize);
163 |     word personal_salt[saltBufferSize+2] = {0};
164 | 
165 |     for (int j = 0; j < saltLen; j++){
166 |         personal_salt[j] = saltbuffer[0].buffer[j];
167 |     }
168 | 
169 |     // Determine the number of calls to F that we need to make
170 |     unsigned int nBlocks = ceilDiv(dkLen_bytes, PRF_output_bytes);
171 |     for (unsigned int j = 1; j <= nBlocks; j++)
172 |     {
173 |         F(pwdBuffer, pwdLen_bytes, personal_salt, saltbuffer[0].length, iters, j, currOutBuffer);
174 |         currOutBuffer += PRF_output_size;
175 |     }
176 | }
177 | 
178 | 
179 | // Exposing HMAC in the same way. Useful for testing atleast.
180 | __kernel void hmac_main(__global inbuf *inbuffer, __global const saltbuf *saltbuffer, __global outbuf *outbuffer)
181 | {
182 |     unsigned int idx = get_global_id(0);
183 |     word pwdLen_bytes = inbuffer[idx].length;
184 |     __global word *pwdBuffer = inbuffer[idx].buffer;
185 | 
186 |     // Copy salt just to cheer the compiler up
187 |     int saltLen_bytes = (int)saltbuffer[0].length;
188 |     int saltLen = ceilDiv(saltLen_bytes, wordSize);
189 |     word personal_salt[saltBufferSize] = {0};
190 | 
191 |     for (int j = 0; j < saltLen; j++){
192 |         personal_salt[j] = saltbuffer[0].buffer[j];
193 |     }
194 | 
195 |     // Call hmac, with local
196 |     word out[hashDigestSize];
197 |     
198 |     hmac(pwdBuffer, pwdLen_bytes, personal_salt, saltLen_bytes, out);
199 | 
200 |     for (int j = 0; j < hashDigestSize; j++){
201 |         outbuffer[idx].buffer[j] = out[j];
202 |     }
203 | }
204 | 
205 | // A modified version of the pbkdf2 kernel that allows you to use these kernels in a situation where you have a password
206 | // and are attempting to brute-force the salt. (So this kernel takes a single password and an array of salts
207 | //
208 | // Originally created for BTCRecover by Stephen Rothery, available at https://github.com/3rdIteration/btcrecover
209 | //    MIT License
210 | 
211 | __kernel void pbkdf2_saltlist(__global const pwdbuf *pwdbuffer_arg, __global inbuf *inbuffer, __global outbuf *outbuffer,
212 |     __private unsigned int iters, __private unsigned int dkLen_bytes)
213 | {
214 | 
215 | 	unsigned int idx = get_global_id(0);
216 |     word pwdLen_bytes = pwdbuffer_arg[0].length;
217 |     __global word *pwdBuffer = pwdbuffer_arg[0].buffer;
218 |     __global word *currOutBuffer = outbuffer[idx].buffer;
219 | 
220 |     // Copy salt so that we can write our integer into the last 4 bytes
221 |     word saltLen_bytes = inbuffer[idx].length;
222 |     int saltLen = ceilDiv(saltLen_bytes, wordSize);
223 |     word personal_salt[saltBufferSize+2] = {0};
224 | 
225 | 
226 |     for (int j = 0; j < saltLen; j++){
227 |         personal_salt[j] = inbuffer[idx].buffer[j];
228 |     }
229 | 
230 |     // Determine the number of calls to F that we need to make
231 |     unsigned int nBlocks = ceilDiv(dkLen_bytes, PRF_output_bytes);
232 |     for (unsigned int j = 1; j <= nBlocks; j++)
233 |     {
234 |         F(pwdBuffer, pwdLen_bytes, personal_salt, saltLen_bytes, iters, j, currOutBuffer);
235 |         currOutBuffer += PRF_output_size;
236 |     }
237 | }
238 | 
239 | 


--------------------------------------------------------------------------------
/Library/worker/generic/pbkdf2_sha1_32.cl:
--------------------------------------------------------------------------------
  1 | /*
  2 |     In- and out- buffer structures (of int32), with variable sizes, for hashing.
  3 |     These allow indexing just using just get_global_id(0)
  4 |     Variables tagged with <..> are replaced, so we can specify just enough room for the data.
  5 |     These are:
  6 |         - hashBlockSize_bits   : The hash's block size in Bits
  7 |         - inMaxNumBlocks      : per hash operation
  8 |         - hashDigestSize_bits   : The hash's digest size in Bits
  9 | 
 10 |     Originally adapted from Bjorn Kerler's sha256.cl
 11 |     MIT License
 12 | */
 13 | #define DEBUG 1
 14 | 
 15 | // All macros left defined for usage in the program
 16 | #define ceilDiv(n,d) (((n) + (d) - 1) / (d))
 17 | 
 18 | // All important now, defining whether we're working with unsigned ints or longs
 19 | #define wordSize 4
 20 | 
 21 | // Practical sizes of buffers, in words.
 22 | #define inBufferSize ceilDiv(128, wordSize)
 23 | #define outBufferSize ceilDiv(40, wordSize)
 24 | #define saltBufferSize ceilDiv(8, wordSize)
 25 | #define ctBufferSize ceilDiv(0, wordSize)
 26 | 
 27 | //
 28 | #define hashBlockSize_bytes ceilDiv(512, 8) /* Needs to be a multiple of 4, or 8 when we work with unsigned longs */
 29 | #define hashDigestSize_bytes ceilDiv(160, 8)
 30 | 
 31 | // just Size always implies _word
 32 | #define hashBlockSize ceilDiv(hashBlockSize_bytes, wordSize)
 33 | #define hashDigestSize ceilDiv(hashDigestSize_bytes, wordSize)
 34 | 
 35 | 
 36 | // Ultimately hoping to faze out the Size_int32/long64,
 37 | //   in favour of just size (_word implied)
 38 | #if wordSize == 4
 39 |     #define hashBlockSize_int32 hashBlockSize
 40 |     #define hashDigestSize_int32 hashDigestSize
 41 |     #define word unsigned int
 42 | 
 43 |     unsigned int SWAP (unsigned int val)
 44 |     {
 45 |         return (rotate(((val) & 0x00FF00FF), 24U) | rotate(((val) & 0xFF00FF00), 8U));
 46 |     }
 47 | 
 48 | #elif wordSize == 8
 49 |     // Initially for use in SHA-512
 50 |     #define hashBlockSize_long64 hashBlockSize
 51 |     #define hashDigestSize_long64 hashDigestSize
 52 |     #define word unsigned long
 53 |     #define rotl64(a,n) (rotate ((a), (n)))
 54 |     #define rotr64(a,n) (rotate ((a), (64ul-n)))
 55 | 
 56 |     unsigned long SWAP (const unsigned long val)
 57 |     {
 58 |         // ab cd ef gh -> gh ef cd ab using the 32 bit trick
 59 |         unsigned long tmp = (rotr64(val & 0x0000FFFF0000FFFFUL, 16UL) | rotl64(val & 0xFFFF0000FFFF0000UL, 16UL));
 60 | 
 61 |         // Then see this as g- e- c- a- and -h -f -d -b to swap within the pairs,
 62 |         // gh ef cd ab -> hg fe dc ba
 63 |         return (rotr64(tmp & 0xFF00FF00FF00FF00UL, 8UL) | rotl64(tmp & 0x00FF00FF00FF00FFUL, 8UL));
 64 |     }
 65 | #endif
 66 | 
 67 | 
 68 | 
 69 | // ====  Define the structs with the right word size  =====
 70 | //  Helpful & more cohesive to have the lengths of structures as words too,
 71 | //   (rather than unsigned int for both)
 72 | typedef struct {
 73 |     word length; // in bytes
 74 |     word buffer[inBufferSize];
 75 | } inbuf;
 76 | 
 77 | typedef struct {
 78 |     word buffer[outBufferSize];
 79 | } outbuf;
 80 | 
 81 | // Salt buffer, used by pbkdf2 & pbe
 82 | typedef struct {
 83 |     word length; // in bytes
 84 |     word buffer[saltBufferSize];
 85 | } saltbuf;
 86 | 
 87 | // ciphertext buffer, used in pbe.
 88 | // no code relating to this in the opencl.py core, dealt with in signal_pbe_mac.cl as it's a special case
 89 | typedef struct {
 90 |     word length; // in bytes
 91 |     word buffer[ctBufferSize];
 92 | } ctbuf;
 93 | 
 94 | 
 95 | 
 96 | 
 97 | // ========== Debugging function ============
 98 | 
 99 | #ifdef DEBUG
100 | #if DEBUG
101 | 
102 |     #define def_printFromWord(tag, funcName, end)               \
103 |     /* For printing the string of bytes stored in an array of words.
104 |     Option to print hex. */    \
105 |     static void funcName(tag const word *arr, const unsigned int len_bytes, const bool hex)\
106 |     {                                           \
107 |         for (int j = 0; j < len_bytes; j++){    \
108 |             word v = arr[j / wordSize];                 \
109 |             word r = mod(j,wordSize) * 8;                \
110 |             /* Prints little endian, since that's what we use */   \
111 |             v = (v >> r) & 0xFF;                \
112 |             if (hex) {                          \
113 |                 printf("%02x", v);              \
114 |             } else {                            \
115 |                 printf("%c", (char)v);          \
116 |             }                                   \
117 |         }                                       \
118 |         printf(end);                            \
119 |     }
120 | 
121 |     def_printFromWord(__private, printFromWord, "")
122 |     def_printFromWord(__global, printFromWord_glbl, "")
123 |     def_printFromWord(__private, printFromWord_n, "\n")
124 |     def_printFromWord(__global, printFromWord_glbl_n, "\n")
125 | 
126 | #endif
127 | #endif/*
128 |     PBKDF2 SHA1 OpenCL Optimized kernel, limited to max. 32 chars for salt and password
129 |     (c) B. Kerler 2017
130 |     MIT License
131 | */
132 | 
133 | #define rotl32(a,n) rotate ((a), (n))
134 | 
135 | #define mod(x,y) x-(x/y*y)
136 | 
137 | #define F2(x,y,z)  ((x) ^ (y) ^ (z))
138 | #define F1(x,y,z)   (bitselect(z,y,x))
139 | #define F0(x,y,z)   (bitselect (x, y, (x ^ z)))
140 | 
141 | #define SHA1M_A 0x67452301u
142 | #define SHA1M_B 0xefcdab89u
143 | #define SHA1M_C 0x98badcfeu
144 | #define SHA1M_D 0x10325476u
145 | #define SHA1M_E 0xc3d2e1f0u
146 | 
147 | #define SHA1C00 0x5a827999u
148 | #define SHA1C01 0x6ed9eba1u
149 | #define SHA1C02 0x8f1bbcdcu
150 | #define SHA1C03 0xca62c1d6u
151 | 
152 | #define SHA1_STEP(f,a,b,c,d,e,x)    \
153 | {                                   \
154 |   e += K;                           \
155 |   e += x;                           \
156 |   e += f (b, c, d);                 \
157 |   e += rotl32 (a,  5u);             \
158 |   b  = rotl32 (b, 30u);             \
159 | }
160 | 
161 | static void sha1_process2 (const unsigned int *W, unsigned int *digest)
162 | {
163 |   unsigned int A = digest[0];
164 |   unsigned int B = digest[1];
165 |   unsigned int C = digest[2];
166 |   unsigned int D = digest[3];
167 |   unsigned int E = digest[4];
168 | 
169 |   unsigned int w0_t = W[0];
170 |   unsigned int w1_t = W[1];
171 |   unsigned int w2_t = W[2];
172 |   unsigned int w3_t = W[3];
173 |   unsigned int w4_t = W[4];
174 |   unsigned int w5_t = W[5];
175 |   unsigned int w6_t = W[6];
176 |   unsigned int w7_t = W[7];
177 |   unsigned int w8_t = W[8];
178 |   unsigned int w9_t = W[9];
179 |   unsigned int wa_t = W[10];
180 |   unsigned int wb_t = W[11];
181 |   unsigned int wc_t = W[12];
182 |   unsigned int wd_t = W[13];
183 |   unsigned int we_t = W[14];
184 |   unsigned int wf_t = W[15];
185 | 
186 |   #undef K
187 |   #define K SHA1C00
188 | 
189 |   SHA1_STEP (F1, A, B, C, D, E, w0_t);
190 |   SHA1_STEP (F1, E, A, B, C, D, w1_t);
191 |   SHA1_STEP (F1, D, E, A, B, C, w2_t);
192 |   SHA1_STEP (F1, C, D, E, A, B, w3_t);
193 |   SHA1_STEP (F1, B, C, D, E, A, w4_t);
194 |   SHA1_STEP (F1, A, B, C, D, E, w5_t);
195 |   SHA1_STEP (F1, E, A, B, C, D, w6_t);
196 |   SHA1_STEP (F1, D, E, A, B, C, w7_t);
197 |   SHA1_STEP (F1, C, D, E, A, B, w8_t);
198 |   SHA1_STEP (F1, B, C, D, E, A, w9_t);
199 |   SHA1_STEP (F1, A, B, C, D, E, wa_t);
200 |   SHA1_STEP (F1, E, A, B, C, D, wb_t);
201 |   SHA1_STEP (F1, D, E, A, B, C, wc_t);
202 |   SHA1_STEP (F1, C, D, E, A, B, wd_t);
203 |   SHA1_STEP (F1, B, C, D, E, A, we_t);
204 |   SHA1_STEP (F1, A, B, C, D, E, wf_t);
205 |   w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (F1, E, A, B, C, D, w0_t);
206 |   w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (F1, D, E, A, B, C, w1_t);
207 |   w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (F1, C, D, E, A, B, w2_t);
208 |   w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (F1, B, C, D, E, A, w3_t);
209 | 
210 |   #undef K
211 |   #define K SHA1C01
212 | 
213 |   w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (F2, A, B, C, D, E, w4_t);
214 |   w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (F2, E, A, B, C, D, w5_t);
215 |   w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (F2, D, E, A, B, C, w6_t);
216 |   w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (F2, C, D, E, A, B, w7_t);
217 |   w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (F2, B, C, D, E, A, w8_t);
218 |   w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (F2, A, B, C, D, E, w9_t);
219 |   wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (F2, E, A, B, C, D, wa_t);
220 |   wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (F2, D, E, A, B, C, wb_t);
221 |   wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (F2, C, D, E, A, B, wc_t);
222 |   wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (F2, B, C, D, E, A, wd_t);
223 |   we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (F2, A, B, C, D, E, we_t);
224 |   wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (F2, E, A, B, C, D, wf_t);
225 |   w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (F2, D, E, A, B, C, w0_t);
226 |   w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (F2, C, D, E, A, B, w1_t);
227 |   w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (F2, B, C, D, E, A, w2_t);
228 |   w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (F2, A, B, C, D, E, w3_t);
229 |   w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (F2, E, A, B, C, D, w4_t);
230 |   w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (F2, D, E, A, B, C, w5_t);
231 |   w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (F2, C, D, E, A, B, w6_t);
232 |   w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (F2, B, C, D, E, A, w7_t);
233 | 
234 |   #undef K
235 |   #define K SHA1C02
236 | 
237 |   w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (F0, A, B, C, D, E, w8_t);
238 |   w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (F0, E, A, B, C, D, w9_t);
239 |   wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (F0, D, E, A, B, C, wa_t);
240 |   wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (F0, C, D, E, A, B, wb_t);
241 |   wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (F0, B, C, D, E, A, wc_t);
242 |   wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (F0, A, B, C, D, E, wd_t);
243 |   we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (F0, E, A, B, C, D, we_t);
244 |   wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (F0, D, E, A, B, C, wf_t);
245 |   w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (F0, C, D, E, A, B, w0_t);
246 |   w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (F0, B, C, D, E, A, w1_t);
247 |   w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (F0, A, B, C, D, E, w2_t);
248 |   w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (F0, E, A, B, C, D, w3_t);
249 |   w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (F0, D, E, A, B, C, w4_t);
250 |   w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (F0, C, D, E, A, B, w5_t);
251 |   w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (F0, B, C, D, E, A, w6_t);
252 |   w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (F0, A, B, C, D, E, w7_t);
253 |   w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (F0, E, A, B, C, D, w8_t);
254 |   w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (F0, D, E, A, B, C, w9_t);
255 |   wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (F0, C, D, E, A, B, wa_t);
256 |   wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (F0, B, C, D, E, A, wb_t);
257 | 
258 |   #undef K
259 |   #define K SHA1C03
260 | 
261 |   wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (F2, A, B, C, D, E, wc_t);
262 |   wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (F2, E, A, B, C, D, wd_t);
263 |   we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (F2, D, E, A, B, C, we_t);
264 |   wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (F2, C, D, E, A, B, wf_t);
265 |   w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (F2, B, C, D, E, A, w0_t);
266 |   w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (F2, A, B, C, D, E, w1_t);
267 |   w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (F2, E, A, B, C, D, w2_t);
268 |   w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (F2, D, E, A, B, C, w3_t);
269 |   w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (F2, C, D, E, A, B, w4_t);
270 |   w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (F2, B, C, D, E, A, w5_t);
271 |   w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (F2, A, B, C, D, E, w6_t);
272 |   w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (F2, E, A, B, C, D, w7_t);
273 |   w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (F2, D, E, A, B, C, w8_t);
274 |   w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (F2, C, D, E, A, B, w9_t);
275 |   wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (F2, B, C, D, E, A, wa_t);
276 |   wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (F2, A, B, C, D, E, wb_t);
277 |   wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (F2, E, A, B, C, D, wc_t);
278 |   wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (F2, D, E, A, B, C, wd_t);
279 |   we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (F2, C, D, E, A, B, we_t);
280 |   wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (F2, B, C, D, E, A, wf_t);
281 | 
282 |   digest[0] += A;
283 |   digest[1] += B;
284 |   digest[2] += C;
285 |   digest[3] += D;
286 |   digest[4] += E;
287 | }
288 | 
289 | static void F(__global const unsigned int *pass, const unsigned int pass_len, unsigned int *salt, const unsigned int salt_len, const unsigned int iter, __global unsigned int* hash, unsigned int hash_len)
290 | {
291 |     int plen=pass_len/4;
292 |     if (mod(pass_len,4)) plen++;
293 | 
294 |     int slen=salt_len/4;
295 |     if (mod(salt_len,4)) slen++;
296 | 
297 |     __global unsigned int* p = hash;
298 | 
299 |     unsigned int ipad[16];
300 |     ipad[0x0]=0x36363636;
301 |     ipad[0x1]=0x36363636;
302 |     ipad[0x2]=0x36363636;
303 |     ipad[0x3]=0x36363636;
304 |     ipad[0x4]=0x36363636;
305 |     ipad[0x5]=0x36363636;
306 |     ipad[0x6]=0x36363636;
307 |     ipad[0x7]=0x36363636;
308 |     ipad[0x8]=0x36363636;
309 |     ipad[0x9]=0x36363636;
310 |     ipad[0xA]=0x36363636;
311 |     ipad[0xB]=0x36363636;
312 |     ipad[0xC]=0x36363636;
313 |     ipad[0xD]=0x36363636;
314 |     ipad[0xE]=0x36363636;
315 |     ipad[0xF]=0x36363636;
316 | 
317 |     unsigned int opad[16];
318 |     opad[0x0]=0x5C5C5C5C;
319 |     opad[0x1]=0x5C5C5C5C;
320 |     opad[0x2]=0x5C5C5C5C;
321 |     opad[0x3]=0x5C5C5C5C;
322 |     opad[0x4]=0x5C5C5C5C;
323 |     opad[0x5]=0x5C5C5C5C;
324 |     opad[0x6]=0x5C5C5C5C;
325 |     opad[0x7]=0x5C5C5C5C;
326 |     opad[0x8]=0x5C5C5C5C;
327 |     opad[0x9]=0x5C5C5C5C;
328 |     opad[0xA]=0x5C5C5C5C;
329 |     opad[0xB]=0x5C5C5C5C;
330 |     opad[0xC]=0x5C5C5C5C;
331 |     opad[0xD]=0x5C5C5C5C;
332 |     opad[0xE]=0x5C5C5C5C;
333 |     opad[0xF]=0x5C5C5C5C;
334 | 
335 |     for (int m=0;m<plen && m<16;m++)
336 |     {
337 |         ipad[m]^=SWAP(pass[m]);
338 |         opad[m]^=SWAP(pass[m]);
339 |     }
340 | 
341 |     // precompute ipad
342 |             unsigned int stateipad[5]={0};
343 |             stateipad[0] = 0x67452301;
344 |             stateipad[1] = 0xefcdab89;
345 |             stateipad[2] = 0x98badcfe;
346 |             stateipad[3] = 0x10325476;
347 |             stateipad[4] = 0xc3d2e1f0;
348 | 
349 |             //->sha256_update(state,W,ilenor,wposr,ipad,0x40);
350 |             unsigned int W[0x10]={0};
351 |             W[0]=ipad[0];
352 |             W[1]=ipad[1];
353 |             W[2]=ipad[2];
354 |             W[3]=ipad[3];
355 |             W[4]=ipad[4];
356 |             W[5]=ipad[5];
357 |             W[6]=ipad[6];
358 |             W[7]=ipad[7];
359 |             W[8]=ipad[8];
360 |             W[9]=ipad[9];
361 |             W[10]=ipad[10];
362 |             W[11]=ipad[11];
363 |             W[12]=ipad[12];
364 |             W[13]=ipad[13];
365 |             W[14]=ipad[14];
366 |             W[15]=ipad[15];
367 |             sha1_process2(W,stateipad);
368 | 
369 |         // precompute ipad
370 |             unsigned int stateopad[5]={0};
371 |             stateopad[0] = 0x67452301;
372 |             stateopad[1] = 0xefcdab89;
373 |             stateopad[2] = 0x98badcfe;
374 |             stateopad[3] = 0x10325476;
375 |             stateopad[4] = 0xc3d2e1f0;
376 | 
377 |             //->sha1_update(state,W,ilenor,wposr,ipad,0x40);
378 |             W[0]=opad[0];
379 |             W[1]=opad[1];
380 |             W[2]=opad[2];
381 |             W[3]=opad[3];
382 |             W[4]=opad[4];
383 |             W[5]=opad[5];
384 |             W[6]=opad[6];
385 |             W[7]=opad[7];
386 |             W[8]=opad[8];
387 |             W[9]=opad[9];
388 |             W[10]=opad[10];
389 |             W[11]=opad[11];
390 |             W[12]=opad[12];
391 |             W[13]=opad[13];
392 |             W[14]=opad[14];
393 |             W[15]=opad[15];
394 |             sha1_process2(W,stateopad);
395 | 
396 |     unsigned int counter = 1;
397 |     unsigned int state[5]={0};
398 | 
399 |     unsigned int tkeylen=hash_len;
400 |   unsigned int cplen=0;
401 |   while(tkeylen>0)
402 |     {
403 |     if(tkeylen > 20) cplen = 20;
404 |     else cplen=tkeylen;
405 | 
406 |         //hmac_sha1_init(state,W,ileno,wpos,ipad,opad,pwd);
407 |         //->sha1_init(state,W,ileno,wpos);
408 |         //->sha1_update(state,W,ileno,wpos,ipad,0x40);
409 |         state[0] = stateipad[0];
410 |         state[1] = stateipad[1];
411 |         state[2] = stateipad[2];
412 |         state[3] = stateipad[3];
413 |         state[4] = stateipad[4];
414 |         //hmac_sha1_update(state,W,ileno,wpos,ipad,opad,salt,salt_len);
415 |         //->sha1_update(state,W,ileno,wpos,salt,salt_len);
416 |         //hmac_sha1_update(state,W,ileno,wpos,ipad,opad,itmp,4);
417 |         //->sha1_update(state,W,ileno,wpos,itmp,4);
418 |         W[0]=0;
419 |         W[1]=0;
420 |         W[2]=0;
421 |         W[3]=0;
422 |         W[4]=0;
423 |         W[5]=0;
424 |         W[6]=0;
425 |         W[7]=0;
426 |         W[8]=0;
427 |         W[9]=0;
428 |         W[10]=0;
429 |         W[11]=0;
430 |         W[12]=0;
431 |         W[13]=0;
432 |         W[14]=0;
433 |         for (int m=0;m<slen;m++)
434 |         {
435 |             W[m]=SWAP(salt[m]);
436 |         }
437 |         W[slen]=counter;
438 | 
439 |         unsigned int padding=0x80<<(((salt_len+4)-((salt_len+4)/4*4))*8);
440 |         W[((mod((salt_len+4),(16*4)))/4)]|=SWAP(padding);
441 |             // Let's add length
442 |         W[0x0F]=(0x40+(salt_len+4))*8;
443 | 
444 |         //W[slen+1]=0x80000000;
445 |         //W[15]=0x54*8;
446 |         //hmac_sha1_final(state,W,ileno,ipad,opad,digtmp);
447 |         //->sha1_finish(state,W,ileno,&opad[0x10]);
448 |         sha1_process2(W,state);
449 | 
450 |         //sha1(opad,0x54,digtmp);
451 |     //->sha1_init(state,W,ileno,wpos);
452 |     //->sha1_update(state,W,ileno,wpos,opad,0x54);
453 |     //->sha1_finish(state,W,ileno,digtmp);
454 | 
455 |         W[0]=state[0];
456 |         W[1]=state[1];
457 |         W[2]=state[2];
458 |         W[3]=state[3];
459 |         W[4]=state[4];
460 |         W[5]=0x80000000;
461 |         W[6]=0x0;
462 |         W[7]=0x0;
463 |         W[8]=0x0;
464 |         W[9]=0;
465 |         W[10]=0;
466 |         W[11]=0;
467 |         W[12]=0;
468 |         W[13]=0;
469 |         W[14]=0;
470 |         W[15]=0x54*8;
471 | 
472 |         state[0]=stateopad[0];
473 |         state[1]=stateopad[1];
474 |         state[2]=stateopad[2];
475 |         state[3]=stateopad[3];
476 |         state[4]=stateopad[4];
477 | 
478 |         //sha256_finish(state,W,ileno,digtmp);
479 |         sha1_process2(W,state);
480 | 
481 |         p[0]=W[0]=state[0];
482 |         p[1]=W[1]=state[1];
483 |         p[2]=W[2]=state[2];
484 |         p[3]=W[3]=state[3];
485 |         p[4]=W[4]=state[4];
486 | 
487 |         for(int j = 1; j < iter; j++)
488 |         {
489 |             //hmac_sha1(pwd,digtmp,32,digtmp);
490 |             //->sha1_init(state,W,ilenor,wposr);
491 |             //->sha1_update(state,W,ilenor,wposr,digtmp,32);
492 |             //->sha1_finish(state,W,ileno,&opad[0x10]);
493 | 
494 |             W[5]=0x80000000; //Padding
495 |             W[6]=0;
496 |             W[7]=0;
497 |             W[8]=0;
498 |             W[9]=0;
499 |             W[10]=0;
500 |             W[11]=0;
501 |             W[12]=0;
502 |             W[13]=0;
503 |             W[14]=0;
504 |             W[15]=0x54*8;
505 |             state[0] = stateipad[0];
506 |             state[1] = stateipad[1];
507 |             state[2] = stateipad[2];
508 |             state[3] = stateipad[3];
509 |             state[4] = stateipad[4];
510 |             sha1_process2(W,state);
511 | 
512 |             unsigned int M[0x10]={0};
513 |             M[0]=state[0];
514 |             M[1]=state[1];
515 |             M[2]=state[2];
516 |             M[3]=state[3];
517 |             M[4]=state[4];
518 |             M[5]=0x80000000; //Padding
519 |             M[6]=0;
520 |             M[7]=0;
521 |             M[8]=0;
522 |             M[9]=0;
523 |             M[10]=0;
524 |             M[11]=0;
525 |             M[12]=0;
526 |             M[13]=0;
527 |             M[14]=0;
528 |             M[15]=0x54*8;
529 | 
530 |             //->sha1_init(state,W,ilenor,wposr);
531 |             //->sha1_update(state,W,ilenor,wposr,opad,0x60);
532 |             state[0] = stateopad[0];
533 |             state[1] = stateopad[1];
534 |             state[2] = stateopad[2];
535 |             state[3] = stateopad[3];
536 |             state[4] = stateopad[4];
537 | 
538 |             //->sha1_finish(state,W,ilenor,digtmp);
539 |             sha1_process2(M,state);
540 | 
541 |             W[0]=state[0];
542 |             W[1]=state[1];
543 |             W[2]=state[2];
544 |             W[3]=state[3];
545 |             W[4]=state[4];
546 | 
547 |             p[0] ^= state[0];
548 |             p[1] ^= state[1];
549 |             p[2] ^= state[2];
550 |             p[3] ^= state[3];
551 |             p[4] ^= state[4];
552 |         }
553 | 
554 |         p[0]=SWAP(p[0]);
555 |         p[1]=SWAP(p[1]);
556 |         p[2]=SWAP(p[2]);
557 |         p[3]=SWAP(p[3]);
558 |         p[4]=SWAP(p[4]);
559 | 
560 |         tkeylen-= cplen;
561 |         counter++;
562 |         p+= cplen/4;
563 |     }
564 |     return;
565 | }
566 | 
567 | 
568 | __kernel void pbkdf2(__global inbuf *inbuffer, __global const saltbuf *saltbuffer, __global outbuf *outbuffer,
569 |     __private unsigned int iters, __private unsigned int dkLen_bytes)
570 | {
571 | 
572 |     unsigned int idx = get_global_id(0);
573 |     word pwdLen_bytes = inbuffer[idx].length;
574 |     __global word *pwdBuffer = inbuffer[idx].buffer;
575 |     __global word *currOutBuffer = outbuffer[idx].buffer;
576 | 
577 |     // Copy salt so that we can write our integer into the last 4 bytes
578 |     word personal_salt[32/4] = {0};
579 |     personal_salt[0] = saltbuffer[0].buffer[0];
580 |     personal_salt[1] = saltbuffer[0].buffer[1];
581 |     personal_salt[2] = saltbuffer[0].buffer[2];
582 |     personal_salt[3] = saltbuffer[0].buffer[3];
583 |     personal_salt[4] = saltbuffer[0].buffer[4];
584 |     personal_salt[5] = saltbuffer[0].buffer[5];
585 |     personal_salt[6] = saltbuffer[0].buffer[6];
586 |     personal_salt[7] = saltbuffer[0].buffer[7];
587 | 
588 |     F(pwdBuffer, pwdLen_bytes, personal_salt, saltbuffer[0].length, iters, currOutBuffer,32);
589 | }
590 | 


--------------------------------------------------------------------------------
/Library/worker/generic/pbkdf2_sha256_32.cl:
--------------------------------------------------------------------------------
  1 | /*
  2 |     In- and out- buffer structures (of int32), with variable sizes, for hashing.
  3 |     These allow indexing just using just get_global_id(0)
  4 |     Variables tagged with <..> are replaced, so we can specify just enough room for the data.
  5 |     These are:
  6 |         - hashBlockSize_bits   : The hash's block size in Bits
  7 |         - inMaxNumBlocks      : per hash operation
  8 |         - hashDigestSize_bits   : The hash's digest size in Bits
  9 | 
 10 |     Originally adapted from Bjorn Kerler's sha256.cl
 11 |     MIT License
 12 | */
 13 | #define DEBUG 1
 14 | 
 15 | // All macros left defined for usage in the program
 16 | #define ceilDiv(n,d) (((n) + (d) - 1) / (d))
 17 | // All important now, defining whether we're working with unsigned ints or longs
 18 | #define wordSize 4
 19 | 
 20 | // Practical sizes of buffers, in words.
 21 | #define inBufferSize ceilDiv(128, wordSize)
 22 | 
 23 | 
 24 | // Ultimately hoping to faze out the Size_int32/long64,
 25 | //   in favour of just size (_word implied)
 26 | #define word unsigned int
 27 | 
 28 | unsigned int SWAP (unsigned int val)
 29 | {
 30 |     return (rotate(((val) & 0x00FF00FF), 24U) | rotate(((val) & 0xFF00FF00), 8U));
 31 | }
 32 | 
 33 | // ====  Define the structs with the right word size  =====
 34 | //  Helpful & more cohesive to have the lengths of structures as words too,
 35 | //   (rather than unsigned int for both)
 36 | typedef struct {
 37 |     word length; // in bytes
 38 |     word buffer[inBufferSize];
 39 | } inbuf;
 40 | 
 41 | typedef struct {
 42 |     word buffer[16];
 43 | } outbuf;
 44 | 
 45 | // Salt buffer, used by pbkdf2 & pbe
 46 | typedef struct {
 47 |     word length; // in bytes
 48 |     word buffer[8];
 49 | } saltbuf;
 50 | 
 51 | 
 52 | // ========== Debugging function ============
 53 | 
 54 | #ifdef DEBUG
 55 | #if DEBUG
 56 | 
 57 |     #define def_printFromWord(tag, funcName, end)               \
 58 |     /* For printing the string of bytes stored in an array of words.
 59 |     Option to print hex. */    \
 60 |     static void funcName(tag const word *arr, const unsigned int len_bytes, const bool hex)\
 61 |     {                                           \
 62 |         for (int j = 0; j < len_bytes; j++){    \
 63 |             word v = arr[j / wordSize];                 \
 64 |             word r = (j % wordSize) * 8;                \
 65 |             /* Prints little endian, since that's what we use */   \
 66 |             v = (v >> r) & 0xFF;                \
 67 |             if (hex) {                          \
 68 |                 printf("%02x", v);              \
 69 |             } else {                            \
 70 |                 printf("%c", (char)v);          \
 71 |             }                                   \
 72 |         }                                       \
 73 |         printf(end);                            \
 74 |     }
 75 | 
 76 |     def_printFromWord(__private, printFromWord, "")
 77 |     def_printFromWord(__global, printFromWord_glbl, "")
 78 |     def_printFromWord(__private, printFromWord_n, "\n")
 79 |     def_printFromWord(__global, printFromWord_glbl_n, "\n")
 80 | 
 81 | #endif
 82 | #endif/*
 83 |     Original:
 84 |     SHA1 OpenCL Optimized kernel
 85 |     (c) B. Kerler 2018
 86 |     MIT License
 87 | */
 88 | 
 89 | /*
 90 |     (small) Changes:
 91 |     outbuf and inbuf structs defined using the buffer_structs_template
 92 |     func_sha256 renamed to hash_main
 93 | */
 94 | 
 95 | #define F1(x,y,z)   (bitselect(z,y,x))
 96 | #define F0(x,y,z)   (bitselect (x, y, ((x) ^ (z))))
 97 | #define mod(x,y) ((x)-((x)/(y)*(y)))
 98 | #define shr32(x,n) ((x) >> (n))
 99 | #define rotl32(a,n) rotate ((a), (n))
100 | 
101 | #define S0(x) (rotl32 ((x), 25u) ^ rotl32 ((x), 14u) ^ shr32 ((x),  3u))
102 | #define S1(x) (rotl32 ((x), 15u) ^ rotl32 ((x), 13u) ^ shr32 ((x), 10u))
103 | #define S2(x) (rotl32 ((x), 30u) ^ rotl32 ((x), 19u) ^ rotl32 ((x), 10u))
104 | #define S3(x) (rotl32 ((x), 26u) ^ rotl32 ((x), 21u) ^ rotl32 ((x),  7u))
105 | 
106 | #define SHA256C00 0x428a2f98u
107 | #define SHA256C01 0x71374491u
108 | #define SHA256C02 0xb5c0fbcfu
109 | #define SHA256C03 0xe9b5dba5u
110 | #define SHA256C04 0x3956c25bu
111 | #define SHA256C05 0x59f111f1u
112 | #define SHA256C06 0x923f82a4u
113 | #define SHA256C07 0xab1c5ed5u
114 | #define SHA256C08 0xd807aa98u
115 | #define SHA256C09 0x12835b01u
116 | #define SHA256C0a 0x243185beu
117 | #define SHA256C0b 0x550c7dc3u
118 | #define SHA256C0c 0x72be5d74u
119 | #define SHA256C0d 0x80deb1feu
120 | #define SHA256C0e 0x9bdc06a7u
121 | #define SHA256C0f 0xc19bf174u
122 | #define SHA256C10 0xe49b69c1u
123 | #define SHA256C11 0xefbe4786u
124 | #define SHA256C12 0x0fc19dc6u
125 | #define SHA256C13 0x240ca1ccu
126 | #define SHA256C14 0x2de92c6fu
127 | #define SHA256C15 0x4a7484aau
128 | #define SHA256C16 0x5cb0a9dcu
129 | #define SHA256C17 0x76f988dau
130 | #define SHA256C18 0x983e5152u
131 | #define SHA256C19 0xa831c66du
132 | #define SHA256C1a 0xb00327c8u
133 | #define SHA256C1b 0xbf597fc7u
134 | #define SHA256C1c 0xc6e00bf3u
135 | #define SHA256C1d 0xd5a79147u
136 | #define SHA256C1e 0x06ca6351u
137 | #define SHA256C1f 0x14292967u
138 | #define SHA256C20 0x27b70a85u
139 | #define SHA256C21 0x2e1b2138u
140 | #define SHA256C22 0x4d2c6dfcu
141 | #define SHA256C23 0x53380d13u
142 | #define SHA256C24 0x650a7354u
143 | #define SHA256C25 0x766a0abbu
144 | #define SHA256C26 0x81c2c92eu
145 | #define SHA256C27 0x92722c85u
146 | #define SHA256C28 0xa2bfe8a1u
147 | #define SHA256C29 0xa81a664bu
148 | #define SHA256C2a 0xc24b8b70u
149 | #define SHA256C2b 0xc76c51a3u
150 | #define SHA256C2c 0xd192e819u
151 | #define SHA256C2d 0xd6990624u
152 | #define SHA256C2e 0xf40e3585u
153 | #define SHA256C2f 0x106aa070u
154 | #define SHA256C30 0x19a4c116u
155 | #define SHA256C31 0x1e376c08u
156 | #define SHA256C32 0x2748774cu
157 | #define SHA256C33 0x34b0bcb5u
158 | #define SHA256C34 0x391c0cb3u
159 | #define SHA256C35 0x4ed8aa4au
160 | #define SHA256C36 0x5b9cca4fu
161 | #define SHA256C37 0x682e6ff3u
162 | #define SHA256C38 0x748f82eeu
163 | #define SHA256C39 0x78a5636fu
164 | #define SHA256C3a 0x84c87814u
165 | #define SHA256C3b 0x8cc70208u
166 | #define SHA256C3c 0x90befffau
167 | #define SHA256C3d 0xa4506cebu
168 | #define SHA256C3e 0xbef9a3f7u
169 | #define SHA256C3f 0xc67178f2u 
170 | 
171 | __constant uint k_sha256[64] =
172 | {
173 |   SHA256C00, SHA256C01, SHA256C02, SHA256C03,
174 |   SHA256C04, SHA256C05, SHA256C06, SHA256C07,
175 |   SHA256C08, SHA256C09, SHA256C0a, SHA256C0b,
176 |   SHA256C0c, SHA256C0d, SHA256C0e, SHA256C0f,
177 |   SHA256C10, SHA256C11, SHA256C12, SHA256C13,
178 |   SHA256C14, SHA256C15, SHA256C16, SHA256C17,
179 |   SHA256C18, SHA256C19, SHA256C1a, SHA256C1b,
180 |   SHA256C1c, SHA256C1d, SHA256C1e, SHA256C1f,
181 |   SHA256C20, SHA256C21, SHA256C22, SHA256C23,
182 |   SHA256C24, SHA256C25, SHA256C26, SHA256C27,
183 |   SHA256C28, SHA256C29, SHA256C2a, SHA256C2b,
184 |   SHA256C2c, SHA256C2d, SHA256C2e, SHA256C2f,
185 |   SHA256C30, SHA256C31, SHA256C32, SHA256C33,
186 |   SHA256C34, SHA256C35, SHA256C36, SHA256C37,
187 |   SHA256C38, SHA256C39, SHA256C3a, SHA256C3b,
188 |   SHA256C3c, SHA256C3d, SHA256C3e, SHA256C3f,
189 | };
190 | 
191 | #define SHA256_STEP(F0a,F1a,a,b,c,d,e,f,g,h,x,K)  \
192 | {                                               \
193 |   h += K;                                       \
194 |   h += x;                                       \
195 |   h += S3 (e);                           \
196 |   h += F1a (e,f,g);                              \
197 |   d += h;                                       \
198 |   h += S2 (a);                           \
199 |   h += F0a (a,b,c);                              \
200 | }
201 | 
202 | #define SHA256_EXPAND(x,y,z,w) (S1 (x) + y + S0 (z) + w) 
203 | 
204 | static void sha256_process2 (const unsigned int *W, unsigned int *digest)
205 | {
206 |   unsigned int a = digest[0];
207 |   unsigned int b = digest[1];
208 |   unsigned int c = digest[2];
209 |   unsigned int d = digest[3];
210 |   unsigned int e = digest[4];
211 |   unsigned int f = digest[5];
212 |   unsigned int g = digest[6];
213 |   unsigned int h = digest[7];
214 | 
215 |   unsigned int w0_t = W[0];
216 |   unsigned int w1_t = W[1];
217 |   unsigned int w2_t = W[2];
218 |   unsigned int w3_t = W[3];
219 |   unsigned int w4_t = W[4];
220 |   unsigned int w5_t = W[5];
221 |   unsigned int w6_t = W[6];
222 |   unsigned int w7_t = W[7];
223 |   unsigned int w8_t = W[8];
224 |   unsigned int w9_t = W[9];
225 |   unsigned int wa_t = W[10];
226 |   unsigned int wb_t = W[11];
227 |   unsigned int wc_t = W[12];
228 |   unsigned int wd_t = W[13];
229 |   unsigned int we_t = W[14];
230 |   unsigned int wf_t = W[15];
231 | 
232 |   #define ROUND_EXPAND(i)                           \
233 |   {                                                 \
234 |     w0_t = SHA256_EXPAND (we_t, w9_t, w1_t, w0_t);  \
235 |     w1_t = SHA256_EXPAND (wf_t, wa_t, w2_t, w1_t);  \
236 |     w2_t = SHA256_EXPAND (w0_t, wb_t, w3_t, w2_t);  \
237 |     w3_t = SHA256_EXPAND (w1_t, wc_t, w4_t, w3_t);  \
238 |     w4_t = SHA256_EXPAND (w2_t, wd_t, w5_t, w4_t);  \
239 |     w5_t = SHA256_EXPAND (w3_t, we_t, w6_t, w5_t);  \
240 |     w6_t = SHA256_EXPAND (w4_t, wf_t, w7_t, w6_t);  \
241 |     w7_t = SHA256_EXPAND (w5_t, w0_t, w8_t, w7_t);  \
242 |     w8_t = SHA256_EXPAND (w6_t, w1_t, w9_t, w8_t);  \
243 |     w9_t = SHA256_EXPAND (w7_t, w2_t, wa_t, w9_t);  \
244 |     wa_t = SHA256_EXPAND (w8_t, w3_t, wb_t, wa_t);  \
245 |     wb_t = SHA256_EXPAND (w9_t, w4_t, wc_t, wb_t);  \
246 |     wc_t = SHA256_EXPAND (wa_t, w5_t, wd_t, wc_t);  \
247 |     wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t);  \
248 |     we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t);  \
249 |     wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t);  \
250 |   }
251 | 
252 |   #define ROUND_STEP(i)                                                                   \
253 |   {                                                                                       \
254 |     SHA256_STEP (F0, F1, a, b, c, d, e, f, g, h, w0_t, k_sha256[i +  0]); \
255 |     SHA256_STEP (F0, F1, h, a, b, c, d, e, f, g, w1_t, k_sha256[i +  1]); \
256 |     SHA256_STEP (F0, F1, g, h, a, b, c, d, e, f, w2_t, k_sha256[i +  2]); \
257 |     SHA256_STEP (F0, F1, f, g, h, a, b, c, d, e, w3_t, k_sha256[i +  3]); \
258 |     SHA256_STEP (F0, F1, e, f, g, h, a, b, c, d, w4_t, k_sha256[i +  4]); \
259 |     SHA256_STEP (F0, F1, d, e, f, g, h, a, b, c, w5_t, k_sha256[i +  5]); \
260 |     SHA256_STEP (F0, F1, c, d, e, f, g, h, a, b, w6_t, k_sha256[i +  6]); \
261 |     SHA256_STEP (F0, F1, b, c, d, e, f, g, h, a, w7_t, k_sha256[i +  7]); \
262 |     SHA256_STEP (F0, F1, a, b, c, d, e, f, g, h, w8_t, k_sha256[i +  8]); \
263 |     SHA256_STEP (F0, F1, h, a, b, c, d, e, f, g, w9_t, k_sha256[i +  9]); \
264 |     SHA256_STEP (F0, F1, g, h, a, b, c, d, e, f, wa_t, k_sha256[i + 10]); \
265 |     SHA256_STEP (F0, F1, f, g, h, a, b, c, d, e, wb_t, k_sha256[i + 11]); \
266 |     SHA256_STEP (F0, F1, e, f, g, h, a, b, c, d, wc_t, k_sha256[i + 12]); \
267 |     SHA256_STEP (F0, F1, d, e, f, g, h, a, b, c, wd_t, k_sha256[i + 13]); \
268 |     SHA256_STEP (F0, F1, c, d, e, f, g, h, a, b, we_t, k_sha256[i + 14]); \
269 |     SHA256_STEP (F0, F1, b, c, d, e, f, g, h, a, wf_t, k_sha256[i + 15]); \
270 |   }
271 | 
272 |   ROUND_STEP (0);
273 | 
274 |   ROUND_EXPAND();
275 |   ROUND_STEP(16);
276 | 
277 |   ROUND_EXPAND();
278 |   ROUND_STEP(32);
279 | 
280 |   ROUND_EXPAND();
281 |   ROUND_STEP(48);
282 | 
283 |   digest[0] += a;
284 |   digest[1] += b;
285 |   digest[2] += c;
286 |   digest[3] += d;
287 |   digest[4] += e;
288 |   digest[5] += f;
289 |   digest[6] += g;
290 |   digest[7] += h;
291 | }
292 | 
293 | #define def_hash(funcName, passTag, hashTag)    \
294 | /* The main hashing function */                 \
295 | static void funcName(passTag const unsigned int *pass, int pass_len, hashTag unsigned int* hash)    \
296 | {                                   \
297 |     int plen=pass_len/4;            \
298 |     if (mod(pass_len,4)) plen++;    \
299 |                                     \
300 |     hashTag unsigned int* p = hash; \
301 |                                     \
302 |     unsigned int W[0x10]={0};   \
303 |     int loops=plen;             \
304 |     int curloop=0;              \
305 |     unsigned int State[8]={0};  \
306 |     State[0] = 0x6a09e667;      \
307 |     State[1] = 0xbb67ae85;      \
308 |     State[2] = 0x3c6ef372;      \
309 |     State[3] = 0xa54ff53a;      \
310 |     State[4] = 0x510e527f;      \
311 |     State[5] = 0x9b05688c;      \
312 |     State[6] = 0x1f83d9ab;      \
313 |     State[7] = 0x5be0cd19;      \
314 |                         \
315 |     while (loops>0)     \
316 |     {                   \
317 |         W[0x0]=0x0;     \
318 |         W[0x1]=0x0;     \
319 |         W[0x2]=0x0;     \
320 |         W[0x3]=0x0;     \
321 |         W[0x4]=0x0;     \
322 |         W[0x5]=0x0;     \
323 |         W[0x6]=0x0;     \
324 |         W[0x7]=0x0;     \
325 |         W[0x8]=0x0;     \
326 |         W[0x9]=0x0;     \
327 |         W[0xA]=0x0;     \
328 |         W[0xB]=0x0;     \
329 |         W[0xC]=0x0;     \
330 |         W[0xD]=0x0;     \
331 |         W[0xE]=0x0;     \
332 |         W[0xF]=0x0;     \
333 |                         \
334 |         for (int m=0;loops!=0 && m<16;m++)      \
335 |         {                                       \
336 |             W[m]^=SWAP(pass[m+(curloop*16)]);   \
337 |             loops--;                            \
338 |         }                                       \
339 |                                                 \
340 |         if (loops==0 && mod(pass_len,64)!=0)    \
341 |         {                                       \
342 |             unsigned int padding=0x80<<(((pass_len+4)-((pass_len+4)/4*4))*8);   \
343 |             int v=mod(pass_len,64);         \
344 |             W[v/4]|=SWAP(padding);          \
345 |             if ((pass_len&0x3B)!=0x3B)      \
346 |             {                               \
347 |                 /* Let's add length */      \
348 |                 W[0x0F]=pass_len*8;         \
349 |             }                               \
350 |         }                                   \
351 |                                         \
352 |         sha256_process2(W,State);       \
353 |         curloop++;                      \
354 |     }                                   \
355 |                             \
356 |     if (mod(plen,16)==0)    \
357 |     {               \
358 |         W[0x0]=0x0; \
359 |         W[0x1]=0x0; \
360 |         W[0x2]=0x0; \
361 |         W[0x3]=0x0; \
362 |         W[0x4]=0x0; \
363 |         W[0x5]=0x0; \
364 |         W[0x6]=0x0; \
365 |         W[0x7]=0x0; \
366 |         W[0x8]=0x0; \
367 |         W[0x9]=0x0; \
368 |         W[0xA]=0x0; \
369 |         W[0xB]=0x0; \
370 |         W[0xC]=0x0; \
371 |         W[0xD]=0x0; \
372 |         W[0xE]=0x0; \
373 |         W[0xF]=0x0; \
374 |         if ((pass_len&0x3B)!=0x3B)  \
375 |         {                           \
376 |             word padding=0x80<<(((pass_len+4)-((pass_len+4)/4*4))*8);   \
377 |             W[0]|=SWAP(padding);    \
378 |         }                           \
379 |         /* Let's add length */      \
380 |         W[0x0F]=pass_len*8;         \
381 |                                     \
382 |         sha256_process2(W,State);   \
383 |     }   \
384 |                             \
385 |     p[0]=SWAP(State[0]);    \
386 |     p[1]=SWAP(State[1]);    \
387 |     p[2]=SWAP(State[2]);    \
388 |     p[3]=SWAP(State[3]);    \
389 |     p[4]=SWAP(State[4]);    \
390 |     p[5]=SWAP(State[5]);    \
391 |     p[6]=SWAP(State[6]);    \
392 |     p[7]=SWAP(State[7]);    \
393 |     return;                 \
394 | }
395 | 
396 | def_hash(hash_global, __global, __global)
397 | def_hash(hash_private, __private, __private)
398 | def_hash(hash_glbl_to_priv, __global, __private)
399 | def_hash(hash_priv_to_glbl, __private, __global)
400 | 
401 | #undef F0
402 | #undef F1
403 | #undef S0
404 | #undef S1
405 | #undef S2
406 | #undef S3
407 | 
408 | #undef mod
409 | #undef shr32
410 | #undef rotl32
411 | 
412 | __kernel void hash_main(__global const inbuf * inbuffer, __global outbuf * outbuffer)
413 | {
414 |     unsigned int idx = get_global_id(0);
415 |     // unsigned int hash[32/4]={0};
416 |     hash_global(inbuffer[idx].buffer, inbuffer[idx].length, outbuffer[idx].buffer);
417 | }
418 | /*
419 |     pbkdf2 and HMAC implementation
420 |     requires implementation of PRF (pseudo-random function),
421 |       probably using HMAC and an implementation of hash_main
422 | */
423 | /*
424 |     REQ: outBuf.buffer must have space for ceil(dkLen / PRF_output_bytes) * PRF_output_bytes
425 |     REQ: PRF implementation MUST allow that output may be the salt (m in hmac)
426 |     inBuffer / pwdBuffer / the like are not const to allow for padding
427 | */
428 | 
429 | // Determine (statically) the actual required buffer size
430 | // Correct for both 64 & 32 bit
431 | //   Just allowing for MD padding: 2 words for length, 1 for the 1-pad = 3 words
432 | #define sizeForHash(reqSize) (ceilDiv((reqSize) + 2 + 1, 16) * 16)
433 | 
434 | __constant const unsigned int opad = 0x5c5c5c5c;
435 | __constant const unsigned int ipad = 0x36363636;
436 | 
437 | __constant const word xoredPad = opad ^ ipad;
438 | 
439 | // Slightly ugly: large enough for hmac_main usage, and tight for pbkdf2
440 | // #define m_buffer_size (8 + 1)
441 | 
442 | static void hmac(__global word *K, const word K_len_bytes,
443 |     const word *m, const word m_len_bytes, word *output)
444 | {
445 |     // REQ: If K_len_bytes isn't divisible by 4/8, final word should be clean (0s to the end)
446 |     // REQ: s digestSize is a multiple of 4/8 bytes
447 | 
448 |     /* Declare the space for input to the last hash function:
449 |          Compute and write K_ ^ opad to the first block of this. This will be the only place that we store K_ */
450 | 
451 |     word input_2[16 + 8] = {0};
452 |     word end;
453 |     if (K_len_bytes <= 64)
454 |     {
455 |         end = ceilDiv(K_len_bytes, wordSize);
456 |         // XOR with opad and slightly pad with zeros..
457 |         input_2[0] = K[0] ^ opad;
458 |         input_2[1] = K[1] ^ opad;
459 |         input_2[2] = K[2] ^ opad;
460 |         input_2[3] = K[3] ^ opad;
461 |         input_2[4] = K[4] ^ opad;
462 |         input_2[5] = K[5] ^ opad;
463 |         input_2[6] = K[6] ^ opad;
464 |         input_2[7] = K[7] ^ opad;
465 |         input_2[8] = K[8] ^ opad;
466 |         input_2[9] = K[9] ^ opad;
467 |         input_2[0xA] = K[0xA] ^ opad;
468 |         input_2[0xB] = K[0xB] ^ opad;
469 |         input_2[0xC] = K[0xC] ^ opad;
470 |         input_2[0xD] = K[0xD] ^ opad;
471 |         input_2[0xE] = K[0xE] ^ opad;
472 |         input_2[0xF] = K[0xF] ^ opad;
473 |     } else {
474 |         end = 8;
475 |         // Hash K to get K'. XOR with opad..
476 |         hash_glbl_to_priv(K, K_len_bytes, input_2);
477 |         input_2[0] ^= opad;
478 |         input_2[1] ^= opad;
479 |         input_2[2] ^= opad;
480 |         input_2[3] ^= opad;
481 |         input_2[4] ^= opad;
482 |         input_2[5] ^= opad;
483 |         input_2[6] ^= opad;
484 |         input_2[7] ^= opad;
485 |         input_2[8] = opad;
486 |         input_2[9] = opad;
487 |         input_2[0xA] = opad;
488 |         input_2[0xB] = opad;
489 |         input_2[0xC] = opad;
490 |         input_2[0xD] = opad;
491 |         input_2[0xE] = opad;
492 |         input_2[0xF] = opad;
493 |     }
494 |     // Copy K' ^ ipad into the first block.
495 |     // Be careful: hash needs a whole block after the end. ceilDiv from buffer_structs
496 |     // Slightly ugly: large enough for hmac_main usage, and tight for pbkdf2
497 |     // #define m_buffer_size (8 + 1)
498 |     // K' ^ ipad into the first block
499 |     word input_1[16 + 9] = {0};
500 | 
501 |     input_1[0] = input_2[0]^xoredPad;
502 |     input_1[1] = input_2[1]^xoredPad;
503 |     input_1[2] = input_2[2]^xoredPad;
504 |     input_1[3] = input_2[3]^xoredPad;
505 |     input_1[4] = input_2[4]^xoredPad;
506 |     input_1[5] = input_2[5]^xoredPad;
507 |     input_1[6] = input_2[6]^xoredPad;
508 |     input_1[7] = input_2[7]^xoredPad;
509 |     input_1[8] = input_2[8]^xoredPad;
510 |     input_1[9] = input_2[9]^xoredPad;
511 |     input_1[0xA] = input_2[0xA]^xoredPad;
512 |     input_1[0xB] = input_2[0xB]^xoredPad;
513 |     input_1[0xC] = input_2[0xC]^xoredPad;
514 |     input_1[0xD] = input_2[0xD]^xoredPad;
515 |     input_1[0xE] = input_2[0xE]^xoredPad;
516 |     input_1[0xF] = input_2[0xF]^xoredPad;
517 | 
518 | 
519 |     // Slightly inefficient copying m in..
520 |     word m_len_word = ceilDiv(m_len_bytes, wordSize);
521 |     for (int j = 0; j < m_len_word; j++){
522 |         input_1[16 + j] = m[j];
523 |     }
524 | 
525 |     // Hash input1 into the second half of input2
526 |     word leng = 64 + m_len_bytes;
527 |     hash_private(input_1, leng, input_2 + 16);
528 | 
529 |     // Hash input2 into output!
530 |     hash_private(input_2, 64 + 32, output);
531 | }
532 | 
533 | #undef sizeForHash
534 | 
535 | 
536 | // PRF
537 | // Our PRF is the hmac using the hash. Commas remove need for bracketing
538 | #define PRF(pwd, pwdLen_bytes, salt, saltLen_bytes, output) \
539 |     hmac(pwd, pwdLen_bytes, salt, saltLen_bytes, output)
540 | 
541 | 
542 | static void F(__global word *pwd, const word pwdLen_bytes,
543 |     word *salt, const word saltLen_bytes,
544 |     const unsigned int iters, unsigned int callI,
545 |     __global word *output)
546 | {
547 |     // ASSUMPTION: salt array has wordSize bytes more room
548 |     // Note salt is not const, so we can efficiently tweak the end of it
549 | 
550 |     // Add the integer to the end of the salt
551 |     // NOTE! Always adding callI as just a u32
552 |     //word overhang = saltLen_bytes % wordSize;
553 |     word overhang=((saltLen_bytes)-((saltLen_bytes)/(wordSize)*(wordSize)));
554 |     overhang *= 8; // convert to bits
555 |     word saltLastI = saltLen_bytes / wordSize;
556 | 
557 |     // ! Crucial line: BE, moved as if it's a u32 but still within the word
558 |     word be_callI = SWAP((word)callI) >> (8*(wordSize-4));
559 |     if (overhang>0)
560 |     {
561 |         salt[saltLastI] |= be_callI << overhang;
562 |         salt[saltLastI+1] = be_callI >> ((8*wordSize)-overhang);
563 |     }
564 |     else
565 |     {
566 |         salt[saltLastI]=be_callI;
567 |     }
568 | 
569 |     // Make initial call, copy into output
570 |     // This copy is avoidable, but only with __global / __private macro stuff
571 |     word u[8] = {0};
572 |     // +4 is correct even for 64 bit
573 |     PRF(pwd, pwdLen_bytes, salt, saltLen_bytes + 4, u);
574 |     output[0] = u[0];
575 |     output[1] = u[1];
576 |     output[2] = u[2];
577 |     output[3] = u[3];
578 |     output[4] = u[4];
579 |     output[5] = u[5];
580 |     output[6] = u[6];
581 |     output[7] = u[7];
582 | 
583 |     // Perform all the iterations, reading salt from- AND writing to- u.
584 |     for (unsigned int j = 1; j < iters; j++){
585 |         PRF(pwd, pwdLen_bytes, u, 32, u);
586 |         output[0]^=u[0];
587 |         output[1]^=u[1];
588 |         output[2]^=u[2];
589 |         output[3]^=u[3];
590 |         output[4]^=u[4];
591 |         output[5]^=u[5];
592 |         output[6]^=u[6];
593 |         output[7]^=u[7];
594 |     }
595 | }
596 | 
597 | __kernel void pbkdf2(__global inbuf *inbuffer, __global const saltbuf *saltbuffer, __global outbuf *outbuffer,
598 |     __private unsigned int iters, __private unsigned int dkLen_bytes)
599 | {
600 | 
601 |     unsigned int idx = get_global_id(0);
602 |     word pwdLen_bytes = inbuffer[idx].length;
603 |     __global word *pwdBuffer = inbuffer[idx].buffer;
604 |     __global word *currOutBuffer = outbuffer[idx].buffer;
605 | 
606 |     // Copy salt so that we can write our integer into the last 4 bytes
607 |     word saltLen_bytes = saltbuffer[0].length;
608 |     int saltLen = ceilDiv(saltLen_bytes, wordSize);
609 |     word personal_salt[8+2] = {0};
610 | 
611 |     personal_salt[0] = saltbuffer[0].buffer[0];
612 |     personal_salt[1] = saltbuffer[0].buffer[1];
613 |     personal_salt[2] = saltbuffer[0].buffer[2];
614 |     personal_salt[3] = saltbuffer[0].buffer[3];
615 |     personal_salt[4] = saltbuffer[0].buffer[4];
616 |     personal_salt[5] = saltbuffer[0].buffer[5];
617 |     personal_salt[6] = saltbuffer[0].buffer[6];
618 |     personal_salt[7] = saltbuffer[0].buffer[7];
619 | 
620 |     // Determine the number of calls to F that we need to make
621 |     unsigned int nBlocks = ceilDiv(dkLen_bytes, 32);
622 |     for (unsigned int j = 1; j <= nBlocks; j++)
623 |     {
624 |         F(pwdBuffer, pwdLen_bytes, personal_salt, saltbuffer[0].length, iters, j, currOutBuffer);
625 |         currOutBuffer += 8;
626 |     }
627 | }
628 | 
629 | 
630 | // Exposing HMAC in the same way. Useful for testing atleast.
631 | __kernel void hmac_main(__global inbuf *inbuffer, __global const saltbuf *saltbuffer, __global outbuf *outbuffer)
632 | {
633 |     unsigned int idx = get_global_id(0);
634 |     word pwdLen_bytes = inbuffer[idx].length;
635 |     __global word *pwdBuffer = inbuffer[idx].buffer;
636 | 
637 |     // Copy salt just to cheer the compiler up
638 |     int saltLen_bytes = (int)saltbuffer[0].length;
639 |     int saltLen = ceilDiv(saltLen_bytes, wordSize);
640 |     word personal_salt[8] = {0};
641 | 
642 |     personal_salt[0] = saltbuffer[0].buffer[0];
643 |     personal_salt[1] = saltbuffer[0].buffer[1];
644 |     personal_salt[2] = saltbuffer[0].buffer[2];
645 |     personal_salt[3] = saltbuffer[0].buffer[3];
646 |     personal_salt[4] = saltbuffer[0].buffer[4];
647 |     personal_salt[5] = saltbuffer[0].buffer[5];
648 |     personal_salt[6] = saltbuffer[0].buffer[6];
649 |     personal_salt[7] = saltbuffer[0].buffer[7];
650 | 
651 |     // Call hmac, with local
652 |     word out[8];
653 |     
654 |     hmac(pwdBuffer, pwdLen_bytes, personal_salt, saltLen_bytes, out);
655 | 
656 |     outbuffer[idx].buffer[0] = out[0];
657 |     outbuffer[idx].buffer[1] = out[1];
658 |     outbuffer[idx].buffer[2] = out[2];
659 |     outbuffer[idx].buffer[3] = out[3];
660 |     outbuffer[idx].buffer[4] = out[4];
661 |     outbuffer[idx].buffer[5] = out[5];
662 |     outbuffer[idx].buffer[6] = out[6];
663 |     outbuffer[idx].buffer[7] = out[7];
664 | }


--------------------------------------------------------------------------------
/Library/worker/generic/sCrypt.cl:
--------------------------------------------------------------------------------
  1 | /*
  2 |     Scrypt OpenCL Optimized kernel
  3 |     (c) C.B. and B. Kerler 2018-2019
  4 |     MIT License
  5 | */ 
  6 | 
  7 | // [Lines 1 and 2 are for defining N and invMemoryDensity, and must be blank]
  8 | 
  9 | /*
 10 | sCrypt kernel.. or just ROMix really, for use with my sBrute PyOpenCL core
 11 | Originally adapted from Bjorn Kerler's opencl_brute
 12 | 
 13 | Follows the variable names of wikipedia's psuedocode:
 14 |     https://en.wikipedia.org/wiki/Scrypt#Algorithm
 15 | Function/macro convention is F(output, input_1, input_2, ..), i.e. output first.
 16 | Generally work with pointers.
 17 |  
 18 | === Design choices & reasoning =================================================
 19 | 
 20 | > initial and final pbkdf2s are left to python for a few reasons:
 21 |     - vastly simplier cl code, hopefully giving us better optimisation
 22 |     - reduced bugs
 23 |     - simplier parallelisation across the parameter 'p'
 24 |     - not a burden on python: work is tiny..
 25 |         & the special sBrute python core is careful that any work is while the GPUs are busy
 26 | 
 27 | > salsa20 is sort of inplace
 28 |     - fundamentally needs to copy the input internally
 29 |     - does (hopefully) make savings by having input = output, making the algo:
 30 |         orig_input < input
 31 |         Process(input)      // inplace
 32 |         input ^= orig_input
 33 |       where the last line should be faster than output = input ^ orig_input
 34 | 
 35 | > JUMBLES!
 36 |     - jumble(Y0|Y1|..|Y_2r-1) = Y0|Y2|..|Y_2r-1  |  Y1|Y3|..|Y_2r-1,
 37 |         which is effectively performed at the end of BlockMix in the original definition
 38 |     - jumble is of order 4, i.e. jumble^4 = id
 39 |     - we want to avoid doing this copying..
 40 |     - naturally we unroll the loop in BlockMix, so reordering the input is free
 41 | => all this leads to us working in 4 different states of "jumbled-ness" throughout the program
 42 |     - indeed our V[j]s are jumbled j % 4 times.
 43 |     - xoring the V[j]'s back onto a (somewhat jumbled) X in the 2nd loop effectively requires a function call
 44 | 
 45 | > Salsa function is long, so can't be macro-ed and called lots of times.
 46 |     - We could have kept the BlockMix loop,
 47 |         but this would require reading the jumble index from an array each iteration
 48 |     - Instead we make Salsa a void Function
 49 |     - Also a xor loop is moved into Salsa, so that we can unroll it,
 50 |       at the small cost of an extra parameter
 51 | 
 52 | > All values except our huge V array are kept locally.
 53 |     - V[j] is accessed and xored onto a local array.
 54 | 
 55 | > After a long battle, the Salsa20/8's 4-pairs-of-rounds loop is unrolled.
 56 |     - Program size should still be fine.
 57 | 
 58 | > using "= {0}" to initialise local arrays is the classic fix copied from Bjorn Kerler's code:
 59 |     seems to be necessary to actually make the program work, even though it should have no effect.
 60 | 
 61 | 
 62 | === FIN ========================================================================
 63 | */
 64 | 
 65 | 
 66 | 
 67 | 
 68 | // ===========================================================================
 69 | // 1 / memory density
 70 | #ifndef invMemoryDensity
 71 |     #define invMemoryDensity 1
 72 | #endif
 73 | #define iMD_is_pow_2 (!(invMemoryDensity & (invMemoryDensity - 1)) && invMemoryDensity)
 74 | 
 75 | 
 76 | // sCrypt constants :
 77 | //  - p irrelevant to us
 78 | //  - r below cannot be changed (without altering the program)
 79 | //      > makes the 'jumble' operation order 4
 80 | //  - N can be changed if necessary, up until we run out of buffer space (so maybe <= 20?)
 81 | #ifndef N
 82 |     #define N 15        // <= 20?
 83 | #endif
 84 | 
 85 | #define mod(x,y) ((x)-((x)/(y)*(y)))
 86 | 
 87 | #define r 8         // CAN'T BE CHANGED
 88 | 
 89 | // derivatives of constants :s
 90 | #define blockSize_bytes (128 * r)   // 1024
 91 | #define ceilDiv(n,d) (((n) + (d) - 1) / (d))
 92 | #define blockSize_int32 ceilDiv(blockSize_bytes, 4) // 256
 93 | #define iterations (1 << N) 
 94 | 
 95 | // Useful struct for internal processing: a lump of 64 bytes (sort of an atomic unit)
 96 | typedef struct {
 97 |     unsigned int buffer[16];    // 64 bytes
 98 | } T_Lump64;
 99 | 
100 | // Comfy Block struct
101 | typedef struct {
102 | 	T_Lump64 lump[2*r];    // 1024 bytes
103 | } T_Block;
104 | 
105 | // Struct for the large V array which needs to be pseduo-randomly accessed.
106 | // Now restricted in length by invMemoryDensity
107 | typedef struct {
108 |     T_Block blk[ceilDiv(iterations, invMemoryDensity)];
109 | } T_HugeArray;
110 | 
111 | 
112 | 
113 | 
114 | 
115 | 
116 | // ===========================================================================
117 | // Simple macros
118 | // Lump & Block macros take pointers
119 | 
120 | #define copy16_unrolled(dest,src)                    \
121 | /* dest[i] = src[i] for i in [0..16) */     \
122 | {                       \
123 |     dest[0]  = src[0];  \
124 |     dest[1]  = src[1];  \
125 |     dest[2]  = src[2];  \
126 |     dest[3]  = src[3];  \
127 |     dest[4]  = src[4];  \
128 |     dest[5]  = src[5];  \
129 |     dest[6]  = src[6];  \
130 |     dest[7]  = src[7];  \
131 |     dest[8]  = src[8];  \
132 |     dest[9]  = src[9];  \
133 |     dest[10] = src[10]; \
134 |     dest[11] = src[11]; \
135 |     dest[12] = src[12]; \
136 |     dest[13] = src[13]; \
137 |     dest[14] = src[14]; \
138 |     dest[15] = src[15]; \
139 | }
140 | 
141 | #define xor16_unrolled(dest,src)            \
142 | /* dest[i] ^= src[i] for i in [0..16) */    \
143 | {                        \
144 |     dest[0]  ^= src[0];  \
145 |     dest[1]  ^= src[1];  \
146 |     dest[2]  ^= src[2];  \
147 |     dest[3]  ^= src[3];  \
148 |     dest[4]  ^= src[4];  \
149 |     dest[5]  ^= src[5];  \
150 |     dest[6]  ^= src[6];  \
151 |     dest[7]  ^= src[7];  \
152 |     dest[8]  ^= src[8];  \
153 |     dest[9]  ^= src[9];  \
154 |     dest[10] ^= src[10]; \
155 |     dest[11] ^= src[11]; \
156 |     dest[12] ^= src[12]; \
157 |     dest[13] ^= src[13]; \
158 |     dest[14] ^= src[14]; \
159 |     dest[15] ^= src[15]; \
160 | }
161 | 
162 | #define add16_unrolled(dest, src)   \
163 | /* dest[i] += src[i] for i in [0..16) */    \
164 | {                                   \
165 |     dest[0] += src[0];  \
166 |     dest[1] += src[1];  \
167 |     dest[2] += src[2];  \
168 |     dest[3] += src[3];  \
169 |     dest[4] += src[4];  \
170 |     dest[5] += src[5];  \
171 |     dest[6] += src[6];  \
172 |     dest[7] += src[7];  \
173 |     dest[8] += src[8];  \
174 |     dest[9] += src[9];  \
175 |     dest[10] += src[10];    \
176 |     dest[11] += src[11];    \
177 |     dest[12] += src[12];    \
178 |     dest[13] += src[13];    \
179 |     dest[14] += src[14];    \
180 |     dest[15] += src[15];    \
181 | }
182 | 
183 | #define copyLump64_unrolled(dest, src)  \
184 | /* &dest = &src */                        \
185 | {                                       \
186 |     copy16_unrolled(dest->buffer, src->buffer)  \
187 | }
188 | 
189 | #define xorLump64_unrolled(dest, src)   \
190 | /* &dest ^= &src */                       \
191 | {                                       \
192 |     xor16_unrolled(dest->buffer, src->buffer)   \
193 | }
194 | 
195 | #define copyBlock_halfrolled(destTag, dest, srcTag, src)     \
196 | /* [destTag] &dest = [srcTag] &src, copying lumps of 64 in a loop */ \
197 | {                                           \
198 |     destTag T_Lump64* _CB_d;                \
199 |     srcTag T_Lump64* _CB_s;                 \
200 |     for (int i = 2*r - 1; i >= 0; i--)      \
201 |     {                                       \
202 |         _CB_d = &(dest)->lump[i];           \
203 |         _CB_s = &(src)->lump[i];            \
204 |         copyLump64_unrolled(_CB_d, _CB_s)   \
205 |     }                                       \
206 | }
207 | 
208 | #define xorBlock_halfrolled(destTag, dest, srcTag, src)     \
209 | /* [destTag] &dest ^= [srcTag] &src, xoring lumps of 64 in a loop */ \
210 | {                                           \
211 |     destTag T_Lump64* _XB_d;                \
212 |     srcTag T_Lump64* _XB_s;                 \
213 |     for (int i = 2*r - 1; i >= 0; i--)      \
214 |     {                                       \
215 |         _XB_d = &(dest)->lump[i];           \
216 |         _XB_s = &(src)->lump[i];            \
217 |         xorLump64_unrolled(_XB_d, _XB_s)    \
218 |     }                                       \
219 | }
220 | 
221 | 
222 | 
223 | 
224 | 
225 | 
226 | 
227 | // ==========================================================================
228 | // Debug printing macros
229 | 
230 | #define printLump(lump) \
231 | /* Takes the object not a pointer */    \
232 | {                                       \
233 |     for (int j = 0; j < 16; j++){       \
234 |         printf("%08X", lump.buffer[j]); \
235 |     }                                   \
236 | }
237 | 
238 | #define printBlock(blk) \
239 | /* Takes a pointer */   \
240 | {                                   \
241 |     for (int i = 0; i < 2*r; i++)   \
242 |     {                               \
243 |         printLump(blk->lump[i])     \
244 |     }                               \
245 | }
246 | 
247 | 
248 | 
249 | 
250 | 
251 | 
252 | 
253 | // ===========================================================================
254 | // Salsa 20/8
255 | // Adapted from https://en.wikipedia.org/wiki/Salsa20#Structure
256 | 
257 | 
258 | // Rotation synonym and quarter round for Salsa20
259 | #define rotl32(a,n) rotate((a), (n))
260 | #define quarterRound(a, b, c, d)		\
261 | /**/                                    \
262 | {                                       \
263 | 	b ^= rotl32(a + d,  7u);	        \
264 | 	c ^= rotl32(b + a,  9u);	        \
265 | 	d ^= rotl32(c + b, 13u);	        \
266 | 	a ^= rotl32(d + c, 18u);            \
267 | }
268 | 
269 | #define pairOfRounds(x)                         \
270 | /* Pinched from wikipedia */                    \
271 | {                                               \
272 |     /* Odd round */                             \
273 |     quarterRound(x[ 0], x[ 4], x[ 8], x[12]);   \
274 |     quarterRound(x[ 5], x[ 9], x[13], x[ 1]);	\
275 |     quarterRound(x[10], x[14], x[ 2], x[ 6]);	\
276 |     quarterRound(x[15], x[ 3], x[ 7], x[11]);	\
277 |     /* Even round */                            \
278 |     quarterRound(x[ 0], x[ 1], x[ 2], x[ 3]);	\
279 |     quarterRound(x[ 5], x[ 6], x[ 7], x[ 4]);	\
280 |     quarterRound(x[10], x[11], x[ 8], x[ 9]);	\
281 |     quarterRound(x[15], x[12], x[13], x[14]);	\
282 | }
283 | 
284 | // Function not a macro (see 'design choices' at the top)
285 | // Xors X onto lump then computes lump <- Salsa20/8(lump)
286 | void Xor_then_Salsa_20_8_InPlace(__private T_Lump64* lump, __private T_Lump64* X)
287 | {
288 |     // Includes xoring here, to allow for unrolling (at expense of an extra param)
289 |     xorLump64_unrolled(lump, X)
290 | 
291 |     // Copy input into x (lowercase) for processing
292 |     unsigned int x[16] = {0};
293 |     copy16_unrolled(x, lump->buffer)
294 | 
295 |     // Do the 8 rounds
296 |     // After much internal conflict I have unrolled this loop of 4
297 |     pairOfRounds(x)
298 |     pairOfRounds(x)
299 |     pairOfRounds(x)
300 |     pairOfRounds(x)
301 | 
302 |     // Add x to original input, and store into output.. which is the input :)
303 |     add16_unrolled(lump->buffer, x)
304 | }
305 | 
306 | 
307 | 
308 | 
309 | 
310 | 
311 | 
312 | // ====================================================================================
313 | // BlockMix variants
314 | //   Nomenclature of the variants is composition: f_g_h(x) = f(g(h(x)))
315 | 
316 | 
317 | #define BlockMixLoopBody(_B_i, _BMLB_X)      \
318 | /* My heavily adapted BlockMix loop body */ \
319 | {                                           \
320 |     /*  _B_i = _B_i ^ _BMLB_X
321 |         _B_i = Salsa20(_B_i)
322 |         _BMLB_X = _B_i (as pointers)
323 |         [ Doesn't increment i ]
324 |     */                                        \
325 |     Xor_then_Salsa_20_8_InPlace(_B_i, _BMLB_X);\
326 |     _BMLB_X = _B_i;                            \
327 | }
328 | 
329 | #define _BlockMix_Generic(B, \
330 |                         i_1, i_2, i_3, i_4, i_5, i_6, i_7,         \
331 |                         i_8, i_9, i_10, i_11, i_12, i_13, i_14, i_15)   \
332 | /* Takes {i_0, .. , i_15} a permutation of {0, .. , 15}, the order of indices
333 |     i_0 = 0 implied. */                                                 \
334 | {                                                                       \
335 |     /* Don't even need to copy to _BM_X, can just point! */                 \
336 |     /* Start with _BM_X = B[2r-1] (indexing across blocks of 64 bytes) */   \
337 |     __private T_Lump64* _BM_X = &B->lump[i_15];   \
338 |     __private T_Lump64* _BM_B_i;                  \
339 |                                         \
340 |     /* i_0 = 0 */                       \
341 |     BlockMixLoopBody(&B->lump[0], _BM_X)\
342 |     _BM_B_i = &B->lump[i_1];            \
343 |     BlockMixLoopBody(_BM_B_i, _BM_X)    \
344 |     _BM_B_i = &B->lump[i_2];            \
345 |     BlockMixLoopBody(_BM_B_i, _BM_X)    \
346 |     _BM_B_i = &B->lump[i_3];            \
347 |     BlockMixLoopBody(_BM_B_i, _BM_X)    \
348 |                                     \
349 |     _BM_B_i = &B->lump[i_4];            \
350 |     BlockMixLoopBody(_BM_B_i, _BM_X)    \
351 |     _BM_B_i = &B->lump[i_5];            \
352 |     BlockMixLoopBody(_BM_B_i, _BM_X)    \
353 |     _BM_B_i = &B->lump[i_6];            \
354 |     BlockMixLoopBody(_BM_B_i, _BM_X)    \
355 |     _BM_B_i = &B->lump[i_7];            \
356 |     BlockMixLoopBody(_BM_B_i, _BM_X)    \
357 |                                     \
358 |     _BM_B_i = &B->lump[i_8];            \
359 |     BlockMixLoopBody(_BM_B_i, _BM_X)    \
360 |     _BM_B_i = &B->lump[i_9];            \
361 |     BlockMixLoopBody(_BM_B_i, _BM_X)    \
362 |     _BM_B_i = &B->lump[i_10];           \
363 |     BlockMixLoopBody(_BM_B_i, _BM_X)    \
364 |     _BM_B_i = &B->lump[i_11];           \
365 |     BlockMixLoopBody(_BM_B_i, _BM_X)    \
366 |                                     \
367 |     _BM_B_i = &B->lump[i_12];           \
368 |     BlockMixLoopBody(_BM_B_i, _BM_X)    \
369 |     _BM_B_i = &B->lump[i_13];           \
370 |     BlockMixLoopBody(_BM_B_i, _BM_X)    \
371 |     _BM_B_i = &B->lump[i_14];           \
372 |     BlockMixLoopBody(_BM_B_i, _BM_X)    \
373 |     _BM_B_i = &B->lump[i_15];           \
374 |     BlockMixLoopBody(_BM_B_i, _BM_X)    \
375 | }
376 | 
377 | 
378 | #define BlockMix_J3(B) \
379 | /* 3 jumbles then a BlockMix */   \
380 | {    \
381 |     _BlockMix_Generic(B, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15)  \
382 | }
383 | 
384 | #define J1_BlockMix_J2(B) \
385 | /* Jumble twice, BlockMixes, then jumbles.  */   \
386 | {    \
387 |     _BlockMix_Generic(B, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)  \
388 | }
389 | 
390 | #define J2_BlockMix_J1(B) \
391 | /* Jumbles, BlockMixes, then 2 jumbles. */   \
392 | {    \
393 |     _BlockMix_Generic(B, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15)  \
394 | }
395 | 
396 | #define J3_BlockMix(B) \
397 | /* BlockMix followed by 3 jumbles (i.e. a jumble-inverse) */   \
398 | {    \
399 |     _BlockMix_Generic(B, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)  \
400 | }
401 | 
402 | 
403 | 
404 | 
405 | 
406 | 
407 | 
408 | 
409 | // ===============================================================================
410 | // Integerify: gets it's own section
411 | 
412 | #define Integerify(j, block)                    \
413 | /* Observe that the last 64 bytes is the last lump */ \
414 | /* Correct regardless of the jumbled-ness of the block! */ \
415 | /* Requires N <= 32 */ \
416 | {                                               \
417 |     j = mod(block->lump[15].buffer[0],iterations); \
418 | }
419 | 
420 | 
421 | 
422 | 
423 | 
424 | 
425 | // ===============================================================================
426 | // Xoring methods for the 4 states of jumbled-ness
427 | //   Culminates in the 'recover_and_xor_appropriately' function, which selects the correct one.
428 | 
429 | #define _xor_generic(dest, srcTag, src,                \
430 |         i_0, i_1, i_2, i_3, i_4, i_5, i_6, i_7,         \
431 |         i_8, i_9, i_10, i_11, i_12, i_13, i_14, i_15)   \
432 | /* dest ^= perm(src), xor permuted source on, k -> i_k the permutation.
433 |     requires src disjoint from dest : guaranteed by address spaces */      \
434 | {                                           \
435 |     __private T_Lump64* _XB_d;              \
436 |     srcTag T_Lump64* _XB_s;                 \
437 |     const int perm[16] = {i_0, i_1, i_2, i_3, i_4, i_5, i_6, i_7,   \
438 |                     i_8, i_9, i_10, i_11, i_12, i_13, i_14, i_15};  \
439 |     for (int i = 2*r - 1; i >= 0; i--)      \
440 |     {                                       \
441 |         _XB_d = &(dest)->lump[i];           \
442 |         /* Select perm index instead of index */    \
443 |         _XB_s = &(src)->lump[perm[i]];      \
444 |         xorLump64_unrolled(_XB_d, _XB_s)    \
445 |     }                                       \
446 | }
447 | 
448 | #define xor_J1(dest, srcTag, src)   \
449 | {                           \
450 |     _xor_generic(dest, srcTag, src, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15)   \
451 | }
452 | 
453 | #define xor_J2(dest, srcTag, src)   \
454 | {                           \
455 |     _xor_generic(dest, srcTag, src, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)   \
456 | }
457 | 
458 | #define xor_J3(dest, srcTag, src)   \
459 | {                           \
460 |     _xor_generic(dest, srcTag, src, 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15)   \
461 | }
462 | 
463 | // Chooses the appropriate xoring based on the supplied value diff, which is modded by 4
464 | //   diff is such that jumble^diff(inp) is 'equally jumbled' as out
465 | //   diff will be pseudorandom, so case statement should maximise efficiency.
466 | // Now also recomputes V'[j] from V[j // density]
467 | void recover_and_xor_appropriately(__private T_Block* dest, __global T_Block* V, 
468 |         unsigned int j, unsigned int diff){
469 | 
470 |     // Number of computations to make.
471 |     int nComps = mod(j,invMemoryDensity);
472 |     int V_index = j / invMemoryDensity;
473 | 
474 |     if (nComps == 0){
475 |         label_nComps_is_zero:
476 |         // Do the xoring directly from the global block V[V_index]
477 |         // Basically the old "xor_appropriately"
478 |         switch(mod(diff,4)){
479 |             case 0:
480 |                 xorBlock_halfrolled(__private, dest, __global, &V[V_index])
481 |                 break;
482 |             case 1:
483 |                 xor_J1(dest, __global, &V[V_index])
484 |                 break;
485 |             case 2:
486 |                 xor_J2(dest, __global, &V[V_index])
487 |                 break;
488 |             case 3:
489 |                 xor_J3(dest, __global, &V[V_index])
490 |                 break;
491 |         }
492 |     }
493 |     else
494 |     {
495 |         // Copy V[j/iMD] into Y, where we'll do our work
496 |         //   (using Bjorn's initialisation-bug-prevention once more)
497 |         // Observe that this copy is pretty essential
498 |         __private unsigned int _Y_bytes[ceilDiv(sizeof(T_Block), 4)] = {0};
499 |         __private T_Block* Y = (T_Block*) _Y_bytes;
500 |         copyBlock_halfrolled(__private, Y, __global, &V[V_index])
501 | 
502 |         // We have to decide where to enter the loop, based on how jumbled V[V_index] is
503 |         //   i.e. (V_index * invMemoryDensity) % 4
504 |         switch(mod(j - nComps,4)){
505 |             case 0:
506 |                 goto label_j0;
507 |             case 1:
508 |                 goto label_j3;
509 |             case 2:
510 |                 goto label_j2;
511 |             case 3:
512 |                 goto label_j1;
513 |         }
514 | 
515 |         // Could change to nComps-- .. would save an assembly instruction? :)
516 |         do {
517 |             label_j0: J3_BlockMix(Y);
518 |             if (--nComps == 0){
519 |                 break;
520 |             }
521 | 
522 |             label_j3: J2_BlockMix_J1(Y);
523 |             if (--nComps == 0){
524 |                 break;
525 |             }
526 | 
527 |             label_j2: J1_BlockMix_J2(Y);
528 |             if (--nComps == 0){
529 |                 break;
530 |             }
531 | 
532 |             label_j1: BlockMix_J3(Y);
533 |         } while (--nComps > 0);
534 | 
535 | 
536 |         // With Y = V'[j] recovered, we can finish the job off by xoring appropriately.
537 |         switch(mod(diff,4)){
538 |             case 0:
539 |                 xorBlock_halfrolled(__private, dest, __private, Y)
540 |                 break;
541 |             case 1:
542 |                 xor_J1(dest, __private, Y)
543 |                 break;
544 |             case 2:
545 |                 xor_J2(dest, __private, Y)
546 |                 break;
547 |             case 3:
548 |                 xor_J3(dest, __private, Y)
549 |                 break;
550 |         }
551 |     }
552 | 
553 | }
554 | 
555 | 
556 | 
557 | 
558 | 
559 | 
560 | 
561 | 
562 | 
563 | // ==================================================================================
564 | // The big one: ROMix kernel
565 | 
566 | __kernel void ROMix( __global T_Block* blocksFlat,
567 |                     __global T_HugeArray* hugeArraysFlat,
568 |                     __global T_Block* outputsFlat
569 |                     )
570 | {
571 |     // Get our id and so unflatten our block & huge array 'V', to get pointers
572 |     //   &arr[i] and arr + i should be equivalent syntax?
573 |     __private unsigned int id = get_global_id(0);
574 |     __global T_Block* origBlock = &blocksFlat[id];
575 |     __global T_Block* outputBlock = &outputsFlat[id];
576 |     __global T_Block* V = hugeArraysFlat[id].blk;
577 |     __global T_Block* curr_V_blk = V;
578 |     
579 |     // Copy our block into local X : could roll fully
580 |     //   slightly weird to allow for Bjorn's bug-preventing-initialisation
581 |     __private unsigned int _X_bytes[ceilDiv(sizeof(T_Block), 4)] = {0};
582 |     __private T_Block* X = (T_Block*) _X_bytes;
583 |     copyBlock_halfrolled(__private, X, __global, origBlock)
584 | 
585 | 
586 | 
587 |     // =====================================================
588 |     // 1st loop, fill V with the correct values, in varying states of jumbled-ness:
589 |     //  Let V' be the correct value. d the invMemoryDensity
590 |     //  d*i mod 4     ||      state in V[i]
591 |     // ============================================
592 |     //      0         ||          V'[d*i]
593 |     //      1         ||      J^3(V'[d*i])
594 |     //      2         ||      J^2(V'[d*i])
595 |     //      3         ||      J^1(V'[d*i])    
596 |     // Now only storing the first in every invMemoryDensity
597 | 
598 |     #define maybeStore(curr_V_blk, X, _j)   \
599 |     /* If due, stores X to curr_V_blk and increments it */  \
600 |     {                                       \
601 |         if (mod(_j,invMemoryDensity) == 0){  \
602 |             copyBlock_halfrolled(__global, curr_V_blk, __private, X);   \
603 |             curr_V_blk++;                   \
604 |         }                                   \
605 |     }
606 | 
607 |     // Still needs to do all 'iterations' loops, to compute the final X
608 |     for (int j = 0; j < iterations; j+=4){
609 |         maybeStore(curr_V_blk, X, j)
610 |         J3_BlockMix(X);
611 | 
612 |         maybeStore(curr_V_blk, X, j+1)
613 |         J2_BlockMix_J1(X);
614 | 
615 |         maybeStore(curr_V_blk, X, j+2)
616 |         J1_BlockMix_J2(X);
617 | 
618 |         maybeStore(curr_V_blk, X, j+3)
619 |         BlockMix_J3(X);
620 |     }
621 | 
622 |     #undef maybeStore
623 | 
624 | 
625 |     // ====================================================
626 |     // 2nd loop, similarly X passes through 4 states of jumbled-ness
627 |     // Observe that we need to choose our xor based on j-i % 4,
628 |     //   which adds more complexity compared to the first loop.
629 | 
630 |     // Moreover we may need to actually recompute the value.
631 |     // => sensibly (in terms of program length) this is in "recover_and_xor_appropriately"
632 |     unsigned int j;
633 |     for (unsigned int i = 0; i < iterations; i+=4){
634 |         Integerify(j, X)
635 |         recover_and_xor_appropriately(X, V, j, j - i);
636 |         J3_BlockMix(X);
637 | 
638 |         Integerify(j, X);
639 |         recover_and_xor_appropriately(X, V, j, j - (i+1));
640 |         J2_BlockMix_J1(X);
641 | 
642 |         Integerify(j, X);
643 |         recover_and_xor_appropriately(X, V, j, j - (i+2));
644 |         J1_BlockMix_J2(X);
645 | 
646 |         Integerify(j, X);
647 |         recover_and_xor_appropriately(X, V, j, j - (i+3));
648 |         BlockMix_J3(X);
649 |     }
650 | 
651 |     // Copy to output: could roll fully
652 |     copyBlock_halfrolled(__global, outputBlock, __private, X)
653 | }
654 | 
655 | 
656 | 
657 | 
658 | 
659 | 
660 | // ===============================================================================
661 | // For testing, Salsa20's each lump in place
662 | // Same signature as ROMix for ease
663 | __kernel void Salsa20(  __global T_Block* blocksFlat,
664 |                         __global T_HugeArray* hugeArraysFlat,
665 |                         __global T_Block* outputsFlat)
666 | {
667 |     __private unsigned int id = get_global_id(0);
668 | 
669 |     // Copy locally, initialising first for fear of bugs
670 |     __private unsigned int _b[ceilDiv(sizeof(T_Block), 4)] = {0};
671 |     __private T_Block* blk = (T_Block*) _b;
672 |     copyBlock_halfrolled(__private, blk, __global, (&blocksFlat[id]))
673 | 
674 |     // Initialise a zero lump
675 |     unsigned int _z[ceilDiv(sizeof(T_Lump64), 4)] = {0};
676 |     T_Lump64* zeroLump = (T_Lump64*)_z;
677 |     
678 |     // Salsa each lump inPlace
679 |     for (int j = 0; j < 2*r; j++)
680 |     {
681 |         Xor_then_Salsa_20_8_InPlace((&blk->lump[j]), zeroLump);
682 |     }
683 | 
684 |     // Copy to output
685 |     __global T_Block* output = &outputsFlat[id];
686 |     copyBlock_halfrolled(__global, output, __private, blk)
687 | }
688 | 


--------------------------------------------------------------------------------
/Library/worker/generic/sCrypt_Bip38fork.cl:
--------------------------------------------------------------------------------
  1 | // Improved OpenCL Scrypt Kernel
  2 | // Part of BTCRecover fork jeffersonn-1/btcrecover, licensed under the GNU General Public License v2.0
  3 | // 2020 Jefferson Nunn and Gaith
  4 | 
  5 | #define iterations 16384
  6 | 
  7 | #define reorder(B)                               \
  8 | {                                                \
  9 |   __private uint4 tmp[4];                        \
 10 |   tmp[0] = (uint4)(B[1].x,B[2].y,B[3].z,B[0].w); \
 11 |   tmp[1] = (uint4)(B[2].x,B[3].y,B[0].z,B[1].w); \
 12 |   tmp[2] = (uint4)(B[3].x,B[0].y,B[1].z,B[2].w); \
 13 |   tmp[3] = (uint4)(B[0].x,B[1].y,B[2].z,B[3].w); \
 14 |   B[0] = tmp[0];                                 \
 15 |   B[1] = tmp[1];                                 \
 16 |   B[2] = tmp[2];                                 \
 17 |   B[3] = tmp[3];                                 \
 18 | }                                                \
 19 | 
 20 | #define undo_reorder(B)                          \
 21 | {                                                \
 22 |   __private uint4 tmp[4];                        \
 23 |   tmp[0] = (uint4)(B[3].x,B[2].y,B[1].z,B[0].w); \
 24 |   tmp[1] = (uint4)(B[0].x,B[3].y,B[2].z,B[1].w); \
 25 |   tmp[2] = (uint4)(B[1].x,B[0].y,B[3].z,B[2].w); \
 26 |   tmp[3] = (uint4)(B[2].x,B[1].y,B[0].z,B[3].w); \
 27 |   B[0] = tmp[0];                                 \
 28 |   B[1] = tmp[1];                                 \
 29 |   B[2] = tmp[2];                                 \
 30 |   B[3] = tmp[3];                                 \
 31 | }                                                \
 32 | 
 33 | #define copy64(dest, idx_dest, src, idx_src) \
 34 | {                                            \
 35 |   dest[idx_dest    ] = src[idx_src    ];     \
 36 |   dest[idx_dest + 1] = src[idx_src + 1];     \
 37 |   dest[idx_dest + 2] = src[idx_src + 2];     \
 38 |   dest[idx_dest + 3] = src[idx_src + 3];     \
 39 | }                                            \
 40 | 
 41 | typedef struct {
 42 |   uint4 buf[64];
 43 | } T_Block;
 44 | 
 45 | void salsa(__private const uint4 Bx[4], __private uint4 B[4]);
 46 | void BlockMix(__private T_Block* B);
 47 | 
 48 | void salsa(__private const uint4 Bx[4], __private uint4 B[4])
 49 | {
 50 |   __private uint4 w[4];
 51 | 
 52 |   w[0] = (B[0] ^= Bx[0]);
 53 |   w[1] = (B[1] ^= Bx[1]);
 54 |   w[2] = (B[2] ^= Bx[2]);
 55 |   w[3] = (B[3] ^= Bx[3]);
 56 | 
 57 |   reorder(w);
 58 | 
 59 |   /* Rounds 1 + 2 */
 60 |   w[0] ^= rotate(w[3]     +w[2]     , 7U);
 61 |   w[1] ^= rotate(w[0]     +w[3]     , 9U);
 62 |   w[2] ^= rotate(w[1]     +w[0]     ,13U);
 63 |   w[3] ^= rotate(w[2]     +w[1]     ,18U);
 64 |   w[2] ^= rotate(w[3].wxyz+w[0].zwxy, 7U);
 65 |   w[1] ^= rotate(w[2].wxyz+w[3].zwxy, 9U);
 66 |   w[0] ^= rotate(w[1].wxyz+w[2].zwxy,13U);
 67 |   w[3] ^= rotate(w[0].wxyz+w[1].zwxy,18U);
 68 | 
 69 |   /* Rounds 3 + 4 */
 70 |   w[0] ^= rotate(w[3]     +w[2]     , 7U);
 71 |   w[1] ^= rotate(w[0]     +w[3]     , 9U);
 72 |   w[2] ^= rotate(w[1]     +w[0]     ,13U);
 73 |   w[3] ^= rotate(w[2]     +w[1]     ,18U);
 74 |   w[2] ^= rotate(w[3].wxyz+w[0].zwxy, 7U);
 75 |   w[1] ^= rotate(w[2].wxyz+w[3].zwxy, 9U);
 76 |   w[0] ^= rotate(w[1].wxyz+w[2].zwxy,13U);
 77 |   w[3] ^= rotate(w[0].wxyz+w[1].zwxy,18U);
 78 | 
 79 |   /* Rounds 5 + 6 */
 80 |   w[0] ^= rotate(w[3]     +w[2]     , 7U);
 81 |   w[1] ^= rotate(w[0]     +w[3]     , 9U);
 82 |   w[2] ^= rotate(w[1]     +w[0]     ,13U);
 83 |   w[3] ^= rotate(w[2]     +w[1]     ,18U);
 84 |   w[2] ^= rotate(w[3].wxyz+w[0].zwxy, 7U);
 85 |   w[1] ^= rotate(w[2].wxyz+w[3].zwxy, 9U);
 86 |   w[0] ^= rotate(w[1].wxyz+w[2].zwxy,13U);
 87 |   w[3] ^= rotate(w[0].wxyz+w[1].zwxy,18U);
 88 | 
 89 |   /* Rounds 7 + 8 */
 90 |   w[0] ^= rotate(w[3]     +w[2]     , 7U);
 91 |   w[1] ^= rotate(w[0]     +w[3]     , 9U);
 92 |   w[2] ^= rotate(w[1]     +w[0]     ,13U);
 93 |   w[3] ^= rotate(w[2]     +w[1]     ,18U);
 94 |   w[2] ^= rotate(w[3].wxyz+w[0].zwxy, 7U);
 95 |   w[1] ^= rotate(w[2].wxyz+w[3].zwxy, 9U);
 96 |   w[0] ^= rotate(w[1].wxyz+w[2].zwxy,13U);
 97 |   w[3] ^= rotate(w[0].wxyz+w[1].zwxy,18U);
 98 | 
 99 |   undo_reorder(w);
100 | 
101 |   B[0] += w[0];
102 |   B[1] += w[1];
103 |   B[2] += w[2];
104 |   B[3] += w[3];
105 | }
106 | 
107 | void BlockMix(__private T_Block* B)
108 | {
109 |   salsa(&B->buf[60], &B->buf[0 ]);
110 |   salsa(&B->buf[0 ], &B->buf[4 ]);
111 |   salsa(&B->buf[4 ], &B->buf[8 ]);
112 |   salsa(&B->buf[8 ], &B->buf[12]);
113 |   salsa(&B->buf[12], &B->buf[16]);
114 |   salsa(&B->buf[16], &B->buf[20]);
115 |   salsa(&B->buf[20], &B->buf[24]);
116 |   salsa(&B->buf[24], &B->buf[28]);
117 |   salsa(&B->buf[28], &B->buf[32]);
118 |   salsa(&B->buf[32], &B->buf[36]);
119 |   salsa(&B->buf[36], &B->buf[40]);
120 |   salsa(&B->buf[40], &B->buf[44]);
121 |   salsa(&B->buf[44], &B->buf[48]);
122 |   salsa(&B->buf[48], &B->buf[52]);
123 |   salsa(&B->buf[52], &B->buf[56]);
124 |   salsa(&B->buf[56], &B->buf[60]);
125 | 
126 |   __private T_Block Y = *B;
127 | 
128 |   copy64(B->buf,  0, Y.buf,  0);
129 |   copy64(B->buf,  4, Y.buf,  8);
130 |   copy64(B->buf,  8, Y.buf, 16);
131 |   copy64(B->buf, 12, Y.buf, 24);
132 |   copy64(B->buf, 16, Y.buf, 32);
133 |   copy64(B->buf, 20, Y.buf, 40);
134 |   copy64(B->buf, 24, Y.buf, 48);
135 |   copy64(B->buf, 28, Y.buf, 56);
136 |   copy64(B->buf, 32, Y.buf,  4);
137 |   copy64(B->buf, 36, Y.buf, 12);
138 |   copy64(B->buf, 40, Y.buf, 20);
139 |   copy64(B->buf, 44, Y.buf, 28);
140 |   copy64(B->buf, 48, Y.buf, 36);
141 |   copy64(B->buf, 52, Y.buf, 44);
142 |   copy64(B->buf, 56, Y.buf, 52);
143 |   copy64(B->buf, 60, Y.buf, 60);
144 | }
145 | 
146 | __kernel void ROMix(__global T_Block* Xs,
147 |                     __global T_Block* Vs,
148 |                     __global T_Block* outputs
149 |                    )
150 | {
151 |   __private unsigned int id = get_global_id(0);
152 |   __private T_Block X = Xs[id];
153 |   __private int i, j, k, v_idx;
154 | 
155 |   __private int v_idx_offset = id * iterations;
156 | 
157 |   for (i = 0, v_idx = v_idx_offset; i < iterations; ++i, ++v_idx)
158 |   {
159 |     Vs[v_idx] = X;
160 |     BlockMix(&X);
161 |   }
162 | 
163 |   for (i = 0; i < iterations; ++i)
164 |   {
165 |     j = X.buf[60].x & (iterations - 1);
166 |     v_idx = v_idx_offset + j;
167 |     for (k = 0; k < 64; ++k)
168 |     {
169 |       X.buf[k] ^= Vs[v_idx].buf[k];
170 |     }
171 |     BlockMix(&X);
172 |   }
173 | 
174 |   __global T_Block* output = &outputs[id];
175 |   for (i = 0; i < 64; ++i)
176 |   {
177 |     output->buf[i] = X.buf[i];
178 |   }
179 | }


--------------------------------------------------------------------------------
/Library/worker/generic/sha1.cl:
--------------------------------------------------------------------------------
  1 | /*
  2 |     SHA1 OpenCL Optimized kernel
  3 |     (c) B. Kerler 2018
  4 |     MIT License
  5 | */
  6 | 
  7 | /*
  8 |     (small) Changes:
  9 |     outbuf and inbuf structs defined using the buffer_structs_template
 10 |     func_sha1 renamed to hash_main
 11 |     hash array trimmed to size 5
 12 | */
 13 | 
 14 | #define rotl32(a,n) rotate ((a), (n)) 
 15 | 
 16 | #define mod(x,y) ((x)-((x)/(y)*(y)))
 17 | 
 18 | #define F2(x,y,z)  ((x) ^ (y) ^ (z))
 19 | #define F1(x,y,z)   (bitselect(z,y,x))
 20 | #define F0(x,y,z)   (bitselect (x, y, ((x) ^ (z))))
 21 | 
 22 | #define SHA1M_A 0x67452301u
 23 | #define SHA1M_B 0xefcdab89u
 24 | #define SHA1M_C 0x98badcfeu
 25 | #define SHA1M_D 0x10325476u
 26 | #define SHA1M_E 0xc3d2e1f0u
 27 | 
 28 | #define SHA1C00 0x5a827999u
 29 | #define SHA1C01 0x6ed9eba1u
 30 | #define SHA1C02 0x8f1bbcdcu
 31 | #define SHA1C03 0xca62c1d6u
 32 | 
 33 | #define SHA1_STEP(f,a,b,c,d,e,x)    \
 34 | {                                   \
 35 |   e += K;                           \
 36 |   e += x;                           \
 37 |   e += f (b, c, d);                 \
 38 |   e += rotl32 (a,  5u);             \
 39 |   b  = rotl32 (b, 30u);             \
 40 | }
 41 | 
 42 | static void sha1_process2 (const unsigned int *W, unsigned int *digest)
 43 | {
 44 |   unsigned int A = digest[0];
 45 |   unsigned int B = digest[1];
 46 |   unsigned int C = digest[2];
 47 |   unsigned int D = digest[3];
 48 |   unsigned int E = digest[4];
 49 | 
 50 |   unsigned int w0_t = W[0];
 51 |   unsigned int w1_t = W[1];
 52 |   unsigned int w2_t = W[2];
 53 |   unsigned int w3_t = W[3];
 54 |   unsigned int w4_t = W[4];
 55 |   unsigned int w5_t = W[5];
 56 |   unsigned int w6_t = W[6];
 57 |   unsigned int w7_t = W[7];
 58 |   unsigned int w8_t = W[8];
 59 |   unsigned int w9_t = W[9];
 60 |   unsigned int wa_t = W[10];
 61 |   unsigned int wb_t = W[11];
 62 |   unsigned int wc_t = W[12];
 63 |   unsigned int wd_t = W[13];
 64 |   unsigned int we_t = W[14];
 65 |   unsigned int wf_t = W[15];
 66 | 
 67 |   #undef K
 68 |   #define K SHA1C00
 69 | 
 70 |   SHA1_STEP (F1, A, B, C, D, E, w0_t);
 71 |   SHA1_STEP (F1, E, A, B, C, D, w1_t);
 72 |   SHA1_STEP (F1, D, E, A, B, C, w2_t);
 73 |   SHA1_STEP (F1, C, D, E, A, B, w3_t);
 74 |   SHA1_STEP (F1, B, C, D, E, A, w4_t);
 75 |   SHA1_STEP (F1, A, B, C, D, E, w5_t);
 76 |   SHA1_STEP (F1, E, A, B, C, D, w6_t);
 77 |   SHA1_STEP (F1, D, E, A, B, C, w7_t);
 78 |   SHA1_STEP (F1, C, D, E, A, B, w8_t);
 79 |   SHA1_STEP (F1, B, C, D, E, A, w9_t);
 80 |   SHA1_STEP (F1, A, B, C, D, E, wa_t);
 81 |   SHA1_STEP (F1, E, A, B, C, D, wb_t);
 82 |   SHA1_STEP (F1, D, E, A, B, C, wc_t);
 83 |   SHA1_STEP (F1, C, D, E, A, B, wd_t);
 84 |   SHA1_STEP (F1, B, C, D, E, A, we_t);
 85 |   SHA1_STEP (F1, A, B, C, D, E, wf_t);
 86 |   w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (F1, E, A, B, C, D, w0_t);
 87 |   w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (F1, D, E, A, B, C, w1_t);
 88 |   w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (F1, C, D, E, A, B, w2_t);
 89 |   w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (F1, B, C, D, E, A, w3_t);
 90 | 
 91 |   #undef K
 92 |   #define K SHA1C01
 93 | 
 94 |   w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (F2, A, B, C, D, E, w4_t);
 95 |   w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (F2, E, A, B, C, D, w5_t);
 96 |   w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (F2, D, E, A, B, C, w6_t);
 97 |   w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (F2, C, D, E, A, B, w7_t);
 98 |   w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (F2, B, C, D, E, A, w8_t);
 99 |   w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (F2, A, B, C, D, E, w9_t);
100 |   wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (F2, E, A, B, C, D, wa_t);
101 |   wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (F2, D, E, A, B, C, wb_t);
102 |   wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (F2, C, D, E, A, B, wc_t);
103 |   wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (F2, B, C, D, E, A, wd_t);
104 |   we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (F2, A, B, C, D, E, we_t);
105 |   wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (F2, E, A, B, C, D, wf_t);
106 |   w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (F2, D, E, A, B, C, w0_t);
107 |   w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (F2, C, D, E, A, B, w1_t);
108 |   w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (F2, B, C, D, E, A, w2_t);
109 |   w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (F2, A, B, C, D, E, w3_t);
110 |   w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (F2, E, A, B, C, D, w4_t);
111 |   w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (F2, D, E, A, B, C, w5_t);
112 |   w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (F2, C, D, E, A, B, w6_t);
113 |   w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (F2, B, C, D, E, A, w7_t);
114 | 
115 |   #undef K
116 |   #define K SHA1C02
117 | 
118 |   w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (F0, A, B, C, D, E, w8_t);
119 |   w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (F0, E, A, B, C, D, w9_t);
120 |   wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (F0, D, E, A, B, C, wa_t);
121 |   wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (F0, C, D, E, A, B, wb_t);
122 |   wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (F0, B, C, D, E, A, wc_t);
123 |   wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (F0, A, B, C, D, E, wd_t);
124 |   we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (F0, E, A, B, C, D, we_t);
125 |   wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (F0, D, E, A, B, C, wf_t);
126 |   w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (F0, C, D, E, A, B, w0_t);
127 |   w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (F0, B, C, D, E, A, w1_t);
128 |   w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (F0, A, B, C, D, E, w2_t);
129 |   w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (F0, E, A, B, C, D, w3_t);
130 |   w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (F0, D, E, A, B, C, w4_t);
131 |   w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (F0, C, D, E, A, B, w5_t);
132 |   w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (F0, B, C, D, E, A, w6_t);
133 |   w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (F0, A, B, C, D, E, w7_t);
134 |   w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (F0, E, A, B, C, D, w8_t);
135 |   w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (F0, D, E, A, B, C, w9_t);
136 |   wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (F0, C, D, E, A, B, wa_t);
137 |   wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (F0, B, C, D, E, A, wb_t);
138 | 
139 |   #undef K
140 |   #define K SHA1C03
141 | 
142 |   wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (F2, A, B, C, D, E, wc_t);
143 |   wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (F2, E, A, B, C, D, wd_t);
144 |   we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (F2, D, E, A, B, C, we_t);
145 |   wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (F2, C, D, E, A, B, wf_t);
146 |   w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (F2, B, C, D, E, A, w0_t);
147 |   w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (F2, A, B, C, D, E, w1_t);
148 |   w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (F2, E, A, B, C, D, w2_t);
149 |   w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (F2, D, E, A, B, C, w3_t);
150 |   w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (F2, C, D, E, A, B, w4_t);
151 |   w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (F2, B, C, D, E, A, w5_t);
152 |   w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (F2, A, B, C, D, E, w6_t);
153 |   w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (F2, E, A, B, C, D, w7_t);
154 |   w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (F2, D, E, A, B, C, w8_t);
155 |   w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (F2, C, D, E, A, B, w9_t);
156 |   wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (F2, B, C, D, E, A, wa_t);
157 |   wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (F2, A, B, C, D, E, wb_t);
158 |   wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (F2, E, A, B, C, D, wc_t);
159 |   wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (F2, D, E, A, B, C, wd_t);
160 |   we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (F2, C, D, E, A, B, we_t);
161 |   wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (F2, B, C, D, E, A, wf_t);
162 | 
163 |   // Macros don't have scope, so this K was being preserved
164 |   #undef K
165 | 
166 |   digest[0] += A;
167 |   digest[1] += B;
168 |   digest[2] += C;
169 |   digest[3] += D;
170 |   digest[4] += E;
171 | } 
172 | 
173 | #define def_hash(funcName, passTag, hashTag)    \
174 | /* The main hashing function */                 \
175 | static void funcName(passTag const unsigned int *pass, int pass_len, hashTag unsigned int* hash)   \
176 | {                                                                                       \
177 |     /* pass is only given to SWAP
178 |         and hash is just assigned to p, which is only accessed by p[i] =
179 |         => both tags irrelevant! */     \
180 |                                         \
181 |     int plen=pass_len/4;                \
182 |     if (mod(pass_len,4)) plen++;        \
183 |                                         \
184 |     hashTag unsigned int* p = hash;     \
185 |                                         \
186 |     unsigned int W[0x10]={0};           \
187 |     int loops=plen;                     \
188 |     int curloop=0;                      \
189 |     unsigned int State[5]={0};          \
190 |     State[0] = 0x67452301;              \
191 |     State[1] = 0xefcdab89;              \
192 |     State[2] = 0x98badcfe;              \
193 |     State[3] = 0x10325476;              \
194 |     State[4] = 0xc3d2e1f0;              \
195 |                                         \
196 |                                         \
197 |     while (loops>0)     \
198 |     {                   \
199 |         W[0x0]=0x0;     \
200 |         W[0x1]=0x0;     \
201 |         W[0x2]=0x0;     \
202 |         W[0x3]=0x0;     \
203 |         W[0x4]=0x0;     \
204 |         W[0x5]=0x0;     \
205 |         W[0x6]=0x0;     \
206 |         W[0x7]=0x0;     \
207 |         W[0x8]=0x0;     \
208 |         W[0x9]=0x0;     \
209 |         W[0xA]=0x0;     \
210 |         W[0xB]=0x0;     \
211 |         W[0xC]=0x0;     \
212 |         W[0xD]=0x0;     \
213 |         W[0xE]=0x0;     \
214 |         W[0xF]=0x0;     \
215 |                         \
216 |         for (int m=0;loops!=0 && m<16;m++)          \
217 |         {                                           \
218 |             W[m]^=SWAP(pass[m+(curloop*16)]);       \
219 |             loops--;                                \
220 |         }                                           \
221 |                                                     \
222 |         if (loops==0 && mod(pass_len,64)!=0)        \
223 |         {                                           \
224 |             unsigned int padding=0x80<<(((pass_len+4)-((pass_len+4)/4*4))*8);   \
225 |             int v=mod(pass_len,64);                 \
226 |             W[v/4]|=SWAP(padding);                  \
227 |             if ((pass_len&0x3B)!=0x3B)              \
228 |             {                                       \
229 |                 /* Let's add length */              \
230 |                 W[0x0F]=pass_len*8;                 \
231 |             }                                       \
232 |         }                                           \
233 |                                                     \
234 |         sha1_process2(W,State);                     \
235 |         curloop++;                                  \
236 |     }                                               \
237 |                             \
238 |     if (mod(plen,16)==0)    \
239 |     {                       \
240 |         W[0x0]=0x0;         \
241 |         W[0x1]=0x0;         \
242 |         W[0x2]=0x0;         \
243 |         W[0x3]=0x0;         \
244 |         W[0x4]=0x0;         \
245 |         W[0x5]=0x0;         \
246 |         W[0x6]=0x0;         \
247 |         W[0x7]=0x0;         \
248 |         W[0x8]=0x0;         \
249 |         W[0x9]=0x0;         \
250 |         W[0xA]=0x0;         \
251 |         W[0xB]=0x0;         \
252 |         W[0xC]=0x0;         \
253 |         W[0xD]=0x0;         \
254 |         W[0xE]=0x0;         \
255 |         W[0xF]=0x0;         \
256 |         if ((pass_len&0x3B)!=0x3B)  \
257 |         {                           \
258 |             unsigned int padding=0x80<<(((pass_len+4)-((pass_len+4)/4*4))*8); \
259 |             W[0]|=SWAP(padding);    \
260 |         }                           \
261 |         /* Let's add length */      \
262 |         W[0x0F]=pass_len*8;         \
263 |                                     \
264 |         sha1_process2(W,State);     \
265 |     }                       \
266 |                             \
267 |     p[0]=SWAP(State[0]);    \
268 |     p[1]=SWAP(State[1]);    \
269 |     p[2]=SWAP(State[2]);    \
270 |     p[3]=SWAP(State[3]);    \
271 |     p[4]=SWAP(State[4]);    \
272 |     return;                 \
273 | }
274 | 
275 | def_hash(hash_global, __global, __global)
276 | def_hash(hash_private, __private, __private)
277 | def_hash(hash_glbl_to_priv, __global, __private)
278 | def_hash(hash_priv_to_glbl, __private, __global)
279 | 
280 | #undef mod
281 | 
282 | #undef rotl32
283 | #undef F0
284 | #undef F1
285 | #undef F2
286 |                 
287 | __kernel void hash_main(__global const inbuf * inbuffer, __global outbuf * outbuffer)
288 | {
289 |     unsigned int idx = get_global_id(0);
290 | 
291 |     // unsigned int hash[20/4]={0};
292 | 
293 |     hash_global(inbuffer[idx].buffer, inbuffer[idx].length, outbuffer[idx].buffer);
294 | 
295 | /*     outbuffer[idx].buffer[0]=hash[0];
296 |     outbuffer[idx].buffer[1]=hash[1];
297 |     outbuffer[idx].buffer[2]=hash[2];
298 |     outbuffer[idx].buffer[3]=hash[3];
299 |     outbuffer[idx].buffer[4]=hash[4]; */
300 | }
301 | 


--------------------------------------------------------------------------------
/Library/worker/generic/sha256.cl:
--------------------------------------------------------------------------------
  1 | /*
  2 |     Original:
  3 |     SHA1 OpenCL Optimized kernel
  4 |     (c) B. Kerler 2018
  5 |     MIT License
  6 | */
  7 | 
  8 | /*
  9 |     (small) Changes:
 10 |     outbuf and inbuf structs defined using the buffer_structs_template
 11 |     func_sha256 renamed to hash_main
 12 | */
 13 | 
 14 | /*
 15 |     Modified: hash_main function works for any length inputs.
 16 | */
 17 | 
 18 | #define F1(x,y,z)   (bitselect(z,y,x))
 19 | #define F0(x,y,z)   (bitselect (x, y, ((x) ^ (z))))
 20 | #define mod(x,y) ((x)-((x)/(y)*(y)))
 21 | #define shr32(x,n) ((x) >> (n))
 22 | #define rotl32(a,n) rotate ((a), (n))
 23 | 
 24 | #define S0(x) (rotl32 ((x), 25u) ^ rotl32 ((x), 14u) ^ shr32 ((x),  3u))
 25 | #define S1(x) (rotl32 ((x), 15u) ^ rotl32 ((x), 13u) ^ shr32 ((x), 10u))
 26 | #define S2(x) (rotl32 ((x), 30u) ^ rotl32 ((x), 19u) ^ rotl32 ((x), 10u))
 27 | #define S3(x) (rotl32 ((x), 26u) ^ rotl32 ((x), 21u) ^ rotl32 ((x),  7u))
 28 | 
 29 | #define SHA256C00 0x428a2f98u
 30 | #define SHA256C01 0x71374491u
 31 | #define SHA256C02 0xb5c0fbcfu
 32 | #define SHA256C03 0xe9b5dba5u
 33 | #define SHA256C04 0x3956c25bu
 34 | #define SHA256C05 0x59f111f1u
 35 | #define SHA256C06 0x923f82a4u
 36 | #define SHA256C07 0xab1c5ed5u
 37 | #define SHA256C08 0xd807aa98u
 38 | #define SHA256C09 0x12835b01u
 39 | #define SHA256C0a 0x243185beu
 40 | #define SHA256C0b 0x550c7dc3u
 41 | #define SHA256C0c 0x72be5d74u
 42 | #define SHA256C0d 0x80deb1feu
 43 | #define SHA256C0e 0x9bdc06a7u
 44 | #define SHA256C0f 0xc19bf174u
 45 | #define SHA256C10 0xe49b69c1u
 46 | #define SHA256C11 0xefbe4786u
 47 | #define SHA256C12 0x0fc19dc6u
 48 | #define SHA256C13 0x240ca1ccu
 49 | #define SHA256C14 0x2de92c6fu
 50 | #define SHA256C15 0x4a7484aau
 51 | #define SHA256C16 0x5cb0a9dcu
 52 | #define SHA256C17 0x76f988dau
 53 | #define SHA256C18 0x983e5152u
 54 | #define SHA256C19 0xa831c66du
 55 | #define SHA256C1a 0xb00327c8u
 56 | #define SHA256C1b 0xbf597fc7u
 57 | #define SHA256C1c 0xc6e00bf3u
 58 | #define SHA256C1d 0xd5a79147u
 59 | #define SHA256C1e 0x06ca6351u
 60 | #define SHA256C1f 0x14292967u
 61 | #define SHA256C20 0x27b70a85u
 62 | #define SHA256C21 0x2e1b2138u
 63 | #define SHA256C22 0x4d2c6dfcu
 64 | #define SHA256C23 0x53380d13u
 65 | #define SHA256C24 0x650a7354u
 66 | #define SHA256C25 0x766a0abbu
 67 | #define SHA256C26 0x81c2c92eu
 68 | #define SHA256C27 0x92722c85u
 69 | #define SHA256C28 0xa2bfe8a1u
 70 | #define SHA256C29 0xa81a664bu
 71 | #define SHA256C2a 0xc24b8b70u
 72 | #define SHA256C2b 0xc76c51a3u
 73 | #define SHA256C2c 0xd192e819u
 74 | #define SHA256C2d 0xd6990624u
 75 | #define SHA256C2e 0xf40e3585u
 76 | #define SHA256C2f 0x106aa070u
 77 | #define SHA256C30 0x19a4c116u
 78 | #define SHA256C31 0x1e376c08u
 79 | #define SHA256C32 0x2748774cu
 80 | #define SHA256C33 0x34b0bcb5u
 81 | #define SHA256C34 0x391c0cb3u
 82 | #define SHA256C35 0x4ed8aa4au
 83 | #define SHA256C36 0x5b9cca4fu
 84 | #define SHA256C37 0x682e6ff3u
 85 | #define SHA256C38 0x748f82eeu
 86 | #define SHA256C39 0x78a5636fu
 87 | #define SHA256C3a 0x84c87814u
 88 | #define SHA256C3b 0x8cc70208u
 89 | #define SHA256C3c 0x90befffau
 90 | #define SHA256C3d 0xa4506cebu
 91 | #define SHA256C3e 0xbef9a3f7u
 92 | #define SHA256C3f 0xc67178f2u 
 93 | 
 94 | __constant uint k_sha256[64] =
 95 | {
 96 |   SHA256C00, SHA256C01, SHA256C02, SHA256C03,
 97 |   SHA256C04, SHA256C05, SHA256C06, SHA256C07,
 98 |   SHA256C08, SHA256C09, SHA256C0a, SHA256C0b,
 99 |   SHA256C0c, SHA256C0d, SHA256C0e, SHA256C0f,
100 |   SHA256C10, SHA256C11, SHA256C12, SHA256C13,
101 |   SHA256C14, SHA256C15, SHA256C16, SHA256C17,
102 |   SHA256C18, SHA256C19, SHA256C1a, SHA256C1b,
103 |   SHA256C1c, SHA256C1d, SHA256C1e, SHA256C1f,
104 |   SHA256C20, SHA256C21, SHA256C22, SHA256C23,
105 |   SHA256C24, SHA256C25, SHA256C26, SHA256C27,
106 |   SHA256C28, SHA256C29, SHA256C2a, SHA256C2b,
107 |   SHA256C2c, SHA256C2d, SHA256C2e, SHA256C2f,
108 |   SHA256C30, SHA256C31, SHA256C32, SHA256C33,
109 |   SHA256C34, SHA256C35, SHA256C36, SHA256C37,
110 |   SHA256C38, SHA256C39, SHA256C3a, SHA256C3b,
111 |   SHA256C3c, SHA256C3d, SHA256C3e, SHA256C3f,
112 | };
113 | 
114 | #define SHA256_STEP(F0a,F1a,a,b,c,d,e,f,g,h,x,K)  \
115 | {                                               \
116 |   h += K;                                       \
117 |   h += x;                                       \
118 |   h += S3 (e);                           \
119 |   h += F1a (e,f,g);                              \
120 |   d += h;                                       \
121 |   h += S2 (a);                           \
122 |   h += F0a (a,b,c);                              \
123 | }
124 | 
125 | #define SHA256_EXPAND(x,y,z,w) (S1 (x) + y + S0 (z) + w) 
126 | 
127 | static void sha256_process2 (const unsigned int *W, unsigned int *digest)
128 | {
129 |   unsigned int a = digest[0];
130 |   unsigned int b = digest[1];
131 |   unsigned int c = digest[2];
132 |   unsigned int d = digest[3];
133 |   unsigned int e = digest[4];
134 |   unsigned int f = digest[5];
135 |   unsigned int g = digest[6];
136 |   unsigned int h = digest[7];
137 | 
138 |   unsigned int w0_t = W[0];
139 |   unsigned int w1_t = W[1];
140 |   unsigned int w2_t = W[2];
141 |   unsigned int w3_t = W[3];
142 |   unsigned int w4_t = W[4];
143 |   unsigned int w5_t = W[5];
144 |   unsigned int w6_t = W[6];
145 |   unsigned int w7_t = W[7];
146 |   unsigned int w8_t = W[8];
147 |   unsigned int w9_t = W[9];
148 |   unsigned int wa_t = W[10];
149 |   unsigned int wb_t = W[11];
150 |   unsigned int wc_t = W[12];
151 |   unsigned int wd_t = W[13];
152 |   unsigned int we_t = W[14];
153 |   unsigned int wf_t = W[15];
154 | 
155 |   #define ROUND_EXPAND(i)                           \
156 |   {                                                 \
157 |     w0_t = SHA256_EXPAND (we_t, w9_t, w1_t, w0_t);  \
158 |     w1_t = SHA256_EXPAND (wf_t, wa_t, w2_t, w1_t);  \
159 |     w2_t = SHA256_EXPAND (w0_t, wb_t, w3_t, w2_t);  \
160 |     w3_t = SHA256_EXPAND (w1_t, wc_t, w4_t, w3_t);  \
161 |     w4_t = SHA256_EXPAND (w2_t, wd_t, w5_t, w4_t);  \
162 |     w5_t = SHA256_EXPAND (w3_t, we_t, w6_t, w5_t);  \
163 |     w6_t = SHA256_EXPAND (w4_t, wf_t, w7_t, w6_t);  \
164 |     w7_t = SHA256_EXPAND (w5_t, w0_t, w8_t, w7_t);  \
165 |     w8_t = SHA256_EXPAND (w6_t, w1_t, w9_t, w8_t);  \
166 |     w9_t = SHA256_EXPAND (w7_t, w2_t, wa_t, w9_t);  \
167 |     wa_t = SHA256_EXPAND (w8_t, w3_t, wb_t, wa_t);  \
168 |     wb_t = SHA256_EXPAND (w9_t, w4_t, wc_t, wb_t);  \
169 |     wc_t = SHA256_EXPAND (wa_t, w5_t, wd_t, wc_t);  \
170 |     wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t);  \
171 |     we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t);  \
172 |     wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t);  \
173 |   }
174 | 
175 |   #define ROUND_STEP(i)                                                                   \
176 |   {                                                                                       \
177 |     SHA256_STEP (F0, F1, a, b, c, d, e, f, g, h, w0_t, k_sha256[i +  0]); \
178 |     SHA256_STEP (F0, F1, h, a, b, c, d, e, f, g, w1_t, k_sha256[i +  1]); \
179 |     SHA256_STEP (F0, F1, g, h, a, b, c, d, e, f, w2_t, k_sha256[i +  2]); \
180 |     SHA256_STEP (F0, F1, f, g, h, a, b, c, d, e, w3_t, k_sha256[i +  3]); \
181 |     SHA256_STEP (F0, F1, e, f, g, h, a, b, c, d, w4_t, k_sha256[i +  4]); \
182 |     SHA256_STEP (F0, F1, d, e, f, g, h, a, b, c, w5_t, k_sha256[i +  5]); \
183 |     SHA256_STEP (F0, F1, c, d, e, f, g, h, a, b, w6_t, k_sha256[i +  6]); \
184 |     SHA256_STEP (F0, F1, b, c, d, e, f, g, h, a, w7_t, k_sha256[i +  7]); \
185 |     SHA256_STEP (F0, F1, a, b, c, d, e, f, g, h, w8_t, k_sha256[i +  8]); \
186 |     SHA256_STEP (F0, F1, h, a, b, c, d, e, f, g, w9_t, k_sha256[i +  9]); \
187 |     SHA256_STEP (F0, F1, g, h, a, b, c, d, e, f, wa_t, k_sha256[i + 10]); \
188 |     SHA256_STEP (F0, F1, f, g, h, a, b, c, d, e, wb_t, k_sha256[i + 11]); \
189 |     SHA256_STEP (F0, F1, e, f, g, h, a, b, c, d, wc_t, k_sha256[i + 12]); \
190 |     SHA256_STEP (F0, F1, d, e, f, g, h, a, b, c, wd_t, k_sha256[i + 13]); \
191 |     SHA256_STEP (F0, F1, c, d, e, f, g, h, a, b, we_t, k_sha256[i + 14]); \
192 |     SHA256_STEP (F0, F1, b, c, d, e, f, g, h, a, wf_t, k_sha256[i + 15]); \
193 |   }
194 | 
195 |   ROUND_STEP (0);
196 | 
197 |   ROUND_EXPAND();
198 |   ROUND_STEP(16);
199 | 
200 |   ROUND_EXPAND();
201 |   ROUND_STEP(32);
202 | 
203 |   ROUND_EXPAND();
204 |   ROUND_STEP(48);
205 | 
206 |   digest[0] += a;
207 |   digest[1] += b;
208 |   digest[2] += c;
209 |   digest[3] += d;
210 |   digest[4] += e;
211 |   digest[5] += f;
212 |   digest[6] += g;
213 |   digest[7] += h;
214 | }
215 | 
216 | #define def_hash(funcName, passTag, hashTag)    \
217 | /* The main hashing function */                 \
218 | static void funcName(passTag const unsigned int *pass, int pass_len, hashTag unsigned int* hash)    \
219 | {                                   \
220 |     int plen=pass_len/4;            \
221 |     if (mod(pass_len,4)) plen++;    \
222 |                                     \
223 |     unsigned int slidePadding=0;    \
224 |     if (mod(pass_len,64)>=56) slidePadding=1; \
225 |                                     \
226 |     hashTag unsigned int* p = hash; \
227 |                                     \
228 |     unsigned int W[0x10]={0};   \
229 |     int loops=plen;             \
230 |     int curloop=0;              \
231 |     unsigned int State[8]={0};  \
232 |     State[0] = 0x6a09e667;      \
233 |     State[1] = 0xbb67ae85;      \
234 |     State[2] = 0x3c6ef372;      \
235 |     State[3] = 0xa54ff53a;      \
236 |     State[4] = 0x510e527f;      \
237 |     State[5] = 0x9b05688c;      \
238 |     State[6] = 0x1f83d9ab;      \
239 |     State[7] = 0x5be0cd19;      \
240 |                         \
241 |     while (loops>0)     \
242 |     {                   \
243 |         W[0x0]=0x0;     \
244 |         W[0x1]=0x0;     \
245 |         W[0x2]=0x0;     \
246 |         W[0x3]=0x0;     \
247 |         W[0x4]=0x0;     \
248 |         W[0x5]=0x0;     \
249 |         W[0x6]=0x0;     \
250 |         W[0x7]=0x0;     \
251 |         W[0x8]=0x0;     \
252 |         W[0x9]=0x0;     \
253 |         W[0xA]=0x0;     \
254 |         W[0xB]=0x0;     \
255 |         W[0xC]=0x0;     \
256 |         W[0xD]=0x0;     \
257 |         W[0xE]=0x0;     \
258 |         W[0xF]=0x0;     \
259 |                         \
260 |         for (int m=0;loops!=0 && m<16;m++)      \
261 |         {                                       \
262 |             W[m]^=SWAP(pass[m+(curloop*16)]);   \
263 |             loops--;                            \
264 |         }                                       \
265 |                                                 \
266 |         if (loops==0 && mod(pass_len,64)!=0)    \
267 |         {                                       \
268 |             unsigned int padding=0x80<<(((pass_len+4)-((pass_len+4)/4*4))*8);   \
269 |             int v=mod(pass_len,64);         \
270 |             W[v/4]|=SWAP(padding);          \
271 |             if (slidePadding==0)            \
272 |             {                               \
273 |                 /* Let's add length */      \
274 |                 W[0x0F]=pass_len*8;         \
275 |             }                               \
276 |         }                                   \
277 |                                         \
278 |         sha256_process2(W,State);       \
279 |         curloop++;                      \
280 |     }                                   \
281 |                             \
282 |     if (slidePadding!=0) {  \
283 |         W[0x0]=0x0;     \
284 |         W[0x1]=0x0;     \
285 |         W[0x2]=0x0;     \
286 |         W[0x3]=0x0;     \
287 |         W[0x4]=0x0;     \
288 |         W[0x5]=0x0;     \
289 |         W[0x6]=0x0;     \
290 |         W[0x7]=0x0;     \
291 |         W[0x8]=0x0;     \
292 |         W[0x9]=0x0;     \
293 |         W[0xA]=0x0;     \
294 |         W[0xB]=0x0;     \
295 |         W[0xC]=0x0;     \
296 |         W[0xD]=0x0;     \
297 |         W[0xE]=0x0;     \
298 |         W[0x0F]=pass_len*8;         \
299 |                                     \
300 |         sha256_process2(W,State);   \
301 |     } else {           \
302 |         if (mod(plen,16)==0)    \
303 |         {                       \
304 |             W[0x0]=0x80000000;  \
305 |             W[0x1]=0x0; \
306 |             W[0x2]=0x0; \
307 |             W[0x3]=0x0; \
308 |             W[0x4]=0x0; \
309 |             W[0x5]=0x0; \
310 |             W[0x6]=0x0; \
311 |             W[0x7]=0x0; \
312 |             W[0x8]=0x0; \
313 |             W[0x9]=0x0; \
314 |             W[0xA]=0x0; \
315 |             W[0xB]=0x0; \
316 |             W[0xC]=0x0; \
317 |             W[0xD]=0x0; \
318 |             W[0xE]=0x0; \
319 |             W[0x0F]=pass_len*8;         \
320 |                                         \
321 |             sha256_process2(W,State);   \
322 |         }   \
323 |     }       \
324 |                             \
325 |     p[0]=SWAP(State[0]);    \
326 |     p[1]=SWAP(State[1]);    \
327 |     p[2]=SWAP(State[2]);    \
328 |     p[3]=SWAP(State[3]);    \
329 |     p[4]=SWAP(State[4]);    \
330 |     p[5]=SWAP(State[5]);    \
331 |     p[6]=SWAP(State[6]);    \
332 |     p[7]=SWAP(State[7]);    \
333 |     return;                 \
334 | }
335 | 
336 | def_hash(hash_global, __global, __global)
337 | def_hash(hash_private, __private, __private)
338 | def_hash(hash_glbl_to_priv, __global, __private)
339 | def_hash(hash_priv_to_glbl, __private, __global)
340 | 
341 | #undef F0
342 | #undef F1
343 | #undef S0
344 | #undef S1
345 | #undef S2
346 | #undef S3
347 | 
348 | #undef mod
349 | #undef shr32
350 | #undef rotl32
351 | 
352 | __kernel void hash_main(__global const inbuf * inbuffer, __global outbuf * outbuffer)
353 | {
354 |     unsigned int idx = get_global_id(0);
355 |     // unsigned int hash[32/4]={0};
356 |     hash_global(inbuffer[idx].buffer, inbuffer[idx].length, outbuffer[idx].buffer);
357 | /*     outbuffer[idx].buffer[0]=hash[0];
358 |     outbuffer[idx].buffer[1]=hash[1];
359 |     outbuffer[idx].buffer[2]=hash[2];
360 |     outbuffer[idx].buffer[3]=hash[3];
361 |     outbuffer[idx].buffer[4]=hash[4];
362 |     outbuffer[idx].buffer[5]=hash[5];
363 |     outbuffer[idx].buffer[6]=hash[6];
364 |     outbuffer[idx].buffer[7]=hash[7]; */
365 | }
366 | 


--------------------------------------------------------------------------------
/Library/worker/generic/sha512.cl:
--------------------------------------------------------------------------------
  1 | /*
  2 |     Original copyright (sha256):
  3 |     OpenCL Optimized kernel
  4 |     (c) B. Kerler 2018
  5 |     MIT License
  6 | 
  7 |     Adapted for SHA512 by C.B .. apparently quite a while ago
  8 |     The moral of the story is always use UL on unsigned longs!
  9 | */
 10 | 
 11 | 
 12 | 
 13 | // bitselect is "if c then b else a" for each bit
 14 | // so equivalent to (c & b) | ((~c) & a)
 15 | #define choose(x,y,z)   (bitselect(z,y,x))
 16 | // Cleverly determines majority vote, conditioning on x=z
 17 | #define bit_maj(x,y,z)   (bitselect (x, y, ((x) ^ (z))))
 18 | 
 19 | // Hopefully rotate works for long too?
 20 | 
 21 | 
 22 | 
 23 | 
 24 | // ==============================================================================
 25 | // =========  S0,S1,s0,s1  ======================================================
 26 | 
 27 | 
 28 | #define S0(x) (rotr64(x,28ul) ^ rotr64(x,34ul) ^ rotr64(x,39ul))
 29 | #define S1(x) (rotr64(x,14ul) ^ rotr64(x,18ul) ^ rotr64(x,41ul))
 30 | 
 31 | #define little_s0(x) (rotr64(x,1ul) ^ rotr64(x,8ul) ^ ((x) >> 7ul))
 32 | #define little_s1(x) (rotr64(x,19ul) ^ rotr64(x,61ul) ^ ((x) >> 6ul))
 33 | 
 34 | 
 35 | // ==============================================================================
 36 | // =========  MD-pads the input, taken from md5.cl  =============================
 37 | // Adapted for unsigned longs
 38 | // Note that the padding is still in a distinct unsigned long to the appended length.
 39 | 
 40 | 
 41 | // 'highBit' macro is (i+1) bytes, all 0 but the last which is 0x80
 42 | //  where we are thinking Little-endian thoughts.
 43 | // Don't forget to call constants longs!!
 44 | #define highBit(i) (0x1UL << (8*i + 7))
 45 | #define fBytes(i) (0xFFFFFFFFFFFFFFFFUL >> (8 * (8-i)))
 46 | __constant unsigned long padLong[8] = {
 47 |     highBit(0), highBit(1), highBit(2), highBit(3),
 48 |     highBit(4), highBit(5), highBit(6), highBit(7)
 49 | };
 50 | __constant unsigned long maskLong[8] = {
 51 |     0, fBytes(1), fBytes(2), fBytes(3),     // strange behaviour for fBytes(0)
 52 |     fBytes(4), fBytes(5), fBytes(6), fBytes(7)
 53 | };
 54 | 
 55 | #define bs_long hashBlockSize_long64
 56 | #define def_md_pad_128(funcName, tag)               \
 57 | /* The standard padding, INPLACE,
 58 |     add a 1 bit, then little-endian original length mod 2^128 (not 64) at the end of a block
 59 |     RETURN number of blocks */                  \
 60 | static int funcName(tag unsigned long *msg, const long msgLen_bytes)      \
 61 | {                                                                       \
 62 |     /* Appends the 1 bit to the end, and 0s to the end of the byte */   \
 63 |     const unsigned int padLongIndex = ((unsigned int)msgLen_bytes) / 8;                \
 64 |     const unsigned int overhang = (((unsigned int)msgLen_bytes) - padLongIndex*8);     \
 65 |     /* Don't assume that there are zeros here! */                       \
 66 |     msg[padLongIndex] &= maskLong[overhang];                              \
 67 |     msg[padLongIndex] |= padLong[overhang];                               \
 68 |                                                                         \
 69 |     /* Previous code was horrible
 70 |         Now we zero until we reach a multiple of the block size,
 71 |         Skipping TWO longs to ensure there is room for the length */     \
 72 |     msg[padLongIndex + 1] = 0;                                          \
 73 |     msg[padLongIndex + 2] = 0;                                          \
 74 |     unsigned int i = 0;                                                 \
 75 |     for (i = padLongIndex + 3; i % bs_long != 0; i++)                   \
 76 |     {                                                                   \
 77 |         msg[i] = 0;                                                     \
 78 |     }                                                                   \
 79 |                                                                         \
 80 |     /* Determine the total number of blocks */                          \
 81 |     int nBlocks = i / bs_long;                                          \
 82 |     /* Add the bit length to the end, 128-bit, big endian? (source wikipedia)
 83 |        Seemingly this does require SWAPing, so perhaps it's little-endian? */           \
 84 |     msg[i-2] = 0;   /* For clarity */                                   \
 85 |     msg[i-1] = SWAP(msgLen_bytes*8);                                    \
 86 |                                                                         \
 87 |     return nBlocks;                                                     \
 88 | };                                                                      
 89 | 
 90 | // Define it with the various tags to cheer OpenCL up
 91 | def_md_pad_128(md_pad__global, __global)
 92 | def_md_pad_128(md_pad__private, __private)
 93 | 
 94 | #undef bs_long
 95 | #undef def_md_pad_128
 96 | #undef highBit
 97 | #undef fBytes
 98 | 
 99 | 
100 | 
101 | 
102 | // ==============================================================================
103 | 
104 | __constant unsigned long k_sha256[80] =
105 | {
106 |     0x428a2f98d728ae22UL, 0x7137449123ef65cdUL, 0xb5c0fbcfec4d3b2fUL, 0xe9b5dba58189dbbcUL, 0x3956c25bf348b538UL, 
107 |     0x59f111f1b605d019UL, 0x923f82a4af194f9bUL, 0xab1c5ed5da6d8118UL, 0xd807aa98a3030242UL, 0x12835b0145706fbeUL, 
108 |     0x243185be4ee4b28cUL, 0x550c7dc3d5ffb4e2UL, 0x72be5d74f27b896fUL, 0x80deb1fe3b1696b1UL, 0x9bdc06a725c71235UL, 
109 |     0xc19bf174cf692694UL, 0xe49b69c19ef14ad2UL, 0xefbe4786384f25e3UL, 0x0fc19dc68b8cd5b5UL, 0x240ca1cc77ac9c65UL, 
110 |     0x2de92c6f592b0275UL, 0x4a7484aa6ea6e483UL, 0x5cb0a9dcbd41fbd4UL, 0x76f988da831153b5UL, 0x983e5152ee66dfabUL, 
111 |     0xa831c66d2db43210UL, 0xb00327c898fb213fUL, 0xbf597fc7beef0ee4UL, 0xc6e00bf33da88fc2UL, 0xd5a79147930aa725UL, 
112 |     0x06ca6351e003826fUL, 0x142929670a0e6e70UL, 0x27b70a8546d22ffcUL, 0x2e1b21385c26c926UL, 0x4d2c6dfc5ac42aedUL, 
113 |     0x53380d139d95b3dfUL, 0x650a73548baf63deUL, 0x766a0abb3c77b2a8UL, 0x81c2c92e47edaee6UL, 0x92722c851482353bUL, 
114 |     0xa2bfe8a14cf10364UL, 0xa81a664bbc423001UL, 0xc24b8b70d0f89791UL, 0xc76c51a30654be30UL, 0xd192e819d6ef5218UL, 
115 |     0xd69906245565a910UL, 0xf40e35855771202aUL, 0x106aa07032bbd1b8UL, 0x19a4c116b8d2d0c8UL, 0x1e376c085141ab53UL, 
116 |     0x2748774cdf8eeb99UL, 0x34b0bcb5e19b48a8UL, 0x391c0cb3c5c95a63UL, 0x4ed8aa4ae3418acbUL, 0x5b9cca4f7763e373UL, 
117 |     0x682e6ff3d6b2b8a3UL, 0x748f82ee5defb2fcUL, 0x78a5636f43172f60UL, 0x84c87814a1f0ab72UL, 0x8cc702081a6439ecUL, 
118 |     0x90befffa23631e28UL, 0xa4506cebde82bde9UL, 0xbef9a3f7b2c67915UL, 0xc67178f2e372532bUL, 0xca273eceea26619cUL, 
119 |     0xd186b8c721c0c207UL, 0xeada7dd6cde0eb1eUL, 0xf57d4f7fee6ed178UL, 0x06f067aa72176fbaUL, 0x0a637dc5a2c898a6UL, 
120 |     0x113f9804bef90daeUL, 0x1b710b35131c471bUL, 0x28db77f523047d84UL, 0x32caab7b40c72493UL, 0x3c9ebe0a15c9bebcUL, 
121 |     0x431d67c49c100d4cUL, 0x4cc5d4becb3e42b6UL, 0x597f299cfc657e2aUL, 0x5fcb6fab3ad6faecUL, 0x6c44198c4a475817UL   
122 | };
123 | 
124 | 
125 | #define SHA512_STEP(a,b,c,d,e,f,g,h,x,K)  \
126 | /**/                \
127 | {                   \
128 |   h += K + S1(e) + choose(e,f,g) + x; /* h = temp1 */   \
129 |   d += h;           \
130 |   h += S0(a) + bit_maj(a,b,c);  \
131 | }
132 | 
133 | 
134 | static void printAll(unsigned long a, unsigned long b, unsigned long c, unsigned long d,
135 |                 unsigned long e, unsigned long f, unsigned long g, unsigned long h)
136 | {
137 |     printf("a = %lX\n", a);
138 |     printf("b = %lX\n", b);
139 |     printf("c = %lX\n", c);
140 |     printf("d = %lX\n", d);
141 |     printf("e = %lX\n", e);
142 |     printf("f = %lX\n", f);
143 |     printf("g = %lX\n", g);
144 |     printf("h = %lX\n\n", h);
145 | }
146 | 
147 | #define ROUND_STEP(i) \
148 | /**/                  \
149 | {                     \
150 |     SHA512_STEP(a, b, c, d, e, f, g, h, W[i + 0], k_sha256[i +  0]); \
151 |     SHA512_STEP(h, a, b, c, d, e, f, g, W[i + 1], k_sha256[i +  1]); \
152 |     SHA512_STEP(g, h, a, b, c, d, e, f, W[i + 2], k_sha256[i +  2]); \
153 |     SHA512_STEP(f, g, h, a, b, c, d, e, W[i + 3], k_sha256[i +  3]); \
154 |     SHA512_STEP(e, f, g, h, a, b, c, d, W[i + 4], k_sha256[i +  4]); \
155 |     SHA512_STEP(d, e, f, g, h, a, b, c, W[i + 5], k_sha256[i +  5]); \
156 |     SHA512_STEP(c, d, e, f, g, h, a, b, W[i + 6], k_sha256[i +  6]); \
157 |     SHA512_STEP(b, c, d, e, f, g, h, a, W[i + 7], k_sha256[i +  7]); \
158 |     SHA512_STEP(a, b, c, d, e, f, g, h, W[i + 8], k_sha256[i +  8]); \
159 |     SHA512_STEP(h, a, b, c, d, e, f, g, W[i + 9], k_sha256[i +  9]); \
160 |     SHA512_STEP(g, h, a, b, c, d, e, f, W[i + 10], k_sha256[i + 10]); \
161 |     SHA512_STEP(f, g, h, a, b, c, d, e, W[i + 11], k_sha256[i + 11]); \
162 |     SHA512_STEP(e, f, g, h, a, b, c, d, W[i + 12], k_sha256[i + 12]); \
163 |     SHA512_STEP(d, e, f, g, h, a, b, c, W[i + 13], k_sha256[i + 13]); \
164 |     SHA512_STEP(c, d, e, f, g, h, a, b, W[i + 14], k_sha256[i + 14]); \
165 |     SHA512_STEP(b, c, d, e, f, g, h, a, W[i + 15], k_sha256[i + 15]); \
166 | }
167 | 
168 | 
169 | #define def_hash(funcName, inputTag, hashTag, mdPadFunc, printFromLongFunc)   \
170 | /* The main hashing function */     \
171 | static void funcName(inputTag unsigned long *input, const unsigned int length, hashTag unsigned long* hash)    \
172 | {                                   \
173 |     /* Do the padding - we weren't previously for some reason */            \
174 |     const unsigned int nBlocks = mdPadFunc(input, (const unsigned long) length);      \
175 |     /*if (length == 8){   \
176 |         printf("Padded input: ");   \
177 |         printFromLongFunc(input, hashBlockSize_bytes, true); \
178 |     }*/   \
179 |                                     \
180 |     unsigned long W[0x50]={0};      \
181 |     /* state which is repeatedly processed & added to */    \
182 |     unsigned long State[8]={0};    \
183 |     State[0] = 0x6a09e667f3bcc908UL;	\
184 |     State[1] = 0xbb67ae8584caa73bUL;	\
185 |     State[2] = 0x3c6ef372fe94f82bUL;	\
186 |     State[3] = 0xa54ff53a5f1d36f1UL;	\
187 |     State[4] = 0x510e527fade682d1UL;	\
188 |     State[5] = 0x9b05688c2b3e6c1fUL;	\
189 |     State[6] = 0x1f83d9abfb41bd6bUL;	\
190 |     State[7] = 0x5be0cd19137e2179UL;	\
191 |                                     \
192 |     unsigned long a,b,c,d,e,f,g,h;  \
193 |                                 \
194 |     /* loop for each block */   \
195 |     for (int block_i = 0; block_i < nBlocks; block_i++)     \
196 |     {                                           \
197 |         /* No need to (re-)initialise W.
198 |             Note that the input pointer is updated */    \
199 |         W[0] = SWAP(input[0]);	\
200 |         W[1] = SWAP(input[1]);	\
201 |         W[2] = SWAP(input[2]);	\
202 |         W[3] = SWAP(input[3]);	\
203 |         W[4] = SWAP(input[4]);	\
204 |         W[5] = SWAP(input[5]);	\
205 |         W[6] = SWAP(input[6]);	\
206 |         W[7] = SWAP(input[7]);	\
207 |         W[8] = SWAP(input[8]);	\
208 |         W[9] = SWAP(input[9]);	\
209 |         W[10] = SWAP(input[10]);	\
210 |         W[11] = SWAP(input[11]);	\
211 |         W[12] = SWAP(input[12]);	\
212 |         W[13] = SWAP(input[13]);	\
213 |         W[14] = SWAP(input[14]);	\
214 |         W[15] = SWAP(input[15]);	\
215 |                             \
216 |         for (int i = 16; i < 80; i++)   \
217 |         {                   \
218 |             W[i] = W[i-16] + little_s0(W[i-15]) + W[i-7] + little_s1(W[i-2]);   \
219 |         }               \
220 |                         \
221 |         a = State[0];   \
222 |         b = State[1];   \
223 |         c = State[2];   \
224 |         d = State[3];   \
225 |         e = State[4];   \
226 |         f = State[5];   \
227 |         g = State[6];   \
228 |         h = State[7];   \
229 |                         \
230 |         /* Note loop is only 5 */  \
231 |         for (int i = 0; i < 80; i += 16)    \
232 |         {                   \
233 |             ROUND_STEP(i)   \
234 |         }                   \
235 |                         \
236 |         State[0] += a;  \
237 |         State[1] += b;  \
238 |         State[2] += c;  \
239 |         State[3] += d;  \
240 |         State[4] += e;  \
241 |         State[5] += f;  \
242 |         State[6] += g;  \
243 |         State[7] += h;  \
244 |                         \
245 |         input += hashBlockSize_long64;   \
246 |     }                   \
247 |                         \
248 |     hash[0]=SWAP(State[0]);   \
249 |     hash[1]=SWAP(State[1]);   \
250 |     hash[2]=SWAP(State[2]);   \
251 |     hash[3]=SWAP(State[3]);   \
252 |     hash[4]=SWAP(State[4]);   \
253 |     hash[5]=SWAP(State[5]);   \
254 |     hash[6]=SWAP(State[6]);   \
255 |     hash[7]=SWAP(State[7]);   \
256 |     return;             \
257 | }
258 | 
259 | def_hash(hash_global, __global, __global, md_pad__global, printFromLong_glbl_n)
260 | def_hash(hash_private, __private, __private, md_pad__private, printFromLong_n)
261 | def_hash(hash_glbl_to_priv, __global, __private, md_pad__global, printFromLong_glbl_n)
262 | def_hash(hash_priv_to_glbl, __private, __global, md_pad__private, printFromLong_n)
263 | 
264 | #undef bit_maj
265 | #undef choose
266 | #undef S0
267 | #undef S1
268 | #undef little_s0
269 | #undef little_s1
270 | 
271 | __kernel void hash_main(__global inbuf * inbuffer, __global outbuf * outbuffer)
272 | {
273 |     unsigned int idx = get_global_id(0);
274 |     hash_global(inbuffer[idx].buffer, inbuffer[idx].length, outbuffer[idx].buffer);
275 | }
276 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MD5,SHA1,SHA256,HMAC,PBKDF2,SCrypt Bruteforcing tools using OpenCL (GPU, yay!) and Python
 2 | (c) B. Kerler and C.B. 2017-2019
 3 | 
 4 | Why
 5 | ===
 6 | - Because bruteforcing PBKDF2/HMAC/SCrypt and hashing MD5/SHA1/SHA256/SHA512 using just CPU sucks.
 7 | - Because Python itself is very slow for bruteforcing
 8 | - Because we'd like to bruteforce using Python and not rely on other
 9 |   tools like Hashcat (sorry Atom :D) and do not want to compile c++ first
10 |   
11 | Installation
12 | =============
13 | - Get python >= 3.7 64-Bit
14 | 
15 | Windows: 
16 | - Download pyopencl-2018.2.1+cl12-cp37-cp37m-win_amd64.whl from
17 |    [Here] (http://www.lfd.uci.edu/~gohlke/pythonlibs/#pyopencl) or use from Installer directory
18 | - Download and install the Win32 OpenCL driver (from Intel) from 
19 |    [Here] (http://registrationcenter-download.intel.com/akdlm/irc_nas/12512/opencl_runtime_16.1.2_x64_setup.msi)
20 | - Install pyOpenCL using: python -m pip install pyopencl-2018.2.1+cl12-cp37-cp37m-win_amd64.whl
21 | - Install scrypt using: python -m pip install scrypt
22 | 
23 | Linux:
24 | ```
25 | sudo pip3 install numpy pybind11 pycryptodome
26 | sudo apt install libssl-dev libssl
27 | sudo ldconfig
28 | sudo pip3 install scrypt
29 | sudo apt install opencl-dev && sudo pip3 install pyopencl
30 | wget http://registrationcenter-download.intel.com/akdlm/irc_nas/12556/opencl_runtime_16.1.2_x64_rh_6.4.0.37.tgz
31 | tar xzvf opencl_runtime_16.1.2_x64_rh_6.4.0.37.tgz
32 | cd opencl_runtime_16.1.2_x64_rh_6.4.0.37
33 | ./install_gui.sh
34 | ``` 
35 | 
36 | Run
37 | ===
38 | - To test if Library works correctly, run:
39 |   "python test.py" -> to print info
40 |   "python test.py 0" -> to run on first platform
41 | - See test.py for example implementation, Library is in Library folder
42 | 
43 | Issues
44 | ======
45 | - Tested with : Intel CPU and GPU, NVIDIA GTX 1080 Ti, AMD 970 (HMAC fails on AMD right now)
46 | 
47 |  
48 | Published under MIT license
49 | Additional license limitations: No use in commercial products without prior permit.
50 | 
51 | Enjoy !
52 | 


--------------------------------------------------------------------------------
/examples/bruteforce.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | # (c) 2021 B. Kerler
  4 | # MIT License
  5 | 
  6 | import threading
  7 | import sys
  8 | import hashlib
  9 | import argparse
 10 | import queue
 11 | from time import perf_counter
 12 | from binascii import hexlify
 13 | from Library import opencl
 14 | from Library.passwordutils import passwordutils
 15 | 
 16 | 
 17 | def verify_set(wordlist, key, salt, hash_val):
 18 |     for pwd in wordlist:
 19 |         pw = hashlib.pbkdf2_hmac('SHA256', password=pwd, salt=salt, iterations=10000, dklen=32)
 20 |         if hash_val in hashlib.sha1(pw).hexdigest()[:8]:
 21 |             print(f'[+] correct password: {pwd}', flush=True)
 22 |             return pwd
 23 |     return b""
 24 | 
 25 | 
 26 | def setup_args():
 27 |     parser = argparse.ArgumentParser(description='PW Bruteforce-Tool V1.0 (c) B. Kerler')
 28 |     parser.add_argument("-p", "--platform", required=False, help='OpenCL platform id.')
 29 |     parser.add_argument("-b", "--batch_size", required=False, help='Define batch_size/workgroupsize if necessary.')
 30 |     parser.add_argument("-m", "--minlen", required=False, help='Define PW minimum length.')
 31 |     parser.add_argument("-x", "--maxlen", required=False, help='Define PW maximum length.')
 32 |     args = parser.parse_args()
 33 |     return args
 34 | 
 35 | 
 36 | class brute:
 37 |     def __init__(self):
 38 |         self.totalthreads = None
 39 |         self.stop = False
 40 |         self.passwords=queue.Queue()
 41 |         self.flag = None
 42 |         self.key = None
 43 |         self.salt = None
 44 |         self.hash_val = None
 45 |         self.computeunits = None
 46 |         self.accel = None
 47 |         self.totalthreads = None
 48 |         self.iterations = None
 49 |         self.args = setup_args()
 50 |         if self.args.batch_size is not None:
 51 |             self.totalthreads = self.args.batchsize
 52 |         if self.args.minlen is not None:
 53 |             self.minlen = self.args.minlen
 54 |         else:
 55 |             self.minlen = 8
 56 | 
 57 |         if self.args.maxlen is not None:
 58 |             self.maxlen = self.args.maxlen
 59 |         else:
 60 |             self.maxlen = 16
 61 | 
 62 |         self.debug = 0
 63 |         if self.args.platform is not None:
 64 |             self.platform = self.args.platform
 65 |         else:
 66 |             self.platform = 0
 67 | 
 68 |         self.opencl_algo = opencl.opencl_algos(self.platform, self.debug, write_combined_file=False,
 69 |                                                inv_memory_density=1)
 70 | 
 71 |     def verifypws(self):
 72 |         pwcount = 0
 73 |         start_time = perf_counter()
 74 |         while not self.passwords.empty():
 75 |             pwlist = []
 76 |             for i in range(0, self.totalthreads):
 77 |                 if not self.stop:
 78 |                     pw = self.passwords.get()
 79 |                     pwlist.append(pw)
 80 |                     pwcount += 1
 81 |                 else:
 82 |                     while not self.passwords.empty():
 83 |                         pw = self.passwords.get()
 84 |                         pwlist.append(pw)
 85 |                         pwcount += 1
 86 |                     break
 87 | 
 88 |             """
 89 |                 Implement your algo here
 90 |             """
 91 |             results = self.opencl_algo.cl_pbkdf2(self.ctx_pbkdf2, pwlist, self.salt, self.iterations, 32)
 92 |             digests = []
 93 |             for result in results:
 94 |                 digests.append(result)
 95 |             """
 96 |                 End of implementation
 97 |             """
 98 | 
 99 |             if len(pwlist) > 0:
100 |                 elapsed_time = perf_counter() - start_time
101 |                 calcedpw = self.totalthreads / elapsed_time
102 |                 print(f"Current try : {pwlist[0].decode('utf-8')}, {calcedpw} PWs/s, " +
103 |                       f"{self.totalthreads} PWs/Thread, {pwcount} total PWs.")
104 |                 start_time = perf_counter()
105 | 
106 |             """
107 |                 Implement your verification here
108 |             """
109 |             for number, sha in enumerate(digests):
110 |                 if self.hash_val == sha:
111 |                     print(f'[+] found password: {pwlist[number]}')
112 |                     return pwlist[number]
113 |         return None
114 | 
115 |     def init_gcpu(self,salt,hash_val,iterations):
116 |         self.salt = salt
117 |         self.hash_val = hash_val
118 |         self.iterations = iterations
119 |         # init opencl instance
120 |         self.ctx_pbkdf2=self.opencl_algo.cl_pbkdf2_init("sha256",len(self.salt),32)
121 | 
122 |         if self.totalthreads is None:
123 |             self.computeunits = self.opencl_algo.opencl_ctx.computeunits
124 |             self.accel = max(self.computeunits // 4 * 4 // 4, 1)
125 |             self.totalthreads = self.opencl_algo.opencl_ctx.workgroupsize * self.accel
126 |             print(f"Using Thread size of {self.totalthreads}")
127 | 
128 |     def stopthread(self):
129 |         self.stop = True
130 | 
131 |     def run(self):
132 |         sys.stdin = sys.stdin.detach()
133 |         self.threadLock = threading.Lock()
134 |         thread1 = passwordutils(self.stopthread, self.threadLock, self.passwords, self.totalthreads, self.minlen, self.maxlen)
135 |         #thread2 = passwordutils(self.passwords, self.totalthreads, self.minlen, self.maxlen)
136 |         #thread3 = passwordutils(self.passwords, self.totalthreads, self.minlen, self.maxlen)
137 |         #thread4 = passwordutils(,self.passwords, self.totalthreads, self.minlen, self.maxlen)
138 |         thread1.start()
139 |         #thread2.start()
140 |         #thread3.start()
141 |         #thread4.start()
142 |         # We wait here for first passwords to arrive
143 |         while self.passwords.empty():
144 |             pass
145 |         start_time = perf_counter()
146 |         res = self.verifypws()
147 |         thread1.join()
148 |         #thread2.join()
149 |         #thread3.join()
150 |         #thread4.join()
151 |         elapsed_time = perf_counter() - start_time
152 |         print(f"Total time : %f" % elapsed_time)
153 | 
154 |         if res == -1 or res is None:
155 |             print("No password found")
156 |             exit(0)
157 | 
158 | 
159 | if __name__ == '__main__':
160 |     tb = brute()
161 |     salt=b"\x12\x34\x56\x78"
162 |     iterations=10000
163 |     hash_val=hashlib.pbkdf2_hmac("SHA256",b"testtest",salt,iterations,32)
164 |     tb.init_gcpu(salt,hash_val,iterations)
165 |     tb.run()
166 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy 
2 | pybind11 
3 | pycryptodome
4 | scrypt
5 | pyopencl


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | # -*- coding: utf-8 -*-
  3 | # (c) B. Kerler 2018-2021
  4 | # MIT License
  5 | import sys
  6 | import hashlib
  7 | import hmac
  8 | import scrypt
  9 | import functools, operator
 10 | from Library import opencl
 11 | from Library.opencl_information import opencl_information
 12 | from binascii import unhexlify, hexlify
 13 | from collections import deque
 14 | from hashlib import pbkdf2_hmac
 15 | 
 16 | 
 17 | # ===================================== Test funcs =============================================
 18 | 
 19 | def test(hashClass, passwordlist, clresult):
 20 |     # Generate the correct results using hashlib
 21 |     correct_res = []
 22 |     for pwd in passwordlist:
 23 |         h = hashClass()
 24 |         h.update(pwd)
 25 |         correct_res.append(h.digest())
 26 | 
 27 |     # Determine success and print
 28 |     correct = [r == c for r, c in zip(clresult, correct_res)]
 29 |     succ = (len(passwordlist) == len(clresult)) and functools.reduce(operator.and_, correct, True)
 30 |     if succ:
 31 |         print("Ok m8!")
 32 |     else:
 33 |         print("Failed !")
 34 |         print(clresult[0])
 35 |         print(correct_res[0])
 36 | 
 37 | 
 38 | def sha256_test(opencl_algo, passwordlist):
 39 |     print("Testing sha256 ..")
 40 |     ctx = opencl_algo.cl_sha256_init()
 41 |     clresult = opencl_algo.cl_sha256(ctx, passwordlist)
 42 |     test(hashlib.sha256, passwordlist, clresult)
 43 | 
 44 | 
 45 | def sha512_test(opencl_algo, passwordlist):
 46 |     print("Testing sha512 ..")
 47 |     ctx = opencl_algo.cl_sha512_init()
 48 |     clresult = opencl_algo.cl_sha512(ctx, passwordlist)
 49 |     test(hashlib.sha512, passwordlist, clresult)
 50 | 
 51 | 
 52 | def md5_test(opencl_algo, passwordlist):
 53 |     print("Testing md5 ..")
 54 |     ctx = opencl_algo.cl_md5_init()
 55 |     clresult = opencl_algo.cl_md5(ctx, passwordlist)
 56 |     test(hashlib.md5, passwordlist, clresult)
 57 | 
 58 | 
 59 | def sha1_test(opencl_algo, passwordlist):
 60 |     print("Testing sha1 ..")
 61 |     ctx = opencl_algo.cl_sha1_init()
 62 |     clresult = opencl_algo.cl_sha1(ctx, passwordlist)
 63 |     test(hashlib.sha1, passwordlist, clresult)
 64 | 
 65 | 
 66 | def hmac_test(passwordlist, salt, hashClass, clResult):
 67 |     correct_res = []
 68 |     for pwd in passwordlist:
 69 |         correct_res.append(hmac.new(pwd, salt, hashClass).digest())
 70 | 
 71 |     # Determine success and print
 72 |     correct = [r == c for r, c in zip(clResult, correct_res)]
 73 |     succ = (len(passwordlist) == len(clResult)) and functools.reduce(operator.and_, correct, True)
 74 |     if succ:
 75 |         print("Ok m9!")
 76 |     else:
 77 |         print("Failed !")
 78 |         print(clResult[0])
 79 |         print(correct_res[0])
 80 | 
 81 | 
 82 | def md5_hmac_test(opencl_algo, passwordlist, salt):
 83 |     print("Testing hmac using md5.cl")
 84 |     ctx = opencl_algo.cl_md5_init("pbkdf2.cl")
 85 |     clResult = opencl_algo.cl_md5_hmac(ctx, passwordlist, salt)
 86 |     hmac_test(passwordlist, salt, hashlib.md5, clResult)
 87 | 
 88 | 
 89 | def sha256_hmac_test(opencl_algo, passwordlist, salt):
 90 |     print("Testing hmac using sha256.cl")
 91 |     ctx = opencl_algo.cl_sha256_init("pbkdf2.cl")
 92 |     clResult = opencl_algo.cl_sha256_hmac(ctx, passwordlist, salt)
 93 |     hmac_test(passwordlist, salt, hashlib.sha256, clResult)
 94 | 
 95 | 
 96 | def sha512_hmac_test(opencl_algo, passwordlist, salt):
 97 |     print("Testing hmac using sha512.cl")
 98 |     ctx = opencl_algo.cl_sha512_init("pbkdf2.cl")
 99 |     clResult = opencl_algo.cl_sha512_hmac(ctx, passwordlist, salt)
100 |     hmac_test(passwordlist, salt, hashlib.sha512, clResult)
101 | 
102 | 
103 | def sha1_hmac_test(opencl_algo, passwordlist, salt):
104 |     print("Testing hmac using sha1.cl")
105 |     ctx = opencl_algo.cl_sha1_init("pbkdf2.cl")
106 |     clResult = opencl_algo.cl_sha1_hmac(ctx, passwordlist, salt)
107 |     hmac_test(passwordlist, salt, hashlib.sha1, clResult)
108 | 
109 | 
110 | def pbkdf2_test(passwordlist, salt, hashName, iters, dklen, clResult):
111 |     correct_res = []
112 |     for pwd in passwordlist:
113 |         correct_res.append(hashlib.pbkdf2_hmac(hashName, pwd, salt, iters, dklen))
114 | 
115 |     # Determine success and print
116 |     correct = [r == c for r, c in zip(clResult, correct_res)]
117 |     succ = (len(passwordlist) == len(clResult)) and functools.reduce(operator.and_, correct, True)
118 |     if succ:
119 |         print("Ok m10!")
120 |     else:
121 |         print("Failed !")
122 |         for i in range(len(passwordlist)):
123 |             if clResult[i] == correct_res[i]:
124 |                 print("#{} succeeded".format(i))
125 |             else:
126 |                 print(i)
127 |                 print(clResult[i])
128 |                 print(correct_res[i])
129 | 
130 | def pbkdf2_saltlist_test(password, saltlist, hashName, iters, dklen, clResult):
131 |     correct_res = []
132 |     for salt in saltlist:
133 |         correct_res.append(hashlib.pbkdf2_hmac(hashName, password, salt, iters, dklen))
134 | 
135 |     # Determine success and print
136 |     correct = [r == c for r, c in zip(clResult, correct_res)]
137 |     succ = (len(saltlist) == len(clResult)) and functools.reduce(operator.and_, correct, True)
138 |     if succ:
139 |         print("Ok m10!")
140 |     else:
141 |         print("Failed !")
142 |         for i in range(len(saltlist)):
143 |             if clResult[i] == correct_res[i]:
144 |                 print("#{} Succeeded".format(i))
145 |             else:
146 |                 print("#{} Failed".format(i))
147 |                 print("clResult: ", clResult[i])
148 |                 print("Hashlib: ", correct_res[i])
149 | 
150 | def pbkdf2_hmac_md5_test(opencl_algo, passwordlist, salt, iters, dklen):
151 |     print("Testing pbkdf2-hmac using md5.cl")
152 |     ctx = opencl_algo.cl_pbkdf2_init("md5", len(salt), dklen)
153 |     clResult = opencl_algo.cl_pbkdf2(ctx, passwordlist, salt, iters, dklen)
154 |     pbkdf2_test(passwordlist, salt, "md5", iters, dklen, clResult)
155 | 
156 | 
157 | def pbkdf2_hmac_sha1_test(opencl_algo, passwordlist, salt, iters, dklen):
158 |     print("Testing pbkdf2-hmac using sha1.cl")
159 |     ctx = opencl_algo.cl_pbkdf2_init("sha1", len(salt), dklen)
160 |     clResult = opencl_algo.cl_pbkdf2(ctx, passwordlist, salt, iters, dklen)
161 |     pbkdf2_test(passwordlist, salt, "sha1", iters, dklen, clResult)
162 | 
163 | 
164 | def pbkdf2_hmac_sha256_test(opencl_algo, passwordlist, salt, iters, dklen):
165 |     print("Testing pbkdf2-hmac using sha256.cl")
166 |     ctx = opencl_algo.cl_pbkdf2_init("sha256", len(salt), dklen)
167 |     clResult = opencl_algo.cl_pbkdf2(ctx, passwordlist, salt, iters, dklen)
168 |     pbkdf2_test(passwordlist, salt, "sha256", iters, dklen, clResult)
169 | 
170 | def pbkdf2_hmac_sha256_speedtest(opencl_algo, passwordlist, salt, iters, dklen):
171 |     print("Testing pbkdf2-hmac using sha256.cl")
172 |     ctx = opencl_algo.cl_pbkdf2_init("sha256", len(salt), dklen)
173 |     clResult = opencl_algo.cl_pbkdf2(ctx, passwordlist, salt, iters, dklen)
174 | 
175 | 
176 | def pbkdf2_hmac_sha512_test(opencl_algo, passwordlist, salt, iters, dklen):
177 |     print("Testing pbkdf2-hmac using sha512.cl")
178 |     ctx = opencl_algo.cl_pbkdf2_init("sha512", len(salt), dklen)
179 |     clResult = opencl_algo.cl_pbkdf2(ctx, passwordlist, salt, iters, dklen)
180 |     pbkdf2_test(passwordlist, salt, "sha512", iters, dklen, clResult)
181 | 
182 | def pbkdf2_hmac_saltlist_md5_test(opencl_algo, password, saltlist, iters, dklen):
183 |     print("Testing pbkdf2-hmac using md5.cl")
184 |     ctx=opencl_algo.cl_pbkdf2_saltlist_init("md5",len(password),dklen)
185 |     clResult = opencl_algo.cl_pbkdf2_saltlist(ctx,password, saltlist, iters, dklen)
186 |     pbkdf2_saltlist_test(password, saltlist, "md5", iters, dklen, clResult)
187 | 
188 | def pbkdf2_hmac_saltlist_sha1_test(opencl_algo, password, saltlist, iters, dklen):
189 |     print("Testing pbkdf2-hmac using sha1.cl")
190 |     ctx=opencl_algo.cl_pbkdf2_saltlist_init("sha1", len(password), dklen)
191 |     clResult = opencl_algo.cl_pbkdf2_saltlist(ctx,password, saltlist, iters, dklen)
192 |     pbkdf2_saltlist_test(password, saltlist, "sha1", iters, dklen, clResult)
193 | 
194 | def pbkdf2_hmac_saltlist_sha256_test(opencl_algo, password, saltlist, iters, dklen):
195 |     print("Testing pbkdf2-hmac using sha256.cl")
196 |     ctx=opencl_algo.cl_pbkdf2_saltlist_init("sha256", len(password), dklen)
197 |     clResult = opencl_algo.cl_pbkdf2_saltlist(ctx,password, saltlist, iters, dklen)
198 |     pbkdf2_saltlist_test(password, saltlist, "sha256", iters, dklen, clResult)
199 | 
200 | def pbkdf2_hmac_saltlist_sha512_test(opencl_algo, password, saltlist, iters, dklen):
201 |     print("Testing pbkdf2-hmac using sha512.cl")
202 |     ctx=opencl_algo.cl_pbkdf2_saltlist_init("sha512", len(password), dklen)
203 |     clResult = opencl_algo.cl_pbkdf2_saltlist(ctx,password, saltlist, iters, dklen)
204 |     pbkdf2_saltlist_test(password, saltlist, "sha512", iters, dklen, clResult)
205 | 
206 | 
207 | def scrypt_test(scrypt_opencl_algos, passwords, N_value=15, r_value=3, p_value=1, desired_key_length=32,
208 |                 hex_salt=unhexlify("DEADBEEFDEADBEEFDEADBEEFDEADBEEF")):
209 |     print("Testing scrypt")
210 |     correct_res = []
211 |     for pwd in passwords:
212 |         v = scrypt.hash(pwd, hex_salt, 1 << N_value, 1 << r_value, 1 << p_value, desired_key_length)
213 |         correct_res.append(v)
214 |     ctx = scrypt_opencl_algos.cl_scrypt_init(N_value)
215 |     clResult = scrypt_opencl_algos.cl_scrypt(ctx, passwords, N_value, r_value, p_value, desired_key_length, hex_salt)
216 | 
217 |     # Determine success and print
218 |     correct = [r == c for r, c in zip(clResult, correct_res)]
219 |     succ = (len(passwords) == len(clResult)) and functools.reduce(operator.and_, correct, True)
220 |     if succ:
221 |         print("Ok m11!")
222 |     else:
223 |         print("Failed !")
224 |         for i in range(len(passwords)):
225 |             if clResult[i] == correct_res[i]:
226 |                 print("#{} succeeded".format(i))
227 |             else:
228 |                 print(i)
229 |                 print(clResult[i])
230 |                 print(correct_res[i])
231 | 
232 | 
233 | def test_iterations(passwordlist, hashClass, iters, clResult):
234 |     hashlib_passwords = []
235 |     for password in passwordlist:
236 |         for i in range(iters):
237 |             password = hashClass(password).digest()
238 |         hashlib_passwords.append(password)
239 | 
240 |     if clResult == hashlib_passwords:
241 |         print("Ok m12!")
242 |     else:
243 |         print("Failed !")
244 |         for i in range(len(passwordlist)):
245 |             if clResult[i] == hashlib_passwords[i]:
246 |                 print("#{} succeeded".format(i))
247 |             else:
248 |                 print(i)
249 |                 print(clResult[i])
250 |                 print(hashlib_passwords[i])
251 | 
252 | 
253 | def hash_iterations_md5_test(opencl_algo, passwordlist, iters):
254 |     print("Testing md5 " + str(iters) + " rounds")
255 |     ctx = opencl_algo.cl_hash_iterations_init("md5")
256 | 
257 |     for i in range(len(passwordlist)):
258 |         passwordlist[i] = hashlib.md5(passwordlist[i]).digest()
259 | 
260 |     clresult = opencl_algo.cl_hash_iterations(ctx, passwordlist, iters, 4)
261 | 
262 |     test_iterations(passwordlist, hashlib.md5, iters, clresult)
263 | 
264 | 
265 | def hash_iterations_sha1_test(opencl_algo, passwordlist, iters):
266 |     print("Testing sha1 " + str(iters) + " rounds")
267 |     ctx = opencl_algo.cl_hash_iterations_init("sha1")
268 | 
269 |     for i in range(len(passwordlist)):
270 |         passwordlist[i] = hashlib.sha1(passwordlist[i]).digest()
271 | 
272 |     clresult = opencl_algo.cl_hash_iterations(ctx, passwordlist, iters, 8)
273 | 
274 |     test_iterations(passwordlist, hashlib.sha1, iters, clresult)
275 | 
276 | 
277 | def hash_iterations_sha256_test(opencl_algo, passwordlist, iters):
278 |     print("Testing sha256 " + str(iters) + " rounds")
279 |     ctx = opencl_algo.cl_hash_iterations_init("sha256")
280 | 
281 |     for i in range(len(passwordlist)):
282 |         passwordlist[i] = hashlib.sha256(passwordlist[i]).digest()
283 | 
284 |     clresult = opencl_algo.cl_hash_iterations(ctx, passwordlist, iters, 8)
285 | 
286 |     test_iterations(passwordlist, hashlib.sha256, iters, clresult)
287 | 
288 | 
289 | def hash_iterations_sha512_test(opencl_algo, passwordlist, iters):
290 |     print("Testing sha512 " + str(iters) + " rounds")
291 |     ctx = opencl_algo.cl_hash_iterations_init("sha512")
292 | 
293 |     for i in range(len(passwordlist)):
294 |         passwordlist[i] = hashlib.sha512(passwordlist[i]).digest()
295 | 
296 |     clresult = opencl_algo.cl_hash_iterations(ctx, passwordlist, iters, 8)
297 | 
298 |     test_iterations(passwordlist, hashlib.sha512, iters, clresult)
299 | 
300 | 
301 | # ===========================================================================================
302 | 
303 | def main(argv):
304 |     if len(argv) < 2:
305 |         print("Implementation tests")
306 |         print("-----------------------------------------------------------------")
307 |         info = opencl_information()
308 |         info.printplatforms()
309 |         print("\nPlease run as: python test.py [platform number]")
310 |         return
311 | 
312 |     # Input values to be hashed
313 |     passwordlist = [b'password', b'hmm', b'trolololl', b'madness']
314 |     salts = [b"salty123", b"salty12",b"\xd1\x0c\x00\xd2\xfe\x64\x02\x98",b"\x12\x34\x56\x78"]
315 | 
316 |     platform = int(argv[1])
317 |     debug = 0
318 |     write_combined_file = False
319 |     opencl_algos = opencl.opencl_algos(platform, debug, write_combined_file, inv_memory_density=1)
320 |     # Call the tests
321 | 
322 |     for salt in salts:
323 |         print("Using salt: %s" % salt)
324 |         md5_test(opencl_algos, passwordlist)
325 |         md5_hmac_test(opencl_algos, passwordlist, salt)
326 |         pbkdf2_hmac_md5_test(opencl_algos, passwordlist, salt, 1000, 32)
327 |         pbkdf2_hmac_md5_test(opencl_algos, passwordlist, salt, 1000, 50)
328 |         hash_iterations_md5_test(opencl_algos, passwordlist, 10000)
329 |         
330 |         sha1_test(opencl_algos, passwordlist)
331 |         sha1_hmac_test(opencl_algos, passwordlist, salt)
332 |         pbkdf2_hmac_sha1_test(opencl_algos, passwordlist, 16*b"\x00", 1000, 32)
333 |         pbkdf2_hmac_sha1_test(opencl_algos, passwordlist, salt, 1000, 32)
334 |         pbkdf2_hmac_sha1_test(opencl_algos, passwordlist, salt, 1000, 64)
335 |         hash_iterations_sha1_test(opencl_algos, passwordlist, 10000)
336 | 
337 |         sha256_test(opencl_algos, passwordlist)
338 |         sha256_hmac_test(opencl_algos, passwordlist, salt)
339 |         pbkdf2_hmac_sha256_test(opencl_algos, passwordlist, salt, 10000, 32)
340 |         pbkdf2_hmac_sha256_test(opencl_algos, passwordlist, salt, 10000, 50)
341 |         hash_iterations_sha256_test(opencl_algos, passwordlist, 10000)
342 | 
343 |         sha512_test(opencl_algos, passwordlist)
344 |         sha512_hmac_test(opencl_algos, passwordlist, salt)
345 |         pbkdf2_hmac_sha512_test(opencl_algos, passwordlist, salt, 1000, 32)
346 |         pbkdf2_hmac_sha512_test(opencl_algos, passwordlist, salt, 1000, 50)
347 |         hash_iterations_sha512_test(opencl_algos, passwordlist, 10000)
348 | 
349 |         scrypt_test(opencl_algos, passwordlist, 15, 3, 1, 0x20, salt)
350 | 
351 |     print("Testing PBKDF2 with SaltList")
352 |     pbkdf2_hmac_saltlist_md5_test(opencl_algos, passwordlist[0], salts, 1000, 50)
353 |     pbkdf2_hmac_saltlist_sha1_test(opencl_algos, passwordlist[0], salts, 1000, 50)
354 |     pbkdf2_hmac_saltlist_sha256_test(opencl_algos, passwordlist[0], salts, 1 << 16, 32)
355 |     pbkdf2_hmac_saltlist_sha512_test(opencl_algos, passwordlist[0], salts, 1000, 50)
356 | 
357 |     """
358 |     from time import perf_counter
359 |     start=perf_counter()
360 |     for i in range(200000):
361 |         passwordlist.append(b"test%04d" % i)
362 |     pbkdf2_hmac_sha256_speedtest(opencl_algos,passwordlist,salts[0],1000,50)
363 |     end=perf_counter()
364 |     print("Time: %f" % (end-start))
365 |     """
366 |     print("Tests have finished.")
367 | 
368 | 
369 | if __name__ == '__main__':
370 |     main(sys.argv)
371 | 


--------------------------------------------------------------------------------