├── LICENSE ├── README.md ├── analysis ├── README.md ├── aesmilp.py ├── examples │ ├── aeslike.yaml │ └── haraka.yaml └── models │ ├── __init__.py │ ├── aeslike.py │ ├── haraka.py │ └── milpconstraints.py ├── code ├── c │ ├── aesni_optimized │ │ ├── Makefile │ │ ├── haraka.c │ │ ├── haraka.h │ │ ├── main.c │ │ └── timing.h │ ├── aesni_ref │ │ ├── Makefile │ │ ├── haraka.c │ │ ├── helpers.c │ │ └── helpers.h │ └── neon │ │ ├── haraka.c │ │ └── haraka.h └── python │ └── ref.py └── supercop └── crypto_sign ├── measure.c ├── measure.c~ ├── sphincs256haraka └── aesni │ ├── api.h │ ├── consts.c │ ├── haraka.c │ ├── haraka.h │ ├── hash.c │ ├── hash.h │ ├── horst.c │ ├── horst.h │ ├── horst.log │ ├── implementors │ ├── params.h │ ├── permute.c │ ├── permute.h │ ├── prg.c │ ├── prg.h │ ├── settings.h │ ├── sign.c │ ├── wots.c │ ├── wots.h │ ├── zerobytes.c │ └── zerobytes.h └── try.c /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 kste 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Haraka v2 2 | 3 | Haraka v2 is a secure and efficient short-input (256 or 512 bits) hash function, designed 4 | to be very fast on modern platforms which support AES-NI. One of the main applications 5 | for such a design is the use in hash-based signature schemes like XMSS and SPHINCS. 6 | For more information see our [paper](https://eprint.iacr.org/2016/098). 7 | 8 | This repository provides various implementations in [code/](https://github.com/kste/haraka/tree/master/code). 9 | In [code/c/aesni_optimized](https://github.com/kste/haraka/tree/master/code/c/aesni_optimized), one can find 10 | an implementation processing 4 or 8 blocks in parallel. 11 | 12 | 13 | ## Performance 14 | 15 | The performance is measured in cycles per byte (cpb) processed. The following numbers 16 | correspond to Intel Skylake using the [optimized implementation](https://github.com/kste/haraka/tree/master/code/c/aesni_optimized). 17 | 18 | Variant | 1x | 4x | 8x 19 | ------- | ------- | ------- | ------- 20 | Haraka256 | 0.72 cpb | 0.63 cpb | 0.63 cpb 21 | Haraka512 | 1.02 cpb | 0.72 cpb | 0.72 cpb 22 | 23 | ## SPHINCS 24 | [SPHINCS](https://sphincs.cr.yp.to/) is a post-quantum secure hash-based digital signature scheme. The performance 25 | of SPHINCS strongly correlates with the performance of the underlying hash function and can be significantly 26 | improved by using an optimized construction. 27 | 28 | A SPHINCS implementation instantiated with Haraka can be found in [supercop/crypto_sign/](https://github.com/kste/haraka/tree/master/supercop/crypto_sign/sphincs256haraka/aesni), which can also be used for benchmarks with 29 | [Supercop](https://bench.cr.yp.to/supercop.html). 30 | 31 | This optimized implementation has the following perfomance figures on Intel Skylake: 32 | 33 | Operation | Cycles 34 | ------------ | ------------- 35 | KeyGeneration | 1.340.338 36 | Signing | 20.782.894 37 | Verify | 415.586 38 | 39 | 40 | ## Reference 41 | 42 | Haraka v2 - Efficient Short-Input Hashing for Post-Quantum Applications 43 | 44 | Stefan Kölbl and Martin M. Lauridsen and Florian Mendel and Christian Rechberger 45 | https://eprint.iacr.org/2016/098 46 | -------------------------------------------------------------------------------- /analysis/README.md: -------------------------------------------------------------------------------- 1 | # Haraka - Analysis 2 | 3 | This folder contains python scripts to construct the mixed integer linear 4 | programming (MILP) model used in the security analysis of Haraka. 5 | 6 | ## Examples 7 | Count the number of active S-boxes for AES-like designs. Parameters like the number of rounds or state dimensions 8 | can be specified in the *.yaml file. 9 | ``` 10 | python3 aesmilp.py --sbox --config examples/aeslike.yaml 11 | ``` 12 | 13 | Finding the optimal truncated differential attack for Haraka. 14 | ``` 15 | python3 aesmilp.py --truncated --config examples/haraka.yaml 16 | ``` 17 | 18 | For more details on this see our paper. 19 | -------------------------------------------------------------------------------- /analysis/aesmilp.py: -------------------------------------------------------------------------------- 1 | """ 2 | A tool to find the minimum number of active S-boxes for AES-like ciphers and 3 | Haraka-like designs. It can also be used to find the optimal truncated 4 | differential attack for Haraka. 5 | 6 | The gurobi python interface is required to run this code http://www.gurobi.com/ 7 | """ 8 | 9 | from argparse import ArgumentParser, RawTextHelpFormatter 10 | from models import aeslike, haraka 11 | from gurobipy import * 12 | 13 | import yaml 14 | 15 | # Disable logging for gurobi on console 16 | setParam("LogToConsole", 1) 17 | 18 | def activesboxharaka(): 19 | config = {"rounds": 1, 20 | "wordsize": 8, 21 | "branchnumber": 5, 22 | "statedimension": 4, 23 | "aesstates": 4, 24 | "aesrounds": 2, 25 | "collisiononly": False, 26 | "mixlayer": "mix", 27 | "securitymodel": "sbox"} 28 | 29 | print("Rounds", "S-boxes", sep="\t") 30 | for num_rounds in range(1, 8): 31 | print(num_rounds, end='') 32 | for aes_rounds in range(1, 6): 33 | config["rounds"] = num_rounds 34 | config["aesrounds"] = aes_rounds 35 | solved_model = solvemodel(haraka.buildmodel(config)) 36 | print(" ", round(solved_model.ObjVal), end="") 37 | print("") 38 | 39 | def findminactiveincreasing(): 40 | """ 41 | Example for finding minimum active S-box for increasing number of 42 | rounds. 43 | """ 44 | config = {"rounds": 1, 45 | "wordsize": 8, 46 | "branchnumber": 5, 47 | "statedimension": 4} 48 | 49 | print("Rounds", "S-boxes", sep="\t") 50 | for num_rounds in range(1, 11): 51 | config["rounds"] = num_rounds 52 | solved_model = solvemodel(aeslike.buildmodel(config)) 53 | print(num_rounds, round(solved_model.ObjVal), sep="\t") 54 | return 55 | 56 | def findminactivesbox(config): 57 | """ 58 | Example which finds the minimum number of active S-boxes for AES like 59 | ciphers, with the parameters given in the config file. 60 | """ 61 | if config["name"] == "aeslike": 62 | model = aeslike.buildmodel(config) 63 | solved_model = solvemodel(model) 64 | aeslike.printmodel(solved_model, config) 65 | elif config["name"] == "haraka": 66 | model = haraka.buildmodel(config) 67 | solved_model = solvemodel(model) 68 | haraka.printmodel(solved_model, config) 69 | return 70 | 71 | def harakatruncated(config): 72 | """ 73 | Find best attack in our truncated model for Haraka. 74 | """ 75 | num_states = ((config["aesrounds"] + 1) * config["rounds"]) + 1 76 | 77 | # Iterate over all possible starting states for the attack 78 | best_attack = 999999 79 | best_round = -1 80 | best_model = 0 81 | 82 | for rnd in range(num_states - 1): 83 | if haraka.isAESround(rnd, config["aesrounds"]): 84 | config["attackerstart"] = rnd 85 | model = haraka.buildmodel(config) 86 | attack_costs = round(solvemodel(model).objVal) 87 | print("Subround {} - Best Attack: {}".format(rnd, attack_costs)) 88 | if attack_costs < best_attack: 89 | best_attack = attack_costs 90 | best_round = rnd 91 | best_model = model 92 | 93 | print("Found best attack in round {} with costs {}".format(best_round, 94 | best_attack)) 95 | haraka.printmodel(best_model, config) 96 | return 97 | 98 | 99 | def solvemodel(gurobi_model): 100 | """ 101 | Solve model and return. 102 | """ 103 | try: 104 | gurobi_model.update() 105 | gurobi_model.write('haraka.lp') 106 | gurobi_model.optimize() 107 | except GurobiError: 108 | print("Error when solving!") 109 | print(GurobiError) 110 | return gurobi_model 111 | 112 | def main(): 113 | """ 114 | Load a config file and parse it 115 | """ 116 | parser = ArgumentParser(description="todo", 117 | formatter_class=RawTextHelpFormatter) 118 | parser.add_argument('--config', nargs=1, help="Use a yaml input file to" 119 | "read the parameters") 120 | parser.add_argument('--sbox', action="store_true", 121 | help="Count the number of active S-boxes.") 122 | parser.add_argument('--truncated', action="store_true", 123 | help="Use the truncated model for security analysis.") 124 | parser.add_argument('--verb', nargs=1, 125 | help="Set verbosity of the Gurobi solver.") 126 | args = parser.parse_args() 127 | 128 | params = {} 129 | 130 | #activesboxharaka() 131 | 132 | if args.verb: 133 | setParam("LogToConsole", int(args.verb[0])) 134 | 135 | if args.config: 136 | with open(args.config[0], 'r') as config: 137 | params = yaml.load(config) 138 | 139 | if args.sbox: 140 | findminactivesbox(params) 141 | 142 | if args.truncated: 143 | harakatruncated(params) 144 | 145 | 146 | if __name__ == '__main__': 147 | main() 148 | -------------------------------------------------------------------------------- /analysis/examples/aeslike.yaml: -------------------------------------------------------------------------------- 1 | # Config file for AES like primitive 2 | --- 3 | name: aeslike 4 | rounds: 4 5 | wordsize: 8 6 | branchnumber: 9 7 | statedimension: 8 8 | ... -------------------------------------------------------------------------------- /analysis/examples/haraka.yaml: -------------------------------------------------------------------------------- 1 | # Config file for Haraka-512/256 2 | --- 3 | name: haraka 4 | rounds: 5 5 | statedimension: 4 6 | branchnumber: 5 7 | wordsize: 8 8 | aesstates: 4 9 | aesrounds: 2 10 | collisiononly: Yes 11 | securitymodel: "sbox" # Count active S-boxes "sbox", 12 | # Truncated model "truncated" 13 | attackerpower: 2 # Rounds the attacker can control in each direction 14 | mixlayer: "mix" # Blend "blend" 15 | # Mix "mix" 16 | ... -------------------------------------------------------------------------------- /analysis/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kste/haraka/74d7f4e0a2c74f844939e1654b2f6741a437c507/analysis/models/__init__.py -------------------------------------------------------------------------------- /analysis/models/aeslike.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script constructs a MILP model for AES-like primitives, which can aid in 3 | finding optimal parameter sets against differential attacks by counting the 4 | minimum number of active S-boxes in a differential trail. 5 | 6 | It uses the Gurobi Solver to solve the MILP instance, hence you need 7 | a Gurobi license (free for Academic use). 8 | """ 9 | 10 | from gurobipy import * 11 | from models.milpconstraints import addAESrndconstraints 12 | 13 | 14 | def buildmodel(config): 15 | """ 16 | Constructs the model for the Gurobi Solver 17 | """ 18 | 19 | model = Model("aeslike") 20 | 21 | # Parameters 22 | state_dim = config["statedimension"] 23 | num_rounds = config["rounds"] 24 | branch_number = config["branchnumber"] 25 | 26 | # Initialize all variables 27 | var_x = [] # state 28 | var_d = [] # dummy variable for MixColumns 29 | 30 | state_words = state_dim * state_dim 31 | 32 | for byte in range((num_rounds + 1) * state_words): 33 | var_x.append(model.addVar(vtype=GRB.BINARY, name="x[{}]".format(byte))) 34 | for col in range(num_rounds * state_dim): 35 | var_d.append(model.addVar(name="dummy[{}]".format(col))) 36 | 37 | activesboxes = model.addVar(name="Active S-boxes") 38 | 39 | model.update() 40 | 41 | # Constraints 42 | 43 | # Optimize number of active S-boxes 44 | model.setObjective(activesboxes, GRB.MINIMIZE) 45 | 46 | # Count Active S-boxes 47 | model.addConstr(quicksum(var_x[i] for i in range(num_rounds * state_words)) 48 | - activesboxes == 0, "Count Active S-boxes") 49 | 50 | # Add constraints from AES round function 51 | model = addAESrndconstraints(model, state_dim, var_x, var_d, 52 | branch_number, num_rounds) 53 | 54 | # No Zero Characteristic 55 | model.addConstr(quicksum(var_x[i] for i in range((num_rounds + 1) * 56 | state_words)) >= 1, "Avoid trivial solutions") 57 | 58 | return model 59 | 60 | def printmodel(model, config): 61 | """ 62 | Print the solution and the corresponding differential trail. 63 | """ 64 | state_dim = config["statedimension"] 65 | num_rounds = config["rounds"] 66 | 67 | print("Rounds:", num_rounds) 68 | print("State dimension:", state_dim) 69 | print("Branch number:", config["branchnumber"]) 70 | print("Minimum number of active S-boxes: {}".format(model.objVal)) 71 | 72 | print("Best differential trail:") 73 | 74 | # Print differential trail 75 | # Print Header 76 | header = "" 77 | for rnd in range(num_rounds + 1): 78 | header += str(rnd) + " " * (2 * state_dim + 1 - len(str(rnd))) 79 | 80 | print(header) 81 | 82 | # Print State 83 | for row in range(state_dim): 84 | for rnd in range(num_rounds + 1): 85 | for col in range(state_dim): 86 | cur_index = row + col * state_dim + rnd * state_dim * state_dim 87 | if model.getVarByName("x[{}]".format(cur_index)).x > 0.0: 88 | print("\033[91mx\033[0m", end=" ") 89 | else: 90 | print(".", end=" ") 91 | print(" ", end="") 92 | print("") 93 | return model.objVal 94 | -------------------------------------------------------------------------------- /analysis/models/haraka.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script constructs a MILP model for Haraka-like designs to 3 | count the number of active S-boxes and determine the security 4 | level against truncated differential attacks. 5 | 6 | It uses the Gurobi Solver to solve the MILP instance, hence you need 7 | a Gurobi license (free for Academic use). 8 | """ 9 | 10 | from gurobipy import * 11 | from models.milpconstraints import addAESrndconstraints 12 | 13 | 14 | def buildmodel(config): 15 | """ 16 | Constructs the model for the Gurobi Solver 17 | """ 18 | 19 | model = Model("haraka") 20 | 21 | # Parameters 22 | rounds = config["rounds"] 23 | state_dim = config["statedimension"] 24 | branch_number = config["branchnumber"] 25 | aes_rounds = config["aesrounds"] 26 | aes_states = config["aesstates"] 27 | 28 | num_states = ((aes_rounds + 1) * rounds) + 1 29 | words_state = state_dim * state_dim 30 | 31 | # Initialize all variables 32 | var_x = [[] for _ in range(aes_states)] 33 | var_d = [[] for _ in range(aes_states)] 34 | var_mccosts = [[] for _ in range(aes_states)] 35 | var_mcactive = [[] for _ in range(aes_states)] 36 | 37 | for aes_state in range(aes_states): 38 | for word in range(num_states * words_state): 39 | var_x[aes_state].append( 40 | model.addVar(vtype=GRB.BINARY, 41 | name="x[{}][{}]".format(aes_state, word)) 42 | ) 43 | for col in range(num_states * state_dim): 44 | var_d[aes_state].append( 45 | model.addVar(name="dummy[{}][{}]".format(aes_state, col)) 46 | ) 47 | var_mccosts[aes_state].append( 48 | model.addVar(name="MCCosts[{}][{}]".format(aes_state, col)) 49 | ) 50 | var_mcactive[aes_state].append( 51 | model.addVar(vtype=GRB.BINARY, 52 | name="MCActive[{}][{}]".format(aes_state, col)) 53 | ) 54 | 55 | activesboxes = model.addVar(name="Active S-boxes") 56 | costs = model.addVar() 57 | 58 | 59 | model.update() 60 | 61 | # Objective to minimize attack costs 62 | model.setObjective(costs, GRB.MINIMIZE) 63 | 64 | if config["securitymodel"] == "sbox": 65 | # print("Finding minimum number of active S-boxes...") 66 | # Count number of active S-boxes 67 | model = addactivesboxconstraints(model, config, var_x, activesboxes) 68 | model.setObjective(activesboxes, GRB.MINIMIZE) 69 | elif config["securitymodel"] == "truncated": 70 | model = addtruncatedconstraints(model, config, var_x, var_mccosts, 71 | var_mcactive, costs) 72 | 73 | 74 | if config["collisiononly"]: 75 | if aes_states == 4: 76 | # If we have 4 states truncated to 256-bit 77 | model = addcolltruncoutput512(model, config, var_x) 78 | else: 79 | model = addcollisionconstraints(model, config, var_x) 80 | 81 | 82 | for rnd in range(rounds): 83 | # Add AES round constraints 84 | for aes_state in range(aes_states): 85 | model = addAESrndconstraints(model, state_dim, 86 | var_x[aes_state][words_state * (aes_rounds + 1) * rnd:], 87 | var_d[aes_state][state_dim * (aes_rounds + 1) * rnd:], 88 | branch_number, aes_rounds) 89 | 90 | # Add MIX round constraints 91 | if config["mixlayer"] == "mix" and aes_states == 4: 92 | model = addmixconstraints512(model, config, var_x, rnd) 93 | elif config["mixlayer"] == "mix" and aes_states == 2: 94 | model = addmixconstraints256(model, config, var_x, rnd) 95 | 96 | 97 | # No all Zero 98 | model.addConstr(quicksum(var_x[aes_state][i] 99 | for aes_state in range(aes_states) 100 | for i in range((aes_rounds * rounds + 1) * 101 | state_dim * state_dim)) >= 1, 102 | "notrivialsolution") 103 | 104 | return model 105 | 106 | 107 | def filterAESround(rounds, aes_rounds): 108 | """ 109 | Filters the list for AES rounds. 110 | """ 111 | return filter(lambda x: isAESround(x, config["aesrounds"]), rounds) 112 | 113 | def isAESround(rnd, aes_rounds): 114 | """ 115 | Return True if rnd is an AES round. 116 | """ 117 | return rnd == 0 or (((rnd + 1) % (aes_rounds + 1)) != 0) 118 | 119 | def printmodel(model, config): 120 | """ 121 | Print the solution and the corresponding differential trail. 122 | """ 123 | if config["securitymodel"] == "truncated": 124 | print("MixColumns Costs: {}".format(round(model.getVarByName("MixColumnsCosts").x))) 125 | print("MixColumns Costs (no dof): {}".format(round(model.getVarByName("MixColumnsCostsNoDof").x))) 126 | print("Collision Costs: {}".format(round(model.getVarByName("CollisionCosts").x))) 127 | print("Reducable Costs: {}".format(round(model.getVarByName("ReducableCosts").x))) 128 | print("Degrees of Freedom: {}".format(round(model.getVarByName("DegreesOfFreedom").x))) 129 | 130 | print("Obj: {}".format(round(model.objVal))) 131 | 132 | print("Best differential trail:") 133 | 134 | state_dim = config["statedimension"] 135 | num_states = ((config["aesrounds"] + 1) * config["rounds"]) + 1 136 | 137 | for rnd in range(num_states): 138 | for row in range(state_dim): 139 | for aes_state in range(config["aesstates"]): 140 | for col in range(state_dim): 141 | cur_index = row + col * state_dim + rnd * state_dim * state_dim 142 | if model.getVarByName("x[{}][{}]".format( 143 | aes_state, cur_index)).x > 0.5: 144 | print("\033[91mx\033[0m", end = " ") 145 | else: 146 | print(".", end = " ") 147 | print(" ", end = "") 148 | print("") 149 | if rnd != num_states - 1: 150 | if isAESround(rnd, config["aesrounds"]): 151 | print("AES") 152 | else: 153 | print("MIX") 154 | return 155 | 156 | def addtruncatedconstraints(model, config, var_x, var_mccosts, var_mcactive, 157 | costs): 158 | """ 159 | Adds constraints for the truncated security model. 160 | """ 161 | 162 | costs_mc = model.addVar(name="MixColumnsCosts") 163 | costs_mc_nodof = model.addVar(name="MixColumnsCostsNoDof") 164 | costs_collision = model.addVar(name="CollisionCosts") 165 | costs_reducable = model.addVar(name="ReducableCosts") 166 | degoffree = model.addVar(name="DegreesOfFreedom") 167 | 168 | model.update() 169 | 170 | num_states = ((config["aesrounds"] + 1) * config["rounds"]) + 1 171 | state_dim = config["statedimension"] 172 | 173 | # Define costs 174 | 175 | # Attacker can control input == output difference with d.o.f. 176 | if config["attackerstart"] - config["attackerpower"] <= 0 and \ 177 | config["attackerstart"] + config["attackerpower"] >= (num_states - 2): 178 | model.addConstr(costs_reducable >= costs_mc + costs_collision - 179 | degoffree, "Attack costs after reducing d.o.f.") 180 | model.addConstr(costs >= costs_reducable + costs_mc_nodof, 181 | "Total attack costs") 182 | else: 183 | model.addConstr(costs_reducable >= costs_mc - degoffree, 184 | "Attack costs after reducing d.o.f.") 185 | model.addConstr(costs >= costs_reducable + costs_mc_nodof + 186 | costs_collision, "Total attack costs") 187 | 188 | # Count number of d.o.f. 189 | # Collision resistance 190 | # model.addConstr(degoffree <= state_dim * state_dim * config["wordsize"] * 191 | # config["aesstates"]) 192 | 193 | # Second-preimage reistance 194 | # Allow only to choose differences in this setting 195 | start_indices = [config["attackerstart"] * state_dim * state_dim + 196 | x for x in range(state_dim*state_dim)] 197 | model.addConstr(degoffree <= quicksum(var_x[aes_state][i] 198 | for aes_state in range(config["aesstates"]) 199 | for i in start_indices) * config["wordsize"]) 200 | 201 | # Find rounds which are non-linear 202 | non_linear_rounds = [x for x in range(num_states - 1) if isAESround(x, 203 | config["aesrounds"])] 204 | 205 | start_index = non_linear_rounds.index(config["attackerstart"]) 206 | 207 | # Count conditions on MixColumns 208 | for aes_state in range(config["aesstates"]): 209 | for fwd_rnd in non_linear_rounds[start_index:]: 210 | for col in range(state_dim): 211 | indices = [] 212 | for row in range(state_dim): 213 | indices.append((fwd_rnd + 1) * state_dim * state_dim + 214 | col*state_dim + row) 215 | model = addMCcostsfromindices(model, config, var_x, var_mccosts, 216 | var_mcactive, aes_state, fwd_rnd, 217 | col, indices) 218 | 219 | 220 | for bck_rnd in non_linear_rounds[:start_index]: 221 | for col in range(state_dim): 222 | indices = [] 223 | for row in range(state_dim): 224 | tmp_index = ((state_dim * col + row * (state_dim + 1)) % 225 | (state_dim * state_dim)) 226 | indices.append(bck_rnd * state_dim * state_dim + tmp_index) 227 | model = addMCcostsfromindices(model, config, var_x, var_mccosts, 228 | var_mcactive, aes_state, bck_rnd, 229 | col, indices) 230 | 231 | # Find costs for controlled and uncontrolled rounds 232 | assert config["attackerstart"] in non_linear_rounds 233 | match_index = non_linear_rounds.index(config["attackerstart"]) 234 | 235 | dof_interval_from = max(match_index - config["attackerpower"], 0) 236 | dof_interval_to = min(match_index + config["attackerpower"], num_states) 237 | 238 | active_rounds_dof = non_linear_rounds[dof_interval_from:dof_interval_to] 239 | active_rounds_nodof = list(set(non_linear_rounds) - set(active_rounds_dof)) 240 | 241 | mc_indices = [] 242 | mc_indices_nodof = [] 243 | 244 | for i in range(state_dim): 245 | for itrnd in active_rounds_dof: 246 | mc_indices.append(state_dim*itrnd + i) 247 | for itrnd in active_rounds_nodof: 248 | mc_indices_nodof.append(state_dim*itrnd + i) 249 | 250 | model.addConstr(quicksum(var_mccosts[j][i] for j in range(config["aesstates"]) 251 | for i in mc_indices) - costs_mc == 0, "MixColumns Costs Reducable") 252 | model.addConstr(quicksum(var_mccosts[j][i] for j in range(config["aesstates"]) 253 | for i in mc_indices_nodof) - costs_mc_nodof == 0, "MixColumns Costs") 254 | 255 | return model 256 | 257 | def addMCcostsfromindices(model, config, var_x, var_mccosts, var_mcactive, 258 | aes_state, rnd, col, indices): 259 | """ 260 | Add the MixColumns costs given the indices 261 | """ 262 | state_column = [var_x[aes_state][i] for i in indices] 263 | column_idx = rnd * config["statedimension"] + col 264 | # Mark as active MixColumns 265 | model.addConstr(quicksum(state_column) <= config["statedimension"] * 266 | var_mcactive[aes_state][column_idx], 267 | "MixColumns is active") 268 | # Costs for MixColumn transition 269 | model.addConstr((config["statedimension"] - quicksum(state_column)) * 270 | config["wordsize"] * 271 | var_mcactive[aes_state][column_idx] == 272 | var_mccosts[aes_state][column_idx], 273 | "MixColumns costs") 274 | return model 275 | 276 | def addcolltruncoutput512(model, config, var_x): 277 | """ 278 | Add constrains that the trail must lead to a collision after truncation. 279 | """ 280 | assert(config["aesstates"] == 4) 281 | 282 | # haraka Truncation 283 | num_states = ((config["aesrounds"] + 1) * config["rounds"]) + 1 284 | state_dim = config["statedimension"] 285 | 286 | hashoutput = [] 287 | for aes_state in [0, 3]: 288 | for word in range(2 * state_dim, state_dim * state_dim): 289 | hashoutput.append(var_x[aes_state][word]) 290 | model.addConstr(var_x[aes_state][word] == 291 | var_x[aes_state][word + (num_states - 1) * 292 | state_dim * state_dim], "collision") 293 | for aes_state in [1, 2]: 294 | for word in range(2 * state_dim): 295 | hashoutput.append(var_x[aes_state][word]) 296 | model.addConstr(var_x[aes_state][word] == 297 | var_x[aes_state][word + (num_states - 1) * 298 | state_dim * state_dim], "collision") 299 | 300 | if config["securitymodel"] == "truncated": 301 | costs_collision = model.getVarByName("CollisionCosts") 302 | model.addConstr(costs_collision - quicksum(hashoutput) * 303 | config["wordsize"] == 0, "inputdiff = outputdiff") 304 | 305 | return model 306 | 307 | def addcollisionconstraints(model, config, var_x): 308 | """ 309 | Add constraints that the trail must lead to a collision. 310 | """ 311 | num_states = ((config["aesrounds"] + 1) * config["rounds"]) + 1 312 | state_dim = config["statedimension"] 313 | 314 | for aes_state in range(config["aesstates"]): 315 | for word in range(state_dim * state_dim): 316 | model.addConstr(var_x[aes_state][word] == 317 | var_x[aes_state][word + (num_states - 1) * 318 | state_dim * state_dim], "collision") 319 | 320 | if config["securitymodel"] == "truncated": 321 | costs_collision = model.getVarByName("CollisionCosts") 322 | model.addConstr(costs_collision - 323 | quicksum(var_x[i][j] for i in range(config["aesstates"]) 324 | for j in range(state_dim * state_dim)) * 325 | config["wordsize"] == 0, "inputdiff = outputdiff") 326 | 327 | return model 328 | 329 | def addmixconstraints512(model, config, var_x, current_round): 330 | """ 331 | Adds the mix layer. Note that this layer is only defined if there 332 | are exactly four AES states. 333 | """ 334 | assert(config["aesstates"] == 4) 335 | 336 | # Columnwise permutation 337 | permutation = [3, 11, 7, 15, 338 | 8, 0, 12, 4, 339 | 9, 1, 13, 5, 340 | 2, 10, 6, 14] 341 | 342 | state_dim = config["statedimension"] 343 | words_state = state_dim * state_dim 344 | start_index = words_state * (config["aesrounds"] + current_round * 345 | config["aesrounds"] + current_round) 346 | 347 | next_index = 0 348 | 349 | for idx, col in enumerate(permutation): 350 | old_col_start = start_index + (col % state_dim) * state_dim 351 | new_col_start = start_index + (idx % state_dim) * state_dim + words_state 352 | for word in range(state_dim): 353 | model.addConstr(var_x[col // 4][old_col_start + word] == 354 | var_x[idx // 4][new_col_start + word], "mix") 355 | 356 | return model 357 | 358 | def addmixconstraints256(model, config, var_x, current_round): 359 | """ 360 | Adds the mix layer. Note that this layer is only defined if there 361 | are exactly two AES states. 362 | """ 363 | assert(config["aesstates"] == 2) 364 | 365 | # Columnwise permutation 366 | permutation = [0, 4, 1, 5, 367 | 2, 6, 3, 7] 368 | 369 | state_dim = config["statedimension"] 370 | words_state = state_dim * state_dim 371 | start_index = words_state * (config["aesrounds"] + current_round * 372 | config["aesrounds"] + current_round) 373 | 374 | next_index = 0 375 | 376 | for idx, col in enumerate(permutation): 377 | old_col_start = start_index + (col % state_dim) * state_dim 378 | new_col_start = start_index + (idx % state_dim) * state_dim + words_state 379 | for word in range(state_dim): 380 | model.addConstr(var_x[col // 4][old_col_start + word] == 381 | var_x[idx // 4][new_col_start + word], "mix") 382 | 383 | return model 384 | 385 | def addactivesboxconstraints(model, config, var_x, activesboxes): 386 | """ 387 | Adds constraints for counting the number of active S-boxes. 388 | """ 389 | sbox_indices = [] 390 | num_states = (config["aesrounds"] + 1) * config["rounds"] 391 | state_size = config["statedimension"] * config["statedimension"] 392 | for rnd in filter(lambda x: isAESround(x, config["aesrounds"]), 393 | range(0, num_states)): 394 | words_state = config["statedimension"] * config["statedimension"] 395 | rnd_offset = rnd * words_state 396 | sbox_indices += [rnd_offset + word for word in range(words_state)] 397 | 398 | sboxes = [] 399 | 400 | if config["aesstates"] == 4: 401 | # Remove S-boxes which are truncated 402 | trunc_indices = [0, 1, 5, 6, 10, 11, 12, 15] 403 | trunc_indices_2 = [2, 3, 4, 7, 8, 9, 13, 14] 404 | 405 | tmp_sbox_indices = [i for i in sbox_indices] 406 | for idx in trunc_indices: 407 | tmp_sbox_indices.remove(idx + (num_states - 2) * state_size) 408 | for idx in tmp_sbox_indices: 409 | sboxes.append(var_x[0][idx]) 410 | sboxes.append(var_x[2][idx]) 411 | 412 | tmp_sbox_indices = [i for i in sbox_indices] 413 | for idx in trunc_indices_2: 414 | tmp_sbox_indices.remove(idx + (num_states - 2) * state_size) 415 | for idx in tmp_sbox_indices: 416 | sboxes.append(var_x[1][idx]) 417 | sboxes.append(var_x[3][idx]) 418 | else: 419 | sboxes = [var_x[aes_state][i] for aes_state in range(config["aesstates"]) 420 | for i in sbox_indices] 421 | 422 | model.addConstr(quicksum(sboxes) - activesboxes == 0, 423 | "Count Active S-boxes") 424 | 425 | return model 426 | -------------------------------------------------------------------------------- /analysis/models/milpconstraints.py: -------------------------------------------------------------------------------- 1 | """ 2 | Constraints for AES-like round functions. 3 | """ 4 | 5 | from gurobipy import * 6 | 7 | def addAESrndconstraints(gurobi_model, state_dim, var_x, var_d, branch_number, 8 | rounds): 9 | """ 10 | Adds constraints from MixColumns/ShiftRows for given branch number. 11 | """ 12 | state = [[x*state_dim + y for x in range(state_dim)] 13 | for y in range(state_dim)] 14 | 15 | next_index = state_dim * state_dim 16 | dummy = 0 17 | 18 | var_mcintmp = [] 19 | var_mcouttmp = [] 20 | for col in range(rounds * state_dim): 21 | var_mcintmp.append(gurobi_model.addVar(vtype=GRB.BINARY, 22 | name="isMCactiveIn[{}]".format(col))) 23 | var_mcouttmp.append(gurobi_model.addVar(vtype=GRB.BINARY, 24 | name="isMCactiveOut[{}]".format(col))) 25 | gurobi_model.update() 26 | 27 | for rnd in range(rounds): 28 | #Shiftrows 29 | tmp = [0 for x in range(state_dim)] 30 | for i in range(1, state_dim): 31 | for j in range(state_dim): 32 | tmp[j] = state[i][(j + i) % state_dim] 33 | for j in range(state_dim): 34 | state[i][j] = tmp[j] 35 | #MixColumns 36 | for j in range(state_dim): 37 | tmp_before = [] 38 | tmp_after = [] 39 | for i in range(state_dim): 40 | tmp_before.append(state[i][j]) 41 | for i in range(state_dim - 1): 42 | tmp_after.append(next_index + i) 43 | tmp_after.append(next_index + (state_dim - 1)) 44 | #Limit for branch number 45 | gurobi_model.addConstr(quicksum(var_x[i] for i in tmp_before + 46 | tmp_after) - (branch_number) * 47 | var_d[dummy] >= 0, "MC{}{}".format(rnd, j)) 48 | 49 | #Force both sides to be either zero or non-zero 50 | gurobi_model.addConstr(quicksum(var_x[i] for i in tmp_before) >= 51 | var_mcintmp[rnd*state_dim + j], "MCactivein") 52 | gurobi_model.addConstr(quicksum(var_x[i] for i in tmp_after) >= 53 | var_mcouttmp[rnd*state_dim + j], "MCactiveout") 54 | gurobi_model.addConstr(quicksum(var_x[i] for i in tmp_before + tmp_after) <= 55 | var_mcintmp[rnd*state_dim + j] * 56 | var_mcouttmp[rnd*state_dim + j] * 2 * 57 | state_dim, "MCValid{}{}".format(rnd, j)) 58 | 59 | for i in range(state_dim): 60 | gurobi_model.addConstr(var_d[dummy] - var_x[state[i][j]] >= 0, 61 | "MCt{}{}{}".format(rnd, j, i)) 62 | for i in range(state_dim): 63 | state[i][j] = next_index 64 | next_index += 1 65 | gurobi_model.addConstr(var_d[dummy] - var_x[state[i][j]] >= 0, 66 | "MCt{}{}{}".format(rnd, j, i)) 67 | dummy += 1 68 | return gurobi_model 69 | -------------------------------------------------------------------------------- /code/c/aesni_optimized/Makefile: -------------------------------------------------------------------------------- 1 | C=gcc-6 2 | RM=rm -f 3 | CFLAGS=-g -O3 -march=native -funroll-all-loops -fomit-frame-pointer 4 | SRCS=main.c haraka.c 5 | OBJS=$(subst .c,.o,$(SRCS)) 6 | OUTFILE="haraka" 7 | 8 | all: haraka 9 | 10 | haraka: $(OBJS) 11 | $(C) -o $(OUTFILE) $(OBJS) $(LDLIBS) 12 | 13 | depend: .depend 14 | 15 | .depend: $(SRCS) 16 | rm -f ./.depend 17 | $(C) $(CFLAGS) -MM $^>>./.depend; 18 | 19 | clean: 20 | $(RM) $(OBJS) 21 | 22 | dist-clean: clean 23 | $(RM) *~ .depend 24 | 25 | include .depend 26 | -------------------------------------------------------------------------------- /code/c/aesni_optimized/haraka.c: -------------------------------------------------------------------------------- 1 | #include "haraka.h" 2 | #include 3 | 4 | void load_constants() { 5 | rc[0] = _mm_set_epi32(0x0684704c,0xe620c00a,0xb2c5fef0,0x75817b9d); 6 | rc[1] = _mm_set_epi32(0x8b66b4e1,0x88f3a06b,0x640f6ba4,0x2f08f717); 7 | rc[2] = _mm_set_epi32(0x3402de2d,0x53f28498,0xcf029d60,0x9f029114); 8 | rc[3] = _mm_set_epi32(0x0ed6eae6,0x2e7b4f08,0xbbf3bcaf,0xfd5b4f79); 9 | rc[4] = _mm_set_epi32(0xcbcfb0cb,0x4872448b,0x79eecd1c,0xbe397044); 10 | rc[5] = _mm_set_epi32(0x7eeacdee,0x6e9032b7,0x8d5335ed,0x2b8a057b); 11 | rc[6] = _mm_set_epi32(0x67c28f43,0x5e2e7cd0,0xe2412761,0xda4fef1b); 12 | rc[7] = _mm_set_epi32(0x2924d9b0,0xafcacc07,0x675ffde2,0x1fc70b3b); 13 | rc[8] = _mm_set_epi32(0xab4d63f1,0xe6867fe9,0xecdb8fca,0xb9d465ee); 14 | rc[9] = _mm_set_epi32(0x1c30bf84,0xd4b7cd64,0x5b2a404f,0xad037e33); 15 | rc[10] = _mm_set_epi32(0xb2cc0bb9,0x941723bf,0x69028b2e,0x8df69800); 16 | rc[11] = _mm_set_epi32(0xfa0478a6,0xde6f5572,0x4aaa9ec8,0x5c9d2d8a); 17 | rc[12] = _mm_set_epi32(0xdfb49f2b,0x6b772a12,0x0efa4f2e,0x29129fd4); 18 | rc[13] = _mm_set_epi32(0x1ea10344,0xf449a236,0x32d611ae,0xbb6a12ee); 19 | rc[14] = _mm_set_epi32(0xaf044988,0x4b050084,0x5f9600c9,0x9ca8eca6); 20 | rc[15] = _mm_set_epi32(0x21025ed8,0x9d199c4f,0x78a2c7e3,0x27e593ec); 21 | rc[16] = _mm_set_epi32(0xbf3aaaf8,0xa759c9b7,0xb9282ecd,0x82d40173); 22 | rc[17] = _mm_set_epi32(0x6260700d,0x6186b017,0x37f2efd9,0x10307d6b); 23 | rc[18] = _mm_set_epi32(0x5aca45c2,0x21300443,0x81c29153,0xf6fc9ac6); 24 | rc[19] = _mm_set_epi32(0x9223973c,0x226b68bb,0x2caf92e8,0x36d1943a); 25 | rc[20] = _mm_set_epi32(0xd3bf9238,0x225886eb,0x6cbab958,0xe51071b4); 26 | rc[21] = _mm_set_epi32(0xdb863ce5,0xaef0c677,0x933dfddd,0x24e1128d); 27 | rc[22] = _mm_set_epi32(0xbb606268,0xffeba09c,0x83e48de3,0xcb2212b1); 28 | rc[23] = _mm_set_epi32(0x734bd3dc,0xe2e4d19c,0x2db91a4e,0xc72bf77d); 29 | rc[24] = _mm_set_epi32(0x43bb47c3,0x61301b43,0x4b1415c4,0x2cb3924e); 30 | rc[25] = _mm_set_epi32(0xdba775a8,0xe707eff6,0x03b231dd,0x16eb6899); 31 | rc[26] = _mm_set_epi32(0x6df3614b,0x3c755977,0x8e5e2302,0x7eca472c); 32 | rc[27] = _mm_set_epi32(0xcda75a17,0xd6de7d77,0x6d1be5b9,0xb88617f9); 33 | rc[28] = _mm_set_epi32(0xec6b43f0,0x6ba8e9aa,0x9d6c069d,0xa946ee5d); 34 | rc[29] = _mm_set_epi32(0xcb1e6950,0xf957332b,0xa2531159,0x3bf327c1); 35 | rc[30] = _mm_set_epi32(0x2cee0c75,0x00da619c,0xe4ed0353,0x600ed0d9); 36 | rc[31] = _mm_set_epi32(0xf0b1a5a1,0x96e90cab,0x80bbbabc,0x63a4a350); 37 | rc[32] = _mm_set_epi32(0xae3db102,0x5e962988,0xab0dde30,0x938dca39); 38 | rc[33] = _mm_set_epi32(0x17bb8f38,0xd554a40b,0x8814f3a8,0x2e75b442); 39 | rc[34] = _mm_set_epi32(0x34bb8a5b,0x5f427fd7,0xaeb6b779,0x360a16f6); 40 | rc[35] = _mm_set_epi32(0x26f65241,0xcbe55438,0x43ce5918,0xffbaafde); 41 | rc[36] = _mm_set_epi32(0x4ce99a54,0xb9f3026a,0xa2ca9cf7,0x839ec978); 42 | rc[37] = _mm_set_epi32(0xae51a51a,0x1bdff7be,0x40c06e28,0x22901235); 43 | rc[38] = _mm_set_epi32(0xa0c1613c,0xba7ed22b,0xc173bc0f,0x48a659cf); 44 | rc[39] = _mm_set_epi32(0x756acc03,0x02288288,0x4ad6bdfd,0xe9c59da1); 45 | } 46 | 47 | void test_implementations() { 48 | unsigned char *in = (unsigned char *)calloc(64*8, sizeof(unsigned char)); 49 | unsigned char *out256 = (unsigned char *)calloc(32*8, sizeof(unsigned char)); 50 | unsigned char *out512 = (unsigned char *)calloc(32*8, sizeof(unsigned char)); 51 | unsigned char testvector256[32] = {0x80, 0x27, 0xcc, 0xb8, 0x79, 0x49, 0x77, 0x4b, 52 | 0x78, 0xd0, 0x54, 0x5f, 0xb7, 0x2b, 0xf7, 0x0c, 53 | 0x69, 0x5c, 0x2a, 0x09, 0x23, 0xcb, 0xd4, 0x7b, 54 | 0xba, 0x11, 0x59, 0xef, 0xbf, 0x2b, 0x2c, 0x1c}; 55 | 56 | unsigned char testvector512[32] = {0xbe, 0x7f, 0x72, 0x3b, 0x4e, 0x80, 0xa9, 0x98, 57 | 0x13, 0xb2, 0x92, 0x28, 0x7f, 0x30, 0x6f, 0x62, 58 | 0x5a, 0x6d, 0x57, 0x33, 0x1c, 0xae, 0x5f, 0x34, 59 | 0xdd, 0x92, 0x77, 0xb0, 0x94, 0x5b, 0xe2, 0xaa}; 60 | 61 | 62 | 63 | int i; 64 | 65 | // Input for testvector 66 | for(i = 0; i < 512; i++) { 67 | in[i] = i % 64; 68 | } 69 | 70 | load_constants(); 71 | haraka512_8x(out512, in); 72 | 73 | // Verify output 74 | for(i = 0; i < 32; i++) { 75 | if (out512[i % 32] != testvector512[i]) { 76 | printf("Error: testvector incorrect.\n"); 77 | return; 78 | } 79 | } 80 | 81 | free(in); 82 | free(out256); 83 | free(out512); 84 | } 85 | 86 | void haraka256(unsigned char *out, const unsigned char *in) { 87 | __m128i s[2], tmp; 88 | 89 | s[0] = LOAD(in); 90 | s[1] = LOAD(in + 16); 91 | 92 | AES2(s[0], s[1], 0); 93 | MIX2(s[0], s[1]); 94 | 95 | AES2(s[0], s[1], 4); 96 | MIX2(s[0], s[1]); 97 | 98 | AES2(s[0], s[1], 8); 99 | MIX2(s[0], s[1]); 100 | 101 | AES2(s[0], s[1], 12); 102 | MIX2(s[0], s[1]); 103 | 104 | AES2(s[0], s[1], 16); 105 | MIX2(s[0], s[1]); 106 | 107 | s[0] = _mm_xor_si128(s[0], LOAD(in)); 108 | s[1] = _mm_xor_si128(s[1], LOAD(in + 16)); 109 | 110 | STORE(out, s[0]); 111 | STORE(out + 16, s[1]); 112 | } 113 | 114 | void haraka256_4x(unsigned char *out, const unsigned char *in) { 115 | __m128i s[4][2], tmp; 116 | 117 | s[0][0] = LOAD(in); 118 | s[0][1] = LOAD(in + 16); 119 | s[1][0] = LOAD(in + 32); 120 | s[1][1] = LOAD(in + 48); 121 | s[2][0] = LOAD(in + 64); 122 | s[2][1] = LOAD(in + 80); 123 | s[3][0] = LOAD(in + 96); 124 | s[3][1] = LOAD(in + 112); 125 | 126 | // Round 1 127 | AES2_4x(s[0], s[1], s[2], s[3], 0); 128 | 129 | MIX2(s[0][0], s[0][1]); 130 | MIX2(s[1][0], s[1][1]); 131 | MIX2(s[2][0], s[2][1]); 132 | MIX2(s[3][0], s[3][1]); 133 | 134 | // Round 2 135 | AES2_4x(s[0], s[1], s[2], s[3], 4); 136 | 137 | MIX2(s[0][0], s[0][1]); 138 | MIX2(s[1][0], s[1][1]); 139 | MIX2(s[2][0], s[2][1]); 140 | MIX2(s[3][0], s[3][1]); 141 | 142 | // Round 3 143 | AES2_4x(s[0], s[1], s[2], s[3], 8); 144 | 145 | MIX2(s[0][0], s[0][1]); 146 | MIX2(s[1][0], s[1][1]); 147 | MIX2(s[2][0], s[2][1]); 148 | MIX2(s[3][0], s[3][1]); 149 | 150 | // Round 4 151 | AES2_4x(s[0], s[1], s[2], s[3], 12); 152 | 153 | MIX2(s[0][0], s[0][1]); 154 | MIX2(s[1][0], s[1][1]); 155 | MIX2(s[2][0], s[2][1]); 156 | MIX2(s[3][0], s[3][1]); 157 | 158 | // Round 5 159 | AES2_4x(s[0], s[1], s[2], s[3], 16); 160 | 161 | MIX2(s[0][0], s[0][1]); 162 | MIX2(s[1][0], s[1][1]); 163 | MIX2(s[2][0], s[2][1]); 164 | MIX2(s[3][0], s[3][1]); 165 | 166 | // Feed Forward 167 | s[0][0] = _mm_xor_si128(s[0][0], LOAD(in)); 168 | s[0][1] = _mm_xor_si128(s[0][1], LOAD(in + 16)); 169 | s[1][0] = _mm_xor_si128(s[1][0], LOAD(in + 32)); 170 | s[1][1] = _mm_xor_si128(s[1][1], LOAD(in + 48)); 171 | s[2][0] = _mm_xor_si128(s[2][0], LOAD(in + 64)); 172 | s[2][1] = _mm_xor_si128(s[2][1], LOAD(in + 80)); 173 | s[3][0] = _mm_xor_si128(s[3][0], LOAD(in + 96)); 174 | s[3][1] = _mm_xor_si128(s[3][1], LOAD(in + 112)); 175 | 176 | STORE(out, s[0][0]); 177 | STORE(out + 16, s[0][1]); 178 | STORE(out + 32, s[1][0]); 179 | STORE(out + 48, s[1][1]); 180 | STORE(out + 64, s[2][0]); 181 | STORE(out + 80, s[2][1]); 182 | STORE(out + 96, s[3][0]); 183 | STORE(out + 112, s[3][1]); 184 | } 185 | 186 | void haraka256_8x(unsigned char *out, const unsigned char *in) { 187 | // This is faster on Skylake, the code below is faster on Haswell. 188 | haraka256_4x(out, in); 189 | haraka256_4x(out + 128, in + 128); 190 | return; 191 | // __m128i s[8][2], tmp; 192 | // 193 | // int i; 194 | // 195 | // s[0][0] = LOAD(in); 196 | // s[0][1] = LOAD(in + 16); 197 | // s[1][0] = LOAD(in + 32); 198 | // s[1][1] = LOAD(in + 48); 199 | // s[2][0] = LOAD(in + 64); 200 | // s[2][1] = LOAD(in + 80); 201 | // s[3][0] = LOAD(in + 96); 202 | // s[3][1] = LOAD(in + 112); 203 | // s[4][0] = LOAD(in + 128); 204 | // s[4][1] = LOAD(in + 144); 205 | // s[5][0] = LOAD(in + 160); 206 | // s[5][1] = LOAD(in + 176); 207 | // s[6][0] = LOAD(in + 192); 208 | // s[6][1] = LOAD(in + 208); 209 | // s[7][0] = LOAD(in + 224); 210 | // s[7][1] = LOAD(in + 240); 211 | // 212 | // // Round 1 213 | // AES2_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 0); 214 | // 215 | // MIX2(s[0][0], s[0][1]); 216 | // MIX2(s[1][0], s[1][1]); 217 | // MIX2(s[2][0], s[2][1]); 218 | // MIX2(s[3][0], s[3][1]); 219 | // MIX2(s[4][0], s[4][1]); 220 | // MIX2(s[5][0], s[5][1]); 221 | // MIX2(s[6][0], s[6][1]); 222 | // MIX2(s[7][0], s[7][1]); 223 | // 224 | // 225 | // // Round 2 226 | // AES2_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 4); 227 | // 228 | // MIX2(s[0][0], s[0][1]); 229 | // MIX2(s[1][0], s[1][1]); 230 | // MIX2(s[2][0], s[2][1]); 231 | // MIX2(s[3][0], s[3][1]); 232 | // MIX2(s[4][0], s[4][1]); 233 | // MIX2(s[5][0], s[5][1]); 234 | // MIX2(s[6][0], s[6][1]); 235 | // MIX2(s[7][0], s[7][1]); 236 | // 237 | // // Round 3 238 | // AES2_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 8); 239 | // 240 | // MIX2(s[0][0], s[0][1]); 241 | // MIX2(s[1][0], s[1][1]); 242 | // MIX2(s[2][0], s[2][1]); 243 | // MIX2(s[3][0], s[3][1]); 244 | // MIX2(s[4][0], s[4][1]); 245 | // MIX2(s[5][0], s[5][1]); 246 | // MIX2(s[6][0], s[6][1]); 247 | // MIX2(s[7][0], s[7][1]); 248 | // 249 | // // Round 4 250 | // AES2_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 12); 251 | // 252 | // MIX2(s[0][0], s[0][1]); 253 | // MIX2(s[1][0], s[1][1]); 254 | // MIX2(s[2][0], s[2][1]); 255 | // MIX2(s[3][0], s[3][1]); 256 | // MIX2(s[4][0], s[4][1]); 257 | // MIX2(s[5][0], s[5][1]); 258 | // MIX2(s[6][0], s[6][1]); 259 | // MIX2(s[7][0], s[7][1]); 260 | // 261 | // // Round 5 262 | // AES2_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 16); 263 | // 264 | // MIX2(s[0][0], s[0][1]); 265 | // MIX2(s[1][0], s[1][1]); 266 | // MIX2(s[2][0], s[2][1]); 267 | // MIX2(s[3][0], s[3][1]); 268 | // MIX2(s[4][0], s[4][1]); 269 | // MIX2(s[5][0], s[5][1]); 270 | // MIX2(s[6][0], s[6][1]); 271 | // MIX2(s[7][0], s[7][1]); 272 | // 273 | // // Feed Forward 274 | // s[0][0] = _mm_xor_si128(s[0][0], LOAD(in)); 275 | // s[0][1] = _mm_xor_si128(s[0][1], LOAD(in + 16)); 276 | // s[1][0] = _mm_xor_si128(s[1][0], LOAD(in + 32)); 277 | // s[1][1] = _mm_xor_si128(s[1][1], LOAD(in + 48)); 278 | // s[2][0] = _mm_xor_si128(s[2][0], LOAD(in + 64)); 279 | // s[2][1] = _mm_xor_si128(s[2][1], LOAD(in + 80)); 280 | // s[3][0] = _mm_xor_si128(s[3][0], LOAD(in + 96)); 281 | // s[3][1] = _mm_xor_si128(s[3][1], LOAD(in + 112)); 282 | // s[4][0] = _mm_xor_si128(s[4][0], LOAD(in + 128)); 283 | // s[4][1] = _mm_xor_si128(s[4][1], LOAD(in + 144)); 284 | // s[5][0] = _mm_xor_si128(s[5][0], LOAD(in + 160)); 285 | // s[5][1] = _mm_xor_si128(s[5][1], LOAD(in + 176)); 286 | // s[6][0] = _mm_xor_si128(s[6][0], LOAD(in + 192)); 287 | // s[6][1] = _mm_xor_si128(s[6][1], LOAD(in + 208)); 288 | // s[7][0] = _mm_xor_si128(s[7][0], LOAD(in + 224)); 289 | // s[7][1] = _mm_xor_si128(s[7][1], LOAD(in + 240)); 290 | // 291 | // STORE(out, s[0][0]); 292 | // STORE(out + 16, s[0][1]); 293 | // STORE(out + 32, s[1][0]); 294 | // STORE(out + 48, s[1][1]); 295 | // STORE(out + 64, s[2][0]); 296 | // STORE(out + 80, s[2][1]); 297 | // STORE(out + 96, s[3][0]); 298 | // STORE(out + 112, s[3][1]); 299 | // STORE(out + 128, s[4][0]); 300 | // STORE(out + 144, s[4][1]); 301 | // STORE(out + 160, s[5][0]); 302 | // STORE(out + 176, s[5][1]); 303 | // STORE(out + 192, s[6][0]); 304 | // STORE(out + 208, s[6][1]); 305 | // STORE(out + 224, s[7][0]); 306 | // STORE(out + 240, s[7][1]); 307 | } 308 | 309 | void haraka512(unsigned char *out, const unsigned char *in) { 310 | u128 s[4], tmp; 311 | 312 | s[0] = LOAD(in); 313 | s[1] = LOAD(in + 16); 314 | s[2] = LOAD(in + 32); 315 | s[3] = LOAD(in + 48); 316 | 317 | AES4(s[0], s[1], s[2], s[3], 0); 318 | MIX4(s[0], s[1], s[2], s[3]); 319 | 320 | AES4(s[0], s[1], s[2], s[3], 8); 321 | MIX4(s[0], s[1], s[2], s[3]); 322 | 323 | AES4(s[0], s[1], s[2], s[3], 16); 324 | MIX4(s[0], s[1], s[2], s[3]); 325 | 326 | AES4(s[0], s[1], s[2], s[3], 24); 327 | MIX4(s[0], s[1], s[2], s[3]); 328 | 329 | AES4(s[0], s[1], s[2], s[3], 32); 330 | MIX4(s[0], s[1], s[2], s[3]); 331 | 332 | s[0] = _mm_xor_si128(s[0], LOAD(in)); 333 | s[1] = _mm_xor_si128(s[1], LOAD(in + 16)); 334 | s[2] = _mm_xor_si128(s[2], LOAD(in + 32)); 335 | s[3] = _mm_xor_si128(s[3], LOAD(in + 48)); 336 | 337 | TRUNCSTORE(out, s[0], s[1], s[2], s[3]); 338 | } 339 | 340 | void haraka512_4x(unsigned char *out, const unsigned char *in) { 341 | u128 s[4][4], tmp; 342 | 343 | s[0][0] = LOAD(in); 344 | s[0][1] = LOAD(in + 16); 345 | s[0][2] = LOAD(in + 32); 346 | s[0][3] = LOAD(in + 48); 347 | s[1][0] = LOAD(in + 64); 348 | s[1][1] = LOAD(in + 80); 349 | s[1][2] = LOAD(in + 96); 350 | s[1][3] = LOAD(in + 112); 351 | s[2][0] = LOAD(in + 128); 352 | s[2][1] = LOAD(in + 144); 353 | s[2][2] = LOAD(in + 160); 354 | s[2][3] = LOAD(in + 176); 355 | s[3][0] = LOAD(in + 192); 356 | s[3][1] = LOAD(in + 208); 357 | s[3][2] = LOAD(in + 224); 358 | s[3][3] = LOAD(in + 240); 359 | 360 | AES4_4x(s[0], s[1], s[2], s[3], 0); 361 | MIX4(s[0][0], s[0][1], s[0][2], s[0][3]); 362 | MIX4(s[1][0], s[1][1], s[1][2], s[1][3]); 363 | MIX4(s[2][0], s[2][1], s[2][2], s[2][3]); 364 | MIX4(s[3][0], s[3][1], s[3][2], s[3][3]); 365 | 366 | AES4_4x(s[0], s[1], s[2], s[3], 8); 367 | MIX4(s[0][0], s[0][1], s[0][2], s[0][3]); 368 | MIX4(s[1][0], s[1][1], s[1][2], s[1][3]); 369 | MIX4(s[2][0], s[2][1], s[2][2], s[2][3]); 370 | MIX4(s[3][0], s[3][1], s[3][2], s[3][3]); 371 | 372 | AES4_4x(s[0], s[1], s[2], s[3], 16); 373 | MIX4(s[0][0], s[0][1], s[0][2], s[0][3]); 374 | MIX4(s[1][0], s[1][1], s[1][2], s[1][3]); 375 | MIX4(s[2][0], s[2][1], s[2][2], s[2][3]); 376 | MIX4(s[3][0], s[3][1], s[3][2], s[3][3]); 377 | 378 | AES4_4x(s[0], s[1], s[2], s[3], 24); 379 | MIX4(s[0][0], s[0][1], s[0][2], s[0][3]); 380 | MIX4(s[1][0], s[1][1], s[1][2], s[1][3]); 381 | MIX4(s[2][0], s[2][1], s[2][2], s[2][3]); 382 | MIX4(s[3][0], s[3][1], s[3][2], s[3][3]); 383 | 384 | AES4_4x(s[0], s[1], s[2], s[3], 32); 385 | MIX4(s[0][0], s[0][1], s[0][2], s[0][3]); 386 | MIX4(s[1][0], s[1][1], s[1][2], s[1][3]); 387 | MIX4(s[2][0], s[2][1], s[2][2], s[2][3]); 388 | MIX4(s[3][0], s[3][1], s[3][2], s[3][3]); 389 | 390 | 391 | s[0][0] = _mm_xor_si128(s[0][0], LOAD(in)); 392 | s[0][1] = _mm_xor_si128(s[0][1], LOAD(in + 16)); 393 | s[0][2] = _mm_xor_si128(s[0][2], LOAD(in + 32)); 394 | s[0][3] = _mm_xor_si128(s[0][3], LOAD(in + 48)); 395 | s[1][0] = _mm_xor_si128(s[1][0], LOAD(in + 64)); 396 | s[1][1] = _mm_xor_si128(s[1][1], LOAD(in + 80)); 397 | s[1][2] = _mm_xor_si128(s[1][2], LOAD(in + 96)); 398 | s[1][3] = _mm_xor_si128(s[1][3], LOAD(in + 112)); 399 | s[2][0] = _mm_xor_si128(s[2][0], LOAD(in + 128)); 400 | s[2][1] = _mm_xor_si128(s[2][1], LOAD(in + 144)); 401 | s[2][2] = _mm_xor_si128(s[2][2], LOAD(in + 160)); 402 | s[2][3] = _mm_xor_si128(s[2][3], LOAD(in + 176)); 403 | s[3][0] = _mm_xor_si128(s[3][0], LOAD(in + 192)); 404 | s[3][1] = _mm_xor_si128(s[3][1], LOAD(in + 208)); 405 | s[3][2] = _mm_xor_si128(s[3][2], LOAD(in + 224)); 406 | s[3][3] = _mm_xor_si128(s[3][3], LOAD(in + 240)); 407 | 408 | TRUNCSTORE(out, s[0][0], s[0][1], s[0][2], s[0][3]); 409 | TRUNCSTORE(out + 32, s[1][0], s[1][1], s[1][2], s[1][3]); 410 | TRUNCSTORE(out + 64, s[2][0], s[2][1], s[2][2], s[2][3]); 411 | TRUNCSTORE(out + 96, s[3][0], s[3][1], s[3][2], s[3][3]); 412 | } 413 | 414 | void haraka512_8x(unsigned char *out, const unsigned char *in) { 415 | // This is faster on Skylake, the code below is faster on Haswell. 416 | haraka512_4x(out, in); 417 | haraka512_4x(out + 128, in + 256); 418 | 419 | // u128 s[8][4], tmp; 420 | // 421 | // s[0][0] = LOAD(in); 422 | // s[0][1] = LOAD(in + 16); 423 | // s[0][2] = LOAD(in + 32); 424 | // s[0][3] = LOAD(in + 48); 425 | // s[1][0] = LOAD(in + 64); 426 | // s[1][1] = LOAD(in + 80); 427 | // s[1][2] = LOAD(in + 96); 428 | // s[1][3] = LOAD(in + 112); 429 | // s[2][0] = LOAD(in + 128); 430 | // s[2][1] = LOAD(in + 144); 431 | // s[2][2] = LOAD(in + 160); 432 | // s[2][3] = LOAD(in + 176); 433 | // s[3][0] = LOAD(in + 192); 434 | // s[3][1] = LOAD(in + 208); 435 | // s[3][2] = LOAD(in + 224); 436 | // s[3][3] = LOAD(in + 240); 437 | // s[4][0] = LOAD(in + 256); 438 | // s[4][1] = LOAD(in + 272); 439 | // s[4][2] = LOAD(in + 288); 440 | // s[4][3] = LOAD(in + 304); 441 | // s[5][0] = LOAD(in + 320); 442 | // s[5][1] = LOAD(in + 336); 443 | // s[5][2] = LOAD(in + 352); 444 | // s[5][3] = LOAD(in + 368); 445 | // s[6][0] = LOAD(in + 384); 446 | // s[6][1] = LOAD(in + 400); 447 | // s[6][2] = LOAD(in + 416); 448 | // s[6][3] = LOAD(in + 432); 449 | // s[7][0] = LOAD(in + 448); 450 | // s[7][1] = LOAD(in + 464); 451 | // s[7][2] = LOAD(in + 480); 452 | // s[7][3] = LOAD(in + 496); 453 | // 454 | // AES4_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 0); 455 | // MIX4(s[0][0], s[0][1], s[0][2], s[0][3]); 456 | // MIX4(s[1][0], s[1][1], s[1][2], s[1][3]); 457 | // MIX4(s[2][0], s[2][1], s[2][2], s[2][3]); 458 | // MIX4(s[3][0], s[3][1], s[3][2], s[3][3]); 459 | // MIX4(s[4][0], s[4][1], s[4][2], s[4][3]); 460 | // MIX4(s[5][0], s[5][1], s[5][2], s[5][3]); 461 | // MIX4(s[6][0], s[6][1], s[6][2], s[6][3]); 462 | // MIX4(s[7][0], s[7][1], s[7][2], s[7][3]); 463 | // 464 | // AES4_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 8); 465 | // MIX4(s[0][0], s[0][1], s[0][2], s[0][3]); 466 | // MIX4(s[1][0], s[1][1], s[1][2], s[1][3]); 467 | // MIX4(s[2][0], s[2][1], s[2][2], s[2][3]); 468 | // MIX4(s[3][0], s[3][1], s[3][2], s[3][3]); 469 | // MIX4(s[4][0], s[4][1], s[4][2], s[4][3]); 470 | // MIX4(s[5][0], s[5][1], s[5][2], s[5][3]); 471 | // MIX4(s[6][0], s[6][1], s[6][2], s[6][3]); 472 | // MIX4(s[7][0], s[7][1], s[7][2], s[7][3]); 473 | // 474 | // AES4_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 16); 475 | // MIX4(s[0][0], s[0][1], s[0][2], s[0][3]); 476 | // MIX4(s[1][0], s[1][1], s[1][2], s[1][3]); 477 | // MIX4(s[2][0], s[2][1], s[2][2], s[2][3]); 478 | // MIX4(s[3][0], s[3][1], s[3][2], s[3][3]); 479 | // MIX4(s[4][0], s[4][1], s[4][2], s[4][3]); 480 | // MIX4(s[5][0], s[5][1], s[5][2], s[5][3]); 481 | // MIX4(s[6][0], s[6][1], s[6][2], s[6][3]); 482 | // MIX4(s[7][0], s[7][1], s[7][2], s[7][3]); 483 | // 484 | // AES4_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 24); 485 | // MIX4(s[0][0], s[0][1], s[0][2], s[0][3]); 486 | // MIX4(s[1][0], s[1][1], s[1][2], s[1][3]); 487 | // MIX4(s[2][0], s[2][1], s[2][2], s[2][3]); 488 | // MIX4(s[3][0], s[3][1], s[3][2], s[3][3]); 489 | // MIX4(s[4][0], s[4][1], s[4][2], s[4][3]); 490 | // MIX4(s[5][0], s[5][1], s[5][2], s[5][3]); 491 | // MIX4(s[6][0], s[6][1], s[6][2], s[6][3]); 492 | // MIX4(s[7][0], s[7][1], s[7][2], s[7][3]); 493 | // 494 | // AES4_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 32); 495 | // MIX4(s[0][0], s[0][1], s[0][2], s[0][3]); 496 | // MIX4(s[1][0], s[1][1], s[1][2], s[1][3]); 497 | // MIX4(s[2][0], s[2][1], s[2][2], s[2][3]); 498 | // MIX4(s[3][0], s[3][1], s[3][2], s[3][3]); 499 | // MIX4(s[4][0], s[4][1], s[4][2], s[4][3]); 500 | // MIX4(s[5][0], s[5][1], s[5][2], s[5][3]); 501 | // MIX4(s[6][0], s[6][1], s[6][2], s[6][3]); 502 | // MIX4(s[7][0], s[7][1], s[7][2], s[7][3]); 503 | // 504 | // 505 | // s[0][0] = _mm_xor_si128(s[0][0], LOAD(in)); 506 | // s[0][1] = _mm_xor_si128(s[0][1], LOAD(in + 16)); 507 | // s[0][2] = _mm_xor_si128(s[0][2], LOAD(in + 32)); 508 | // s[0][3] = _mm_xor_si128(s[0][3], LOAD(in + 48)); 509 | // s[1][0] = _mm_xor_si128(s[1][0], LOAD(in + 64)); 510 | // s[1][1] = _mm_xor_si128(s[1][1], LOAD(in + 80)); 511 | // s[1][2] = _mm_xor_si128(s[1][2], LOAD(in + 96)); 512 | // s[1][3] = _mm_xor_si128(s[1][3], LOAD(in + 112)); 513 | // s[2][0] = _mm_xor_si128(s[2][0], LOAD(in + 128)); 514 | // s[2][1] = _mm_xor_si128(s[2][1], LOAD(in + 144)); 515 | // s[2][2] = _mm_xor_si128(s[2][2], LOAD(in + 160)); 516 | // s[2][3] = _mm_xor_si128(s[2][3], LOAD(in + 176)); 517 | // s[3][0] = _mm_xor_si128(s[3][0], LOAD(in + 192)); 518 | // s[3][1] = _mm_xor_si128(s[3][1], LOAD(in + 208)); 519 | // s[3][2] = _mm_xor_si128(s[3][2], LOAD(in + 224)); 520 | // s[3][3] = _mm_xor_si128(s[3][3], LOAD(in + 240)); 521 | // s[4][0] = _mm_xor_si128(s[4][0], LOAD(in + 256)); 522 | // s[4][1] = _mm_xor_si128(s[4][1], LOAD(in + 272)); 523 | // s[4][2] = _mm_xor_si128(s[4][2], LOAD(in + 288)); 524 | // s[4][3] = _mm_xor_si128(s[4][3], LOAD(in + 304)); 525 | // s[5][0] = _mm_xor_si128(s[5][0], LOAD(in + 320)); 526 | // s[5][1] = _mm_xor_si128(s[5][1], LOAD(in + 336)); 527 | // s[5][2] = _mm_xor_si128(s[5][2], LOAD(in + 352)); 528 | // s[5][3] = _mm_xor_si128(s[5][3], LOAD(in + 368)); 529 | // s[6][0] = _mm_xor_si128(s[6][0], LOAD(in + 384)); 530 | // s[6][1] = _mm_xor_si128(s[6][1], LOAD(in + 400)); 531 | // s[6][2] = _mm_xor_si128(s[6][2], LOAD(in + 416)); 532 | // s[6][3] = _mm_xor_si128(s[6][3], LOAD(in + 432)); 533 | // s[7][0] = _mm_xor_si128(s[7][0], LOAD(in + 448)); 534 | // s[7][1] = _mm_xor_si128(s[7][1], LOAD(in + 464)); 535 | // s[7][2] = _mm_xor_si128(s[7][2], LOAD(in + 480)); 536 | // s[7][3] = _mm_xor_si128(s[7][3], LOAD(in + 496)); 537 | // 538 | // TRUNCSTORE(out, s[0][0], s[0][1], s[0][2], s[0][3]); 539 | // TRUNCSTORE(out + 32, s[1][0], s[1][1], s[1][2], s[1][3]); 540 | // TRUNCSTORE(out + 64, s[2][0], s[2][1], s[2][2], s[2][3]); 541 | // TRUNCSTORE(out + 96, s[3][0], s[3][1], s[3][2], s[3][3]); 542 | // TRUNCSTORE(out + 128, s[4][0], s[4][1], s[4][2], s[4][3]); 543 | // TRUNCSTORE(out + 160, s[5][0], s[5][1], s[5][2], s[5][3]); 544 | // TRUNCSTORE(out + 192, s[6][0], s[6][1], s[6][2], s[6][3]); 545 | // TRUNCSTORE(out + 224, s[7][0], s[7][1], s[7][2], s[7][3]); 546 | } 547 | -------------------------------------------------------------------------------- /code/c/aesni_optimized/haraka.h: -------------------------------------------------------------------------------- 1 | /* 2 | Optimized Implementations for Haraka256 and Haraka512 3 | */ 4 | #ifndef HARAKA_H_ 5 | #define HARAKA_H_ 6 | 7 | #include "immintrin.h" 8 | 9 | #define NUMROUNDS 5 10 | 11 | #define u64 unsigned long 12 | #define u128 __m128i 13 | 14 | u128 rc[40]; 15 | 16 | #define LOAD(src) _mm_load_si128((u128 *)(src)) 17 | #define STORE(dest,src) _mm_storeu_si128((u128 *)(dest),src) 18 | 19 | #define AES2(s0, s1, rci) \ 20 | s0 = _mm_aesenc_si128(s0, rc[rci]); \ 21 | s1 = _mm_aesenc_si128(s1, rc[rci + 1]); \ 22 | s0 = _mm_aesenc_si128(s0, rc[rci + 2]); \ 23 | s1 = _mm_aesenc_si128(s1, rc[rci + 3]); 24 | 25 | #define AES2_4x(s0, s1, s2, s3, rci) \ 26 | AES2(s0[0], s0[1], rci); \ 27 | AES2(s1[0], s1[1], rci); \ 28 | AES2(s2[0], s2[1], rci); \ 29 | AES2(s3[0], s3[1], rci); 30 | 31 | #define AES2_8x(s0, s1, s2, s3, s4, s5, s6, s7, rci) \ 32 | AES2_4x(s0, s1, s2, s3, rci); \ 33 | AES2_4x(s4, s5, s6, s7, rci); 34 | 35 | #define AES4(s0, s1, s2, s3, rci) \ 36 | s0 = _mm_aesenc_si128(s0, rc[rci]); \ 37 | s1 = _mm_aesenc_si128(s1, rc[rci + 1]); \ 38 | s2 = _mm_aesenc_si128(s2, rc[rci + 2]); \ 39 | s3 = _mm_aesenc_si128(s3, rc[rci + 3]); \ 40 | s0 = _mm_aesenc_si128(s0, rc[rci + 4]); \ 41 | s1 = _mm_aesenc_si128(s1, rc[rci + 5]); \ 42 | s2 = _mm_aesenc_si128(s2, rc[rci + 6]); \ 43 | s3 = _mm_aesenc_si128(s3, rc[rci + 7]); \ 44 | 45 | #define AES4_4x(s0, s1, s2, s3, rci) \ 46 | AES4(s0[0], s0[1], s0[2], s0[3], rci); \ 47 | AES4(s1[0], s1[1], s1[2], s1[3], rci); \ 48 | AES4(s2[0], s2[1], s2[2], s2[3], rci); \ 49 | AES4(s3[0], s3[1], s3[2], s3[3], rci); 50 | 51 | #define AES4_8x(s0, s1, s2, s3, s4, s5, s6, s7, rci) \ 52 | AES4_4x(s0, s1, s2, s3, rci); \ 53 | AES4_4x(s4, s5, s6, s7, rci); 54 | 55 | #define MIX2(s0, s1) \ 56 | tmp = _mm_unpacklo_epi32(s0, s1); \ 57 | s1 = _mm_unpackhi_epi32(s0, s1); \ 58 | s0 = tmp; 59 | 60 | #define MIX4(s0, s1, s2, s3) \ 61 | tmp = _mm_unpacklo_epi32(s0, s1); \ 62 | s0 = _mm_unpackhi_epi32(s0, s1); \ 63 | s1 = _mm_unpacklo_epi32(s2, s3); \ 64 | s2 = _mm_unpackhi_epi32(s2, s3); \ 65 | s3 = _mm_unpacklo_epi32(s0, s2); \ 66 | s0 = _mm_unpackhi_epi32(s0, s2); \ 67 | s2 = _mm_unpackhi_epi32(s1, tmp); \ 68 | s1 = _mm_unpacklo_epi32(s1, tmp); 69 | 70 | #define TRUNCSTORE(out, s0, s1, s2, s3) \ 71 | *(u64*)(out) = (u64*)(s0)[1]; \ 72 | *(u64*)(out + 8) = (u64*)(s1)[1]; \ 73 | *(u64*)(out + 16) = (u64*)(s2)[0]; \ 74 | *(u64*)(out + 24) = (u64*)(s3)[0]; 75 | 76 | void load_constants(); 77 | void test_implementations(); 78 | 79 | void haraka256(unsigned char *out, const unsigned char *in); 80 | void haraka256_4x(unsigned char *out, const unsigned char *in); 81 | void haraka256_8x(unsigned char *out, const unsigned char *in); 82 | 83 | void haraka512(unsigned char *out, const unsigned char *in); 84 | void haraka512_4x(unsigned char *out, const unsigned char *in); 85 | void haraka512_8x(unsigned char *out, const unsigned char *in); 86 | 87 | 88 | #endif 89 | -------------------------------------------------------------------------------- /code/c/aesni_optimized/main.c: -------------------------------------------------------------------------------- 1 | /* 2 | Timing code for optimized implementation of Haraka. 3 | */ 4 | 5 | #include "stdio.h" 6 | #include "stdlib.h" 7 | #include 8 | #include "haraka.h" 9 | #include "timing.h" 10 | 11 | typedef void (*hash_function)(unsigned char*, const unsigned char*); 12 | 13 | // Measures how many cycles func requires to process a random input. 14 | double timeit(hash_function func, int inlen, int outlen) { 15 | unsigned char *in, *out; 16 | unsigned long long timer = 0; 17 | double timings[NUM_TIMINGS]; 18 | 19 | int i, j; 20 | 21 | srand(0); 22 | in = malloc(inlen); 23 | out = malloc(outlen); 24 | 25 | load_constants(); 26 | 27 | for (i = -100; i < NUM_TIMINGS; i++) { 28 | //Get random input 29 | for (j = 0; j < inlen; j++) { 30 | in[j] = rand() & 0xff; 31 | } 32 | 33 | timer = startTimer(); 34 | for(j = 0; j < ITERATIONS; j++) { 35 | func(out, in); 36 | } 37 | timer = endTimer() - timer; 38 | 39 | if (i >= 0 && i < NUM_TIMINGS) { 40 | timings[i] = ((double)timer) / inlen / ITERATIONS; 41 | } 42 | } 43 | 44 | //Get Median 45 | qsort(timings, NUM_TIMINGS, sizeof(double), compareDouble); 46 | 47 | free(out); 48 | free(in); 49 | return timings[NUM_TIMINGS / 2]; 50 | } 51 | 52 | int main() { 53 | test_implementations(); 54 | printf("Haraka-256 1x: %f cycles per byte\n", timeit(haraka256, 32, 32)); 55 | printf("Haraka-256 4x: %f cycles per byte\n", timeit(haraka256_4x, 4*32, 4*32)); 56 | printf("Haraka-256 8x: %f cycles per byte\n", timeit(haraka256_8x, 8*32, 8*32)); 57 | 58 | printf("Haraka-512 1x: %f cycles per byte\n", timeit(haraka512, 64, 32)); 59 | printf("Haraka-512 4x: %f cycles per byte\n", timeit(haraka512_4x, 4*64, 4*32)); 60 | printf("Haraka-512 8x: %f cycles per byte\n", timeit(haraka512_8x, 8*64, 8*32)); 61 | } 62 | -------------------------------------------------------------------------------- /code/c/aesni_optimized/timing.h: -------------------------------------------------------------------------------- 1 | #ifndef TIMING_H 2 | #define TIMING_H 3 | 4 | #define NUM_TIMINGS 10000 5 | #define ITERATIONS 1000 6 | 7 | int compareDouble(const void *x, const void *y) 8 | { 9 | double xx = *(double*)x, yy = *(double*)y; 10 | if (xx < yy) return -1; 11 | if (xx > yy) return 1; 12 | return 0; 13 | } 14 | 15 | unsigned long long int startTimer(void) 16 | { 17 | unsigned a, d; 18 | 19 | __asm__ volatile("CPUID\n\t" 20 | "RDTSC\n\t" 21 | "mov %%edx, %0\n\t" 22 | "mov %%eax, %1\n\t": "=r" (d), 23 | "=r" (a):: "%rax", "%rbx", "%rcx", "%rdx"); 24 | 25 | return ((unsigned long long)a) | (((unsigned long long)d) << 32);; 26 | } 27 | 28 | unsigned long long int endTimer(void) 29 | { 30 | unsigned a, d; 31 | 32 | __asm__ volatile("RDTSCP\n\t" 33 | "mov %%edx, %0\n\t" 34 | "mov %%eax,%1\n\t" 35 | "CPUID\n\t": "=r" (d), "=r" (a):: 36 | "%rax", "%rbx", "%rcx", "%rdx"); 37 | 38 | return ((unsigned long long)a) | (((unsigned long long)d) << 32);; 39 | } 40 | 41 | #endif 42 | -------------------------------------------------------------------------------- /code/c/aesni_ref/Makefile: -------------------------------------------------------------------------------- 1 | CFLAGS=-std=c99 -O3 -Wno-format -march=native -funroll-loops -fomit-frame-pointer 2 | 3 | ifdef MPAR 4 | CFLAGS += -DMPAR=$(MPAR) 5 | endif 6 | 7 | all: 8 | $(CC) $(CFLAGS) helpers.c haraka.c -o haraka 9 | 10 | clean: 11 | rm haraka -------------------------------------------------------------------------------- /code/c/aesni_ref/haraka.c: -------------------------------------------------------------------------------- 1 | #include "wmmintrin.h" 2 | #include "emmintrin.h" 3 | #include "smmintrin.h" 4 | #include "helpers.h" 5 | #include 6 | 7 | #define ROUNDS (5) 8 | #define AES_PER_ROUND (2) 9 | 10 | int haraka512256(unsigned char *hash, const unsigned char *msg) { 11 | // stuff we need 12 | int i, j; 13 | __m128i s[4], tmp, rc[40]; 14 | __m128i MSB64 = _mm_set_epi32(0xFFFFFFFF,0xFFFFFFFF,0,0); 15 | 16 | 17 | // define round constants 18 | rc[0] = _mm_set_epi32(0x0684704c,0xe620c00a,0xb2c5fef0,0x75817b9d); 19 | rc[1] = _mm_set_epi32(0x8b66b4e1,0x88f3a06b,0x640f6ba4,0x2f08f717); 20 | rc[2] = _mm_set_epi32(0x3402de2d,0x53f28498,0xcf029d60,0x9f029114); 21 | rc[3] = _mm_set_epi32(0x0ed6eae6,0x2e7b4f08,0xbbf3bcaf,0xfd5b4f79); 22 | rc[4] = _mm_set_epi32(0xcbcfb0cb,0x4872448b,0x79eecd1c,0xbe397044); 23 | rc[5] = _mm_set_epi32(0x7eeacdee,0x6e9032b7,0x8d5335ed,0x2b8a057b); 24 | rc[6] = _mm_set_epi32(0x67c28f43,0x5e2e7cd0,0xe2412761,0xda4fef1b); 25 | rc[7] = _mm_set_epi32(0x2924d9b0,0xafcacc07,0x675ffde2,0x1fc70b3b); 26 | rc[8] = _mm_set_epi32(0xab4d63f1,0xe6867fe9,0xecdb8fca,0xb9d465ee); 27 | rc[9] = _mm_set_epi32(0x1c30bf84,0xd4b7cd64,0x5b2a404f,0xad037e33); 28 | rc[10] = _mm_set_epi32(0xb2cc0bb9,0x941723bf,0x69028b2e,0x8df69800); 29 | rc[11] = _mm_set_epi32(0xfa0478a6,0xde6f5572,0x4aaa9ec8,0x5c9d2d8a); 30 | rc[12] = _mm_set_epi32(0xdfb49f2b,0x6b772a12,0x0efa4f2e,0x29129fd4); 31 | rc[13] = _mm_set_epi32(0x1ea10344,0xf449a236,0x32d611ae,0xbb6a12ee); 32 | rc[14] = _mm_set_epi32(0xaf044988,0x4b050084,0x5f9600c9,0x9ca8eca6); 33 | rc[15] = _mm_set_epi32(0x21025ed8,0x9d199c4f,0x78a2c7e3,0x27e593ec); 34 | rc[16] = _mm_set_epi32(0xbf3aaaf8,0xa759c9b7,0xb9282ecd,0x82d40173); 35 | rc[17] = _mm_set_epi32(0x6260700d,0x6186b017,0x37f2efd9,0x10307d6b); 36 | rc[18] = _mm_set_epi32(0x5aca45c2,0x21300443,0x81c29153,0xf6fc9ac6); 37 | rc[19] = _mm_set_epi32(0x9223973c,0x226b68bb,0x2caf92e8,0x36d1943a); 38 | rc[20] = _mm_set_epi32(0xd3bf9238,0x225886eb,0x6cbab958,0xe51071b4); 39 | rc[21] = _mm_set_epi32(0xdb863ce5,0xaef0c677,0x933dfddd,0x24e1128d); 40 | rc[22] = _mm_set_epi32(0xbb606268,0xffeba09c,0x83e48de3,0xcb2212b1); 41 | rc[23] = _mm_set_epi32(0x734bd3dc,0xe2e4d19c,0x2db91a4e,0xc72bf77d); 42 | rc[24] = _mm_set_epi32(0x43bb47c3,0x61301b43,0x4b1415c4,0x2cb3924e); 43 | rc[25] = _mm_set_epi32(0xdba775a8,0xe707eff6,0x03b231dd,0x16eb6899); 44 | rc[26] = _mm_set_epi32(0x6df3614b,0x3c755977,0x8e5e2302,0x7eca472c); 45 | rc[27] = _mm_set_epi32(0xcda75a17,0xd6de7d77,0x6d1be5b9,0xb88617f9); 46 | rc[28] = _mm_set_epi32(0xec6b43f0,0x6ba8e9aa,0x9d6c069d,0xa946ee5d); 47 | rc[29] = _mm_set_epi32(0xcb1e6950,0xf957332b,0xa2531159,0x3bf327c1); 48 | rc[30] = _mm_set_epi32(0x2cee0c75,0x00da619c,0xe4ed0353,0x600ed0d9); 49 | rc[31] = _mm_set_epi32(0xf0b1a5a1,0x96e90cab,0x80bbbabc,0x63a4a350); 50 | rc[32] = _mm_set_epi32(0xae3db102,0x5e962988,0xab0dde30,0x938dca39); 51 | rc[33] = _mm_set_epi32(0x17bb8f38,0xd554a40b,0x8814f3a8,0x2e75b442); 52 | rc[34] = _mm_set_epi32(0x34bb8a5b,0x5f427fd7,0xaeb6b779,0x360a16f6); 53 | rc[35] = _mm_set_epi32(0x26f65241,0xcbe55438,0x43ce5918,0xffbaafde); 54 | rc[36] = _mm_set_epi32(0x4ce99a54,0xb9f3026a,0xa2ca9cf7,0x839ec978); 55 | rc[37] = _mm_set_epi32(0xae51a51a,0x1bdff7be,0x40c06e28,0x22901235); 56 | rc[38] = _mm_set_epi32(0xa0c1613c,0xba7ed22b,0xc173bc0f,0x48a659cf); 57 | rc[39] = _mm_set_epi32(0x756acc03,0x02288288,0x4ad6bdfd,0xe9c59da1); 58 | 59 | // initialize state to msg 60 | s[0] = _mm_load_si128(&((__m128i*)msg)[0]); 61 | s[1] = _mm_load_si128(&((__m128i*)msg)[1]); 62 | s[2] = _mm_load_si128(&((__m128i*)msg)[2]); 63 | s[3] = _mm_load_si128(&((__m128i*)msg)[3]); 64 | 65 | printf("= input state =\n"); 66 | printstate512(s); 67 | 68 | for (i = 0; i < ROUNDS; ++i) { 69 | // aes round(s) 70 | for (j = 0; j < AES_PER_ROUND; ++j) { 71 | s[0] = _mm_aesenc_si128(s[0], rc[4*AES_PER_ROUND*i + 4*j]); 72 | s[1] = _mm_aesenc_si128(s[1], rc[4*AES_PER_ROUND*i + 4*j + 1]); 73 | s[2] = _mm_aesenc_si128(s[2], rc[4*AES_PER_ROUND*i + 4*j + 2]); 74 | s[3] = _mm_aesenc_si128(s[3], rc[4*AES_PER_ROUND*i + 4*j + 3]); 75 | } 76 | 77 | printf("= round %d : after aes layer =\n", i); 78 | printstate512(s); 79 | 80 | // mixing 81 | tmp = _mm_unpacklo_epi32(s[0], s[1]); 82 | s[0] = _mm_unpackhi_epi32(s[0], s[1]); 83 | s[1] = _mm_unpacklo_epi32(s[2], s[3]); 84 | s[2] = _mm_unpackhi_epi32(s[2], s[3]); 85 | s[3] = _mm_unpacklo_epi32(s[0], s[2]); 86 | s[0] = _mm_unpackhi_epi32(s[0], s[2]); 87 | s[2] = _mm_unpackhi_epi32(s[1], tmp); 88 | s[1] = _mm_unpacklo_epi32(s[1], tmp); 89 | 90 | printf("= round %d : after mix layer =\n", i); 91 | printstate512(s); 92 | } 93 | 94 | printf("= output from permutation =\n"); 95 | printstate512(s); 96 | 97 | // xor message to get DM effect 98 | s[0] = _mm_xor_si128(s[0], _mm_load_si128(&((__m128i*)msg)[0])); 99 | s[1] = _mm_xor_si128(s[1], _mm_load_si128(&((__m128i*)msg)[1])); 100 | s[2] = _mm_xor_si128(s[2], _mm_load_si128(&((__m128i*)msg)[2])); 101 | s[3] = _mm_xor_si128(s[3], _mm_load_si128(&((__m128i*)msg)[3])); 102 | 103 | printf("= after feed-forward =\n"); 104 | printstate512(s); 105 | 106 | // truncate and store result 107 | _mm_maskmoveu_si128(s[0], MSB64, (hash-8)); 108 | _mm_maskmoveu_si128(s[1], MSB64, (hash+0)); 109 | _mm_storel_epi64((__m128i*)(hash + 16), s[2]); 110 | _mm_storel_epi64((__m128i*)(hash + 24), s[3]); 111 | } 112 | 113 | int haraka256256(unsigned char *hash, const unsigned char *msg) { 114 | // stuff we need 115 | int i, j; 116 | __m128i s[2], tmp, rc[20]; 117 | 118 | // define round constants 119 | rc[0] = _mm_set_epi32(0x0684704c,0xe620c00a,0xb2c5fef0,0x75817b9d); 120 | rc[1] = _mm_set_epi32(0x8b66b4e1,0x88f3a06b,0x640f6ba4,0x2f08f717); 121 | rc[2] = _mm_set_epi32(0x3402de2d,0x53f28498,0xcf029d60,0x9f029114); 122 | rc[3] = _mm_set_epi32(0x0ed6eae6,0x2e7b4f08,0xbbf3bcaf,0xfd5b4f79); 123 | rc[4] = _mm_set_epi32(0xcbcfb0cb,0x4872448b,0x79eecd1c,0xbe397044); 124 | rc[5] = _mm_set_epi32(0x7eeacdee,0x6e9032b7,0x8d5335ed,0x2b8a057b); 125 | rc[6] = _mm_set_epi32(0x67c28f43,0x5e2e7cd0,0xe2412761,0xda4fef1b); 126 | rc[7] = _mm_set_epi32(0x2924d9b0,0xafcacc07,0x675ffde2,0x1fc70b3b); 127 | rc[8] = _mm_set_epi32(0xab4d63f1,0xe6867fe9,0xecdb8fca,0xb9d465ee); 128 | rc[9] = _mm_set_epi32(0x1c30bf84,0xd4b7cd64,0x5b2a404f,0xad037e33); 129 | rc[10] = _mm_set_epi32(0xb2cc0bb9,0x941723bf,0x69028b2e,0x8df69800); 130 | rc[11] = _mm_set_epi32(0xfa0478a6,0xde6f5572,0x4aaa9ec8,0x5c9d2d8a); 131 | rc[12] = _mm_set_epi32(0xdfb49f2b,0x6b772a12,0x0efa4f2e,0x29129fd4); 132 | rc[13] = _mm_set_epi32(0x1ea10344,0xf449a236,0x32d611ae,0xbb6a12ee); 133 | rc[14] = _mm_set_epi32(0xaf044988,0x4b050084,0x5f9600c9,0x9ca8eca6); 134 | rc[15] = _mm_set_epi32(0x21025ed8,0x9d199c4f,0x78a2c7e3,0x27e593ec); 135 | rc[16] = _mm_set_epi32(0xbf3aaaf8,0xa759c9b7,0xb9282ecd,0x82d40173); 136 | rc[17] = _mm_set_epi32(0x6260700d,0x6186b017,0x37f2efd9,0x10307d6b); 137 | rc[18] = _mm_set_epi32(0x5aca45c2,0x21300443,0x81c29153,0xf6fc9ac6); 138 | rc[19] = _mm_set_epi32(0x9223973c,0x226b68bb,0x2caf92e8,0x36d1943a); 139 | 140 | // initialize state to msg 141 | s[0] = _mm_load_si128(&((__m128i*)msg)[0]); 142 | s[1] = _mm_load_si128(&((__m128i*)msg)[1]); 143 | 144 | printf("= input state =\n"); 145 | printstate256(s); 146 | 147 | for (i = 0; i < ROUNDS; ++i) { 148 | // aes round(s) 149 | for (j = 0; j < AES_PER_ROUND; ++j) { 150 | s[0] = _mm_aesenc_si128(s[0], rc[2*AES_PER_ROUND*i + 2*j]); 151 | s[1] = _mm_aesenc_si128(s[1], rc[2*AES_PER_ROUND*i + 2*j + 1]); 152 | } 153 | 154 | printf("= round %d : after aes layer =\n", i); 155 | printstate256(s); 156 | 157 | // mixing 158 | tmp = _mm_unpacklo_epi32(s[0], s[1]); 159 | s[1] = _mm_unpackhi_epi32(s[0], s[1]); 160 | s[0] = tmp; 161 | 162 | printf("= round %d : after mix layer =\n", i); 163 | printstate256(s); 164 | } 165 | 166 | printf("= output from permutation =\n"); 167 | printstate256(s); 168 | 169 | // xor message to get DM effect 170 | s[0] = _mm_xor_si128(s[0], _mm_load_si128(&((__m128i*)msg)[0])); 171 | s[1] = _mm_xor_si128(s[1], _mm_load_si128(&((__m128i*)msg)[1])); 172 | 173 | printf("= after feed-forward =\n"); 174 | printstate256(s); 175 | 176 | // store result 177 | _mm_storeu_si128((__m128i*)hash, s[0]); 178 | _mm_storeu_si128((__m128i*)(hash + 16), s[1]); 179 | } 180 | 181 | int main() { 182 | // allocate memory for input and digest 183 | unsigned char *msg = (unsigned char *)calloc(64, sizeof(unsigned char)); 184 | unsigned char *digest = (unsigned char *)calloc(32, sizeof(unsigned char)); 185 | int i; 186 | 187 | // set some input bytes 188 | for (i = 0; i < 64; ++i) 189 | msg[i] = i; 190 | 191 | // print input 192 | printf("= input bytes =\n"); 193 | printbytes(msg, 64); printf("\n"); 194 | 195 | // run Haraka-512/256 196 | haraka512256(digest, msg); 197 | 198 | // print output 199 | printf("= haraka-512/256 output bytes =\n"); 200 | printbytes(digest, 32); printf("\n"); 201 | 202 | // run Haraka-256/256 203 | haraka256256(digest, msg); 204 | 205 | // print output 206 | printf("= haraka-256/256 output bytes =\n"); 207 | printbytes(digest, 32); printf("\n"); 208 | 209 | return 0; 210 | } 211 | -------------------------------------------------------------------------------- /code/c/aesni_ref/helpers.c: -------------------------------------------------------------------------------- 1 | #include "smmintrin.h" 2 | #include "helpers.h" 3 | #include 4 | 5 | ///////////// 6 | // HELPERS // 7 | ///////////// 8 | void print_block(__m128i var) { 9 | uint8_t *val = (uint8_t*) &var; 10 | //~ printf("%.16llx%.16llx\n", v64val[1], v64val[0]); 11 | printf("%02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x\n", 12 | // val[15], val[14], val[13], val[12], val[11], val[10], val[9], val[8], val[7], val[6], val[5], val[4], val[3], val[2], val[1], val[0]); 13 | val[0], val[1], val[2], val[3], val[4], val[5], val[6], val[7], val[8], val[9], val[10], val[11], val[12], val[13], val[14], val[15]); 14 | } 15 | 16 | void printbytes(unsigned char *m, int len) { 17 | int i; 18 | for (i = 0; i < len-1; ++i) 19 | printf("%02x ", m[i]); 20 | printf("%02x\n", m[len-1]); 21 | } 22 | 23 | void printstate512(__m128i* s) { 24 | uint8_t *A = (uint8_t*)(&s[0]); 25 | uint8_t *B = (uint8_t*)(&s[1]); 26 | uint8_t *C = (uint8_t*)(&s[2]); 27 | uint8_t *D = (uint8_t*)(&s[3]); 28 | 29 | int i; 30 | for (i = 0; i < 4; ++i) 31 | printf("%02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x\n", 32 | A[i], A[i+4], A[i+8], A[i+12], 33 | B[i], B[i+4], B[i+8], B[i+12], 34 | C[i], C[i+4], C[i+8], C[i+12], 35 | D[i], D[i+4], D[i+8], D[i+12]); 36 | printf("\n"); 37 | } 38 | 39 | void printstate256(__m128i* s) { 40 | uint8_t *A = (uint8_t*)(&s[0]); 41 | uint8_t *B = (uint8_t*)(&s[1]); 42 | 43 | int i; 44 | for (i = 0; i < 4; ++i) 45 | printf("%02x %02x %02x %02x %02x %02x %02x %02x\n", 46 | A[i], A[i+4], A[i+8], A[i+12], 47 | B[i], B[i+4], B[i+8], B[i+12]); 48 | printf("\n"); 49 | } 50 | -------------------------------------------------------------------------------- /code/c/aesni_ref/helpers.h: -------------------------------------------------------------------------------- 1 | #ifndef HELPERS_H 2 | #define HELPERS_H 3 | #include 4 | 5 | ///////////// 6 | // HELPERS // 7 | ///////////// 8 | void print_block(__m128i); 9 | void printbytes(unsigned char *, int); 10 | void printstate512(__m128i* s); 11 | void printstate256(__m128i* s); 12 | 13 | #endif 14 | -------------------------------------------------------------------------------- /code/c/neon/haraka.c: -------------------------------------------------------------------------------- 1 | #include "haraka.h" 2 | #include 3 | #include 4 | 5 | void haraka_testvectors() { 6 | unsigned char *in = (unsigned char *)calloc(64*8, sizeof(unsigned char)); 7 | unsigned char *out256 = (unsigned char *)calloc(32*8, sizeof(unsigned char)); 8 | unsigned char *out512 = (unsigned char *)calloc(32*8, sizeof(unsigned char)); 9 | unsigned char testvector256[32] = {0x80, 0x27, 0xcc, 0xb8, 0x79, 0x49, 0x77, 0x4b, 10 | 0x78, 0xd0, 0x54, 0x5f, 0xb7, 0x2b, 0xf7, 0x0c, 11 | 0x69, 0x5c, 0x2a, 0x09, 0x23, 0xcb, 0xd4, 0x7b, 12 | 0xba, 0x11, 0x59, 0xef, 0xbf, 0x2b, 0x2c, 0x1c}; 13 | 14 | unsigned char testvector512[32] = {0xbe, 0x7f, 0x72, 0x3b, 0x4e, 0x80, 0xa9, 0x98, 15 | 0x13, 0xb2, 0x92, 0x28, 0x7f, 0x30, 0x6f, 0x62, 16 | 0x5a, 0x6d, 0x57, 0x33, 0x1c, 0xae, 0x5f, 0x34, 17 | 0xdd, 0x92, 0x77, 0xb0, 0x94, 0x5b, 0xe2, 0xaa}; 18 | 19 | int i; 20 | 21 | // Input for testvector 22 | for(i = 0; i < 4*8; i++) { 23 | in[i] = i % 32; 24 | } 25 | 26 | haraka_f_8x(out256, in); 27 | 28 | // Verify output 29 | for(i = 0; i < 8*32; i++) { 30 | if (out256[i % 32] != testvector256[i % 32]) { 31 | printf("Error: testvector incorrect for haraka_f at position %i.\n", i); 32 | return; 33 | } 34 | } 35 | 36 | // Input for testvector 37 | for(i = 0; i < 8*64; i++) { 38 | in[i] = i % 64; 39 | } 40 | 41 | haraka_h_8x(out512, in); 42 | 43 | // Verify output 44 | for(i = 0; i < 32; i++) { 45 | if (out512[i % 32] != testvector512[i % 32]) { 46 | printf("Error: testvector incorrect for haraka_h at position %i.\n", i); 47 | return; 48 | } 49 | } 50 | 51 | free(in); 52 | free(out256); 53 | free(out512); 54 | } 55 | 56 | void haraka_f(unsigned char *out, const unsigned char *in) { 57 | u128 s[2], tmp, s_save[2]; 58 | 59 | s[0] = LOAD(in); 60 | s[1] = LOAD(in + 16); 61 | 62 | s_save[0] = s[0]; 63 | s_save[1] = s[1]; 64 | 65 | AES2(s[0], s[1], 0); 66 | MIX2(s[0], s[1]); 67 | 68 | AES2(s[0], s[1], 4); 69 | MIX2(s[0], s[1]); 70 | 71 | AES2(s[0], s[1], 8); 72 | MIX2(s[0], s[1]); 73 | 74 | AES2(s[0], s[1], 12); 75 | MIX2(s[0], s[1]); 76 | 77 | AES2(s[0], s[1], 16); 78 | s[0] = XOR(s[0], rc256[20]); 79 | s[1] = XOR(s[1], rc256[21]); 80 | MIX2(s[0], s[1]); 81 | 82 | s[0] = XOR(s[0], s_save[0]); 83 | s[1] = XOR(s[1], s_save[1]); 84 | 85 | STORE(out, s[0]); 86 | STORE(out + 16, s[1]); 87 | } 88 | 89 | void haraka_f_4x(unsigned char *out, const unsigned char *in) { 90 | u128 s[4][2], tmp, s_save[4][2]; 91 | 92 | s[0][0] = LOAD(in); 93 | s[0][1] = LOAD(in + 16); 94 | s[1][0] = LOAD(in + 32); 95 | s[1][1] = LOAD(in + 48); 96 | s[2][0] = LOAD(in + 64); 97 | s[2][1] = LOAD(in + 80); 98 | s[3][0] = LOAD(in + 96); 99 | s[3][1] = LOAD(in + 112); 100 | 101 | s_save[0][0] = s[0][0]; 102 | s_save[0][1] = s[0][1]; 103 | s_save[1][0] = s[1][0]; 104 | s_save[1][1] = s[1][1]; 105 | s_save[2][0] = s[2][0]; 106 | s_save[2][1] = s[2][1]; 107 | s_save[3][0] = s[3][0]; 108 | s_save[3][1] = s[3][1]; 109 | 110 | // Round 1 111 | AES2_4x(s[0], s[1], s[2], s[3], 0); 112 | 113 | MIX2(s[0][0], s[0][1]); 114 | MIX2(s[1][0], s[1][1]); 115 | MIX2(s[2][0], s[2][1]); 116 | MIX2(s[3][0], s[3][1]); 117 | 118 | // Round 2 119 | AES2_4x(s[0], s[1], s[2], s[3], 4); 120 | 121 | MIX2(s[0][0], s[0][1]); 122 | MIX2(s[1][0], s[1][1]); 123 | MIX2(s[2][0], s[2][1]); 124 | MIX2(s[3][0], s[3][1]); 125 | 126 | // Round 3 127 | AES2_4x(s[0], s[1], s[2], s[3], 8); 128 | 129 | MIX2(s[0][0], s[0][1]); 130 | MIX2(s[1][0], s[1][1]); 131 | MIX2(s[2][0], s[2][1]); 132 | MIX2(s[3][0], s[3][1]); 133 | 134 | // Round 4 135 | AES2_4x(s[0], s[1], s[2], s[3], 12); 136 | 137 | MIX2(s[0][0], s[0][1]); 138 | MIX2(s[1][0], s[1][1]); 139 | MIX2(s[2][0], s[2][1]); 140 | MIX2(s[3][0], s[3][1]); 141 | 142 | // Round 5 143 | AES2_4x(s[0], s[1], s[2], s[3], 16); 144 | s[0][0] = XOR(s[0][0], rc256[20]); 145 | s[0][1] = XOR(s[0][1], rc256[21]); 146 | s[1][0] = XOR(s[1][0], rc256[20]); 147 | s[1][1] = XOR(s[1][1], rc256[21]); 148 | s[2][0] = XOR(s[2][0], rc256[20]); 149 | s[2][1] = XOR(s[2][1], rc256[21]); 150 | s[3][0] = XOR(s[3][0], rc256[20]); 151 | s[3][1] = XOR(s[3][1], rc256[21]); 152 | 153 | MIX2(s[0][0], s[0][1]); 154 | MIX2(s[1][0], s[1][1]); 155 | MIX2(s[2][0], s[2][1]); 156 | MIX2(s[3][0], s[3][1]); 157 | 158 | // Feed Forward 159 | s[0][0] = XOR(s[0][0], s_save[0][0]); 160 | s[0][1] = XOR(s[0][1], s_save[0][1]); 161 | s[1][0] = XOR(s[1][0], s_save[1][0]); 162 | s[1][1] = XOR(s[1][1], s_save[1][1]); 163 | s[2][0] = XOR(s[2][0], s_save[2][0]); 164 | s[2][1] = XOR(s[2][1], s_save[2][1]); 165 | s[3][0] = XOR(s[3][0], s_save[3][0]); 166 | s[3][1] = XOR(s[3][1], s_save[3][1]); 167 | 168 | STORE(out, s[0][0]); 169 | STORE(out + 16, s[0][1]); 170 | STORE(out + 32, s[1][0]); 171 | STORE(out + 48, s[1][1]); 172 | STORE(out + 64, s[2][0]); 173 | STORE(out + 80, s[2][1]); 174 | STORE(out + 96, s[3][0]); 175 | STORE(out + 112, s[3][1]); 176 | } 177 | 178 | void haraka_f_8x(unsigned char *out, const unsigned char *in) { 179 | u128 s[8][2], tmp, s_save[8][2]; 180 | s[0][0] = LOAD(in + 0); 181 | s[0][1] = LOAD(in + 16); 182 | s[1][0] = LOAD(in + 32); 183 | s[1][1] = LOAD(in + 48); 184 | s[2][0] = LOAD(in + 64); 185 | s[2][1] = LOAD(in + 80); 186 | s[3][0] = LOAD(in + 96); 187 | s[3][1] = LOAD(in + 112); 188 | s[4][0] = LOAD(in + 128); 189 | s[4][1] = LOAD(in + 144); 190 | s[5][0] = LOAD(in + 160); 191 | s[5][1] = LOAD(in + 176); 192 | s[6][0] = LOAD(in + 192); 193 | s[6][1] = LOAD(in + 208); 194 | s[7][0] = LOAD(in + 224); 195 | s[7][1] = LOAD(in + 240); 196 | 197 | s_save[0][0] = s[0][0]; 198 | s_save[0][1] = s[0][1]; 199 | s_save[1][0] = s[1][0]; 200 | s_save[1][1] = s[1][1]; 201 | s_save[2][0] = s[2][0]; 202 | s_save[2][1] = s[2][1]; 203 | s_save[3][0] = s[3][0]; 204 | s_save[3][1] = s[3][1]; 205 | s_save[4][0] = s[4][0]; 206 | s_save[4][1] = s[4][1]; 207 | s_save[5][0] = s[5][0]; 208 | s_save[5][1] = s[5][1]; 209 | s_save[6][0] = s[6][0]; 210 | s_save[6][1] = s[6][1]; 211 | s_save[7][0] = s[7][0]; 212 | s_save[7][1] = s[7][1]; 213 | 214 | AES2_4x(s[0], s[1], s[2], s[3], 0); 215 | AES2_4x(s[4], s[5], s[6], s[7], 0); 216 | MIX2(s[0][0], s[0][1]); 217 | MIX2(s[1][0], s[1][1]); 218 | MIX2(s[2][0], s[2][1]); 219 | MIX2(s[3][0], s[3][1]); 220 | MIX2(s[4][0], s[4][1]); 221 | MIX2(s[5][0], s[5][1]); 222 | MIX2(s[6][0], s[6][1]); 223 | MIX2(s[7][0], s[7][1]); 224 | AES2_4x(s[0], s[1], s[2], s[3], 4); 225 | AES2_4x(s[4], s[5], s[6], s[7], 4); 226 | MIX2(s[0][0], s[0][1]); 227 | MIX2(s[1][0], s[1][1]); 228 | MIX2(s[2][0], s[2][1]); 229 | MIX2(s[3][0], s[3][1]); 230 | MIX2(s[4][0], s[4][1]); 231 | MIX2(s[5][0], s[5][1]); 232 | MIX2(s[6][0], s[6][1]); 233 | MIX2(s[7][0], s[7][1]); 234 | AES2_4x(s[0], s[1], s[2], s[3], 8); 235 | AES2_4x(s[4], s[5], s[6], s[7], 8); 236 | MIX2(s[0][0], s[0][1]); 237 | MIX2(s[1][0], s[1][1]); 238 | MIX2(s[2][0], s[2][1]); 239 | MIX2(s[3][0], s[3][1]); 240 | MIX2(s[4][0], s[4][1]); 241 | MIX2(s[5][0], s[5][1]); 242 | MIX2(s[6][0], s[6][1]); 243 | MIX2(s[7][0], s[7][1]); 244 | AES2_4x(s[0], s[1], s[2], s[3], 12); 245 | AES2_4x(s[4], s[5], s[6], s[7], 12); 246 | MIX2(s[0][0], s[0][1]); 247 | MIX2(s[1][0], s[1][1]); 248 | MIX2(s[2][0], s[2][1]); 249 | MIX2(s[3][0], s[3][1]); 250 | MIX2(s[4][0], s[4][1]); 251 | MIX2(s[5][0], s[5][1]); 252 | MIX2(s[6][0], s[6][1]); 253 | MIX2(s[7][0], s[7][1]); 254 | 255 | AES2_4x(s[0], s[1], s[2], s[3], 16); 256 | AES2_4x(s[4], s[5], s[6], s[7], 16); 257 | s[0][0] = XOR(s[0][0], rc256[20]); 258 | s[0][1] = XOR(s[0][1], rc256[21]); 259 | s[1][0] = XOR(s[1][0], rc256[20]); 260 | s[1][1] = XOR(s[1][1], rc256[21]); 261 | s[2][0] = XOR(s[2][0], rc256[20]); 262 | s[2][1] = XOR(s[2][1], rc256[21]); 263 | s[3][0] = XOR(s[3][0], rc256[20]); 264 | s[3][1] = XOR(s[3][1], rc256[21]); 265 | s[4][0] = XOR(s[4][0], rc256[20]); 266 | s[4][1] = XOR(s[4][1], rc256[21]); 267 | s[5][0] = XOR(s[5][0], rc256[20]); 268 | s[5][1] = XOR(s[5][1], rc256[21]); 269 | s[6][0] = XOR(s[6][0], rc256[20]); 270 | s[6][1] = XOR(s[6][1], rc256[21]); 271 | s[7][0] = XOR(s[7][0], rc256[20]); 272 | s[7][1] = XOR(s[7][1], rc256[21]); 273 | 274 | MIX2(s[0][0], s[0][1]); 275 | MIX2(s[1][0], s[1][1]); 276 | MIX2(s[2][0], s[2][1]); 277 | MIX2(s[3][0], s[3][1]); 278 | MIX2(s[4][0], s[4][1]); 279 | MIX2(s[5][0], s[5][1]); 280 | MIX2(s[6][0], s[6][1]); 281 | MIX2(s[7][0], s[7][1]); 282 | s[0][0] = XOR(s[0][0], s_save[0][0]); 283 | s[0][1] = XOR(s[0][1], s_save[0][1]); 284 | s[1][0] = XOR(s[1][0], s_save[1][0]); 285 | s[1][1] = XOR(s[1][1], s_save[1][1]); 286 | s[2][0] = XOR(s[2][0], s_save[2][0]); 287 | s[2][1] = XOR(s[2][1], s_save[2][1]); 288 | s[3][0] = XOR(s[3][0], s_save[3][0]); 289 | s[3][1] = XOR(s[3][1], s_save[3][1]); 290 | s[4][0] = XOR(s[4][0], s_save[4][0]); 291 | s[4][1] = XOR(s[4][1], s_save[4][1]); 292 | s[5][0] = XOR(s[5][0], s_save[5][0]); 293 | s[5][1] = XOR(s[5][1], s_save[5][1]); 294 | s[6][0] = XOR(s[6][0], s_save[6][0]); 295 | s[6][1] = XOR(s[6][1], s_save[6][1]); 296 | s[7][0] = XOR(s[7][0], s_save[7][0]); 297 | s[7][1] = XOR(s[7][1], s_save[7][1]); 298 | 299 | STORE(out + 0, s[0][0]); 300 | STORE(out + 16, s[0][1]); 301 | STORE(out + 32, s[1][0]); 302 | STORE(out + 48, s[1][1]); 303 | STORE(out + 64, s[2][0]); 304 | STORE(out + 80, s[2][1]); 305 | STORE(out + 96, s[3][0]); 306 | STORE(out + 112, s[3][1]); 307 | STORE(out + 128, s[4][0]); 308 | STORE(out + 144, s[4][1]); 309 | STORE(out + 160, s[5][0]); 310 | STORE(out + 176, s[5][1]); 311 | STORE(out + 192, s[6][0]); 312 | STORE(out + 208, s[6][1]); 313 | STORE(out + 224, s[7][0]); 314 | STORE(out + 240, s[7][1]); 315 | } 316 | 317 | void haraka_h(unsigned char *out, const unsigned char *in) { 318 | u128 s[4], tmp, s_save[4]; 319 | 320 | s[0] = LOAD(in); 321 | s[1] = LOAD(in + 16); 322 | s[2] = LOAD(in + 32); 323 | s[3] = LOAD(in + 48); 324 | 325 | s_save[0] = s[0]; 326 | s_save[1] = s[1]; 327 | s_save[2] = s[2]; 328 | s_save[3] = s[3]; 329 | 330 | AES4(s[0], s[1], s[2], s[3], 0); 331 | MIX4(s[0], s[1], s[2], s[3]); 332 | 333 | AES4(s[0], s[1], s[2], s[3], 8); 334 | MIX4(s[0], s[1], s[2], s[3]); 335 | 336 | AES4(s[0], s[1], s[2], s[3], 16); 337 | MIX4(s[0], s[1], s[2], s[3]); 338 | 339 | AES4(s[0], s[1], s[2], s[3], 24); 340 | MIX4(s[0], s[1], s[2], s[3]); 341 | 342 | AES4(s[0], s[1], s[2], s[3], 32); 343 | s[0] = XOR(s[0], rc512[40]); 344 | s[1] = XOR(s[1], rc512[41]); 345 | s[2] = XOR(s[2], rc512[42]); 346 | s[3] = XOR(s[3], rc512[43]); 347 | MIX4(s[0], s[1], s[2], s[3]); 348 | 349 | s[0] = XOR(s[0], s_save[0]); 350 | s[1] = XOR(s[1], s_save[1]); 351 | s[2] = XOR(s[2], s_save[2]); 352 | s[3] = XOR(s[3], s_save[3]); 353 | 354 | TRUNCSTORE(out, s[0], s[1], s[2], s[3]); 355 | } 356 | 357 | void haraka_h_4x(unsigned char *out, const unsigned char *in) { 358 | u128 s[4][4], tmp, s_save[4][4]; 359 | 360 | s[0][0] = LOAD(in); 361 | s[0][1] = LOAD(in + 16); 362 | s[0][2] = LOAD(in + 32); 363 | s[0][3] = LOAD(in + 48); 364 | s[1][0] = LOAD(in + 64); 365 | s[1][1] = LOAD(in + 80); 366 | s[1][2] = LOAD(in + 96); 367 | s[1][3] = LOAD(in + 112); 368 | s[2][0] = LOAD(in + 128); 369 | s[2][1] = LOAD(in + 144); 370 | s[2][2] = LOAD(in + 160); 371 | s[2][3] = LOAD(in + 176); 372 | s[3][0] = LOAD(in + 192); 373 | s[3][1] = LOAD(in + 208); 374 | s[3][2] = LOAD(in + 224); 375 | s[3][3] = LOAD(in + 240); 376 | 377 | s_save[0][0] = s[0][0]; 378 | s_save[0][1] = s[0][1]; 379 | s_save[0][2] = s[0][2]; 380 | s_save[0][3] = s[0][3]; 381 | s_save[1][0] = s[1][0]; 382 | s_save[1][1] = s[1][1]; 383 | s_save[1][2] = s[1][2]; 384 | s_save[1][3] = s[1][3]; 385 | s_save[2][0] = s[2][0]; 386 | s_save[2][1] = s[2][1]; 387 | s_save[2][2] = s[2][2]; 388 | s_save[2][3] = s[2][3]; 389 | s_save[3][0] = s[3][0]; 390 | s_save[3][1] = s[3][1]; 391 | s_save[3][2] = s[3][2]; 392 | s_save[3][3] = s[3][3]; 393 | 394 | AES4_4x(s[0], s[1], s[2], s[3], 0); 395 | MIX4(s[0][0], s[0][1], s[0][2], s[0][3]); 396 | MIX4(s[1][0], s[1][1], s[1][2], s[1][3]); 397 | MIX4(s[2][0], s[2][1], s[2][2], s[2][3]); 398 | MIX4(s[3][0], s[3][1], s[3][2], s[3][3]); 399 | 400 | AES4_4x(s[0], s[1], s[2], s[3], 8); 401 | MIX4(s[0][0], s[0][1], s[0][2], s[0][3]); 402 | MIX4(s[1][0], s[1][1], s[1][2], s[1][3]); 403 | MIX4(s[2][0], s[2][1], s[2][2], s[2][3]); 404 | MIX4(s[3][0], s[3][1], s[3][2], s[3][3]); 405 | 406 | AES4_4x(s[0], s[1], s[2], s[3], 16); 407 | MIX4(s[0][0], s[0][1], s[0][2], s[0][3]); 408 | MIX4(s[1][0], s[1][1], s[1][2], s[1][3]); 409 | MIX4(s[2][0], s[2][1], s[2][2], s[2][3]); 410 | MIX4(s[3][0], s[3][1], s[3][2], s[3][3]); 411 | 412 | AES4_4x(s[0], s[1], s[2], s[3], 24); 413 | MIX4(s[0][0], s[0][1], s[0][2], s[0][3]); 414 | MIX4(s[1][0], s[1][1], s[1][2], s[1][3]); 415 | MIX4(s[2][0], s[2][1], s[2][2], s[2][3]); 416 | MIX4(s[3][0], s[3][1], s[3][2], s[3][3]); 417 | 418 | AES4_4x(s[0], s[1], s[2], s[3], 32); 419 | s[0][0] = XOR(s[0][0], rc512[40]); 420 | s[0][1] = XOR(s[0][1], rc512[41]); 421 | s[0][2] = XOR(s[0][2], rc512[42]); 422 | s[0][3] = XOR(s[0][3], rc512[43]); 423 | s[1][0] = XOR(s[1][0], rc512[40]); 424 | s[1][1] = XOR(s[1][1], rc512[41]); 425 | s[1][2] = XOR(s[1][2], rc512[42]); 426 | s[1][3] = XOR(s[1][3], rc512[43]); 427 | s[2][0] = XOR(s[2][0], rc512[40]); 428 | s[2][1] = XOR(s[2][1], rc512[41]); 429 | s[2][2] = XOR(s[2][2], rc512[42]); 430 | s[2][3] = XOR(s[2][3], rc512[43]); 431 | s[3][0] = XOR(s[3][0], rc512[40]); 432 | s[3][1] = XOR(s[3][1], rc512[41]); 433 | s[3][2] = XOR(s[3][2], rc512[42]); 434 | s[3][3] = XOR(s[3][3], rc512[43]); 435 | MIX4(s[0][0], s[0][1], s[0][2], s[0][3]); 436 | MIX4(s[1][0], s[1][1], s[1][2], s[1][3]); 437 | MIX4(s[2][0], s[2][1], s[2][2], s[2][3]); 438 | MIX4(s[3][0], s[3][1], s[3][2], s[3][3]); 439 | 440 | s[0][0] = XOR(s[0][0], s_save[0][0]); 441 | s[0][1] = XOR(s[0][1], s_save[0][1]); 442 | s[0][2] = XOR(s[0][2], s_save[0][2]); 443 | s[0][3] = XOR(s[0][3], s_save[0][3]); 444 | s[1][0] = XOR(s[1][0], s_save[1][0]); 445 | s[1][1] = XOR(s[1][1], s_save[1][1]); 446 | s[1][2] = XOR(s[1][2], s_save[1][2]); 447 | s[1][3] = XOR(s[1][3], s_save[1][3]); 448 | s[2][0] = XOR(s[2][0], s_save[2][0]); 449 | s[2][1] = XOR(s[2][1], s_save[2][1]); 450 | s[2][2] = XOR(s[2][2], s_save[2][2]); 451 | s[2][3] = XOR(s[2][3], s_save[2][3]); 452 | s[3][0] = XOR(s[3][0], s_save[3][0]); 453 | s[3][1] = XOR(s[3][1], s_save[3][1]); 454 | s[3][2] = XOR(s[3][2], s_save[3][2]); 455 | s[3][3] = XOR(s[3][3], s_save[3][3]); 456 | 457 | TRUNCSTORE(out, s[0][0], s[0][1], s[0][2], s[0][3]); 458 | TRUNCSTORE(out + 32, s[1][0], s[1][1], s[1][2], s[1][3]); 459 | TRUNCSTORE(out + 64, s[2][0], s[2][1], s[2][2], s[2][3]); 460 | TRUNCSTORE(out + 96, s[3][0], s[3][1], s[3][2], s[3][3]); 461 | } 462 | 463 | void haraka_h_8x(unsigned char *out, const unsigned char *in) { 464 | haraka_h_4x(out, in); 465 | haraka_h_4x(out + 128, in + 256); 466 | } 467 | -------------------------------------------------------------------------------- /code/c/neon/haraka.h: -------------------------------------------------------------------------------- 1 | /* 2 | Implementation of Haraka256 and Haraka512 for ARM. 3 | */ 4 | #ifndef HARAKA_H_ 5 | #define HARAKA_H_ 6 | 7 | #include 8 | 9 | #define u64 unsigned long 10 | #define u128 uint8x16_t 11 | 12 | 13 | // Note that the round constants differ from the x86 implementation due to the 14 | // different order in which the key is added with the ARM AES instruction set. 15 | 16 | static const uint8x16_t rc256[22] = {{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 17 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 18 | {0x9d,0x7b,0x81,0x75,0xf0,0xfe,0xc5,0xb2,0xa,0xc0,0x20,0xe6,0x4c,0x70,0x84,0x6}, 19 | {0x17,0xf7,0x8,0x2f,0xa4,0x6b,0xf,0x64,0x6b,0xa0,0xf3,0x88,0xe1,0xb4,0x66,0x8b}, 20 | {0x14,0x91,0x2,0x9f,0x79,0x4f,0x5b,0xfd,0x60,0x9d,0x2,0xcf,0xaf,0xbc,0xf3,0xbb}, 21 | {0x98,0x84,0xf2,0x53,0x8,0x4f,0x7b,0x2e,0x2d,0xde,0x2,0x34,0xe6,0xea,0xd6,0xe}, 22 | {0x44,0x70,0x39,0xbe,0x1c,0xcd,0xee,0x79,0x8b,0x44,0x72,0x48,0xcb,0xb0,0xcf,0xcb}, 23 | {0x7b,0x5,0x8a,0x2b,0xed,0x35,0x53,0x8d,0xb7,0x32,0x90,0x6e,0xee,0xcd,0xea,0x7e}, 24 | {0x1b,0xef,0x4f,0xda,0x3b,0xb,0xc7,0x1f,0x61,0x27,0x41,0xe2,0xe2,0xfd,0x5f,0x67}, 25 | {0xd0,0x7c,0x2e,0x5e,0x7,0xcc,0xca,0xaf,0x43,0x8f,0xc2,0x67,0xb0,0xd9,0x24,0x29}, 26 | {0xee,0x65,0xd4,0xb9,0xca,0x8f,0xdb,0xec,0xe9,0x7f,0x86,0xe6,0xf1,0x63,0x4d,0xab}, 27 | {0x33,0x7e,0x3,0xad,0x4f,0x40,0x2a,0x5b,0x64,0xcd,0xb7,0xd4,0x84,0xbf,0x30,0x1c}, 28 | {0x0,0x98,0xf6,0x8d,0x8a,0x2d,0x9d,0x5c,0x2e,0x8b,0x2,0x69,0xc8,0x9e,0xaa,0x4a}, 29 | {0xbf,0x23,0x17,0x94,0x72,0x55,0x6f,0xde,0xb9,0xb,0xcc,0xb2,0xa6,0x78,0x4,0xfa}, 30 | {0xd4,0x9f,0x12,0x29,0x2e,0x4f,0xfa,0xe,0x12,0x2a,0x77,0x6b,0x2b,0x9f,0xb4,0xdf}, 31 | {0xee,0x12,0x6a,0xbb,0xae,0x11,0xd6,0x32,0x36,0xa2,0x49,0xf4,0x44,0x3,0xa1,0x1e}, 32 | {0xa6,0xec,0xa8,0x9c,0xec,0x93,0xe5,0x27,0xc9,0x0,0x96,0x5f,0xe3,0xc7,0xa2,0x78}, 33 | {0x84,0x0,0x5,0x4b,0x4f,0x9c,0x19,0x9d,0x88,0x49,0x4,0xaf,0xd8,0x5e,0x2,0x21}, 34 | {0x73,0x1,0xd4,0x82,0xcd,0x2e,0x28,0xb9,0xb7,0xc9,0x59,0xa7,0xf8,0xaa,0x3a,0xbf}, 35 | {0x6b,0x7d,0x30,0x10,0xd9,0xef,0xf2,0x37,0x17,0xb0,0x86,0x61,0xd,0x70,0x60,0x62}, 36 | {0xc6,0x9a,0xfc,0xf6,0x53,0x91,0xc2,0x81,0x43,0x4,0x30,0x21,0xc2,0x45,0xca,0x5a}, 37 | {0x3a,0x94,0xd1,0x36,0xe8,0x92,0xaf,0x2c,0xbb,0x68,0x6b,0x22,0x3c,0x97,0x23,0x92}}; 38 | 39 | 40 | static const uint8x16_t rc512[44] = {{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 41 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 42 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 43 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 44 | {0x9d,0x7b,0x81,0x75,0xf0,0xfe,0xc5,0xb2,0xa,0xc0,0x20,0xe6,0x4c,0x70,0x84,0x6}, 45 | {0x17,0xf7,0x8,0x2f,0xa4,0x6b,0xf,0x64,0x6b,0xa0,0xf3,0x88,0xe1,0xb4,0x66,0x8b}, 46 | {0x14,0x91,0x2,0x9f,0x60,0x9d,0x2,0xcf,0x98,0x84,0xf2,0x53,0x2d,0xde,0x2,0x34}, 47 | {0x79,0x4f,0x5b,0xfd,0xaf,0xbc,0xf3,0xbb,0x8,0x4f,0x7b,0x2e,0xe6,0xea,0xd6,0xe}, 48 | {0xcb,0xb0,0xcf,0xcb,0x43,0x8f,0xc2,0x67,0xee,0xcd,0xea,0x7e,0xb0,0xd9,0x24,0x29}, 49 | {0x1b,0xef,0x4f,0xda,0x44,0x70,0x39,0xbe,0x3b,0xb,0xc7,0x1f,0x7b,0x5,0x8a,0x2b}, 50 | {0x61,0x27,0x41,0xe2,0x1c,0xcd,0xee,0x79,0xe2,0xfd,0x5f,0x67,0xed,0x35,0x53,0x8d}, 51 | {0x8b,0x44,0x72,0x48,0xd0,0x7c,0x2e,0x5e,0xb7,0x32,0x90,0x6e,0x7,0xcc,0xca,0xaf}, 52 | {0xee,0x65,0xd4,0xb9,0xca,0x8f,0xdb,0xec,0xe9,0x7f,0x86,0xe6,0xf1,0x63,0x4d,0xab}, 53 | {0x33,0x7e,0x3,0xad,0x4f,0x40,0x2a,0x5b,0x64,0xcd,0xb7,0xd4,0x84,0xbf,0x30,0x1c}, 54 | {0x0,0x98,0xf6,0x8d,0x2e,0x8b,0x2,0x69,0xbf,0x23,0x17,0x94,0xb9,0xb,0xcc,0xb2}, 55 | {0x8a,0x2d,0x9d,0x5c,0xc8,0x9e,0xaa,0x4a,0x72,0x55,0x6f,0xde,0xa6,0x78,0x4,0xfa}, 56 | {0x2b,0x9f,0xb4,0xdf,0x88,0x49,0x4,0xaf,0x44,0x3,0xa1,0x1e,0xd8,0x5e,0x2,0x21}, 57 | {0xa6,0xec,0xa8,0x9c,0xd4,0x9f,0x12,0x29,0xec,0x93,0xe5,0x27,0xee,0x12,0x6a,0xbb}, 58 | {0xc9,0x0,0x96,0x5f,0x2e,0x4f,0xfa,0xe,0xe3,0xc7,0xa2,0x78,0xae,0x11,0xd6,0x32}, 59 | {0x12,0x2a,0x77,0x6b,0x84,0x0,0x5,0x4b,0x36,0xa2,0x49,0xf4,0x4f,0x9c,0x19,0x9d}, 60 | {0x73,0x1,0xd4,0x82,0xcd,0x2e,0x28,0xb9,0xb7,0xc9,0x59,0xa7,0xf8,0xaa,0x3a,0xbf}, 61 | {0x6b,0x7d,0x30,0x10,0xd9,0xef,0xf2,0x37,0x17,0xb0,0x86,0x61,0xd,0x70,0x60,0x62}, 62 | {0xc6,0x9a,0xfc,0xf6,0x53,0x91,0xc2,0x81,0x43,0x4,0x30,0x21,0xc2,0x45,0xca,0x5a}, 63 | {0x3a,0x94,0xd1,0x36,0xe8,0x92,0xaf,0x2c,0xbb,0x68,0x6b,0x22,0x3c,0x97,0x23,0x92}, 64 | {0x38,0x92,0xbf,0xd3,0x68,0x62,0x60,0xbb,0xe5,0x3c,0x86,0xdb,0xdc,0xd3,0x4b,0x73}, 65 | {0xb1,0x12,0x22,0xcb,0xb4,0x71,0x10,0xe5,0x7d,0xf7,0x2b,0xc7,0x8d,0x12,0xe1,0x24}, 66 | {0xe3,0x8d,0xe4,0x83,0x58,0xb9,0xba,0x6c,0x4e,0x1a,0xb9,0x2d,0xdd,0xfd,0x3d,0x93}, 67 | {0xeb,0x86,0x58,0x22,0x9c,0xa0,0xeb,0xff,0x77,0xc6,0xf0,0xae,0x9c,0xd1,0xe4,0xe2}, 68 | {0x4e,0x92,0xb3,0x2c,0xc4,0x15,0x14,0x4b,0x43,0x1b,0x30,0x61,0xc3,0x47,0xbb,0x43}, 69 | {0x99,0x68,0xeb,0x16,0xdd,0x31,0xb2,0x3,0xf6,0xef,0x7,0xe7,0xa8,0x75,0xa7,0xdb}, 70 | {0x2c,0x47,0xca,0x7e,0x2,0x23,0x5e,0x8e,0x77,0x59,0x75,0x3c,0x4b,0x61,0xf3,0x6d}, 71 | {0xf9,0x17,0x86,0xb8,0xb9,0xe5,0x1b,0x6d,0x77,0x7d,0xde,0xd6,0x17,0x5a,0xa7,0xcd}, 72 | {0xf0,0x43,0x6b,0xec,0x75,0xc,0xee,0x2c,0x50,0x69,0x1e,0xcb,0xa1,0xa5,0xb1,0xf0}, 73 | {0xd9,0xd0,0xe,0x60,0x5d,0xee,0x46,0xa9,0x50,0xa3,0xa4,0x63,0xc1,0x27,0xf3,0x3b}, 74 | {0x53,0x3,0xed,0xe4,0x9d,0x6,0x6c,0x9d,0xbc,0xba,0xbb,0x80,0x59,0x11,0x53,0xa2}, 75 | {0xaa,0xe9,0xa8,0x6b,0x9c,0x61,0xda,0x0,0x2b,0x33,0x57,0xf9,0xab,0xc,0xe9,0x96}, 76 | {0x39,0xca,0x8d,0x93,0x30,0xde,0xd,0xab,0x88,0x29,0x96,0x5e,0x2,0xb1,0x3d,0xae}, 77 | {0x42,0xb4,0x75,0x2e,0xa8,0xf3,0x14,0x88,0xb,0xa4,0x54,0xd5,0x38,0x8f,0xbb,0x17}, 78 | {0xf6,0x16,0xa,0x36,0x79,0xb7,0xb6,0xae,0xd7,0x7f,0x42,0x5f,0x5b,0x8a,0xbb,0x34}, 79 | {0xde,0xaf,0xba,0xff,0x18,0x59,0xce,0x43,0x38,0x54,0xe5,0xcb,0x41,0x52,0xf6,0x26}, 80 | {0x78,0xc9,0x9e,0x83,0xf7,0x9c,0xca,0xa2,0x6a,0x2,0xf3,0xb9,0x54,0x9a,0xe9,0x4c}, 81 | {0x35,0x12,0x90,0x22,0x28,0x6e,0xc0,0x40,0xbe,0xf7,0xdf,0x1b,0x1a,0xa5,0x51,0xae}, 82 | {0xcf,0x59,0xa6,0x48,0xf,0xbc,0x73,0xc1,0x2b,0xd2,0x7e,0xba,0x3c,0x61,0xc1,0xa0}, 83 | {0xa1,0x9d,0xc5,0xe9,0xfd,0xbd,0xd6,0x4a,0x88,0x82,0x28,0x2,0x3,0xcc,0x6a,0x75}}; 84 | 85 | #define XOR(a, b) veorq_u8(a, b) 86 | #define LOAD(src) vld1q_u8(src) 87 | #define STORE(dest,src) vst1q_u8(dest,src) 88 | #define ZIP2(a, b) (u128) vzip2q_u32((uint32x4_t)a, (uint32x4_t)b) 89 | #define ZIP1(a, b) (u128) vzip1q_u32((uint32x4_t)a, (uint32x4_t)b) 90 | 91 | #define AES2(s0, s1, rci) \ 92 | s0 = vaesmcq_u8(vaeseq_u8(s0, rc256[rci])); \ 93 | s1 = vaesmcq_u8(vaeseq_u8(s1, rc256[rci + 1])); \ 94 | s0 = vaesmcq_u8(vaeseq_u8(s0, rc256[rci + 2])); \ 95 | s1 = vaesmcq_u8(vaeseq_u8(s1, rc256[rci + 3])); 96 | 97 | #define AES2_4x(s0, s1, s2, s3, rci) \ 98 | AES2(s0[0], s0[1], rci); \ 99 | AES2(s1[0], s1[1], rci); \ 100 | AES2(s2[0], s2[1], rci); \ 101 | AES2(s3[0], s3[1], rci); 102 | 103 | #define AES2_8x(s0, s1, s2, s3, s4, s5, s6, s7, rci) \ 104 | AES2_4x(s0, s1, s2, s3, rci); \ 105 | AES2_4x(s4, s5, s6, s7, rci); 106 | 107 | #define AES4(s0, s1, s2, s3, rci) \ 108 | s0 = vaesmcq_u8(vaeseq_u8(s0, rc512[rci])); \ 109 | s1 = vaesmcq_u8(vaeseq_u8(s1, rc512[rci + 1])); \ 110 | s2 = vaesmcq_u8(vaeseq_u8(s2, rc512[rci + 2])); \ 111 | s3 = vaesmcq_u8(vaeseq_u8(s3, rc512[rci + 3])); \ 112 | s0 = vaesmcq_u8(vaeseq_u8(s0, rc512[rci + 4])); \ 113 | s1 = vaesmcq_u8(vaeseq_u8(s1, rc512[rci + 5])); \ 114 | s2 = vaesmcq_u8(vaeseq_u8(s2, rc512[rci + 6])); \ 115 | s3 = vaesmcq_u8(vaeseq_u8(s3, rc512[rci + 7])); \ 116 | 117 | #define AES4_4x(s0, s1, s2, s3, rci) \ 118 | AES4(s0[0], s0[1], s0[2], s0[3], rci); \ 119 | AES4(s1[0], s1[1], s1[2], s1[3], rci); \ 120 | AES4(s2[0], s2[1], s2[2], s2[3], rci); \ 121 | AES4(s3[0], s3[1], s3[2], s3[3], rci); 122 | 123 | #define AES4_8x(s0, s1, s2, s3, s4, s5, s6, s7, rci) \ 124 | AES4_4x(s0, s1, s2, s3, rci); \ 125 | AES4_4x(s4, s5, s6, s7, rci); 126 | 127 | #define MIX2(s0, s1) \ 128 | tmp = ZIP2(s0, s1); \ 129 | s0 = ZIP1(s0, s1); \ 130 | s1 = tmp; 131 | 132 | #define MIX4(s0, s1, s2, s3) \ 133 | tmp = ZIP1(s0, s1); \ 134 | s0 = ZIP2(s0, s1); \ 135 | s1 = ZIP1(s2, s3); \ 136 | s2 = ZIP2(s2, s3); \ 137 | s3 = ZIP1(s0, s2); \ 138 | s0 = ZIP2(s0, s2); \ 139 | s2 = ZIP2(s1, tmp); \ 140 | s1 = ZIP1(s1, tmp); 141 | 142 | #define TRUNCSTORE(out, s0, s1, s2, s3) \ 143 | *(u64*)(out) = vreinterpretq_u64_u8(s0)[1]; \ 144 | *(u64*)(out + 8) = vreinterpretq_u64_u8(s1)[1]; \ 145 | *(u64*)(out + 16) = vreinterpretq_u64_u8(s2)[0]; \ 146 | *(u64*)(out + 24) = vreinterpretq_u64_u8(s3)[0]; 147 | 148 | void haraka_testvectors(); 149 | 150 | void haraka_f(unsigned char *out, const unsigned char *in); 151 | void haraka_f_4x(unsigned char *out, const unsigned char *in); 152 | void haraka_f_8x(unsigned char *out, const unsigned char *in); 153 | 154 | void haraka_h(unsigned char *out, const unsigned char *in); 155 | void haraka_h_4x(unsigned char *out, const unsigned char *in); 156 | void haraka_h_8x(unsigned char *out, const unsigned char *in); 157 | 158 | 159 | #endif 160 | -------------------------------------------------------------------------------- /code/python/ref.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import copy 3 | 4 | MPAR = 1 5 | ROUNDS = 5 6 | AES_ROUNDS = 2 7 | 8 | # AES S-box 9 | S = [[0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76], 10 | [0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0], 11 | [0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15], 12 | [0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75], 13 | [0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84], 14 | [0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf], 15 | [0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8], 16 | [0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2], 17 | [0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73], 18 | [0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb], 19 | [0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79], 20 | [0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08], 21 | [0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a], 22 | [0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e], 23 | [0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf], 24 | [0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16]] 25 | 26 | RC = [0x0684704ce620c00ab2c5fef075817b9d, 0x8b66b4e188f3a06b640f6ba42f08f717, 27 | 0x3402de2d53f28498cf029d609f029114, 0x0ed6eae62e7b4f08bbf3bcaffd5b4f79, 28 | 0xcbcfb0cb4872448b79eecd1cbe397044, 0x7eeacdee6e9032b78d5335ed2b8a057b, 29 | 0x67c28f435e2e7cd0e2412761da4fef1b, 0x2924d9b0afcacc07675ffde21fc70b3b, 30 | 0xab4d63f1e6867fe9ecdb8fcab9d465ee, 0x1c30bf84d4b7cd645b2a404fad037e33, 31 | 0xb2cc0bb9941723bf69028b2e8df69800, 0xfa0478a6de6f55724aaa9ec85c9d2d8a, 32 | 0xdfb49f2b6b772a120efa4f2e29129fd4, 0x1ea10344f449a23632d611aebb6a12ee, 33 | 0xaf0449884b0500845f9600c99ca8eca6, 0x21025ed89d199c4f78a2c7e327e593ec, 34 | 0xbf3aaaf8a759c9b7b9282ecd82d40173, 0x6260700d6186b01737f2efd910307d6b, 35 | 0x5aca45c22130044381c29153f6fc9ac6, 0x9223973c226b68bb2caf92e836d1943a, 36 | 0xd3bf9238225886eb6cbab958e51071b4, 0xdb863ce5aef0c677933dfddd24e1128d, 37 | 0xbb606268ffeba09c83e48de3cb2212b1, 0x734bd3dce2e4d19c2db91a4ec72bf77d, 38 | 0x43bb47c361301b434b1415c42cb3924e, 0xdba775a8e707eff603b231dd16eb6899, 39 | 0x6df3614b3c7559778e5e23027eca472c, 0xcda75a17d6de7d776d1be5b9b88617f9, 40 | 0xec6b43f06ba8e9aa9d6c069da946ee5d, 0xcb1e6950f957332ba25311593bf327c1, 41 | 0x2cee0c7500da619ce4ed0353600ed0d9, 0xf0b1a5a196e90cab80bbbabc63a4a350, 42 | 0xae3db1025e962988ab0dde30938dca39, 0x17bb8f38d554a40b8814f3a82e75b442, 43 | 0x34bb8a5b5f427fd7aeb6b779360a16f6, 0x26f65241cbe5543843ce5918ffbaafde, 44 | 0x4ce99a54b9f3026aa2ca9cf7839ec978, 0xae51a51a1bdff7be40c06e2822901235, 45 | 0xa0c1613cba7ed22bc173bc0f48a659cf, 0x756acc03022882884ad6bdfde9c59da1] 46 | 47 | # get padded hex for single byte 48 | def hexbyte(x): 49 | return hex(x)[2:].zfill(2) 50 | 51 | # print list of bytes in hex 52 | def ps(s): 53 | return " ".join([hexbyte(x) for x in s]) 54 | 55 | # print state 56 | def printstate(s): 57 | for i in range(4): 58 | if len(s) == 4: 59 | q = [s[0][i],s[0][i+4],s[0][i+8],s[0][i+12], 60 | s[1][i],s[1][i+4],s[1][i+8],s[1][i+12], 61 | s[2][i],s[2][i+4],s[2][i+8],s[2][i+12], 62 | s[3][i],s[3][i+4],s[3][i+8],s[3][i+12]] 63 | else: 64 | q = [s[0][i],s[0][i+4],s[0][i+8],s[0][i+12], 65 | s[1][i],s[1][i+4],s[1][i+8],s[1][i+12]] 66 | print " ".join([hexbyte(x) for x in q]) 67 | # print q 68 | print "" 69 | 70 | # multiply by 2 over GF(2^128) 71 | def xtime(x): 72 | if (x >> 7): 73 | return ((x << 1) ^ 0x1b) & 0xff 74 | else: 75 | return (x << 1) & 0xff 76 | 77 | # xor two lists element-wise 78 | def xor(x,y): 79 | return [x[i] ^ y[i] for i in range(16)] 80 | 81 | # apply a single S-box 82 | def sbox(x): 83 | return S[(x >> 4)][x & 0xF] 84 | 85 | # AES SubBytes 86 | def subbytes(s): 87 | return [sbox(x) for x in s] 88 | 89 | # AES ShiftRows 90 | def shiftrows(s): 91 | return [s[0], s[5], s[10], s[15], 92 | s[4], s[9], s[14], s[3], 93 | s[8], s[13], s[2], s[7], 94 | s[12], s[1], s[6], s[11]] 95 | 96 | # AES MixColumns 97 | def mixcolumns(s): 98 | return list(itertools.chain(* 99 | [[xtime(s[4*i]) ^ xtime(s[4*i+1]) ^ s[4*i+1] ^ s[4*i+2] ^ s[4*i+3], 100 | s[4*i] ^ xtime(s[4*i+1]) ^ xtime(s[4*i+2]) ^ s[4*i+2] ^ s[4*i+3], 101 | s[4*i] ^ s[4*i+1] ^ xtime(s[4*i+2]) ^ xtime(s[4*i+3]) ^ s[4*i+3], 102 | xtime(s[4*i]) ^ s[4*i] ^ s[4*i+1] ^ s[4*i+2] ^ xtime(s[4*i+3])] 103 | for i in range(4)])) 104 | 105 | # AES single regular round 106 | def aesenc(s, rk): 107 | s = subbytes(s) 108 | s = shiftrows(s) 109 | s = mixcolumns(s) 110 | s = xor(s, rk[::-1]) 111 | return s 112 | 113 | # consider 4 consecutive entries as 32-bit values and shift each of them to the left 114 | def shift32(x): 115 | # make list of 32-bit elements 116 | w = [((x[i] << 24) ^ (x[i+1] << 16) ^ (x[i+2] << 8) ^ x[i+3]) << 1 for i in [0, 4, 8, 12]] 117 | return list(itertools.chain(*[[(q >> 24) & 0xFF, (q >> 16) & 0xFF, (q >> 8) & 0xFF, (q >> 0) & 0xFF] for q in w])) 118 | 119 | # linear mixing for Haraka-512/256 120 | def mix512(s): 121 | return [s[0][12:16] + s[2][12:16] + s[1][12:16] + s[3][12:16], 122 | s[2][0:4] + s[0][0:4] + s[3][0:4] + s[1][0:4] , 123 | s[2][4:8] + s[0][4:8] + s[3][4:8] + s[1][4:8] , 124 | s[0][8:12] + s[2][8:12] + s[1][8:12] + s[3][8:12]] 125 | 126 | # linear mixing for Haraka-256/256 127 | def mix256(s): 128 | return [s[0][0:4] + s[1][0:4] + s[0][4:8] + s[1][4:8], 129 | s[0][8:12] + s[1][8:12] + s[0][12:16] + s[1][12:16]] 130 | 131 | # convert RC to 16 words state 132 | def convRC(rc): 133 | rcstr = hex(rc)[2:-1].zfill(32) 134 | return [int(rcstr[i:i+2], 16) for i in range(0, 32, 2)] 135 | 136 | # Haraka-512/256 137 | def haraka512256(msg): 138 | # obtain state from msg input and set initial rcon 139 | s = [msg[i:i+16] for i in [0,16,32,48]] 140 | rcon = [0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1] 141 | 142 | print "= input state =" 143 | printstate(s) 144 | 145 | # apply round functions 146 | for t in range(ROUNDS): 147 | # first we do AES_ROUNDS of AES rounds and update the round constant each time 148 | for m in range(AES_ROUNDS): 149 | s = [aesenc(s[i], convRC(RC[4*t*AES_ROUNDS + 4*m + i])) for i in range(4)] 150 | 151 | print "= round %d : after aes layer ="%(t) 152 | printstate(s) 153 | 154 | # now apply mixing 155 | s = mix512(s) 156 | 157 | print "= round %d : after mix layer ="%(t) 158 | printstate(s) 159 | 160 | print "= output from permutation =" 161 | printstate(s) 162 | 163 | # apply feed-forward 164 | s = [xor(s[i], msg[16*i:16*(i+1)]) for i in range(4)] 165 | 166 | print "= after feed-forward =" 167 | printstate(s) 168 | 169 | # truncation 170 | return s[0][8:] + s[1][8:] + s[2][0:8] + s[3][0:8] 171 | 172 | # Haraka-256/256 173 | def haraka256256(msg): 174 | # obtain state from msg input and set initial rcon 175 | s = [msg[i:i+16] for i in [0,16]] 176 | rcon = [0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1] 177 | 178 | print "= input state =" 179 | printstate(s) 180 | 181 | # apply round functions 182 | for t in range(ROUNDS): 183 | # first we do AES_ROUNDS of AES rounds and update the round constant each time 184 | for m in range(AES_ROUNDS): 185 | s = [aesenc(s[i], convRC(RC[2*t*AES_ROUNDS + 2*m + i])) for i in range(2)] 186 | rcon = shift32(rcon) 187 | 188 | print "= round %d : after aes layer ="%(t) 189 | printstate(s) 190 | 191 | # now apply mixing 192 | s = mix256(s) 193 | 194 | print "= round %d : after mix layer ="%(t) 195 | printstate(s) 196 | 197 | print "= output from permutation =" 198 | printstate(s) 199 | 200 | # apply feed-forward 201 | s = [xor(s[i], msg[16*i:16*(i+1)]) for i in range(2)] 202 | 203 | print "= after feed-forward =" 204 | printstate(s) 205 | 206 | # truncation 207 | return list(itertools.chain(*s)) 208 | 209 | 210 | # set some message bytes 211 | m = [i for i in range(64)] 212 | 213 | # print input 214 | print "= input bytes =" 215 | print ps(m) + "\n" 216 | 217 | # call Haraka-512/256 218 | digest = haraka512256(m) 219 | 220 | # print digest 221 | print "= haraka-512/256 output bytes =" 222 | print ps(digest) + "\n" 223 | 224 | # call Haraka-256/256 225 | digest = haraka256256(m) 226 | 227 | # print digest 228 | print "= haraka-256/256 output bytes =" 229 | print ps(digest) + "\n" -------------------------------------------------------------------------------- /supercop/crypto_sign/measure.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "randombytes.h" 3 | #include "cpucycles.h" 4 | #include "crypto_sign.h" 5 | #include "permute.h" 6 | 7 | extern void printentry(long long,const char *,long long *,long long); 8 | extern unsigned char *alignedcalloc(unsigned long long); 9 | extern const char *primitiveimplementation; 10 | extern const char *implementationversion; 11 | extern const char *sizenames[]; 12 | extern const long long sizes[]; 13 | extern void allocate(void); 14 | extern void measure(void); 15 | 16 | const char *primitiveimplementation = crypto_sign_IMPLEMENTATION; 17 | const char *implementationversion = crypto_sign_VERSION; 18 | const char *sizenames[] = { "outputbytes", "publickeybytes", "secretkeybytes", 0 }; 19 | const long long sizes[] = { crypto_sign_BYTES, crypto_sign_PUBLICKEYBYTES, crypto_sign_SECRETKEYBYTES }; 20 | 21 | #define MAXTEST_BYTES 100000 22 | 23 | static unsigned char *pk; 24 | static unsigned char *sk; 25 | static unsigned char *m; unsigned long long mlen; 26 | static unsigned char *sm; unsigned long long smlen; 27 | static unsigned char *t; unsigned long long tlen; 28 | 29 | void preallocate(void) 30 | { 31 | #ifdef RAND_R_PRNG_NOT_SEEDED 32 | RAND_status(); 33 | #endif 34 | } 35 | 36 | void allocate(void) 37 | { 38 | pk = alignedcalloc(crypto_sign_PUBLICKEYBYTES); 39 | sk = alignedcalloc(crypto_sign_SECRETKEYBYTES); 40 | m = alignedcalloc(MAXTEST_BYTES + crypto_sign_BYTES); 41 | sm = alignedcalloc(MAXTEST_BYTES + crypto_sign_BYTES); 42 | t = alignedcalloc(MAXTEST_BYTES + crypto_sign_BYTES); 43 | } 44 | 45 | #define TIMINGS 20 46 | static long long cycles[TIMINGS + 1]; 47 | static long long bytes[TIMINGS + 1]; 48 | 49 | void measure(void) 50 | { 51 | int i; 52 | int loop; 53 | 54 | for (loop = 0;loop < LOOPS;++loop) { 55 | for (i = 0;i <= TIMINGS;++i) { 56 | cycles[i] = cpucycles(); 57 | crypto_sign_keypair(pk,sk); 58 | } 59 | for (i = 0;i < TIMINGS;++i) 60 | cycles[i] = cycles[i + 1] - cycles[i]; 61 | printentry(-1,"keypair_cycles",cycles,TIMINGS); 62 | 63 | for (mlen = 0;mlen <= MAXTEST_BYTES;mlen += 1 + mlen / 4) { 64 | randombytes(m,mlen); 65 | 66 | for (i = 0;i <= TIMINGS;++i) { 67 | cycles[i] = cpucycles(); 68 | bytes[i] = crypto_sign(sm,&smlen,m,mlen,sk); 69 | if (bytes[i] == 0) 70 | bytes[i] = smlen; 71 | } 72 | for (i = 0;i < TIMINGS;++i) 73 | cycles[i] = cycles[i + 1] - cycles[i]; 74 | printentry(mlen,"cycles",cycles,TIMINGS); 75 | printentry(mlen,"bytes",bytes,TIMINGS); 76 | 77 | for (i = 0;i <= TIMINGS;++i) { 78 | cycles[i] = cpucycles(); 79 | bytes[i] = crypto_sign_open(t,&tlen,sm,smlen,pk); 80 | if (bytes[i] == 0) bytes[i] = tlen; 81 | } 82 | for (i = 0;i < TIMINGS;++i) 83 | cycles[i] = cycles[i + 1] - cycles[i]; 84 | printentry(mlen,"open_cycles",cycles,TIMINGS); 85 | printentry(mlen,"open_bytes",bytes,TIMINGS); 86 | } 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /supercop/crypto_sign/measure.c~: -------------------------------------------------------------------------------- 1 | #include 2 | #include "randombytes.h" 3 | #include "cpucycles.h" 4 | #include "crypto_sign.h" 5 | 6 | extern void printentry(long long,const char *,long long *,long long); 7 | extern unsigned char *alignedcalloc(unsigned long long); 8 | extern const char *primitiveimplementation; 9 | extern const char *implementationversion; 10 | extern const char *sizenames[]; 11 | extern const long long sizes[]; 12 | extern void allocate(void); 13 | extern void measure(void); 14 | 15 | const char *primitiveimplementation = crypto_sign_IMPLEMENTATION; 16 | const char *implementationversion = crypto_sign_VERSION; 17 | const char *sizenames[] = { "outputbytes", "publickeybytes", "secretkeybytes", 0 }; 18 | const long long sizes[] = { crypto_sign_BYTES, crypto_sign_PUBLICKEYBYTES, crypto_sign_SECRETKEYBYTES }; 19 | 20 | #define MAXTEST_BYTES 100000 21 | 22 | static unsigned char *pk; 23 | static unsigned char *sk; 24 | static unsigned char *m; unsigned long long mlen; 25 | static unsigned char *sm; unsigned long long smlen; 26 | static unsigned char *t; unsigned long long tlen; 27 | 28 | void preallocate(void) 29 | { 30 | #ifdef RAND_R_PRNG_NOT_SEEDED 31 | RAND_status(); 32 | #endif 33 | } 34 | 35 | void allocate(void) 36 | { 37 | pk = alignedcalloc(crypto_sign_PUBLICKEYBYTES); 38 | sk = alignedcalloc(crypto_sign_SECRETKEYBYTES); 39 | m = alignedcalloc(MAXTEST_BYTES + crypto_sign_BYTES); 40 | sm = alignedcalloc(MAXTEST_BYTES + crypto_sign_BYTES); 41 | t = alignedcalloc(MAXTEST_BYTES + crypto_sign_BYTES); 42 | } 43 | 44 | #define TIMINGS 1 45 | static long long cycles[TIMINGS + 1]; 46 | static long long bytes[TIMINGS + 1]; 47 | 48 | void measure(void) 49 | { 50 | int i; 51 | int loop; 52 | 53 | for (loop = 0;loop < LOOPS;++loop) { 54 | for (i = 0;i <= TIMINGS;++i) { 55 | cycles[i] = cpucycles(); 56 | crypto_sign_keypair(pk,sk); 57 | } 58 | for (i = 0;i < TIMINGS;++i) 59 | cycles[i] = cycles[i + 1] - cycles[i]; 60 | printentry(-1,"keypair_cycles",cycles,TIMINGS); 61 | 62 | for (mlen = 0;mlen <= MAXTEST_BYTES;mlen += 1 + mlen / 4) { 63 | randombytes(m,mlen); 64 | 65 | for (i = 0;i <= TIMINGS;++i) { 66 | cycles[i] = cpucycles(); 67 | bytes[i] = crypto_sign(sm,&smlen,m,mlen,sk); 68 | if (bytes[i] == 0) 69 | bytes[i] = smlen; 70 | } 71 | for (i = 0;i < TIMINGS;++i) 72 | cycles[i] = cycles[i + 1] - cycles[i]; 73 | printentry(mlen,"cycles",cycles,TIMINGS); 74 | printentry(mlen,"bytes",bytes,TIMINGS); 75 | 76 | for (i = 0;i <= TIMINGS;++i) { 77 | cycles[i] = cpucycles(); 78 | bytes[i] = crypto_sign_open(t,&tlen,sm,smlen,pk); 79 | if (bytes[i] == 0) bytes[i] = tlen; 80 | } 81 | for (i = 0;i < TIMINGS;++i) 82 | cycles[i] = cycles[i + 1] - cycles[i]; 83 | printentry(mlen,"open_cycles",cycles,TIMINGS); 84 | printentry(mlen,"open_bytes",bytes,TIMINGS); 85 | } 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /supercop/crypto_sign/sphincs256haraka/aesni/api.h: -------------------------------------------------------------------------------- 1 | #include "params.h" 2 | 3 | #define CRYPTO_SECRETKEYBYTES (SEED_BYTES + CRYPTO_PUBLICKEYBYTES-HASH_BYTES + SK_RAND_SEED_BYTES) 4 | #define CRYPTO_PUBLICKEYBYTES ((N_MASKS+1)*HASH_BYTES) 5 | #define CRYPTO_BYTES (MESSAGE_HASH_SEED_BYTES + (TOTALTREE_HEIGHT+7)/8 + HORST_SIGBYTES + (TOTALTREE_HEIGHT/SUBTREE_HEIGHT)*WOTS_SIGBYTES + TOTALTREE_HEIGHT*HASH_BYTES) 6 | #define CRYPTO_DETERMINISTIC 1 7 | -------------------------------------------------------------------------------- /supercop/crypto_sign/sphincs256haraka/aesni/consts.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | uint32_t hashc8x[64] = { 4 | 0x61707865, 0x61707865, 0x61707865, 0x61707865, 0x61707865, 0x61707865, 0x61707865, 0x61707865, 5 | 0x3320646e, 0x3320646e, 0x3320646e, 0x3320646e, 0x3320646e, 0x3320646e, 0x3320646e, 0x3320646e, 6 | 0x79622d32, 0x79622d32, 0x79622d32, 0x79622d32, 0x79622d32, 0x79622d32, 0x79622d32, 0x79622d32, 7 | 0x74206574, 0x74206574, 0x74206574, 0x74206574, 0x74206574, 0x74206574, 0x74206574, 0x74206574, 8 | 0x3436206f, 0x3436206f, 0x3436206f, 0x3436206f, 0x3436206f, 0x3436206f, 0x3436206f, 0x3436206f, 9 | 0x7479622d, 0x7479622d, 0x7479622d, 0x7479622d, 0x7479622d, 0x7479622d, 0x7479622d, 0x7479622d, 10 | 0x74732065, 0x74732065, 0x74732065, 0x74732065, 0x74732065, 0x74732065, 0x74732065, 0x74732065, 11 | 0x21657461, 0x21657461, 0x21657461, 0x21657461, 0x21657461, 0x21657461, 0x21657461, 0x21657461}; 12 | 13 | unsigned char _rotate8[32] = {3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14, 14 | 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14}; 15 | 16 | unsigned char _rotate16[32] = {2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13, 17 | 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13}; 18 | -------------------------------------------------------------------------------- /supercop/crypto_sign/sphincs256haraka/aesni/haraka.c: -------------------------------------------------------------------------------- 1 | #include "haraka.h" 2 | #include 3 | 4 | void load_constants() { 5 | rc[0] = _mm_set_epi32(0x0684704c,0xe620c00a,0xb2c5fef0,0x75817b9d); 6 | rc[1] = _mm_set_epi32(0x8b66b4e1,0x88f3a06b,0x640f6ba4,0x2f08f717); 7 | rc[2] = _mm_set_epi32(0x3402de2d,0x53f28498,0xcf029d60,0x9f029114); 8 | rc[3] = _mm_set_epi32(0x0ed6eae6,0x2e7b4f08,0xbbf3bcaf,0xfd5b4f79); 9 | rc[4] = _mm_set_epi32(0xcbcfb0cb,0x4872448b,0x79eecd1c,0xbe397044); 10 | rc[5] = _mm_set_epi32(0x7eeacdee,0x6e9032b7,0x8d5335ed,0x2b8a057b); 11 | rc[6] = _mm_set_epi32(0x67c28f43,0x5e2e7cd0,0xe2412761,0xda4fef1b); 12 | rc[7] = _mm_set_epi32(0x2924d9b0,0xafcacc07,0x675ffde2,0x1fc70b3b); 13 | rc[8] = _mm_set_epi32(0xab4d63f1,0xe6867fe9,0xecdb8fca,0xb9d465ee); 14 | rc[9] = _mm_set_epi32(0x1c30bf84,0xd4b7cd64,0x5b2a404f,0xad037e33); 15 | rc[10] = _mm_set_epi32(0xb2cc0bb9,0x941723bf,0x69028b2e,0x8df69800); 16 | rc[11] = _mm_set_epi32(0xfa0478a6,0xde6f5572,0x4aaa9ec8,0x5c9d2d8a); 17 | rc[12] = _mm_set_epi32(0xdfb49f2b,0x6b772a12,0x0efa4f2e,0x29129fd4); 18 | rc[13] = _mm_set_epi32(0x1ea10344,0xf449a236,0x32d611ae,0xbb6a12ee); 19 | rc[14] = _mm_set_epi32(0xaf044988,0x4b050084,0x5f9600c9,0x9ca8eca6); 20 | rc[15] = _mm_set_epi32(0x21025ed8,0x9d199c4f,0x78a2c7e3,0x27e593ec); 21 | rc[16] = _mm_set_epi32(0xbf3aaaf8,0xa759c9b7,0xb9282ecd,0x82d40173); 22 | rc[17] = _mm_set_epi32(0x6260700d,0x6186b017,0x37f2efd9,0x10307d6b); 23 | rc[18] = _mm_set_epi32(0x5aca45c2,0x21300443,0x81c29153,0xf6fc9ac6); 24 | rc[19] = _mm_set_epi32(0x9223973c,0x226b68bb,0x2caf92e8,0x36d1943a); 25 | rc[20] = _mm_set_epi32(0xd3bf9238,0x225886eb,0x6cbab958,0xe51071b4); 26 | rc[21] = _mm_set_epi32(0xdb863ce5,0xaef0c677,0x933dfddd,0x24e1128d); 27 | rc[22] = _mm_set_epi32(0xbb606268,0xffeba09c,0x83e48de3,0xcb2212b1); 28 | rc[23] = _mm_set_epi32(0x734bd3dc,0xe2e4d19c,0x2db91a4e,0xc72bf77d); 29 | rc[24] = _mm_set_epi32(0x43bb47c3,0x61301b43,0x4b1415c4,0x2cb3924e); 30 | rc[25] = _mm_set_epi32(0xdba775a8,0xe707eff6,0x03b231dd,0x16eb6899); 31 | rc[26] = _mm_set_epi32(0x6df3614b,0x3c755977,0x8e5e2302,0x7eca472c); 32 | rc[27] = _mm_set_epi32(0xcda75a17,0xd6de7d77,0x6d1be5b9,0xb88617f9); 33 | rc[28] = _mm_set_epi32(0xec6b43f0,0x6ba8e9aa,0x9d6c069d,0xa946ee5d); 34 | rc[29] = _mm_set_epi32(0xcb1e6950,0xf957332b,0xa2531159,0x3bf327c1); 35 | rc[30] = _mm_set_epi32(0x2cee0c75,0x00da619c,0xe4ed0353,0x600ed0d9); 36 | rc[31] = _mm_set_epi32(0xf0b1a5a1,0x96e90cab,0x80bbbabc,0x63a4a350); 37 | rc[32] = _mm_set_epi32(0xae3db102,0x5e962988,0xab0dde30,0x938dca39); 38 | rc[33] = _mm_set_epi32(0x17bb8f38,0xd554a40b,0x8814f3a8,0x2e75b442); 39 | rc[34] = _mm_set_epi32(0x34bb8a5b,0x5f427fd7,0xaeb6b779,0x360a16f6); 40 | rc[35] = _mm_set_epi32(0x26f65241,0xcbe55438,0x43ce5918,0xffbaafde); 41 | rc[36] = _mm_set_epi32(0x4ce99a54,0xb9f3026a,0xa2ca9cf7,0x839ec978); 42 | rc[37] = _mm_set_epi32(0xae51a51a,0x1bdff7be,0x40c06e28,0x22901235); 43 | rc[38] = _mm_set_epi32(0xa0c1613c,0xba7ed22b,0xc173bc0f,0x48a659cf); 44 | rc[39] = _mm_set_epi32(0x756acc03,0x02288288,0x4ad6bdfd,0xe9c59da1); 45 | #define CONSTANTSLOADED 46 | } 47 | 48 | void test_implementations() { 49 | unsigned char *in = (unsigned char *)calloc(64*8, sizeof(unsigned char)); 50 | unsigned char *out256 = (unsigned char *)calloc(32*8, sizeof(unsigned char)); 51 | unsigned char *out512 = (unsigned char *)calloc(32*8, sizeof(unsigned char)); 52 | unsigned char testvector256[32] = {0x80, 0x27, 0xcc, 0xb8, 0x79, 0x49, 0x77, 0x4b, 53 | 0x78, 0xd0, 0x54, 0x5f, 0xb7, 0x2b, 0xf7, 0x0c, 54 | 0x69, 0x5c, 0x2a, 0x09, 0x23, 0xcb, 0xd4, 0x7b, 55 | 0xba, 0x11, 0x59, 0xef, 0xbf, 0x2b, 0x2c, 0x1c}; 56 | 57 | unsigned char testvector512[32] = {0xbe, 0x7f, 0x72, 0x3b, 0x4e, 0x80, 0xa9, 0x98, 58 | 0x13, 0xb2, 0x92, 0x28, 0x7f, 0x30, 0x6f, 0x62, 59 | 0x5a, 0x6d, 0x57, 0x33, 0x1c, 0xae, 0x5f, 0x34, 60 | 0xdd, 0x92, 0x77, 0xb0, 0x94, 0x5b, 0xe2, 0xaa}; 61 | 62 | 63 | 64 | int i; 65 | 66 | // Input for testvector 67 | for(i = 0; i < 512; i++) { 68 | in[i] = i % 64; 69 | } 70 | 71 | load_constants(); 72 | haraka512_8x(out512, in); 73 | 74 | // Verify output 75 | for(i = 0; i < 32; i++) { 76 | if (out512[i % 32] != testvector512[i]) { 77 | printf("Error: testvector incorrect.\n"); 78 | return; 79 | } 80 | } 81 | 82 | free(in); 83 | free(out256); 84 | free(out512); 85 | } 86 | 87 | void haraka256(unsigned char *out, const unsigned char *in) { 88 | #ifndef CONSTANTSLOADED 89 | load_constants(); 90 | #endif 91 | __m128i s[2], tmp; 92 | 93 | s[0] = LOAD(in); 94 | s[1] = LOAD(in + 16); 95 | 96 | AES2(s[0], s[1], 0); 97 | MIX2(s[0], s[1]); 98 | 99 | AES2(s[0], s[1], 4); 100 | MIX2(s[0], s[1]); 101 | 102 | AES2(s[0], s[1], 8); 103 | MIX2(s[0], s[1]); 104 | 105 | AES2(s[0], s[1], 12); 106 | MIX2(s[0], s[1]); 107 | 108 | AES2(s[0], s[1], 16); 109 | MIX2(s[0], s[1]); 110 | 111 | s[0] = _mm_xor_si128(s[0], LOAD(in)); 112 | s[1] = _mm_xor_si128(s[1], LOAD(in + 16)); 113 | 114 | STORE(out, s[0]); 115 | STORE(out + 16, s[1]); 116 | } 117 | 118 | void haraka256_4x(unsigned char *out, const unsigned char *in) { 119 | #ifndef CONSTANTSLOADED 120 | load_constants(); 121 | #endif 122 | __m128i s[4][2], tmp; 123 | 124 | s[0][0] = LOAD(in); 125 | s[0][1] = LOAD(in + 16); 126 | s[1][0] = LOAD(in + 32); 127 | s[1][1] = LOAD(in + 48); 128 | s[2][0] = LOAD(in + 64); 129 | s[2][1] = LOAD(in + 80); 130 | s[3][0] = LOAD(in + 96); 131 | s[3][1] = LOAD(in + 112); 132 | 133 | // Round 1 134 | AES2_4x(s[0], s[1], s[2], s[3], 0); 135 | 136 | MIX2(s[0][0], s[0][1]); 137 | MIX2(s[1][0], s[1][1]); 138 | MIX2(s[2][0], s[2][1]); 139 | MIX2(s[3][0], s[3][1]); 140 | 141 | // Round 2 142 | AES2_4x(s[0], s[1], s[2], s[3], 4); 143 | 144 | MIX2(s[0][0], s[0][1]); 145 | MIX2(s[1][0], s[1][1]); 146 | MIX2(s[2][0], s[2][1]); 147 | MIX2(s[3][0], s[3][1]); 148 | 149 | // Round 3 150 | AES2_4x(s[0], s[1], s[2], s[3], 8); 151 | 152 | MIX2(s[0][0], s[0][1]); 153 | MIX2(s[1][0], s[1][1]); 154 | MIX2(s[2][0], s[2][1]); 155 | MIX2(s[3][0], s[3][1]); 156 | 157 | // Round 4 158 | AES2_4x(s[0], s[1], s[2], s[3], 12); 159 | 160 | MIX2(s[0][0], s[0][1]); 161 | MIX2(s[1][0], s[1][1]); 162 | MIX2(s[2][0], s[2][1]); 163 | MIX2(s[3][0], s[3][1]); 164 | 165 | // Round 5 166 | AES2_4x(s[0], s[1], s[2], s[3], 16); 167 | 168 | MIX2(s[0][0], s[0][1]); 169 | MIX2(s[1][0], s[1][1]); 170 | MIX2(s[2][0], s[2][1]); 171 | MIX2(s[3][0], s[3][1]); 172 | 173 | // Feed Forward 174 | s[0][0] = _mm_xor_si128(s[0][0], LOAD(in)); 175 | s[0][1] = _mm_xor_si128(s[0][1], LOAD(in + 16)); 176 | s[1][0] = _mm_xor_si128(s[1][0], LOAD(in + 32)); 177 | s[1][1] = _mm_xor_si128(s[1][1], LOAD(in + 48)); 178 | s[2][0] = _mm_xor_si128(s[2][0], LOAD(in + 64)); 179 | s[2][1] = _mm_xor_si128(s[2][1], LOAD(in + 80)); 180 | s[3][0] = _mm_xor_si128(s[3][0], LOAD(in + 96)); 181 | s[3][1] = _mm_xor_si128(s[3][1], LOAD(in + 112)); 182 | 183 | STORE(out, s[0][0]); 184 | STORE(out + 16, s[0][1]); 185 | STORE(out + 32, s[1][0]); 186 | STORE(out + 48, s[1][1]); 187 | STORE(out + 64, s[2][0]); 188 | STORE(out + 80, s[2][1]); 189 | STORE(out + 96, s[3][0]); 190 | STORE(out + 112, s[3][1]); 191 | } 192 | 193 | void haraka256_8x(unsigned char *out, const unsigned char *in) { 194 | #ifndef CONSTANTSLOADED 195 | load_constants(); 196 | #endif 197 | haraka256_4x(out, in); 198 | haraka256_4x(out + 128, in + 128); 199 | 200 | // __m128i s[8][2], tmp; 201 | // 202 | // int i; 203 | // 204 | // s[0][0] = LOAD(in); 205 | // s[0][1] = LOAD(in + 16); 206 | // s[1][0] = LOAD(in + 32); 207 | // s[1][1] = LOAD(in + 48); 208 | // s[2][0] = LOAD(in + 64); 209 | // s[2][1] = LOAD(in + 80); 210 | // s[3][0] = LOAD(in + 96); 211 | // s[3][1] = LOAD(in + 112); 212 | // s[4][0] = LOAD(in + 128); 213 | // s[4][1] = LOAD(in + 144); 214 | // s[5][0] = LOAD(in + 160); 215 | // s[5][1] = LOAD(in + 176); 216 | // s[6][0] = LOAD(in + 192); 217 | // s[6][1] = LOAD(in + 208); 218 | // s[7][0] = LOAD(in + 224); 219 | // s[7][1] = LOAD(in + 240); 220 | // 221 | // // Round 1 222 | // AES2_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 0); 223 | // 224 | // MIX2(s[0][0], s[0][1]); 225 | // MIX2(s[1][0], s[1][1]); 226 | // MIX2(s[2][0], s[2][1]); 227 | // MIX2(s[3][0], s[3][1]); 228 | // MIX2(s[4][0], s[4][1]); 229 | // MIX2(s[5][0], s[5][1]); 230 | // MIX2(s[6][0], s[6][1]); 231 | // MIX2(s[7][0], s[7][1]); 232 | // 233 | // 234 | // // Round 2 235 | // AES2_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 4); 236 | // 237 | // MIX2(s[0][0], s[0][1]); 238 | // MIX2(s[1][0], s[1][1]); 239 | // MIX2(s[2][0], s[2][1]); 240 | // MIX2(s[3][0], s[3][1]); 241 | // MIX2(s[4][0], s[4][1]); 242 | // MIX2(s[5][0], s[5][1]); 243 | // MIX2(s[6][0], s[6][1]); 244 | // MIX2(s[7][0], s[7][1]); 245 | // 246 | // // Round 3 247 | // AES2_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 8); 248 | // 249 | // MIX2(s[0][0], s[0][1]); 250 | // MIX2(s[1][0], s[1][1]); 251 | // MIX2(s[2][0], s[2][1]); 252 | // MIX2(s[3][0], s[3][1]); 253 | // MIX2(s[4][0], s[4][1]); 254 | // MIX2(s[5][0], s[5][1]); 255 | // MIX2(s[6][0], s[6][1]); 256 | // MIX2(s[7][0], s[7][1]); 257 | // 258 | // // Round 4 259 | // AES2_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 12); 260 | // 261 | // MIX2(s[0][0], s[0][1]); 262 | // MIX2(s[1][0], s[1][1]); 263 | // MIX2(s[2][0], s[2][1]); 264 | // MIX2(s[3][0], s[3][1]); 265 | // MIX2(s[4][0], s[4][1]); 266 | // MIX2(s[5][0], s[5][1]); 267 | // MIX2(s[6][0], s[6][1]); 268 | // MIX2(s[7][0], s[7][1]); 269 | // 270 | // // Round 5 271 | // AES2_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 16); 272 | // 273 | // MIX2(s[0][0], s[0][1]); 274 | // MIX2(s[1][0], s[1][1]); 275 | // MIX2(s[2][0], s[2][1]); 276 | // MIX2(s[3][0], s[3][1]); 277 | // MIX2(s[4][0], s[4][1]); 278 | // MIX2(s[5][0], s[5][1]); 279 | // MIX2(s[6][0], s[6][1]); 280 | // MIX2(s[7][0], s[7][1]); 281 | // 282 | // // Feed Forward 283 | // s[0][0] = _mm_xor_si128(s[0][0], LOAD(in)); 284 | // s[0][1] = _mm_xor_si128(s[0][1], LOAD(in + 16)); 285 | // s[1][0] = _mm_xor_si128(s[1][0], LOAD(in + 32)); 286 | // s[1][1] = _mm_xor_si128(s[1][1], LOAD(in + 48)); 287 | // s[2][0] = _mm_xor_si128(s[2][0], LOAD(in + 64)); 288 | // s[2][1] = _mm_xor_si128(s[2][1], LOAD(in + 80)); 289 | // s[3][0] = _mm_xor_si128(s[3][0], LOAD(in + 96)); 290 | // s[3][1] = _mm_xor_si128(s[3][1], LOAD(in + 112)); 291 | // s[4][0] = _mm_xor_si128(s[4][0], LOAD(in + 128)); 292 | // s[4][1] = _mm_xor_si128(s[4][1], LOAD(in + 144)); 293 | // s[5][0] = _mm_xor_si128(s[5][0], LOAD(in + 160)); 294 | // s[5][1] = _mm_xor_si128(s[5][1], LOAD(in + 176)); 295 | // s[6][0] = _mm_xor_si128(s[6][0], LOAD(in + 192)); 296 | // s[6][1] = _mm_xor_si128(s[6][1], LOAD(in + 208)); 297 | // s[7][0] = _mm_xor_si128(s[7][0], LOAD(in + 224)); 298 | // s[7][1] = _mm_xor_si128(s[7][1], LOAD(in + 240)); 299 | // 300 | // STORE(out, s[0][0]); 301 | // STORE(out + 16, s[0][1]); 302 | // STORE(out + 32, s[1][0]); 303 | // STORE(out + 48, s[1][1]); 304 | // STORE(out + 64, s[2][0]); 305 | // STORE(out + 80, s[2][1]); 306 | // STORE(out + 96, s[3][0]); 307 | // STORE(out + 112, s[3][1]); 308 | // STORE(out + 128, s[4][0]); 309 | // STORE(out + 144, s[4][1]); 310 | // STORE(out + 160, s[5][0]); 311 | // STORE(out + 176, s[5][1]); 312 | // STORE(out + 192, s[6][0]); 313 | // STORE(out + 208, s[6][1]); 314 | // STORE(out + 224, s[7][0]); 315 | // STORE(out + 240, s[7][1]); 316 | } 317 | 318 | void haraka512(unsigned char *out, const unsigned char *in) { 319 | #ifndef CONSTANTSLOADED 320 | load_constants(); 321 | #endif 322 | u128 s[4], tmp; 323 | 324 | s[0] = LOAD(in); 325 | s[1] = LOAD(in + 16); 326 | s[2] = LOAD(in + 32); 327 | s[3] = LOAD(in + 48); 328 | 329 | AES4(s[0], s[1], s[2], s[3], 0); 330 | MIX4(s[0], s[1], s[2], s[3]); 331 | 332 | AES4(s[0], s[1], s[2], s[3], 8); 333 | MIX4(s[0], s[1], s[2], s[3]); 334 | 335 | AES4(s[0], s[1], s[2], s[3], 16); 336 | MIX4(s[0], s[1], s[2], s[3]); 337 | 338 | AES4(s[0], s[1], s[2], s[3], 24); 339 | MIX4(s[0], s[1], s[2], s[3]); 340 | 341 | AES4(s[0], s[1], s[2], s[3], 32); 342 | MIX4(s[0], s[1], s[2], s[3]); 343 | 344 | s[0] = _mm_xor_si128(s[0], LOAD(in)); 345 | s[1] = _mm_xor_si128(s[1], LOAD(in + 16)); 346 | s[2] = _mm_xor_si128(s[2], LOAD(in + 32)); 347 | s[3] = _mm_xor_si128(s[3], LOAD(in + 48)); 348 | 349 | TRUNCSTORE(out, s[0], s[1], s[2], s[3]); 350 | } 351 | 352 | void haraka512_4x(unsigned char *out, const unsigned char *in) { 353 | #ifndef CONSTANTSLOADED 354 | load_constants(); 355 | #endif 356 | u128 s[4][4], tmp; 357 | 358 | s[0][0] = LOAD(in); 359 | s[0][1] = LOAD(in + 16); 360 | s[0][2] = LOAD(in + 32); 361 | s[0][3] = LOAD(in + 48); 362 | s[1][0] = LOAD(in + 64); 363 | s[1][1] = LOAD(in + 80); 364 | s[1][2] = LOAD(in + 96); 365 | s[1][3] = LOAD(in + 112); 366 | s[2][0] = LOAD(in + 128); 367 | s[2][1] = LOAD(in + 144); 368 | s[2][2] = LOAD(in + 160); 369 | s[2][3] = LOAD(in + 176); 370 | s[3][0] = LOAD(in + 192); 371 | s[3][1] = LOAD(in + 208); 372 | s[3][2] = LOAD(in + 224); 373 | s[3][3] = LOAD(in + 240); 374 | 375 | AES4_4x(s[0], s[1], s[2], s[3], 0); 376 | MIX4(s[0][0], s[0][1], s[0][2], s[0][3]); 377 | MIX4(s[1][0], s[1][1], s[1][2], s[1][3]); 378 | MIX4(s[2][0], s[2][1], s[2][2], s[2][3]); 379 | MIX4(s[3][0], s[3][1], s[3][2], s[3][3]); 380 | 381 | AES4_4x(s[0], s[1], s[2], s[3], 8); 382 | MIX4(s[0][0], s[0][1], s[0][2], s[0][3]); 383 | MIX4(s[1][0], s[1][1], s[1][2], s[1][3]); 384 | MIX4(s[2][0], s[2][1], s[2][2], s[2][3]); 385 | MIX4(s[3][0], s[3][1], s[3][2], s[3][3]); 386 | 387 | AES4_4x(s[0], s[1], s[2], s[3], 16); 388 | MIX4(s[0][0], s[0][1], s[0][2], s[0][3]); 389 | MIX4(s[1][0], s[1][1], s[1][2], s[1][3]); 390 | MIX4(s[2][0], s[2][1], s[2][2], s[2][3]); 391 | MIX4(s[3][0], s[3][1], s[3][2], s[3][3]); 392 | 393 | AES4_4x(s[0], s[1], s[2], s[3], 24); 394 | MIX4(s[0][0], s[0][1], s[0][2], s[0][3]); 395 | MIX4(s[1][0], s[1][1], s[1][2], s[1][3]); 396 | MIX4(s[2][0], s[2][1], s[2][2], s[2][3]); 397 | MIX4(s[3][0], s[3][1], s[3][2], s[3][3]); 398 | 399 | AES4_4x(s[0], s[1], s[2], s[3], 32); 400 | MIX4(s[0][0], s[0][1], s[0][2], s[0][3]); 401 | MIX4(s[1][0], s[1][1], s[1][2], s[1][3]); 402 | MIX4(s[2][0], s[2][1], s[2][2], s[2][3]); 403 | MIX4(s[3][0], s[3][1], s[3][2], s[3][3]); 404 | 405 | 406 | s[0][0] = _mm_xor_si128(s[0][0], LOAD(in)); 407 | s[0][1] = _mm_xor_si128(s[0][1], LOAD(in + 16)); 408 | s[0][2] = _mm_xor_si128(s[0][2], LOAD(in + 32)); 409 | s[0][3] = _mm_xor_si128(s[0][3], LOAD(in + 48)); 410 | s[1][0] = _mm_xor_si128(s[1][0], LOAD(in + 64)); 411 | s[1][1] = _mm_xor_si128(s[1][1], LOAD(in + 80)); 412 | s[1][2] = _mm_xor_si128(s[1][2], LOAD(in + 96)); 413 | s[1][3] = _mm_xor_si128(s[1][3], LOAD(in + 112)); 414 | s[2][0] = _mm_xor_si128(s[2][0], LOAD(in + 128)); 415 | s[2][1] = _mm_xor_si128(s[2][1], LOAD(in + 144)); 416 | s[2][2] = _mm_xor_si128(s[2][2], LOAD(in + 160)); 417 | s[2][3] = _mm_xor_si128(s[2][3], LOAD(in + 176)); 418 | s[3][0] = _mm_xor_si128(s[3][0], LOAD(in + 192)); 419 | s[3][1] = _mm_xor_si128(s[3][1], LOAD(in + 208)); 420 | s[3][2] = _mm_xor_si128(s[3][2], LOAD(in + 224)); 421 | s[3][3] = _mm_xor_si128(s[3][3], LOAD(in + 240)); 422 | 423 | TRUNCSTORE(out, s[0][0], s[0][1], s[0][2], s[0][3]); 424 | TRUNCSTORE(out + 32, s[1][0], s[1][1], s[1][2], s[1][3]); 425 | TRUNCSTORE(out + 64, s[2][0], s[2][1], s[2][2], s[2][3]); 426 | TRUNCSTORE(out + 96, s[3][0], s[3][1], s[3][2], s[3][3]); 427 | } 428 | 429 | void haraka512_8x(unsigned char *out, const unsigned char *in) { 430 | #ifndef CONSTANTSLOADED 431 | load_constants(); 432 | #endif 433 | // This is faster on Skylake, the code below is faster on Haswell. 434 | haraka512_4x(out, in); 435 | haraka512_4x(out + 128, in + 256); 436 | 437 | // u128 s[8][4], tmp; 438 | // 439 | // s[0][0] = LOAD(in); 440 | // s[0][1] = LOAD(in + 16); 441 | // s[0][2] = LOAD(in + 32); 442 | // s[0][3] = LOAD(in + 48); 443 | // s[1][0] = LOAD(in + 64); 444 | // s[1][1] = LOAD(in + 80); 445 | // s[1][2] = LOAD(in + 96); 446 | // s[1][3] = LOAD(in + 112); 447 | // s[2][0] = LOAD(in + 128); 448 | // s[2][1] = LOAD(in + 144); 449 | // s[2][2] = LOAD(in + 160); 450 | // s[2][3] = LOAD(in + 176); 451 | // s[3][0] = LOAD(in + 192); 452 | // s[3][1] = LOAD(in + 208); 453 | // s[3][2] = LOAD(in + 224); 454 | // s[3][3] = LOAD(in + 240); 455 | // s[4][0] = LOAD(in + 256); 456 | // s[4][1] = LOAD(in + 272); 457 | // s[4][2] = LOAD(in + 288); 458 | // s[4][3] = LOAD(in + 304); 459 | // s[5][0] = LOAD(in + 320); 460 | // s[5][1] = LOAD(in + 336); 461 | // s[5][2] = LOAD(in + 352); 462 | // s[5][3] = LOAD(in + 368); 463 | // s[6][0] = LOAD(in + 384); 464 | // s[6][1] = LOAD(in + 400); 465 | // s[6][2] = LOAD(in + 416); 466 | // s[6][3] = LOAD(in + 432); 467 | // s[7][0] = LOAD(in + 448); 468 | // s[7][1] = LOAD(in + 464); 469 | // s[7][2] = LOAD(in + 480); 470 | // s[7][3] = LOAD(in + 496); 471 | // 472 | // AES4_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 0); 473 | // MIX4(s[0][0], s[0][1], s[0][2], s[0][3]); 474 | // MIX4(s[1][0], s[1][1], s[1][2], s[1][3]); 475 | // MIX4(s[2][0], s[2][1], s[2][2], s[2][3]); 476 | // MIX4(s[3][0], s[3][1], s[3][2], s[3][3]); 477 | // MIX4(s[4][0], s[4][1], s[4][2], s[4][3]); 478 | // MIX4(s[5][0], s[5][1], s[5][2], s[5][3]); 479 | // MIX4(s[6][0], s[6][1], s[6][2], s[6][3]); 480 | // MIX4(s[7][0], s[7][1], s[7][2], s[7][3]); 481 | // 482 | // AES4_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 8); 483 | // MIX4(s[0][0], s[0][1], s[0][2], s[0][3]); 484 | // MIX4(s[1][0], s[1][1], s[1][2], s[1][3]); 485 | // MIX4(s[2][0], s[2][1], s[2][2], s[2][3]); 486 | // MIX4(s[3][0], s[3][1], s[3][2], s[3][3]); 487 | // MIX4(s[4][0], s[4][1], s[4][2], s[4][3]); 488 | // MIX4(s[5][0], s[5][1], s[5][2], s[5][3]); 489 | // MIX4(s[6][0], s[6][1], s[6][2], s[6][3]); 490 | // MIX4(s[7][0], s[7][1], s[7][2], s[7][3]); 491 | // 492 | // AES4_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 16); 493 | // MIX4(s[0][0], s[0][1], s[0][2], s[0][3]); 494 | // MIX4(s[1][0], s[1][1], s[1][2], s[1][3]); 495 | // MIX4(s[2][0], s[2][1], s[2][2], s[2][3]); 496 | // MIX4(s[3][0], s[3][1], s[3][2], s[3][3]); 497 | // MIX4(s[4][0], s[4][1], s[4][2], s[4][3]); 498 | // MIX4(s[5][0], s[5][1], s[5][2], s[5][3]); 499 | // MIX4(s[6][0], s[6][1], s[6][2], s[6][3]); 500 | // MIX4(s[7][0], s[7][1], s[7][2], s[7][3]); 501 | // 502 | // AES4_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 24); 503 | // MIX4(s[0][0], s[0][1], s[0][2], s[0][3]); 504 | // MIX4(s[1][0], s[1][1], s[1][2], s[1][3]); 505 | // MIX4(s[2][0], s[2][1], s[2][2], s[2][3]); 506 | // MIX4(s[3][0], s[3][1], s[3][2], s[3][3]); 507 | // MIX4(s[4][0], s[4][1], s[4][2], s[4][3]); 508 | // MIX4(s[5][0], s[5][1], s[5][2], s[5][3]); 509 | // MIX4(s[6][0], s[6][1], s[6][2], s[6][3]); 510 | // MIX4(s[7][0], s[7][1], s[7][2], s[7][3]); 511 | // 512 | // AES4_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 32); 513 | // MIX4(s[0][0], s[0][1], s[0][2], s[0][3]); 514 | // MIX4(s[1][0], s[1][1], s[1][2], s[1][3]); 515 | // MIX4(s[2][0], s[2][1], s[2][2], s[2][3]); 516 | // MIX4(s[3][0], s[3][1], s[3][2], s[3][3]); 517 | // MIX4(s[4][0], s[4][1], s[4][2], s[4][3]); 518 | // MIX4(s[5][0], s[5][1], s[5][2], s[5][3]); 519 | // MIX4(s[6][0], s[6][1], s[6][2], s[6][3]); 520 | // MIX4(s[7][0], s[7][1], s[7][2], s[7][3]); 521 | // 522 | // 523 | // s[0][0] = _mm_xor_si128(s[0][0], LOAD(in)); 524 | // s[0][1] = _mm_xor_si128(s[0][1], LOAD(in + 16)); 525 | // s[0][2] = _mm_xor_si128(s[0][2], LOAD(in + 32)); 526 | // s[0][3] = _mm_xor_si128(s[0][3], LOAD(in + 48)); 527 | // s[1][0] = _mm_xor_si128(s[1][0], LOAD(in + 64)); 528 | // s[1][1] = _mm_xor_si128(s[1][1], LOAD(in + 80)); 529 | // s[1][2] = _mm_xor_si128(s[1][2], LOAD(in + 96)); 530 | // s[1][3] = _mm_xor_si128(s[1][3], LOAD(in + 112)); 531 | // s[2][0] = _mm_xor_si128(s[2][0], LOAD(in + 128)); 532 | // s[2][1] = _mm_xor_si128(s[2][1], LOAD(in + 144)); 533 | // s[2][2] = _mm_xor_si128(s[2][2], LOAD(in + 160)); 534 | // s[2][3] = _mm_xor_si128(s[2][3], LOAD(in + 176)); 535 | // s[3][0] = _mm_xor_si128(s[3][0], LOAD(in + 192)); 536 | // s[3][1] = _mm_xor_si128(s[3][1], LOAD(in + 208)); 537 | // s[3][2] = _mm_xor_si128(s[3][2], LOAD(in + 224)); 538 | // s[3][3] = _mm_xor_si128(s[3][3], LOAD(in + 240)); 539 | // s[4][0] = _mm_xor_si128(s[4][0], LOAD(in + 256)); 540 | // s[4][1] = _mm_xor_si128(s[4][1], LOAD(in + 272)); 541 | // s[4][2] = _mm_xor_si128(s[4][2], LOAD(in + 288)); 542 | // s[4][3] = _mm_xor_si128(s[4][3], LOAD(in + 304)); 543 | // s[5][0] = _mm_xor_si128(s[5][0], LOAD(in + 320)); 544 | // s[5][1] = _mm_xor_si128(s[5][1], LOAD(in + 336)); 545 | // s[5][2] = _mm_xor_si128(s[5][2], LOAD(in + 352)); 546 | // s[5][3] = _mm_xor_si128(s[5][3], LOAD(in + 368)); 547 | // s[6][0] = _mm_xor_si128(s[6][0], LOAD(in + 384)); 548 | // s[6][1] = _mm_xor_si128(s[6][1], LOAD(in + 400)); 549 | // s[6][2] = _mm_xor_si128(s[6][2], LOAD(in + 416)); 550 | // s[6][3] = _mm_xor_si128(s[6][3], LOAD(in + 432)); 551 | // s[7][0] = _mm_xor_si128(s[7][0], LOAD(in + 448)); 552 | // s[7][1] = _mm_xor_si128(s[7][1], LOAD(in + 464)); 553 | // s[7][2] = _mm_xor_si128(s[7][2], LOAD(in + 480)); 554 | // s[7][3] = _mm_xor_si128(s[7][3], LOAD(in + 496)); 555 | // 556 | // TRUNCSTORE(out, s[0][0], s[0][1], s[0][2], s[0][3]); 557 | // TRUNCSTORE(out + 32, s[1][0], s[1][1], s[1][2], s[1][3]); 558 | // TRUNCSTORE(out + 64, s[2][0], s[2][1], s[2][2], s[2][3]); 559 | // TRUNCSTORE(out + 96, s[3][0], s[3][1], s[3][2], s[3][3]); 560 | // TRUNCSTORE(out + 128, s[4][0], s[4][1], s[4][2], s[4][3]); 561 | // TRUNCSTORE(out + 160, s[5][0], s[5][1], s[5][2], s[5][3]); 562 | // TRUNCSTORE(out + 192, s[6][0], s[6][1], s[6][2], s[6][3]); 563 | // TRUNCSTORE(out + 224, s[7][0], s[7][1], s[7][2], s[7][3]); 564 | } 565 | -------------------------------------------------------------------------------- /supercop/crypto_sign/sphincs256haraka/aesni/haraka.h: -------------------------------------------------------------------------------- 1 | /* 2 | Optimized Implementations for Haraka256 and Haraka512 3 | */ 4 | #ifndef HARAKA_H_ 5 | #define HARAKA_H_ 6 | 7 | #include "immintrin.h" 8 | 9 | #define NUMROUNDS 5 10 | 11 | #define u64 unsigned long 12 | #define u128 __m128i 13 | 14 | u128 rc[40]; 15 | 16 | #define LOAD(src) _mm_load_si128((u128 *)(src)) 17 | #define STORE(dest,src) _mm_storeu_si128((u128 *)(dest),src) 18 | 19 | #define AES2(s0, s1, rci) \ 20 | s0 = _mm_aesenc_si128(s0, rc[rci]); \ 21 | s1 = _mm_aesenc_si128(s1, rc[rci + 1]); \ 22 | s0 = _mm_aesenc_si128(s0, rc[rci + 2]); \ 23 | s1 = _mm_aesenc_si128(s1, rc[rci + 3]); 24 | 25 | #define AES2_4x(s0, s1, s2, s3, rci) \ 26 | AES2(s0[0], s0[1], rci); \ 27 | AES2(s1[0], s1[1], rci); \ 28 | AES2(s2[0], s2[1], rci); \ 29 | AES2(s3[0], s3[1], rci); 30 | 31 | #define AES2_8x(s0, s1, s2, s3, s4, s5, s6, s7, rci) \ 32 | AES2_4x(s0, s1, s2, s3, rci); \ 33 | AES2_4x(s4, s5, s6, s7, rci); 34 | 35 | #define AES4(s0, s1, s2, s3, rci) \ 36 | s0 = _mm_aesenc_si128(s0, rc[rci]); \ 37 | s1 = _mm_aesenc_si128(s1, rc[rci + 1]); \ 38 | s2 = _mm_aesenc_si128(s2, rc[rci + 2]); \ 39 | s3 = _mm_aesenc_si128(s3, rc[rci + 3]); \ 40 | s0 = _mm_aesenc_si128(s0, rc[rci + 4]); \ 41 | s1 = _mm_aesenc_si128(s1, rc[rci + 5]); \ 42 | s2 = _mm_aesenc_si128(s2, rc[rci + 6]); \ 43 | s3 = _mm_aesenc_si128(s3, rc[rci + 7]); \ 44 | 45 | #define AES4_4x(s0, s1, s2, s3, rci) \ 46 | AES4(s0[0], s0[1], s0[2], s0[3], rci); \ 47 | AES4(s1[0], s1[1], s1[2], s1[3], rci); \ 48 | AES4(s2[0], s2[1], s2[2], s2[3], rci); \ 49 | AES4(s3[0], s3[1], s3[2], s3[3], rci); 50 | 51 | #define AES4_8x(s0, s1, s2, s3, s4, s5, s6, s7, rci) \ 52 | AES4_4x(s0, s1, s2, s3, rci); \ 53 | AES4_4x(s4, s5, s6, s7, rci); 54 | 55 | #define MIX2(s0, s1) \ 56 | tmp = _mm_unpacklo_epi32(s0, s1); \ 57 | s1 = _mm_unpackhi_epi32(s0, s1); \ 58 | s0 = tmp; 59 | 60 | #define MIX4(s0, s1, s2, s3) \ 61 | tmp = _mm_unpacklo_epi32(s0, s1); \ 62 | s0 = _mm_unpackhi_epi32(s0, s1); \ 63 | s1 = _mm_unpacklo_epi32(s2, s3); \ 64 | s2 = _mm_unpackhi_epi32(s2, s3); \ 65 | s3 = _mm_unpacklo_epi32(s0, s2); \ 66 | s0 = _mm_unpackhi_epi32(s0, s2); \ 67 | s2 = _mm_unpackhi_epi32(s1, tmp); \ 68 | s1 = _mm_unpacklo_epi32(s1, tmp); 69 | 70 | #define TRUNCSTORE(out, s0, s1, s2, s3) \ 71 | *(u64*)(out) = (u64*)(s0)[1]; \ 72 | *(u64*)(out + 8) = (u64*)(s1)[1]; \ 73 | *(u64*)(out + 16) = (u64*)(s2)[0]; \ 74 | *(u64*)(out + 24) = (u64*)(s3)[0]; 75 | 76 | void load_constants(); 77 | void test_implementations(); 78 | 79 | void haraka256(unsigned char *out, const unsigned char *in); 80 | void haraka256_4x(unsigned char *out, const unsigned char *in); 81 | void haraka256_8x(unsigned char *out, const unsigned char *in); 82 | 83 | void haraka512(unsigned char *out, const unsigned char *in); 84 | void haraka512_4x(unsigned char *out, const unsigned char *in); 85 | void haraka512_8x(unsigned char *out, const unsigned char *in); 86 | 87 | 88 | #endif 89 | -------------------------------------------------------------------------------- /supercop/crypto_sign/sphincs256haraka/aesni/hash.c: -------------------------------------------------------------------------------- 1 | #include "params.h" 2 | #include "permute.h" 3 | #include "immintrin.h" 4 | #include "crypto_hash_blake256.h" 5 | #include "crypto_hash_blake512.h" 6 | #include "haraka.h" 7 | 8 | #include 9 | #include 10 | 11 | int varlen_hash(unsigned char *out,const unsigned char *in,unsigned long long inlen) 12 | { 13 | //SHA256(in,inlen,out); 14 | crypto_hash_blake256(out,in,inlen); 15 | return 0; 16 | } 17 | 18 | int msg_hash(unsigned char *out,const unsigned char *in,unsigned long long inlen) 19 | { 20 | //SHA512(in,inlen,out); 21 | crypto_hash_blake512(out,in,inlen); 22 | return 0; 23 | } 24 | 25 | 26 | static const char *hashc = "expand 32-byte to 64-byte state!"; 27 | 28 | int hash_2n_n(unsigned char *out,const unsigned char *in) 29 | { 30 | #if HASH_BYTES != 32 31 | #error "Current code only supports 32-byte hashes" 32 | #endif 33 | 34 | unsigned char x[64]; 35 | int i; 36 | for(i=0;i<64;i++) 37 | { 38 | x[i] = in[i]; 39 | } 40 | haraka512(out,x); 41 | 42 | return 0; 43 | } 44 | 45 | int hash_2n_n_mask(unsigned char *out,const unsigned char *in, const unsigned char *mask) 46 | { 47 | unsigned char buf[2*HASH_BYTES]; 48 | int i; 49 | for(i=0;i<2*HASH_BYTES;i++) 50 | buf[i] = in[i] ^ mask[i]; 51 | return hash_2n_n(out, buf); 52 | } 53 | 54 | int hash_n_n(unsigned char *out,const unsigned char *in) 55 | { 56 | #if HASH_BYTES != 32 57 | #error "Current code only supports 32-byte hashes" 58 | #endif 59 | unsigned char x[32]; 60 | int i; 61 | 62 | for(i=0;i<32;i++) 63 | { 64 | x[i] = in[i]; 65 | } 66 | haraka256(out,x); 67 | 68 | return 0; 69 | } 70 | 71 | int hash_n_n_mask(unsigned char *out,const unsigned char *in, const unsigned char *mask) 72 | { 73 | unsigned char buf[HASH_BYTES]; 74 | int i; 75 | for(i=0;i 6 | #include 7 | 8 | static void expand_seed(unsigned char outseeds[HORST_T*HORST_SKBYTES], const unsigned char inseed[SEED_BYTES]) 9 | { 10 | prg(outseeds, HORST_T*HORST_SKBYTES, inseed); 11 | } 12 | 13 | int horst_sign(unsigned char *sig, unsigned char pk[HASH_BYTES], unsigned long long *sigbytes, 14 | const unsigned char *m, unsigned long long mlen, 15 | const unsigned char seed[SEED_BYTES], 16 | const unsigned char masks[2*HORST_LOGT*HASH_BYTES], 17 | const unsigned char m_hash[MSGHASH_BYTES]) 18 | { 19 | unsigned char sk[HORST_T*HORST_SKBYTES]; 20 | unsigned int idx; 21 | int i,j,k; 22 | int sigpos = 0; 23 | 24 | unsigned char tree[(2*HORST_T-1)*HASH_BYTES]; /* replace by something more memory-efficient? */ 25 | 26 | expand_seed(sk, seed); 27 | 28 | // Build the whole tree and save it 29 | #if HORST_SKBYTES != HASH_BYTES 30 | #error "Need to have HORST_SKBYTES == HASH_BYTES" 31 | #endif 32 | 33 | // Generate pk leaves non parallel 34 | //for(i=0;i>1; // parent node 150 | 151 | if(!(idx&1)) 152 | { 153 | hash_2n_n_mask(buffer,buffer,masks+2*(j-1)*HASH_BYTES); 154 | for(k=0;k>1; // parent node 167 | hash_2n_n_mask(buffer,buffer,masks+2*(HORST_LOGT-7)*HASH_BYTES); 168 | 169 | for(k=0;k 2 | #include "wmmintrin.h" 3 | #include "emmintrin.h" 4 | #include "smmintrin.h" 5 | 6 | #define CHACHA_ROUNDS 12 7 | 8 | #define U32V(x) \ 9 | ((x) & 0xffffffff) 10 | 11 | #define ROTL32(x,c) \ 12 | ((((x) << c) | ((x) >> (32-c))) & 0xffffffff) 13 | 14 | #define ROTATE(v,c) (ROTL32(v,c)) 15 | #define XOR(v,w) ((v) ^ (w)) 16 | #define PLUS(v,w) (U32V((v) + (w))) 17 | #define PLUSONE(v) (PLUS((v),1)) 18 | 19 | #define QUARTERROUND(a,b,c,d) \ 20 | x[a] = PLUS(x[a],x[b]); x[d] = ROTATE(XOR(x[d],x[a]),16); \ 21 | x[c] = PLUS(x[c],x[d]); x[b] = ROTATE(XOR(x[b],x[c]),12); \ 22 | x[a] = PLUS(x[a],x[b]); x[d] = ROTATE(XOR(x[d],x[a]), 8); \ 23 | x[c] = PLUS(x[c],x[d]); x[b] = ROTATE(XOR(x[b],x[c]), 7); 24 | 25 | 26 | void chacha_permute(unsigned char out[64],const unsigned char in [64]) 27 | { 28 | uint32_t x[16]; 29 | int i; 30 | 31 | for (i = 0;i < 16;i++) 32 | { 33 | x[i] = in[4*i+3]; 34 | x[i] <<= 8; 35 | x[i] |= in[4*i+2]; 36 | x[i] <<= 8; 37 | x[i] |= in[4*i+1]; 38 | x[i] <<= 8; 39 | x[i] |= in[4*i+0]; 40 | } 41 | 42 | for (i = CHACHA_ROUNDS;i > 0;i -= 2) 43 | { 44 | QUARTERROUND( 0, 4, 8,12) 45 | QUARTERROUND( 1, 5, 9,13) 46 | QUARTERROUND( 2, 6,10,14) 47 | QUARTERROUND( 3, 7,11,15) 48 | QUARTERROUND( 0, 5,10,15) 49 | QUARTERROUND( 1, 6,11,12) 50 | QUARTERROUND( 2, 7, 8,13) 51 | QUARTERROUND( 3, 4, 9,14) 52 | } 53 | 54 | // for (i = 0;i < 16;++i) x[i] = PLUS(x[i],input[i]); // XXX: Bad idea if we later xor the input to the state? 55 | for (i = 0;i < 16;++i) 56 | { 57 | out[4*i] = x[i] & 0xff; 58 | out[4*i+1] = (x[i] >> 8) & 0xff; 59 | out[4*i+2] = (x[i] >> 16) & 0xff; 60 | out[4*i+3] = (x[i] >> 24) & 0xff; 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /supercop/crypto_sign/sphincs256haraka/aesni/permute.h: -------------------------------------------------------------------------------- 1 | #ifndef PERMUTE_H 2 | #define PERMUTE_H 3 | 4 | void chacha_permute(unsigned char output[64],const unsigned char input [64]); 5 | 6 | void load_rc(); 7 | void haraka512256(unsigned char out[32], const unsigned char in[64]); 8 | void haraka256256(unsigned char out[32], const unsigned char in[32]); 9 | void haraka512256_8x(unsigned char out[32*8], const unsigned char in[64*8]); 10 | void haraka256256_8x(unsigned char out[32*8], const unsigned char in[32*8]); 11 | 12 | #endif 13 | -------------------------------------------------------------------------------- /supercop/crypto_sign/sphincs256haraka/aesni/prg.c: -------------------------------------------------------------------------------- 1 | #include "crypto_stream_chacha12.h" 2 | #include "params.h" 3 | #include "prg.h" 4 | 5 | static unsigned char nonce[crypto_stream_chacha12_NONCEBYTES] = {0}; 6 | 7 | #if crypto_stream_chacha12_KEYBYTES != SEED_BYTES 8 | #error "SEED_BYTES needs to match CRYPTO_STREAM_KEYBYTES for this implementation" 9 | #endif 10 | 11 | void prg(unsigned char *r, unsigned long long rlen, const unsigned char key[SEED_BYTES]) 12 | { 13 | crypto_stream_chacha12(r,rlen,nonce,key); 14 | } 15 | -------------------------------------------------------------------------------- /supercop/crypto_sign/sphincs256haraka/aesni/prg.h: -------------------------------------------------------------------------------- 1 | #ifndef PRG_H 2 | #define PRG_H 3 | 4 | #include "params.h" 5 | 6 | void prg(unsigned char *r, unsigned long long rlen, const unsigned char key[SEED_BYTES]); 7 | 8 | #endif 9 | -------------------------------------------------------------------------------- /supercop/crypto_sign/sphincs256haraka/aesni/settings.h: -------------------------------------------------------------------------------- 1 | #ifndef SETTINGS_H 2 | #define SETTINGS_H 3 | 4 | #ifndef MPAR 5 | #define MPAR (1) 6 | #endif 7 | 8 | #ifndef ROUNDS 9 | #define ROUNDS (4) 10 | #endif 11 | 12 | #ifndef AES_PER_ROUND 13 | #define AES_PER_ROUND (2) 14 | #endif 15 | 16 | #ifndef MIX_PER_ROUND 17 | #define MIX_PER_ROUND (1) 18 | #endif 19 | 20 | #define DEBUG (0) 21 | 22 | #define MIX_METHOD (0) // 0 : blend 23 | // 1 : shuffle + xor 24 | // 2 : AESQ method 25 | 26 | #endif 27 | -------------------------------------------------------------------------------- /supercop/crypto_sign/sphincs256haraka/aesni/sign.c: -------------------------------------------------------------------------------- 1 | #include "crypto_sign.h" 2 | #include 3 | #include 4 | 5 | #include "api.h" 6 | #include "randombytes.h" 7 | #include "zerobytes.h" 8 | #include "params.h" 9 | #include "wots.h" 10 | #include "horst.h" 11 | #include "hash.h" 12 | #include "crypto_hash_blake512.h" 13 | #include "permute.h" 14 | 15 | #define BIGINT_BYTES ((TOTALTREE_HEIGHT-SUBTREE_HEIGHT+7)/8) 16 | 17 | #if (TOTALTREE_HEIGHT-SUBTREE_HEIGHT) > 64 18 | #error "TOTALTREE_HEIGHT-SUBTREE_HEIGHT must be at most 64" 19 | #endif 20 | 21 | typedef struct{ 22 | int level; 23 | unsigned long long subtree; 24 | int subleaf; 25 | } leafaddr; 26 | 27 | 28 | static void get_seed(unsigned char seed[SEED_BYTES], const unsigned char *sk, const leafaddr *a) 29 | { 30 | #if (N_LEVELS > 15) && (N_LEVELS < 8) 31 | #error "Need to have 8 <= N_LEVELS <= 15" 32 | #endif 33 | 34 | #if SUBTREE_HEIGHT != 5 35 | #error "Need to have SUBTREE_HEIGHT == 5" 36 | #endif 37 | 38 | #if TOTALTREE_HEIGHT != 60 39 | #error "Need to have TOTALTREE_HEIGHT == 60" 40 | #endif 41 | unsigned char buffer[SEED_BYTES+8]; 42 | unsigned long long t; 43 | int i; 44 | 45 | for(i=0;ilevel; 50 | //55 bits to encode subtree 51 | t |= a->subtree << 4; 52 | //5 bits to encode leaf 53 | t |= (unsigned long long)a->subleaf << 59; 54 | 55 | for(i=0;i<8;i++) 56 | buffer[SEED_BYTES+i] = (t >> 8*i) & 0xff; 57 | 58 | #if SEED_BYTES != HASH_BYTES 59 | #error "Need to have SEED_BYTES == HASH_BYTES" 60 | #endif 61 | varlen_hash(seed,buffer,SEED_BYTES+8); 62 | } 63 | 64 | 65 | /*static void l_tree(unsigned char *leaf, unsigned char *wots_pk, const unsigned char *masks) 66 | { 67 | int l = WOTS_L; 68 | int i,j = 0; 69 | for(i=0;i>1);j++) 72 | hash_2n_n_mask(wots_pk+j*HASH_BYTES,wots_pk+j*2*HASH_BYTES, masks+i*2*HASH_BYTES); 73 | 74 | if(l&1) 75 | { 76 | memcpy(wots_pk+(l>>1)*HASH_BYTES,wots_pk+(l-1)*HASH_BYTES, HASH_BYTES); 77 | l=(l>>1)+1; 78 | } 79 | else 80 | l=(l>>1); 81 | } 82 | memcpy(leaf,wots_pk,HASH_BYTES); 83 | } 84 | */ 85 | static void l_tree(unsigned char *leaf, unsigned char *wots_pk, const unsigned char *masks) 86 | { 87 | int l = WOTS_L; 88 | int i,j = 0; 89 | for(i=0;i>1);j+=8) 93 | hash_2n_n_mask_8x(wots_pk+j*HASH_BYTES,wots_pk+j*2*HASH_BYTES, HASH_BYTES, 2*HASH_BYTES, masks+i*2*HASH_BYTES); 94 | 95 | for(;j < (l>>1);j++) 96 | hash_2n_n_mask(wots_pk+j*HASH_BYTES,wots_pk+j*2*HASH_BYTES, masks+i*2*HASH_BYTES); 97 | 98 | if(l&1) 99 | { 100 | memcpy(wots_pk+(l>>1)*HASH_BYTES,wots_pk+(l-1)*HASH_BYTES, HASH_BYTES); 101 | l=(l>>1)+1; 102 | } 103 | else 104 | l=(l>>1); 105 | } 106 | memcpy(leaf,wots_pk,HASH_BYTES); 107 | } 108 | 109 | static void l_tree_8x(unsigned char *leaf, unsigned char *wots_pk, const unsigned char *masks) 110 | { 111 | int l = WOTS_L; 112 | int i,j,k = 0; 113 | 114 | for(i=0;i>1);j++) 117 | hash_2n_n_mask_8x(wots_pk+j*HASH_BYTES, wots_pk+j*2*HASH_BYTES, 118 | WOTS_L*HASH_BYTES, WOTS_L*HASH_BYTES, 119 | masks+i*2*HASH_BYTES); 120 | if(l&1) 121 | { 122 | for (k = 0; k < 8; k++) 123 | memcpy(wots_pk+(l>>1)*HASH_BYTES + k*WOTS_L*HASH_BYTES, 124 | wots_pk+(l -1)*HASH_BYTES + k*WOTS_L*HASH_BYTES, HASH_BYTES); 125 | l=(l>>1)+1; 126 | } else { 127 | l=(l>>1); 128 | } 129 | } 130 | for (k = 0; k < 8; k++) 131 | memcpy(leaf + k*HASH_BYTES, wots_pk + k*WOTS_L*HASH_BYTES, HASH_BYTES); 132 | } 133 | 134 | static void gen_leaf_wots(unsigned char leaf[HASH_BYTES], const unsigned char *masks, const unsigned char *sk, const leafaddr *a) 135 | { 136 | unsigned char seed[SEED_BYTES]; 137 | unsigned char pk[WOTS_L*HASH_BYTES]; 138 | 139 | get_seed(seed, sk, a); 140 | wots_pkgen(pk, seed, masks); 141 | 142 | l_tree(leaf, pk, masks); 143 | } 144 | 145 | 146 | static void treehash(unsigned char *node, int height, const unsigned char *sk, const leafaddr *leaf, const unsigned char *masks) 147 | { 148 | 149 | leafaddr a = *leaf; 150 | int lastnode,i; 151 | unsigned char stack[(height+1)*HASH_BYTES]; 152 | unsigned int stacklevels[height+1]; 153 | unsigned int stackoffset=0; 154 | unsigned int maskoffset =0; 155 | 156 | lastnode = a.subleaf+(1<1 && stacklevels[stackoffset-1] == stacklevels[stackoffset-2]) 164 | { 165 | //MASKS 166 | maskoffset = 2*(stacklevels[stackoffset-1] + WOTS_LOG_L)*HASH_BYTES; 167 | hash_2n_n_mask(stack+(stackoffset-2)*HASH_BYTES,stack+(stackoffset-2)*HASH_BYTES, 168 | masks+maskoffset); 169 | stacklevels[stackoffset-2]++; 170 | stackoffset--; 171 | } 172 | } 173 | for(i=0;i>= 1; 202 | if(leafidx&1) 203 | { 204 | hash_2n_n_mask(buffer+HASH_BYTES,buffer,masks+2*(WOTS_LOG_L+i)*HASH_BYTES); 205 | for(j=0;j 0; i>>=1) 244 | { 245 | for (j = 0; j < i; j+=2) 246 | hash_2n_n_mask(tree + (i>>1)*HASH_BYTES + (j>>1) * HASH_BYTES, 247 | tree + i*HASH_BYTES + j * HASH_BYTES, 248 | masks+2*(WOTS_LOG_L + level)*HASH_BYTES); 249 | 250 | level++; 251 | } 252 | 253 | 254 | idx = a->subleaf; 255 | 256 | // copy authpath 257 | for(i=0;i>i)*HASH_BYTES + ((idx >> i) ^ 1) * HASH_BYTES, HASH_BYTES); 259 | 260 | // copy root 261 | memcpy(root, tree+HASH_BYTES, HASH_BYTES); 262 | } 263 | 264 | 265 | /* 266 | * Format pk: [|N_MASKS*HASH_BYTES| Bitmasks || root] 267 | */ 268 | int crypto_sign_keypair(unsigned char *pk, unsigned char *sk) 269 | { 270 | leafaddr a; 271 | 272 | randombytes(sk,CRYPTO_SECRETKEYBYTES); 273 | memcpy(pk,sk+SEED_BYTES,N_MASKS*HASH_BYTES); 274 | 275 | // Initialization of top-subtree address 276 | a.level = N_LEVELS - 1; 277 | a.subtree = 0; 278 | a.subleaf = 0; 279 | 280 | // Construct top subtree 281 | treehash(pk+(N_MASKS*HASH_BYTES), SUBTREE_HEIGHT, sk, &a, pk); 282 | return 0; 283 | } 284 | 285 | 286 | int crypto_sign(unsigned char *sm,unsigned long long *smlen, const unsigned char *m,unsigned long long mlen, const unsigned char *sk) 287 | { 288 | leafaddr a; 289 | unsigned long long i; 290 | unsigned long long leafidx; 291 | unsigned char R[MESSAGE_HASH_SEED_BYTES]; 292 | unsigned char m_h[MSGHASH_BYTES]; 293 | unsigned long long rnd[8]; 294 | unsigned long long horst_sigbytes; 295 | unsigned char root[HASH_BYTES]; 296 | unsigned char seed[SEED_BYTES]; 297 | unsigned char masks[N_MASKS*HASH_BYTES]; 298 | unsigned char *pk; 299 | unsigned char tsk[CRYPTO_SECRETKEYBYTES]; 300 | 301 | for(i=0;i0;i--) 311 | scratch[SK_RAND_SEED_BYTES+i-1] = m[i-1]; 312 | // Copy secret random seed to scratch 313 | memcpy(scratch, tsk + CRYPTO_SECRETKEYBYTES - SK_RAND_SEED_BYTES, SK_RAND_SEED_BYTES); 314 | 315 | crypto_hash_blake512((unsigned char*)rnd, scratch, SK_RAND_SEED_BYTES + mlen); //XXX: Why Blake 512? 316 | 317 | // wipe sk 318 | zerobytes(scratch,SK_RAND_SEED_BYTES); 319 | 320 | #if TOTALTREE_HEIGHT != 60 321 | #error "Implemented for TOTALTREE_HEIGHT == 60!" 322 | #endif 323 | 324 | leafidx = rnd[0] & 0xfffffffffffffff; 325 | 326 | #if MESSAGE_HASH_SEED_BYTES != 32 327 | #error "Implemented for MESSAGE_HASH_SEED_BYTES == 32!" 328 | #endif 329 | memcpy(R, &rnd[2], MESSAGE_HASH_SEED_BYTES); 330 | 331 | // prepare msg_hash 332 | scratch = sm + CRYPTO_BYTES - MESSAGE_HASH_SEED_BYTES - CRYPTO_PUBLICKEYBYTES; 333 | 334 | // cpy R 335 | memcpy(scratch, R, MESSAGE_HASH_SEED_BYTES); 336 | 337 | // construct and cpy pk 338 | leafaddr a; 339 | a.level = N_LEVELS - 1; 340 | a.subtree = 0; 341 | a.subleaf=0; 342 | 343 | pk = scratch + MESSAGE_HASH_SEED_BYTES; 344 | 345 | memcpy(pk, tsk+SEED_BYTES, N_MASKS*HASH_BYTES); 346 | 347 | treehash(pk+(N_MASKS*HASH_BYTES), SUBTREE_HEIGHT, tsk, &a, pk); 348 | 349 | // message already on the right spot 350 | 351 | msg_hash(m_h, scratch, mlen + MESSAGE_HASH_SEED_BYTES + CRYPTO_PUBLICKEYBYTES); 352 | } 353 | 354 | a.level = N_LEVELS; // Use unique value $d$ for HORST address. 355 | a.subleaf = leafidx & ((1<> SUBTREE_HEIGHT; 357 | 358 | *smlen = 0; 359 | 360 | for(i=0; i> 8*i) & 0xff; 369 | 370 | sm += (TOTALTREE_HEIGHT+7)/8; 371 | *smlen += (TOTALTREE_HEIGHT+7)/8; 372 | 373 | get_seed(seed, tsk, &a); 374 | horst_sign(sm, root, &horst_sigbytes, m, mlen, seed, masks, m_h); 375 | 376 | sm += horst_sigbytes; 377 | *smlen += horst_sigbytes; 378 | 379 | for(i=0;i>= SUBTREE_HEIGHT; 394 | } 395 | 396 | zerobytes(tsk, CRYPTO_SECRETKEYBYTES); 397 | 398 | *smlen += mlen; 399 | 400 | return 0; 401 | } 402 | 403 | 404 | 405 | int crypto_sign_open(unsigned char *m,unsigned long long *mlen, const unsigned char *sm,unsigned long long smlen, const unsigned char *pk) 406 | { 407 | unsigned long long i; 408 | unsigned long long leafidx=0; 409 | unsigned char wots_pk[WOTS_L*HASH_BYTES]; 410 | unsigned char pkhash[HASH_BYTES]; 411 | unsigned char root[HASH_BYTES]; 412 | unsigned char sig[CRYPTO_BYTES]; 413 | unsigned char *sigp; 414 | unsigned char tpk[CRYPTO_PUBLICKEYBYTES]; 415 | 416 | if(smlen < CRYPTO_BYTES) 417 | return -1; 418 | 419 | unsigned char m_h[MSGHASH_BYTES]; 420 | 421 | for(i=0;i>= 5; 476 | 477 | sigp += SUBTREE_HEIGHT*HASH_BYTES; 478 | smlen -= SUBTREE_HEIGHT*HASH_BYTES; 479 | } 480 | 481 | for(i=0;i 6 | #include 7 | #include 8 | 9 | 10 | static void expand_seed(unsigned char outseeds[WOTS_L*HASH_BYTES], const unsigned char inseed[SEED_BYTES]) 11 | { 12 | prg(outseeds, WOTS_L*HASH_BYTES, inseed); 13 | } 14 | 15 | static void gen_chain(unsigned char out[HASH_BYTES], const unsigned char seed[HASH_BYTES], const unsigned char *masks, int chainlen) 16 | { 17 | int i,j; 18 | for(j=0;j> 4; 80 | c += WOTS_W - 1 - basew[i]; 81 | c += WOTS_W - 1 - basew[i+1]; 82 | } 83 | 84 | for( ;i>= 4; 88 | } 89 | 90 | expand_seed(sig, sk); 91 | /* 92 | for(i=0;i> 4; 112 | c += WOTS_W - 1 - basew[i]; 113 | c += WOTS_W - 1 - basew[i+1]; 114 | } 115 | 116 | for( ;i>= 4; 120 | } 121 | 122 | // as much as possible 8 times parallel 123 | for(i=0; (i+8) < WOTS_L; i+=8) 124 | { 125 | memcpy(tmp, sig+i*HASH_BYTES, 8*HASH_BYTES); 126 | 127 | int cnt = (1 << 8) - 1; 128 | 129 | // remove basew == 0 cases 130 | for (k = 0; k < 8; k++) 131 | if (0 == WOTS_W-1-basew[i+k]) 132 | { 133 | memcpy(pk+(i+k)*HASH_BYTES, tmp+k*HASH_BYTES, HASH_BYTES); 134 | cnt ^= (1 << k); 135 | } 136 | 137 | for(j=0; (j < WOTS_W) & (cnt > 0); ) 138 | { 139 | for (k = 0; k < 8; k++) 140 | for (l = 0; l < 32; l++) 141 | tmp[l + k*HASH_BYTES] ^= (masks+(basew[i+k]*HASH_BYTES)+(j*HASH_BYTES))[l]; 142 | 143 | hash_n_n_8x(tmp, tmp); 144 | 145 | j++; 146 | 147 | for (k = 0; k < 8; k++) 148 | if (j == WOTS_W-1-basew[i+k])// | (j == WOTS_W)) 149 | { 150 | memcpy(pk+(i+k)*HASH_BYTES, tmp+k*HASH_BYTES, HASH_BYTES); 151 | cnt ^= (1 << k); 152 | } 153 | } 154 | } 155 | 156 | // rest one by one... 157 | for(; (i) < WOTS_L; i+=1) 158 | { 159 | memcpy(tmp, sig+i*HASH_BYTES, HASH_BYTES); 160 | 161 | for(j=0; j < WOTS_W-1-basew[i]; j++) 162 | hash_n_n_mask(tmp, tmp, masks+(basew[i]*HASH_BYTES)+(j*HASH_BYTES)); 163 | 164 | memcpy(pk+i*HASH_BYTES, tmp, HASH_BYTES); 165 | } 166 | 167 | } 168 | -------------------------------------------------------------------------------- /supercop/crypto_sign/sphincs256haraka/aesni/wots.h: -------------------------------------------------------------------------------- 1 | #ifndef WOTS_H 2 | #define WOTS_H 3 | 4 | #include "params.h" 5 | 6 | void wots_pkgen(unsigned char pk[WOTS_L*HASH_BYTES], const unsigned char sk[SEED_BYTES], const unsigned char masks[(WOTS_W-1)*HASH_BYTES]); 7 | 8 | void wots_sign(unsigned char sig[WOTS_L*HASH_BYTES], const unsigned char msg[HASH_BYTES], const unsigned char sk[SEED_BYTES], const unsigned char masks[(WOTS_W-1)*HASH_BYTES]); 9 | 10 | void wots_verify(unsigned char pk[WOTS_L*HASH_BYTES], const unsigned char sig[WOTS_L*HASH_BYTES], const unsigned char msg[HASH_BYTES], const unsigned char masks[(WOTS_W-1)*HASH_BYTES]); 11 | 12 | #endif 13 | -------------------------------------------------------------------------------- /supercop/crypto_sign/sphincs256haraka/aesni/zerobytes.c: -------------------------------------------------------------------------------- 1 | #include "zerobytes.h" 2 | 3 | unsigned char *zerobytes(unsigned char *r,unsigned long long n) 4 | { 5 | volatile unsigned char *p=r; 6 | while (n--) 7 | *(p++) = 0; 8 | return r; 9 | } 10 | -------------------------------------------------------------------------------- /supercop/crypto_sign/sphincs256haraka/aesni/zerobytes.h: -------------------------------------------------------------------------------- 1 | #ifndef ZEROBYTES_H 2 | #define ZEROBYTES_H 3 | 4 | unsigned char *zerobytes(unsigned char *r,unsigned long long n); 5 | 6 | #endif 7 | -------------------------------------------------------------------------------- /supercop/crypto_sign/try.c: -------------------------------------------------------------------------------- 1 | /* 2 | * crypto_sign/try.c version 20140423 3 | * D. J. Bernstein 4 | * Public domain. 5 | * Auto-generated by trygen.py; do not edit. 6 | */ 7 | 8 | #include "crypto_sign.h" 9 | #include "try.h" 10 | #include "randombytes.h" 11 | 12 | const char *primitiveimplementation = crypto_sign_IMPLEMENTATION; 13 | 14 | #define TUNE_BYTES 1536 15 | #ifdef SMALL 16 | #define MAXTEST_BYTES 128 17 | #else 18 | #define MAXTEST_BYTES 4096 19 | #endif 20 | #ifdef SMALL 21 | #define LOOPS 8 22 | #else 23 | #define LOOPS 64 24 | #endif 25 | 26 | static unsigned char *p; 27 | static unsigned char *s; 28 | static unsigned char *m; 29 | static unsigned char *c; 30 | static unsigned char *t; 31 | static unsigned char *p2; 32 | static unsigned char *s2; 33 | static unsigned char *m2; 34 | static unsigned char *c2; 35 | static unsigned char *t2; 36 | #define plen crypto_sign_PUBLICKEYBYTES 37 | #define slen crypto_sign_SECRETKEYBYTES 38 | unsigned long long mlen; 39 | unsigned long long clen; 40 | unsigned long long tlen; 41 | 42 | void preallocate(void) 43 | { 44 | #ifdef RAND_R_PRNG_NOT_SEEDED 45 | RAND_status(); 46 | #endif 47 | } 48 | 49 | void allocate(void) 50 | { 51 | unsigned long long alloclen = 0; 52 | if (alloclen < TUNE_BYTES) alloclen = TUNE_BYTES; 53 | if (alloclen < MAXTEST_BYTES + crypto_sign_BYTES) alloclen = MAXTEST_BYTES + crypto_sign_BYTES; 54 | if (alloclen < crypto_sign_PUBLICKEYBYTES) alloclen = crypto_sign_PUBLICKEYBYTES; 55 | if (alloclen < crypto_sign_SECRETKEYBYTES) alloclen = crypto_sign_SECRETKEYBYTES; 56 | p = alignedcalloc(alloclen); 57 | s = alignedcalloc(alloclen); 58 | m = alignedcalloc(alloclen); 59 | c = alignedcalloc(alloclen); 60 | t = alignedcalloc(alloclen); 61 | p2 = alignedcalloc(alloclen); 62 | s2 = alignedcalloc(alloclen); 63 | m2 = alignedcalloc(alloclen); 64 | c2 = alignedcalloc(alloclen); 65 | t2 = alignedcalloc(alloclen); 66 | } 67 | 68 | void predoit(void) 69 | { 70 | crypto_sign_keypair(p,s); 71 | mlen = TUNE_BYTES; 72 | clen = 0; 73 | randombytes(m,mlen); 74 | } 75 | 76 | void doit(void) 77 | { 78 | crypto_sign(c,&clen,m,mlen,s); 79 | crypto_sign_open(t,&tlen,c,clen,p); 80 | } 81 | 82 | void test(void) 83 | { 84 | unsigned long long loop; 85 | 86 | for (loop = 0;loop < LOOPS;++loop) { 87 | mlen = myrandom() % (MAXTEST_BYTES + 1); 88 | 89 | output_prepare(p2,p,plen); 90 | output_prepare(s2,s,slen); 91 | if (crypto_sign_keypair(p,s) != 0) fail("crypto_sign_keypair returns nonzero"); 92 | checksum(p,plen); 93 | checksum(s,slen); 94 | output_compare(p2,p,plen,"crypto_sign_keypair"); 95 | output_compare(s2,s,slen,"crypto_sign_keypair"); 96 | 97 | clen = mlen + crypto_sign_BYTES; 98 | output_prepare(c2,c,clen); 99 | input_prepare(m2,m,mlen); 100 | memcpy(s2,s,slen); 101 | double_canary(s2,s,slen); 102 | if (crypto_sign(c,&clen,m,mlen,s) != 0) fail("crypto_sign returns nonzero"); 103 | if (clen < mlen) fail("crypto_sign returns smaller output than input"); 104 | if (clen > mlen + crypto_sign_BYTES) fail("crypto_sign returns more than crypto_sign_BYTES extra bytes"); 105 | checksum(c,clen); 106 | output_compare(c2,c,clen,"crypto_sign"); 107 | input_compare(m2,m,mlen,"crypto_sign"); 108 | input_compare(s2,s,slen,"crypto_sign"); 109 | 110 | #if crypto_sign_DETERMINISTIC == 1 111 | double_canary(c2,c,clen); 112 | double_canary(m2,m,mlen); 113 | double_canary(s2,s,slen); 114 | if (crypto_sign(c2,&clen,m2,mlen,s2) != 0) fail("crypto_sign returns nonzero"); 115 | if (memcmp(c2,c,clen) != 0) fail("crypto_sign is nondeterministic"); 116 | #endif 117 | 118 | #if crypto_sign_DETERMINISTIC == 1 119 | double_canary(c2,c,clen); 120 | double_canary(m2,m,mlen); 121 | double_canary(s2,s,slen); 122 | if (crypto_sign(m2,&clen,m2,mlen,s) != 0) fail("crypto_sign with m=c overlap returns nonzero"); 123 | if (memcmp(m2,c,clen) != 0) fail("crypto_sign does not handle m=c overlap"); 124 | memcpy(m2,m,mlen); 125 | if (crypto_sign(s2,&clen,m,mlen,s2) != 0) fail("crypto_sign with s=c overlap returns nonzero"); 126 | if (memcmp(s2,c,clen) != 0) fail("crypto_sign does not handle s=c overlap"); 127 | memcpy(s2,s,slen); 128 | #endif 129 | 130 | tlen = clen; 131 | output_prepare(t2,t,tlen); 132 | memcpy(c2,c,clen); 133 | double_canary(c2,c,clen); 134 | memcpy(p2,p,plen); 135 | double_canary(p2,p,plen); 136 | if (crypto_sign_open(t,&tlen,c,clen,p) != 0) fail("crypto_sign_open returns nonzero"); 137 | if (tlen != mlen) fail("crypto_sign_open does not match mlen"); 138 | if (memcmp(t,m,mlen) != 0) fail("crypto_sign_open does not match m"); 139 | checksum(t,tlen); 140 | output_compare(t2,t,clen,"crypto_sign_open"); 141 | input_compare(c2,c,clen,"crypto_sign_open"); 142 | input_compare(p2,p,plen,"crypto_sign_open"); 143 | 144 | double_canary(t2,t,tlen); 145 | double_canary(c2,c,clen); 146 | double_canary(p2,p,plen); 147 | if (crypto_sign_open(t2,&tlen,c2,clen,p2) != 0) fail("crypto_sign_open returns nonzero"); 148 | if (memcmp(t2,t,tlen) != 0) fail("crypto_sign_open is nondeterministic"); 149 | 150 | double_canary(t2,t,tlen); 151 | double_canary(c2,c,clen); 152 | double_canary(p2,p,plen); 153 | if (crypto_sign_open(c2,&tlen,c2,clen,p) != 0) fail("crypto_sign_open with c=t overlap returns nonzero"); 154 | if (memcmp(c2,t,tlen) != 0) fail("crypto_sign_open does not handle c=t overlap"); 155 | memcpy(c2,c,clen); 156 | if (crypto_sign_open(p2,&tlen,c,clen,p2) != 0) fail("crypto_sign_open with p=t overlap returns nonzero"); 157 | if (memcmp(p2,t,tlen) != 0) fail("crypto_sign_open does not handle p=t overlap"); 158 | memcpy(p2,p,plen); 159 | 160 | c[myrandom() % clen] += 1 + (myrandom() % 255); 161 | if (crypto_sign_open(t,&tlen,c,clen,p) == 0) 162 | if ((tlen != mlen) || (memcmp(t,m,mlen) != 0)) 163 | fail("crypto_sign_open allows trivial forgeries"); 164 | c[myrandom() % clen] += 1 + (myrandom() % 255); 165 | if (crypto_sign_open(t,&tlen,c,clen,p) == 0) 166 | if ((tlen != mlen) || (memcmp(t,m,mlen) != 0)) 167 | fail("crypto_sign_open allows trivial forgeries"); 168 | c[myrandom() % clen] += 1 + (myrandom() % 255); 169 | if (crypto_sign_open(t,&tlen,c,clen,p) == 0) 170 | if ((tlen != mlen) || (memcmp(t,m,mlen) != 0)) 171 | fail("crypto_sign_open allows trivial forgeries"); 172 | } 173 | } 174 | --------------------------------------------------------------------------------