├── LICENSE
├── README.md
├── analysis
    ├── README.md
    ├── aesmilp.py
    ├── examples
    │   ├── aeslike.yaml
    │   └── haraka.yaml
    └── models
    │   ├── __init__.py
    │   ├── aeslike.py
    │   ├── haraka.py
    │   └── milpconstraints.py
├── code
    ├── c
    │   ├── aesni_optimized
    │   │   ├── Makefile
    │   │   ├── haraka.c
    │   │   ├── haraka.h
    │   │   ├── main.c
    │   │   └── timing.h
    │   ├── aesni_ref
    │   │   ├── Makefile
    │   │   ├── haraka.c
    │   │   ├── helpers.c
    │   │   └── helpers.h
    │   └── neon
    │   │   ├── haraka.c
    │   │   └── haraka.h
    └── python
    │   └── ref.py
└── supercop
    └── crypto_sign
        ├── measure.c
        ├── measure.c~
        ├── sphincs256haraka
            └── aesni
            │   ├── api.h
            │   ├── consts.c
            │   ├── haraka.c
            │   ├── haraka.h
            │   ├── hash.c
            │   ├── hash.h
            │   ├── horst.c
            │   ├── horst.h
            │   ├── horst.log
            │   ├── implementors
            │   ├── params.h
            │   ├── permute.c
            │   ├── permute.h
            │   ├── prg.c
            │   ├── prg.h
            │   ├── settings.h
            │   ├── sign.c
            │   ├── wots.c
            │   ├── wots.h
            │   ├── zerobytes.c
            │   └── zerobytes.h
        └── try.c


/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 kste
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Haraka v2
 2 | 
 3 | Haraka v2 is a secure and efficient short-input (256 or 512 bits) hash function, designed 
 4 | to be very fast on modern platforms which support AES-NI. One of the main applications 
 5 | for such a design is the use in hash-based signature schemes like XMSS and SPHINCS.
 6 | For more information see our [paper](https://eprint.iacr.org/2016/098).
 7 | 
 8 | This repository provides various implementations in [code/](https://github.com/kste/haraka/tree/master/code).
 9 | In [code/c/aesni_optimized](https://github.com/kste/haraka/tree/master/code/c/aesni_optimized), one can find
10 | an implementation processing 4 or 8 blocks in parallel.
11 | 
12 | 
13 | ## Performance
14 | 
15 | The performance is measured in cycles per byte (cpb) processed. The following numbers
16 | correspond to Intel Skylake using the [optimized implementation](https://github.com/kste/haraka/tree/master/code/c/aesni_optimized).
17 | 
18 | Variant | 1x | 4x | 8x
19 | ------- | ------- | ------- | -------
20 | Haraka256 | 0.72 cpb | 0.63 cpb  | 0.63 cpb 
21 | Haraka512 | 1.02 cpb  | 0.72 cpb  | 0.72 cpb 
22 | 
23 | ## SPHINCS
24 | [SPHINCS](https://sphincs.cr.yp.to/) is a post-quantum secure hash-based digital signature scheme. The performance
25 | of SPHINCS strongly correlates with the performance of the underlying hash function and can be significantly
26 | improved by using an optimized construction.
27 | 
28 | A SPHINCS implementation instantiated with Haraka can be found in [supercop/crypto_sign/](https://github.com/kste/haraka/tree/master/supercop/crypto_sign/sphincs256haraka/aesni), which can also be used for benchmarks with 
29 | [Supercop](https://bench.cr.yp.to/supercop.html).
30 | 
31 | This optimized implementation has the following perfomance figures on Intel Skylake:
32 | 
33 | Operation | Cycles
34 | ------------ | -------------
35 | KeyGeneration | 1.340.338
36 | Signing | 20.782.894
37 | Verify | 415.586
38 | 
39 | 
40 | ## Reference
41 | 
42 | Haraka v2 - Efficient Short-Input Hashing for Post-Quantum Applications
43 | 
44 | Stefan Kölbl and Martin M. Lauridsen and Florian Mendel and Christian Rechberger
45 | https://eprint.iacr.org/2016/098
46 | 


--------------------------------------------------------------------------------
/analysis/README.md:
--------------------------------------------------------------------------------
 1 | # Haraka - Analysis
 2 | 
 3 | This folder contains python scripts to construct the mixed integer linear
 4 | programming (MILP) model used in the security analysis of Haraka.
 5 | 
 6 | ## Examples
 7 | Count the number of active S-boxes for AES-like designs. Parameters like the number of rounds or state dimensions
 8 | can be specified in the *.yaml file.
 9 | ```
10 | python3 aesmilp.py --sbox --config examples/aeslike.yaml
11 | ```
12 | 
13 | Finding the optimal truncated differential attack for Haraka.
14 | ```
15 | python3 aesmilp.py --truncated --config examples/haraka.yaml
16 | ```
17 | 
18 | For more details on this see our paper.
19 | 


--------------------------------------------------------------------------------
/analysis/aesmilp.py:
--------------------------------------------------------------------------------
  1 | """
  2 | A tool to find the minimum number of active S-boxes for AES-like ciphers and
  3 | Haraka-like designs. It can also be used to find the optimal truncated 
  4 | differential attack for Haraka.
  5 | 
  6 | The gurobi python interface is required to run this code http://www.gurobi.com/
  7 | """
  8 | 
  9 | from argparse import ArgumentParser, RawTextHelpFormatter
 10 | from models import aeslike, haraka
 11 | from gurobipy import *
 12 | 
 13 | import yaml
 14 | 
 15 | # Disable logging for gurobi on console
 16 | setParam("LogToConsole", 1)
 17 | 
 18 | def activesboxharaka():
 19 |     config = {"rounds": 1,
 20 |               "wordsize": 8,
 21 |               "branchnumber": 5,
 22 |               "statedimension": 4,
 23 |               "aesstates": 4,
 24 |               "aesrounds": 2,
 25 |               "collisiononly": False,
 26 |               "mixlayer": "mix",
 27 |               "securitymodel": "sbox"}
 28 | 
 29 |     print("Rounds", "S-boxes", sep="\t")
 30 |     for num_rounds in range(1, 8):
 31 |         print(num_rounds, end='')
 32 |         for aes_rounds in range(1, 6):
 33 |             config["rounds"] = num_rounds
 34 |             config["aesrounds"] = aes_rounds
 35 |             solved_model = solvemodel(haraka.buildmodel(config))
 36 |             print(" ", round(solved_model.ObjVal), end="")
 37 |         print("")
 38 | 
 39 | def findminactiveincreasing():
 40 |     """
 41 |     Example for finding minimum active S-box for increasing number of
 42 |     rounds.
 43 |     """
 44 |     config = {"rounds": 1,
 45 |               "wordsize": 8,
 46 |               "branchnumber": 5,
 47 |               "statedimension": 4}
 48 | 
 49 |     print("Rounds", "S-boxes", sep="\t")
 50 |     for num_rounds in range(1, 11):
 51 |         config["rounds"] = num_rounds
 52 |         solved_model = solvemodel(aeslike.buildmodel(config))
 53 |         print(num_rounds, round(solved_model.ObjVal), sep="\t")
 54 |     return
 55 | 
 56 | def findminactivesbox(config):
 57 |     """
 58 |     Example which finds the minimum number of active S-boxes for AES like
 59 |     ciphers, with the parameters given in the config file.
 60 |     """
 61 |     if config["name"] == "aeslike":
 62 |         model = aeslike.buildmodel(config)
 63 |         solved_model = solvemodel(model)
 64 |         aeslike.printmodel(solved_model, config)
 65 |     elif config["name"] == "haraka":
 66 |         model = haraka.buildmodel(config)
 67 |         solved_model = solvemodel(model)
 68 |         haraka.printmodel(solved_model, config)
 69 |     return
 70 | 
 71 | def harakatruncated(config):
 72 |     """
 73 |     Find best attack in our truncated model for Haraka.
 74 |     """
 75 |     num_states = ((config["aesrounds"] + 1) * config["rounds"]) + 1
 76 | 
 77 |     # Iterate over all possible starting states for the attack
 78 |     best_attack = 999999
 79 |     best_round = -1
 80 |     best_model = 0
 81 | 
 82 |     for rnd in range(num_states - 1):
 83 |         if haraka.isAESround(rnd, config["aesrounds"]):
 84 |             config["attackerstart"] = rnd
 85 |             model = haraka.buildmodel(config)
 86 |             attack_costs = round(solvemodel(model).objVal)
 87 |             print("Subround {} - Best Attack: {}".format(rnd, attack_costs))
 88 |             if attack_costs < best_attack:
 89 |                 best_attack = attack_costs
 90 |                 best_round = rnd
 91 |                 best_model = model
 92 | 
 93 |     print("Found best attack in round {} with costs {}".format(best_round,
 94 |                                                                best_attack))
 95 |     haraka.printmodel(best_model, config)
 96 |     return
 97 | 
 98 | 
 99 | def solvemodel(gurobi_model):
100 |     """
101 |     Solve model and return.
102 |     """
103 |     try:
104 |         gurobi_model.update()
105 |         gurobi_model.write('haraka.lp')
106 |         gurobi_model.optimize()
107 |     except GurobiError:
108 |         print("Error when solving!")
109 |         print(GurobiError)
110 |     return gurobi_model
111 | 
112 | def main():
113 |     """
114 |     Load a config file and parse it
115 |     """
116 |     parser = ArgumentParser(description="todo",
117 |                             formatter_class=RawTextHelpFormatter)
118 |     parser.add_argument('--config', nargs=1, help="Use a yaml input file to"
119 |                                                   "read the parameters")
120 |     parser.add_argument('--sbox', action="store_true", 
121 |                         help="Count the number of active S-boxes.")
122 |     parser.add_argument('--truncated', action="store_true", 
123 |                         help="Use the truncated model for security analysis.")
124 |     parser.add_argument('--verb', nargs=1,
125 |                         help="Set verbosity of the Gurobi solver.")
126 |     args = parser.parse_args()
127 | 
128 |     params = {}
129 | 
130 |     #activesboxharaka()
131 | 
132 |     if args.verb:
133 |         setParam("LogToConsole", int(args.verb[0]))
134 | 
135 |     if args.config:
136 |         with open(args.config[0], 'r') as config:
137 |             params = yaml.load(config)
138 | 
139 |     if args.sbox:
140 |         findminactivesbox(params)
141 |     
142 |     if args.truncated:
143 |         harakatruncated(params)
144 | 
145 | 
146 | if __name__ == '__main__':
147 |     main()
148 | 


--------------------------------------------------------------------------------
/analysis/examples/aeslike.yaml:
--------------------------------------------------------------------------------
1 | # Config file for AES like primitive
2 | ---
3 | name: aeslike
4 | rounds: 4
5 | wordsize: 8
6 | branchnumber: 9
7 | statedimension: 8
8 | ...


--------------------------------------------------------------------------------
/analysis/examples/haraka.yaml:
--------------------------------------------------------------------------------
 1 | # Config file for Haraka-512/256
 2 | ---
 3 | name: haraka
 4 | rounds: 5
 5 | statedimension: 4
 6 | branchnumber: 5
 7 | wordsize: 8
 8 | aesstates: 4
 9 | aesrounds: 2
10 | collisiononly: Yes
11 | securitymodel: "sbox" # Count active S-boxes "sbox", 
12 |                            # Truncated model "truncated"
13 | attackerpower: 2 # Rounds the attacker can control in each direction
14 | mixlayer: "mix"  # Blend "blend"
15 |                    # Mix "mix"                    
16 | ...


--------------------------------------------------------------------------------
/analysis/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kste/haraka/74d7f4e0a2c74f844939e1654b2f6741a437c507/analysis/models/__init__.py


--------------------------------------------------------------------------------
/analysis/models/aeslike.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This script constructs a MILP model for AES-like primitives, which can aid in
 3 | finding optimal parameter sets against differential attacks by counting the
 4 | minimum number of active S-boxes in a differential trail.
 5 | 
 6 | It uses the Gurobi Solver to solve the MILP instance, hence you need
 7 | a Gurobi license (free for Academic use).
 8 | """
 9 | 
10 | from gurobipy import *
11 | from models.milpconstraints import addAESrndconstraints
12 | 
13 | 
14 | def buildmodel(config):
15 |     """
16 |     Constructs the model for the Gurobi Solver
17 |     """
18 | 
19 |     model = Model("aeslike")
20 | 
21 |     # Parameters
22 |     state_dim = config["statedimension"]
23 |     num_rounds = config["rounds"]
24 |     branch_number = config["branchnumber"]
25 | 
26 |     # Initialize all variables
27 |     var_x = [] # state
28 |     var_d = [] # dummy variable for MixColumns
29 | 
30 |     state_words = state_dim * state_dim
31 | 
32 |     for byte in range((num_rounds + 1) * state_words):
33 |         var_x.append(model.addVar(vtype=GRB.BINARY, name="x[{}]".format(byte)))
34 |     for col in range(num_rounds * state_dim):
35 |         var_d.append(model.addVar(name="dummy[{}]".format(col)))
36 | 
37 |     activesboxes = model.addVar(name="Active S-boxes")
38 | 
39 |     model.update()
40 | 
41 |     # Constraints
42 | 
43 |     # Optimize number of active S-boxes
44 |     model.setObjective(activesboxes, GRB.MINIMIZE)
45 | 
46 |     # Count Active S-boxes
47 |     model.addConstr(quicksum(var_x[i] for i in range(num_rounds * state_words))
48 |                            - activesboxes == 0, "Count Active S-boxes")
49 | 
50 |     # Add constraints from AES round function
51 |     model = addAESrndconstraints(model, state_dim, var_x, var_d,
52 |                                         branch_number, num_rounds)
53 | 
54 |     # No Zero Characteristic
55 |     model.addConstr(quicksum(var_x[i] for i in range((num_rounds + 1) *
56 |                            state_words)) >= 1, "Avoid trivial solutions")
57 | 
58 |     return model
59 | 
60 | def printmodel(model, config):
61 |     """
62 |     Print the solution and the corresponding differential trail.
63 |     """
64 |     state_dim = config["statedimension"]
65 |     num_rounds = config["rounds"]
66 | 
67 |     print("Rounds:", num_rounds)
68 |     print("State dimension:", state_dim)
69 |     print("Branch number:", config["branchnumber"])
70 |     print("Minimum number of active S-boxes: {}".format(model.objVal))
71 | 
72 |     print("Best differential trail:")
73 | 
74 |     # Print differential trail
75 |     # Print Header
76 |     header = ""
77 |     for rnd in range(num_rounds + 1):
78 |         header += str(rnd) + " " * (2 * state_dim + 1 - len(str(rnd)))
79 | 
80 |     print(header)
81 | 
82 |     # Print State
83 |     for row in range(state_dim):
84 |         for rnd in range(num_rounds + 1):
85 |             for col in range(state_dim):
86 |                 cur_index = row + col * state_dim + rnd * state_dim * state_dim
87 |                 if model.getVarByName("x[{}]".format(cur_index)).x > 0.0:
88 |                     print("\033[91mx\033[0m", end=" ")
89 |                 else:
90 |                     print(".", end=" ")
91 |             print(" ", end="")
92 |         print("")
93 |     return model.objVal
94 | 


--------------------------------------------------------------------------------
/analysis/models/haraka.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script constructs a MILP model for Haraka-like designs to
  3 | count the number of active S-boxes and determine the security
  4 | level against truncated differential attacks.
  5 | 
  6 | It uses the Gurobi Solver to solve the MILP instance, hence you need
  7 | a Gurobi license (free for Academic use).
  8 | """
  9 | 
 10 | from gurobipy import *
 11 | from models.milpconstraints import addAESrndconstraints
 12 | 
 13 | 
 14 | def buildmodel(config):
 15 |     """
 16 |     Constructs the model for the Gurobi Solver
 17 |     """
 18 | 
 19 |     model = Model("haraka")
 20 | 
 21 |     # Parameters
 22 |     rounds = config["rounds"]
 23 |     state_dim = config["statedimension"]
 24 |     branch_number = config["branchnumber"]
 25 |     aes_rounds = config["aesrounds"]
 26 |     aes_states = config["aesstates"]
 27 | 
 28 |     num_states = ((aes_rounds + 1) * rounds) + 1
 29 |     words_state = state_dim * state_dim
 30 | 
 31 |     # Initialize all variables
 32 |     var_x = [[] for _ in range(aes_states)]
 33 |     var_d = [[] for _ in range(aes_states)]
 34 |     var_mccosts = [[] for _ in range(aes_states)]
 35 |     var_mcactive = [[] for _ in range(aes_states)]
 36 | 
 37 |     for aes_state in range(aes_states):
 38 |         for word in range(num_states * words_state):
 39 |             var_x[aes_state].append( 
 40 |                 model.addVar(vtype=GRB.BINARY,
 41 |                              name="x[{}][{}]".format(aes_state, word))
 42 |                 )
 43 |         for col in range(num_states * state_dim):
 44 |             var_d[aes_state].append(
 45 |                 model.addVar(name="dummy[{}][{}]".format(aes_state, col))
 46 |                 )
 47 |             var_mccosts[aes_state].append(
 48 |                 model.addVar(name="MCCosts[{}][{}]".format(aes_state, col))
 49 |                 )
 50 |             var_mcactive[aes_state].append(
 51 |                 model.addVar(vtype=GRB.BINARY,
 52 |                              name="MCActive[{}][{}]".format(aes_state, col))
 53 |                 )
 54 |     
 55 |     activesboxes = model.addVar(name="Active S-boxes")
 56 |     costs = model.addVar()
 57 | 
 58 | 
 59 |     model.update()
 60 | 
 61 |     # Objective to minimize attack costs
 62 |     model.setObjective(costs, GRB.MINIMIZE)
 63 | 
 64 |     if config["securitymodel"] == "sbox":
 65 |         # print("Finding minimum number of active S-boxes...")
 66 |         # Count number of active S-boxes
 67 |         model = addactivesboxconstraints(model, config, var_x, activesboxes)
 68 |         model.setObjective(activesboxes, GRB.MINIMIZE)
 69 |     elif config["securitymodel"] == "truncated":
 70 |         model = addtruncatedconstraints(model, config, var_x, var_mccosts,
 71 |                                         var_mcactive, costs)
 72 | 
 73 | 
 74 |     if config["collisiononly"]:
 75 |         if aes_states == 4:
 76 |             # If we have 4 states truncated to 256-bit
 77 |             model = addcolltruncoutput512(model, config, var_x)
 78 |         else:
 79 |             model = addcollisionconstraints(model, config, var_x)
 80 | 
 81 | 
 82 |     for rnd in range(rounds):
 83 |         # Add AES round constraints
 84 |         for aes_state in range(aes_states):
 85 |             model = addAESrndconstraints(model, state_dim, 
 86 |                 var_x[aes_state][words_state * (aes_rounds + 1) * rnd:], 
 87 |                 var_d[aes_state][state_dim * (aes_rounds + 1) * rnd:], 
 88 |                 branch_number, aes_rounds)
 89 | 
 90 |         # Add MIX round constraints
 91 |         if config["mixlayer"] == "mix" and aes_states == 4:
 92 |             model = addmixconstraints512(model, config, var_x, rnd)
 93 |         elif config["mixlayer"] == "mix" and aes_states == 2:
 94 |             model = addmixconstraints256(model, config, var_x, rnd)
 95 | 
 96 | 
 97 |     # No all Zero
 98 |     model.addConstr(quicksum(var_x[aes_state][i] 
 99 |                              for aes_state in range(aes_states) 
100 |                              for i in range((aes_rounds * rounds + 1) *
101 |                                             state_dim * state_dim)) >= 1,
102 |                            "notrivialsolution")
103 | 
104 |     return model
105 | 
106 | 
107 | def filterAESround(rounds, aes_rounds):
108 |     """
109 |     Filters the list for AES rounds.
110 |     """
111 |     return filter(lambda x: isAESround(x, config["aesrounds"]), rounds)
112 | 
113 | def isAESround(rnd, aes_rounds):
114 |     """
115 |     Return True if rnd is an AES round.
116 |     """
117 |     return rnd == 0 or (((rnd + 1) % (aes_rounds + 1)) != 0)
118 | 
119 | def printmodel(model, config):
120 |     """
121 |     Print the solution and the corresponding differential trail.
122 |     """
123 |     if config["securitymodel"] == "truncated":
124 |         print("MixColumns Costs: {}".format(round(model.getVarByName("MixColumnsCosts").x)))
125 |         print("MixColumns Costs (no dof): {}".format(round(model.getVarByName("MixColumnsCostsNoDof").x)))
126 |         print("Collision Costs: {}".format(round(model.getVarByName("CollisionCosts").x)))
127 |         print("Reducable Costs: {}".format(round(model.getVarByName("ReducableCosts").x)))
128 |         print("Degrees of Freedom: {}".format(round(model.getVarByName("DegreesOfFreedom").x)))
129 | 
130 |     print("Obj: {}".format(round(model.objVal)))
131 | 
132 |     print("Best differential trail:")
133 | 
134 |     state_dim = config["statedimension"]
135 |     num_states = ((config["aesrounds"] + 1) * config["rounds"]) + 1
136 | 
137 |     for rnd in range(num_states):
138 |         for row in range(state_dim):
139 |             for aes_state in range(config["aesstates"]):
140 |                 for col in range(state_dim):
141 |                     cur_index = row + col * state_dim + rnd * state_dim * state_dim
142 |                     if model.getVarByName("x[{}][{}]".format(
143 |                         aes_state, cur_index)).x > 0.5:
144 |                         print("\033[91mx\033[0m", end = " ")
145 |                     else:
146 |                         print(".", end = " ")
147 |                 print(" ", end = "")
148 |             print("")
149 |         if rnd != num_states - 1:
150 |             if isAESround(rnd, config["aesrounds"]):
151 |                 print("AES")
152 |             else:
153 |                 print("MIX")
154 |     return
155 | 
156 | def addtruncatedconstraints(model, config, var_x, var_mccosts, var_mcactive,
157 |                             costs):
158 |     """
159 |     Adds constraints for the truncated security model.
160 |     """
161 | 
162 |     costs_mc = model.addVar(name="MixColumnsCosts")
163 |     costs_mc_nodof = model.addVar(name="MixColumnsCostsNoDof")
164 |     costs_collision = model.addVar(name="CollisionCosts")
165 |     costs_reducable = model.addVar(name="ReducableCosts")
166 |     degoffree = model.addVar(name="DegreesOfFreedom")
167 | 
168 |     model.update()
169 | 
170 |     num_states = ((config["aesrounds"] + 1) * config["rounds"]) + 1
171 |     state_dim = config["statedimension"]
172 | 
173 |     # Define costs
174 | 
175 |     # Attacker can control input == output difference with d.o.f.
176 |     if config["attackerstart"] - config["attackerpower"] <= 0 and \
177 |        config["attackerstart"] + config["attackerpower"] >= (num_states - 2):
178 |        model.addConstr(costs_reducable >= costs_mc + costs_collision - 
179 |                        degoffree, "Attack costs after reducing d.o.f.")
180 |        model.addConstr(costs >= costs_reducable + costs_mc_nodof,
181 |                        "Total attack costs")
182 |     else:
183 |         model.addConstr(costs_reducable >= costs_mc - degoffree, 
184 |                         "Attack costs after reducing d.o.f.")
185 |         model.addConstr(costs >= costs_reducable + costs_mc_nodof + 
186 |                         costs_collision, "Total attack costs")
187 | 
188 |     # Count number of d.o.f.
189 |     # Collision resistance
190 |     # model.addConstr(degoffree <= state_dim * state_dim * config["wordsize"] * 
191 |     #                config["aesstates"])
192 | 
193 |     # Second-preimage reistance
194 |     # Allow only to choose differences in this setting
195 |     start_indices = [config["attackerstart"] * state_dim * state_dim + 
196 |                      x for x in range(state_dim*state_dim)]
197 |     model.addConstr(degoffree <= quicksum(var_x[aes_state][i]
198 |                         for aes_state in range(config["aesstates"])
199 |                         for i in start_indices) * config["wordsize"])
200 | 
201 |     # Find rounds which are non-linear
202 |     non_linear_rounds = [x for x in range(num_states - 1) if isAESround(x, 
203 |                          config["aesrounds"])]
204 | 
205 |     start_index = non_linear_rounds.index(config["attackerstart"])
206 | 
207 |     # Count conditions on MixColumns
208 |     for aes_state in range(config["aesstates"]):
209 |         for fwd_rnd in non_linear_rounds[start_index:]:
210 |             for col in range(state_dim):
211 |                 indices = []
212 |                 for row in range(state_dim):
213 |                     indices.append((fwd_rnd + 1) * state_dim * state_dim +
214 |                                 col*state_dim + row)
215 |                 model = addMCcostsfromindices(model, config, var_x, var_mccosts,
216 |                                               var_mcactive, aes_state, fwd_rnd, 
217 |                                               col, indices)
218 | 
219 | 
220 |         for bck_rnd in non_linear_rounds[:start_index]:
221 |             for col in range(state_dim):
222 |                 indices = []
223 |                 for row in range(state_dim):
224 |                     tmp_index = ((state_dim * col + row * (state_dim + 1)) %
225 |                                  (state_dim * state_dim))
226 |                     indices.append(bck_rnd * state_dim * state_dim + tmp_index)
227 |                 model = addMCcostsfromindices(model, config, var_x, var_mccosts,
228 |                                               var_mcactive, aes_state, bck_rnd,
229 |                                               col, indices)
230 | 
231 |     # Find costs for controlled and uncontrolled rounds
232 |     assert config["attackerstart"] in non_linear_rounds
233 |     match_index = non_linear_rounds.index(config["attackerstart"])
234 | 
235 |     dof_interval_from = max(match_index - config["attackerpower"], 0)
236 |     dof_interval_to = min(match_index + config["attackerpower"], num_states)
237 | 
238 |     active_rounds_dof = non_linear_rounds[dof_interval_from:dof_interval_to]
239 |     active_rounds_nodof = list(set(non_linear_rounds) - set(active_rounds_dof))
240 | 
241 |     mc_indices = []
242 |     mc_indices_nodof = []
243 | 
244 |     for i in range(state_dim):
245 |         for itrnd in active_rounds_dof:
246 |             mc_indices.append(state_dim*itrnd + i)
247 |         for itrnd in active_rounds_nodof:
248 |             mc_indices_nodof.append(state_dim*itrnd + i)
249 | 
250 |     model.addConstr(quicksum(var_mccosts[j][i] for j in range(config["aesstates"]) 
251 |         for i in mc_indices) - costs_mc == 0, "MixColumns Costs Reducable")
252 |     model.addConstr(quicksum(var_mccosts[j][i] for j in range(config["aesstates"]) 
253 |         for i in mc_indices_nodof) - costs_mc_nodof == 0, "MixColumns Costs")
254 | 
255 |     return model
256 | 
257 | def addMCcostsfromindices(model, config, var_x, var_mccosts, var_mcactive, 
258 |                           aes_state, rnd, col, indices):
259 |     """
260 |     Add the MixColumns costs given the indices
261 |     """
262 |     state_column = [var_x[aes_state][i] for i in indices]
263 |     column_idx = rnd * config["statedimension"] + col
264 |     # Mark as active MixColumns
265 |     model.addConstr(quicksum(state_column) <= config["statedimension"] * 
266 |                     var_mcactive[aes_state][column_idx], 
267 |                     "MixColumns is active")
268 |     # Costs for MixColumn transition
269 |     model.addConstr((config["statedimension"] - quicksum(state_column)) * 
270 |                     config["wordsize"] * 
271 |                     var_mcactive[aes_state][column_idx] == 
272 |                     var_mccosts[aes_state][column_idx],
273 |                     "MixColumns costs")
274 |     return model
275 | 
276 | def addcolltruncoutput512(model, config, var_x):
277 |     """
278 |     Add constrains that the trail must lead to a collision after truncation.
279 |     """
280 |     assert(config["aesstates"] == 4)
281 | 
282 |     # haraka Truncation
283 |     num_states = ((config["aesrounds"] + 1) * config["rounds"]) + 1
284 |     state_dim = config["statedimension"]
285 | 
286 |     hashoutput = []
287 |     for aes_state in [0, 3]:
288 |         for word in range(2 * state_dim, state_dim * state_dim):
289 |             hashoutput.append(var_x[aes_state][word])
290 |             model.addConstr(var_x[aes_state][word] == 
291 |                             var_x[aes_state][word + (num_states - 1) *
292 |                             state_dim * state_dim], "collision")
293 |     for aes_state in [1, 2]:
294 |         for word in range(2 * state_dim):
295 |             hashoutput.append(var_x[aes_state][word])
296 |             model.addConstr(var_x[aes_state][word] == 
297 |                             var_x[aes_state][word + (num_states - 1) *
298 |                             state_dim * state_dim], "collision")            
299 | 
300 |     if config["securitymodel"] == "truncated":
301 |         costs_collision = model.getVarByName("CollisionCosts")
302 |         model.addConstr(costs_collision - quicksum(hashoutput) * 
303 |                         config["wordsize"] == 0, "inputdiff = outputdiff")
304 | 
305 |     return model
306 | 
307 | def addcollisionconstraints(model, config, var_x):
308 |     """
309 |     Add constraints that the trail must lead to a collision.
310 |     """
311 |     num_states = ((config["aesrounds"] + 1) * config["rounds"]) + 1
312 |     state_dim = config["statedimension"]
313 | 
314 |     for aes_state in range(config["aesstates"]):
315 |         for word in range(state_dim * state_dim):
316 |             model.addConstr(var_x[aes_state][word] == 
317 |                             var_x[aes_state][word + (num_states - 1) * 
318 |                             state_dim * state_dim], "collision")
319 | 
320 |     if config["securitymodel"] == "truncated":
321 |         costs_collision = model.getVarByName("CollisionCosts")
322 |         model.addConstr(costs_collision - 
323 |                         quicksum(var_x[i][j] for i in range(config["aesstates"])
324 |                                  for j in range(state_dim * state_dim)) * 
325 |                         config["wordsize"] == 0, "inputdiff = outputdiff")            
326 | 
327 |     return model
328 | 
329 | def addmixconstraints512(model, config, var_x, current_round):
330 |     """
331 |     Adds the mix layer. Note that this layer is only defined if there
332 |     are exactly four AES states.
333 |     """
334 |     assert(config["aesstates"] == 4)
335 | 
336 |     # Columnwise permutation
337 |     permutation = [3, 11, 7, 15, 
338 |                    8, 0, 12, 4, 
339 |                    9, 1, 13, 5, 
340 |                    2, 10, 6, 14]
341 | 
342 |     state_dim = config["statedimension"]
343 |     words_state = state_dim * state_dim
344 |     start_index = words_state * (config["aesrounds"] + current_round * 
345 |                  config["aesrounds"] + current_round)
346 | 
347 |     next_index = 0
348 | 
349 |     for idx, col in enumerate(permutation):
350 |         old_col_start = start_index + (col % state_dim) * state_dim
351 |         new_col_start = start_index + (idx % state_dim) * state_dim + words_state
352 |         for word in range(state_dim):
353 |             model.addConstr(var_x[col // 4][old_col_start + word] ==
354 |                             var_x[idx // 4][new_col_start + word], "mix")
355 | 
356 |     return model
357 | 
358 | def addmixconstraints256(model, config, var_x, current_round):
359 |     """
360 |     Adds the mix layer. Note that this layer is only defined if there
361 |     are exactly two AES states.
362 |     """
363 |     assert(config["aesstates"] == 2)
364 | 
365 |     # Columnwise permutation
366 |     permutation = [0, 4, 1, 5,
367 |                    2, 6, 3, 7]
368 | 
369 |     state_dim = config["statedimension"]
370 |     words_state = state_dim * state_dim
371 |     start_index = words_state * (config["aesrounds"] + current_round * 
372 |                  config["aesrounds"] + current_round)
373 | 
374 |     next_index = 0
375 | 
376 |     for idx, col in enumerate(permutation):
377 |         old_col_start = start_index + (col % state_dim) * state_dim
378 |         new_col_start = start_index + (idx % state_dim) * state_dim + words_state
379 |         for word in range(state_dim):
380 |             model.addConstr(var_x[col // 4][old_col_start + word] ==
381 |                             var_x[idx // 4][new_col_start + word], "mix")
382 | 
383 |     return model    
384 | 
385 | def addactivesboxconstraints(model, config, var_x, activesboxes):
386 |     """
387 |     Adds constraints for counting the number of active S-boxes.
388 |     """
389 |     sbox_indices = []
390 |     num_states = (config["aesrounds"] + 1) * config["rounds"]
391 |     state_size = config["statedimension"] * config["statedimension"]
392 |     for rnd in filter(lambda x: isAESround(x, config["aesrounds"]),
393 |                       range(0, num_states)):
394 |         words_state = config["statedimension"] * config["statedimension"]
395 |         rnd_offset = rnd * words_state
396 |         sbox_indices += [rnd_offset + word for word in range(words_state)]
397 | 
398 |     sboxes = []
399 | 
400 |     if config["aesstates"] == 4:
401 |         # Remove S-boxes which are truncated
402 |         trunc_indices = [0, 1, 5, 6, 10, 11, 12, 15]
403 |         trunc_indices_2 = [2, 3, 4, 7, 8, 9, 13, 14]
404 | 
405 |         tmp_sbox_indices = [i for i in sbox_indices]
406 |         for idx in trunc_indices:
407 |             tmp_sbox_indices.remove(idx + (num_states - 2) * state_size)
408 |         for idx in tmp_sbox_indices:
409 |             sboxes.append(var_x[0][idx])
410 |             sboxes.append(var_x[2][idx])
411 | 
412 |         tmp_sbox_indices = [i for i in sbox_indices]
413 |         for idx in trunc_indices_2:
414 |             tmp_sbox_indices.remove(idx + (num_states - 2) * state_size)
415 |         for idx in tmp_sbox_indices:
416 |             sboxes.append(var_x[1][idx])
417 |             sboxes.append(var_x[3][idx])
418 |     else:
419 |         sboxes = [var_x[aes_state][i] for aes_state in range(config["aesstates"]) 
420 |               for i in sbox_indices]
421 | 
422 |     model.addConstr(quicksum(sboxes) - activesboxes == 0, 
423 |                     "Count Active S-boxes")
424 | 
425 |     return model
426 | 


--------------------------------------------------------------------------------
/analysis/models/milpconstraints.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Constraints for AES-like round functions.
 3 | """
 4 | 
 5 | from gurobipy import *
 6 | 
 7 | def addAESrndconstraints(gurobi_model, state_dim, var_x, var_d, branch_number,
 8 |                          rounds):
 9 |     """
10 |     Adds constraints from MixColumns/ShiftRows for given branch number.
11 |     """
12 |     state = [[x*state_dim + y for x in range(state_dim)] 
13 |              for y in range(state_dim)]
14 | 
15 |     next_index = state_dim * state_dim
16 |     dummy = 0
17 | 
18 |     var_mcintmp = []
19 |     var_mcouttmp = []
20 |     for col in range(rounds * state_dim):
21 |         var_mcintmp.append(gurobi_model.addVar(vtype=GRB.BINARY, 
22 |                                                name="isMCactiveIn[{}]".format(col)))
23 |         var_mcouttmp.append(gurobi_model.addVar(vtype=GRB.BINARY, 
24 |                                                 name="isMCactiveOut[{}]".format(col)))
25 |     gurobi_model.update()
26 | 
27 |     for rnd in range(rounds):
28 |         #Shiftrows
29 |         tmp = [0 for x in range(state_dim)]
30 |         for i in range(1, state_dim):
31 |             for j in range(state_dim):
32 |                 tmp[j] = state[i][(j + i) % state_dim]
33 |             for j in range(state_dim):
34 |                 state[i][j] = tmp[j]
35 |         #MixColumns
36 |         for j in range(state_dim):
37 |             tmp_before = []
38 |             tmp_after = []
39 |             for i in range(state_dim):
40 |                 tmp_before.append(state[i][j])
41 |             for i in range(state_dim - 1):
42 |                 tmp_after.append(next_index + i)
43 |             tmp_after.append(next_index + (state_dim - 1))
44 |             #Limit for branch number
45 |             gurobi_model.addConstr(quicksum(var_x[i] for i in tmp_before +
46 |                                             tmp_after) - (branch_number) *
47 |                                    var_d[dummy] >= 0, "MC{}{}".format(rnd, j))
48 | 
49 |             #Force both sides to be either zero or non-zero
50 |             gurobi_model.addConstr(quicksum(var_x[i] for i in tmp_before) >= 
51 |                                    var_mcintmp[rnd*state_dim + j], "MCactivein")
52 |             gurobi_model.addConstr(quicksum(var_x[i] for i in tmp_after) >= 
53 |                                    var_mcouttmp[rnd*state_dim + j], "MCactiveout")
54 |             gurobi_model.addConstr(quicksum(var_x[i] for i in tmp_before + tmp_after) <=
55 |                                    var_mcintmp[rnd*state_dim + j] * 
56 |                                    var_mcouttmp[rnd*state_dim + j] * 2 *
57 |                                    state_dim, "MCValid{}{}".format(rnd, j))
58 | 
59 |             for i in range(state_dim):
60 |                 gurobi_model.addConstr(var_d[dummy] - var_x[state[i][j]] >= 0,
61 |                                        "MCt{}{}{}".format(rnd, j, i))
62 |             for i in range(state_dim):
63 |                 state[i][j] = next_index
64 |                 next_index += 1
65 |                 gurobi_model.addConstr(var_d[dummy] - var_x[state[i][j]] >= 0,
66 |                                        "MCt{}{}{}".format(rnd, j, i))
67 |             dummy += 1
68 |     return gurobi_model
69 | 


--------------------------------------------------------------------------------
/code/c/aesni_optimized/Makefile:
--------------------------------------------------------------------------------
 1 | C=gcc-6
 2 | RM=rm -f
 3 | CFLAGS=-g -O3 -march=native -funroll-all-loops -fomit-frame-pointer
 4 | SRCS=main.c haraka.c
 5 | OBJS=$(subst .c,.o,$(SRCS))
 6 | OUTFILE="haraka"
 7 | 
 8 | all: haraka
 9 | 
10 | haraka: $(OBJS)
11 | 	$(C) -o $(OUTFILE) $(OBJS) $(LDLIBS)
12 | 
13 | depend: .depend
14 | 
15 | .depend: $(SRCS)
16 | 	rm -f ./.depend
17 | 	$(C) $(CFLAGS) -MM $^>>./.depend;
18 | 
19 | clean:
20 | 	$(RM) $(OBJS)
21 | 
22 | dist-clean: clean
23 | 	$(RM) *~ .depend
24 | 
25 | include .depend
26 | 


--------------------------------------------------------------------------------
/code/c/aesni_optimized/haraka.c:
--------------------------------------------------------------------------------
  1 | #include "haraka.h"
  2 | #include <stdio.h>
  3 | 
  4 | void load_constants() {
  5 |   rc[0] = _mm_set_epi32(0x0684704c,0xe620c00a,0xb2c5fef0,0x75817b9d);
  6 |   rc[1] = _mm_set_epi32(0x8b66b4e1,0x88f3a06b,0x640f6ba4,0x2f08f717);
  7 |   rc[2] = _mm_set_epi32(0x3402de2d,0x53f28498,0xcf029d60,0x9f029114);
  8 |   rc[3] = _mm_set_epi32(0x0ed6eae6,0x2e7b4f08,0xbbf3bcaf,0xfd5b4f79);
  9 |   rc[4] = _mm_set_epi32(0xcbcfb0cb,0x4872448b,0x79eecd1c,0xbe397044);
 10 |   rc[5] = _mm_set_epi32(0x7eeacdee,0x6e9032b7,0x8d5335ed,0x2b8a057b);
 11 |   rc[6] = _mm_set_epi32(0x67c28f43,0x5e2e7cd0,0xe2412761,0xda4fef1b);
 12 |   rc[7] = _mm_set_epi32(0x2924d9b0,0xafcacc07,0x675ffde2,0x1fc70b3b);
 13 |   rc[8] = _mm_set_epi32(0xab4d63f1,0xe6867fe9,0xecdb8fca,0xb9d465ee);
 14 |   rc[9] = _mm_set_epi32(0x1c30bf84,0xd4b7cd64,0x5b2a404f,0xad037e33);
 15 |   rc[10] = _mm_set_epi32(0xb2cc0bb9,0x941723bf,0x69028b2e,0x8df69800);
 16 |   rc[11] = _mm_set_epi32(0xfa0478a6,0xde6f5572,0x4aaa9ec8,0x5c9d2d8a);
 17 |   rc[12] = _mm_set_epi32(0xdfb49f2b,0x6b772a12,0x0efa4f2e,0x29129fd4);
 18 |   rc[13] = _mm_set_epi32(0x1ea10344,0xf449a236,0x32d611ae,0xbb6a12ee);
 19 |   rc[14] = _mm_set_epi32(0xaf044988,0x4b050084,0x5f9600c9,0x9ca8eca6);
 20 |   rc[15] = _mm_set_epi32(0x21025ed8,0x9d199c4f,0x78a2c7e3,0x27e593ec);
 21 |   rc[16] = _mm_set_epi32(0xbf3aaaf8,0xa759c9b7,0xb9282ecd,0x82d40173);
 22 |   rc[17] = _mm_set_epi32(0x6260700d,0x6186b017,0x37f2efd9,0x10307d6b);
 23 |   rc[18] = _mm_set_epi32(0x5aca45c2,0x21300443,0x81c29153,0xf6fc9ac6);
 24 |   rc[19] = _mm_set_epi32(0x9223973c,0x226b68bb,0x2caf92e8,0x36d1943a);
 25 |   rc[20] = _mm_set_epi32(0xd3bf9238,0x225886eb,0x6cbab958,0xe51071b4);
 26 |   rc[21] = _mm_set_epi32(0xdb863ce5,0xaef0c677,0x933dfddd,0x24e1128d);
 27 |   rc[22] = _mm_set_epi32(0xbb606268,0xffeba09c,0x83e48de3,0xcb2212b1);
 28 |   rc[23] = _mm_set_epi32(0x734bd3dc,0xe2e4d19c,0x2db91a4e,0xc72bf77d);
 29 |   rc[24] = _mm_set_epi32(0x43bb47c3,0x61301b43,0x4b1415c4,0x2cb3924e);
 30 |   rc[25] = _mm_set_epi32(0xdba775a8,0xe707eff6,0x03b231dd,0x16eb6899);
 31 |   rc[26] = _mm_set_epi32(0x6df3614b,0x3c755977,0x8e5e2302,0x7eca472c);
 32 |   rc[27] = _mm_set_epi32(0xcda75a17,0xd6de7d77,0x6d1be5b9,0xb88617f9);
 33 |   rc[28] = _mm_set_epi32(0xec6b43f0,0x6ba8e9aa,0x9d6c069d,0xa946ee5d);
 34 |   rc[29] = _mm_set_epi32(0xcb1e6950,0xf957332b,0xa2531159,0x3bf327c1);
 35 |   rc[30] = _mm_set_epi32(0x2cee0c75,0x00da619c,0xe4ed0353,0x600ed0d9);
 36 |   rc[31] = _mm_set_epi32(0xf0b1a5a1,0x96e90cab,0x80bbbabc,0x63a4a350);
 37 |   rc[32] = _mm_set_epi32(0xae3db102,0x5e962988,0xab0dde30,0x938dca39);
 38 |   rc[33] = _mm_set_epi32(0x17bb8f38,0xd554a40b,0x8814f3a8,0x2e75b442);
 39 |   rc[34] = _mm_set_epi32(0x34bb8a5b,0x5f427fd7,0xaeb6b779,0x360a16f6);
 40 |   rc[35] = _mm_set_epi32(0x26f65241,0xcbe55438,0x43ce5918,0xffbaafde);
 41 |   rc[36] = _mm_set_epi32(0x4ce99a54,0xb9f3026a,0xa2ca9cf7,0x839ec978);
 42 |   rc[37] = _mm_set_epi32(0xae51a51a,0x1bdff7be,0x40c06e28,0x22901235);
 43 |   rc[38] = _mm_set_epi32(0xa0c1613c,0xba7ed22b,0xc173bc0f,0x48a659cf);
 44 |   rc[39] = _mm_set_epi32(0x756acc03,0x02288288,0x4ad6bdfd,0xe9c59da1);
 45 | }
 46 | 
 47 | void test_implementations() {
 48 |   unsigned char *in = (unsigned char *)calloc(64*8, sizeof(unsigned char));
 49 |   unsigned char *out256 = (unsigned char *)calloc(32*8, sizeof(unsigned char));
 50 |   unsigned char *out512 = (unsigned char *)calloc(32*8, sizeof(unsigned char));
 51 |   unsigned char testvector256[32] = {0x80, 0x27, 0xcc, 0xb8, 0x79, 0x49, 0x77, 0x4b,
 52 |                                      0x78, 0xd0, 0x54, 0x5f, 0xb7, 0x2b, 0xf7, 0x0c,
 53 |                                      0x69, 0x5c, 0x2a, 0x09, 0x23, 0xcb, 0xd4, 0x7b,
 54 |                                      0xba, 0x11, 0x59, 0xef, 0xbf, 0x2b, 0x2c, 0x1c};
 55 | 
 56 |  unsigned char testvector512[32] = {0xbe, 0x7f, 0x72, 0x3b, 0x4e, 0x80, 0xa9, 0x98,
 57 |                                     0x13, 0xb2, 0x92, 0x28, 0x7f, 0x30, 0x6f, 0x62,
 58 |                                     0x5a, 0x6d, 0x57, 0x33, 0x1c, 0xae, 0x5f, 0x34,
 59 |                                     0xdd, 0x92, 0x77, 0xb0, 0x94, 0x5b, 0xe2, 0xaa};
 60 | 
 61 | 
 62 | 
 63 |   int i;
 64 | 
 65 |   // Input for testvector
 66 |   for(i = 0; i < 512; i++) {
 67 |     in[i] = i % 64;
 68 |   }
 69 | 
 70 |   load_constants();
 71 |   haraka512_8x(out512, in);
 72 | 
 73 |   // Verify output
 74 |   for(i = 0; i < 32; i++) {
 75 |     if (out512[i % 32] != testvector512[i]) {
 76 |       printf("Error: testvector incorrect.\n");
 77 |       return;
 78 |     }
 79 |   }
 80 | 
 81 |   free(in);
 82 |   free(out256);
 83 |   free(out512);
 84 | }
 85 | 
 86 | void haraka256(unsigned char *out, const unsigned char *in) {
 87 |   __m128i s[2], tmp;
 88 | 
 89 |   s[0] = LOAD(in);
 90 |   s[1] = LOAD(in + 16);
 91 | 
 92 |   AES2(s[0], s[1], 0);
 93 |   MIX2(s[0], s[1]);
 94 | 
 95 |   AES2(s[0], s[1], 4);
 96 |   MIX2(s[0], s[1]);
 97 | 
 98 |   AES2(s[0], s[1], 8);
 99 |   MIX2(s[0], s[1]);
100 | 
101 |   AES2(s[0], s[1], 12);
102 |   MIX2(s[0], s[1]);
103 | 
104 |   AES2(s[0], s[1], 16);
105 |   MIX2(s[0], s[1]);
106 | 
107 |   s[0] = _mm_xor_si128(s[0], LOAD(in));
108 |   s[1] = _mm_xor_si128(s[1], LOAD(in + 16));
109 | 
110 |   STORE(out, s[0]);
111 |   STORE(out + 16, s[1]);
112 | }
113 | 
114 | void haraka256_4x(unsigned char *out, const unsigned char *in) {
115 |   __m128i s[4][2], tmp;
116 | 
117 |   s[0][0] = LOAD(in);
118 |   s[0][1] = LOAD(in + 16);
119 |   s[1][0] = LOAD(in + 32);
120 |   s[1][1] = LOAD(in + 48);
121 |   s[2][0] = LOAD(in + 64);
122 |   s[2][1] = LOAD(in + 80);
123 |   s[3][0] = LOAD(in + 96);
124 |   s[3][1] = LOAD(in + 112);
125 | 
126 |   // Round 1
127 |   AES2_4x(s[0], s[1], s[2], s[3], 0);
128 | 
129 |   MIX2(s[0][0], s[0][1]);
130 |   MIX2(s[1][0], s[1][1]);
131 |   MIX2(s[2][0], s[2][1]);
132 |   MIX2(s[3][0], s[3][1]);
133 | 
134 |   // Round 2
135 |   AES2_4x(s[0], s[1], s[2], s[3], 4);
136 | 
137 |   MIX2(s[0][0], s[0][1]);
138 |   MIX2(s[1][0], s[1][1]);
139 |   MIX2(s[2][0], s[2][1]);
140 |   MIX2(s[3][0], s[3][1]);
141 | 
142 |   // Round 3
143 |   AES2_4x(s[0], s[1], s[2], s[3], 8);
144 | 
145 |   MIX2(s[0][0], s[0][1]);
146 |   MIX2(s[1][0], s[1][1]);
147 |   MIX2(s[2][0], s[2][1]);
148 |   MIX2(s[3][0], s[3][1]);
149 | 
150 |   // Round 4
151 |   AES2_4x(s[0], s[1], s[2], s[3], 12);
152 | 
153 |   MIX2(s[0][0], s[0][1]);
154 |   MIX2(s[1][0], s[1][1]);
155 |   MIX2(s[2][0], s[2][1]);
156 |   MIX2(s[3][0], s[3][1]);
157 | 
158 |   // Round 5
159 |   AES2_4x(s[0], s[1], s[2], s[3], 16);
160 | 
161 |   MIX2(s[0][0], s[0][1]);
162 |   MIX2(s[1][0], s[1][1]);
163 |   MIX2(s[2][0], s[2][1]);
164 |   MIX2(s[3][0], s[3][1]);
165 | 
166 |   // Feed Forward
167 |   s[0][0] = _mm_xor_si128(s[0][0], LOAD(in));
168 |   s[0][1] = _mm_xor_si128(s[0][1], LOAD(in + 16));
169 |   s[1][0] = _mm_xor_si128(s[1][0], LOAD(in + 32));
170 |   s[1][1] = _mm_xor_si128(s[1][1], LOAD(in + 48));
171 |   s[2][0] = _mm_xor_si128(s[2][0], LOAD(in + 64));
172 |   s[2][1] = _mm_xor_si128(s[2][1], LOAD(in + 80));
173 |   s[3][0] = _mm_xor_si128(s[3][0], LOAD(in + 96));
174 |   s[3][1] = _mm_xor_si128(s[3][1], LOAD(in + 112));
175 | 
176 |   STORE(out, s[0][0]);
177 |   STORE(out + 16, s[0][1]);
178 |   STORE(out + 32, s[1][0]);
179 |   STORE(out + 48, s[1][1]);
180 |   STORE(out + 64, s[2][0]);
181 |   STORE(out + 80, s[2][1]);
182 |   STORE(out + 96, s[3][0]);
183 |   STORE(out + 112, s[3][1]);
184 | }
185 | 
186 | void haraka256_8x(unsigned char *out, const unsigned char *in) {
187 |   // This is faster on Skylake, the code below is faster on Haswell.
188 |   haraka256_4x(out, in);
189 |   haraka256_4x(out + 128, in + 128);
190 |   return;
191 |   // __m128i s[8][2], tmp;
192 |   //
193 |   // int i;
194 |   //
195 |   // s[0][0] = LOAD(in);
196 |   // s[0][1] = LOAD(in + 16);
197 |   // s[1][0] = LOAD(in + 32);
198 |   // s[1][1] = LOAD(in + 48);
199 |   // s[2][0] = LOAD(in + 64);
200 |   // s[2][1] = LOAD(in + 80);
201 |   // s[3][0] = LOAD(in + 96);
202 |   // s[3][1] = LOAD(in + 112);
203 |   // s[4][0] = LOAD(in + 128);
204 |   // s[4][1] = LOAD(in + 144);
205 |   // s[5][0] = LOAD(in + 160);
206 |   // s[5][1] = LOAD(in + 176);
207 |   // s[6][0] = LOAD(in + 192);
208 |   // s[6][1] = LOAD(in + 208);
209 |   // s[7][0] = LOAD(in + 224);
210 |   // s[7][1] = LOAD(in + 240);
211 |   //
212 |   // // Round 1
213 |   // AES2_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 0);
214 |   //
215 |   // MIX2(s[0][0], s[0][1]);
216 |   // MIX2(s[1][0], s[1][1]);
217 |   // MIX2(s[2][0], s[2][1]);
218 |   // MIX2(s[3][0], s[3][1]);
219 |   // MIX2(s[4][0], s[4][1]);
220 |   // MIX2(s[5][0], s[5][1]);
221 |   // MIX2(s[6][0], s[6][1]);
222 |   // MIX2(s[7][0], s[7][1]);
223 |   //
224 |   //
225 |   // // Round 2
226 |   // AES2_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 4);
227 |   //
228 |   // MIX2(s[0][0], s[0][1]);
229 |   // MIX2(s[1][0], s[1][1]);
230 |   // MIX2(s[2][0], s[2][1]);
231 |   // MIX2(s[3][0], s[3][1]);
232 |   // MIX2(s[4][0], s[4][1]);
233 |   // MIX2(s[5][0], s[5][1]);
234 |   // MIX2(s[6][0], s[6][1]);
235 |   // MIX2(s[7][0], s[7][1]);
236 |   //
237 |   // // Round 3
238 |   // AES2_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 8);
239 |   //
240 |   // MIX2(s[0][0], s[0][1]);
241 |   // MIX2(s[1][0], s[1][1]);
242 |   // MIX2(s[2][0], s[2][1]);
243 |   // MIX2(s[3][0], s[3][1]);
244 |   // MIX2(s[4][0], s[4][1]);
245 |   // MIX2(s[5][0], s[5][1]);
246 |   // MIX2(s[6][0], s[6][1]);
247 |   // MIX2(s[7][0], s[7][1]);
248 |   //
249 |   // // Round 4
250 |   // AES2_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 12);
251 |   //
252 |   // MIX2(s[0][0], s[0][1]);
253 |   // MIX2(s[1][0], s[1][1]);
254 |   // MIX2(s[2][0], s[2][1]);
255 |   // MIX2(s[3][0], s[3][1]);
256 |   // MIX2(s[4][0], s[4][1]);
257 |   // MIX2(s[5][0], s[5][1]);
258 |   // MIX2(s[6][0], s[6][1]);
259 |   // MIX2(s[7][0], s[7][1]);
260 |   //
261 |   // // Round 5
262 |   // AES2_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 16);
263 |   //
264 |   // MIX2(s[0][0], s[0][1]);
265 |   // MIX2(s[1][0], s[1][1]);
266 |   // MIX2(s[2][0], s[2][1]);
267 |   // MIX2(s[3][0], s[3][1]);
268 |   // MIX2(s[4][0], s[4][1]);
269 |   // MIX2(s[5][0], s[5][1]);
270 |   // MIX2(s[6][0], s[6][1]);
271 |   // MIX2(s[7][0], s[7][1]);
272 |   //
273 |   // // Feed Forward
274 |   // s[0][0] = _mm_xor_si128(s[0][0], LOAD(in));
275 |   // s[0][1] = _mm_xor_si128(s[0][1], LOAD(in + 16));
276 |   // s[1][0] = _mm_xor_si128(s[1][0], LOAD(in + 32));
277 |   // s[1][1] = _mm_xor_si128(s[1][1], LOAD(in + 48));
278 |   // s[2][0] = _mm_xor_si128(s[2][0], LOAD(in + 64));
279 |   // s[2][1] = _mm_xor_si128(s[2][1], LOAD(in + 80));
280 |   // s[3][0] = _mm_xor_si128(s[3][0], LOAD(in + 96));
281 |   // s[3][1] = _mm_xor_si128(s[3][1], LOAD(in + 112));
282 |   // s[4][0] = _mm_xor_si128(s[4][0], LOAD(in + 128));
283 |   // s[4][1] = _mm_xor_si128(s[4][1], LOAD(in + 144));
284 |   // s[5][0] = _mm_xor_si128(s[5][0], LOAD(in + 160));
285 |   // s[5][1] = _mm_xor_si128(s[5][1], LOAD(in + 176));
286 |   // s[6][0] = _mm_xor_si128(s[6][0], LOAD(in + 192));
287 |   // s[6][1] = _mm_xor_si128(s[6][1], LOAD(in + 208));
288 |   // s[7][0] = _mm_xor_si128(s[7][0], LOAD(in + 224));
289 |   // s[7][1] = _mm_xor_si128(s[7][1], LOAD(in + 240));
290 |   //
291 |   // STORE(out, s[0][0]);
292 |   // STORE(out + 16, s[0][1]);
293 |   // STORE(out + 32, s[1][0]);
294 |   // STORE(out + 48, s[1][1]);
295 |   // STORE(out + 64, s[2][0]);
296 |   // STORE(out + 80, s[2][1]);
297 |   // STORE(out + 96, s[3][0]);
298 |   // STORE(out + 112, s[3][1]);
299 |   // STORE(out + 128, s[4][0]);
300 |   // STORE(out + 144, s[4][1]);
301 |   // STORE(out + 160, s[5][0]);
302 |   // STORE(out + 176, s[5][1]);
303 |   // STORE(out + 192, s[6][0]);
304 |   // STORE(out + 208, s[6][1]);
305 |   // STORE(out + 224, s[7][0]);
306 |   // STORE(out + 240, s[7][1]);
307 | }
308 | 
309 | void haraka512(unsigned char *out, const unsigned char *in) {
310 |   u128 s[4], tmp;
311 | 
312 |   s[0] = LOAD(in);
313 |   s[1] = LOAD(in + 16);
314 |   s[2] = LOAD(in + 32);
315 |   s[3] = LOAD(in + 48);
316 | 
317 |   AES4(s[0], s[1], s[2], s[3], 0);
318 |   MIX4(s[0], s[1], s[2], s[3]);
319 | 
320 |   AES4(s[0], s[1], s[2], s[3], 8);
321 |   MIX4(s[0], s[1], s[2], s[3]);
322 | 
323 |   AES4(s[0], s[1], s[2], s[3], 16);
324 |   MIX4(s[0], s[1], s[2], s[3]);
325 | 
326 |   AES4(s[0], s[1], s[2], s[3], 24);
327 |   MIX4(s[0], s[1], s[2], s[3]);
328 | 
329 |   AES4(s[0], s[1], s[2], s[3], 32);
330 |   MIX4(s[0], s[1], s[2], s[3]);
331 | 
332 |   s[0] = _mm_xor_si128(s[0], LOAD(in));
333 |   s[1] = _mm_xor_si128(s[1], LOAD(in + 16));
334 |   s[2] = _mm_xor_si128(s[2], LOAD(in + 32));
335 |   s[3] = _mm_xor_si128(s[3], LOAD(in + 48));
336 | 
337 |   TRUNCSTORE(out, s[0], s[1], s[2], s[3]);
338 | }
339 | 
340 | void haraka512_4x(unsigned char *out, const unsigned char *in) {
341 |   u128 s[4][4], tmp;
342 | 
343 |   s[0][0] = LOAD(in);
344 |   s[0][1] = LOAD(in + 16);
345 |   s[0][2] = LOAD(in + 32);
346 |   s[0][3] = LOAD(in + 48);
347 |   s[1][0] = LOAD(in + 64);
348 |   s[1][1] = LOAD(in + 80);
349 |   s[1][2] = LOAD(in + 96);
350 |   s[1][3] = LOAD(in + 112);
351 |   s[2][0] = LOAD(in + 128);
352 |   s[2][1] = LOAD(in + 144);
353 |   s[2][2] = LOAD(in + 160);
354 |   s[2][3] = LOAD(in + 176);
355 |   s[3][0] = LOAD(in + 192);
356 |   s[3][1] = LOAD(in + 208);
357 |   s[3][2] = LOAD(in + 224);
358 |   s[3][3] = LOAD(in + 240);
359 | 
360 |   AES4_4x(s[0], s[1], s[2], s[3], 0);
361 |   MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
362 |   MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
363 |   MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
364 |   MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
365 | 
366 |   AES4_4x(s[0], s[1], s[2], s[3], 8);
367 |   MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
368 |   MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
369 |   MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
370 |   MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
371 | 
372 |   AES4_4x(s[0], s[1], s[2], s[3], 16);
373 |   MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
374 |   MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
375 |   MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
376 |   MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
377 | 
378 |   AES4_4x(s[0], s[1], s[2], s[3], 24);
379 |   MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
380 |   MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
381 |   MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
382 |   MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
383 | 
384 |   AES4_4x(s[0], s[1], s[2], s[3], 32);
385 |   MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
386 |   MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
387 |   MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
388 |   MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
389 | 
390 | 
391 |   s[0][0] = _mm_xor_si128(s[0][0], LOAD(in));
392 |   s[0][1] = _mm_xor_si128(s[0][1], LOAD(in + 16));
393 |   s[0][2] = _mm_xor_si128(s[0][2], LOAD(in + 32));
394 |   s[0][3] = _mm_xor_si128(s[0][3], LOAD(in + 48));
395 |   s[1][0] = _mm_xor_si128(s[1][0], LOAD(in + 64));
396 |   s[1][1] = _mm_xor_si128(s[1][1], LOAD(in + 80));
397 |   s[1][2] = _mm_xor_si128(s[1][2], LOAD(in + 96));
398 |   s[1][3] = _mm_xor_si128(s[1][3], LOAD(in + 112));
399 |   s[2][0] = _mm_xor_si128(s[2][0], LOAD(in + 128));
400 |   s[2][1] = _mm_xor_si128(s[2][1], LOAD(in + 144));
401 |   s[2][2] = _mm_xor_si128(s[2][2], LOAD(in + 160));
402 |   s[2][3] = _mm_xor_si128(s[2][3], LOAD(in + 176));
403 |   s[3][0] = _mm_xor_si128(s[3][0], LOAD(in + 192));
404 |   s[3][1] = _mm_xor_si128(s[3][1], LOAD(in + 208));
405 |   s[3][2] = _mm_xor_si128(s[3][2], LOAD(in + 224));
406 |   s[3][3] = _mm_xor_si128(s[3][3], LOAD(in + 240));
407 | 
408 |   TRUNCSTORE(out, s[0][0], s[0][1], s[0][2], s[0][3]);
409 |   TRUNCSTORE(out + 32, s[1][0], s[1][1], s[1][2], s[1][3]);
410 |   TRUNCSTORE(out + 64, s[2][0], s[2][1], s[2][2], s[2][3]);
411 |   TRUNCSTORE(out + 96, s[3][0], s[3][1], s[3][2], s[3][3]);
412 | }
413 | 
414 | void haraka512_8x(unsigned char *out, const unsigned char *in) {
415 |   // This is faster on Skylake, the code below is faster on Haswell.
416 |   haraka512_4x(out, in);
417 |   haraka512_4x(out + 128, in + 256);
418 | 
419 |   // u128 s[8][4], tmp;
420 |   //
421 |   // s[0][0] = LOAD(in);
422 |   // s[0][1] = LOAD(in + 16);
423 |   // s[0][2] = LOAD(in + 32);
424 |   // s[0][3] = LOAD(in + 48);
425 |   // s[1][0] = LOAD(in + 64);
426 |   // s[1][1] = LOAD(in + 80);
427 |   // s[1][2] = LOAD(in + 96);
428 |   // s[1][3] = LOAD(in + 112);
429 |   // s[2][0] = LOAD(in + 128);
430 |   // s[2][1] = LOAD(in + 144);
431 |   // s[2][2] = LOAD(in + 160);
432 |   // s[2][3] = LOAD(in + 176);
433 |   // s[3][0] = LOAD(in + 192);
434 |   // s[3][1] = LOAD(in + 208);
435 |   // s[3][2] = LOAD(in + 224);
436 |   // s[3][3] = LOAD(in + 240);
437 |   // s[4][0] = LOAD(in + 256);
438 |   // s[4][1] = LOAD(in + 272);
439 |   // s[4][2] = LOAD(in + 288);
440 |   // s[4][3] = LOAD(in + 304);
441 |   // s[5][0] = LOAD(in + 320);
442 |   // s[5][1] = LOAD(in + 336);
443 |   // s[5][2] = LOAD(in + 352);
444 |   // s[5][3] = LOAD(in + 368);
445 |   // s[6][0] = LOAD(in + 384);
446 |   // s[6][1] = LOAD(in + 400);
447 |   // s[6][2] = LOAD(in + 416);
448 |   // s[6][3] = LOAD(in + 432);
449 |   // s[7][0] = LOAD(in + 448);
450 |   // s[7][1] = LOAD(in + 464);
451 |   // s[7][2] = LOAD(in + 480);
452 |   // s[7][3] = LOAD(in + 496);
453 |   //
454 |   // AES4_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 0);
455 |   // MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
456 |   // MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
457 |   // MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
458 |   // MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
459 |   // MIX4(s[4][0], s[4][1], s[4][2], s[4][3]);
460 |   // MIX4(s[5][0], s[5][1], s[5][2], s[5][3]);
461 |   // MIX4(s[6][0], s[6][1], s[6][2], s[6][3]);
462 |   // MIX4(s[7][0], s[7][1], s[7][2], s[7][3]);
463 |   //
464 |   // AES4_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 8);
465 |   // MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
466 |   // MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
467 |   // MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
468 |   // MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
469 |   // MIX4(s[4][0], s[4][1], s[4][2], s[4][3]);
470 |   // MIX4(s[5][0], s[5][1], s[5][2], s[5][3]);
471 |   // MIX4(s[6][0], s[6][1], s[6][2], s[6][3]);
472 |   // MIX4(s[7][0], s[7][1], s[7][2], s[7][3]);
473 |   //
474 |   // AES4_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 16);
475 |   // MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
476 |   // MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
477 |   // MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
478 |   // MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
479 |   // MIX4(s[4][0], s[4][1], s[4][2], s[4][3]);
480 |   // MIX4(s[5][0], s[5][1], s[5][2], s[5][3]);
481 |   // MIX4(s[6][0], s[6][1], s[6][2], s[6][3]);
482 |   // MIX4(s[7][0], s[7][1], s[7][2], s[7][3]);
483 |   //
484 |   // AES4_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 24);
485 |   // MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
486 |   // MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
487 |   // MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
488 |   // MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
489 |   // MIX4(s[4][0], s[4][1], s[4][2], s[4][3]);
490 |   // MIX4(s[5][0], s[5][1], s[5][2], s[5][3]);
491 |   // MIX4(s[6][0], s[6][1], s[6][2], s[6][3]);
492 |   // MIX4(s[7][0], s[7][1], s[7][2], s[7][3]);
493 |   //
494 |   // AES4_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 32);
495 |   // MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
496 |   // MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
497 |   // MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
498 |   // MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
499 |   // MIX4(s[4][0], s[4][1], s[4][2], s[4][3]);
500 |   // MIX4(s[5][0], s[5][1], s[5][2], s[5][3]);
501 |   // MIX4(s[6][0], s[6][1], s[6][2], s[6][3]);
502 |   // MIX4(s[7][0], s[7][1], s[7][2], s[7][3]);
503 |   //
504 |   //
505 |   // s[0][0] = _mm_xor_si128(s[0][0], LOAD(in));
506 |   // s[0][1] = _mm_xor_si128(s[0][1], LOAD(in + 16));
507 |   // s[0][2] = _mm_xor_si128(s[0][2], LOAD(in + 32));
508 |   // s[0][3] = _mm_xor_si128(s[0][3], LOAD(in + 48));
509 |   // s[1][0] = _mm_xor_si128(s[1][0], LOAD(in + 64));
510 |   // s[1][1] = _mm_xor_si128(s[1][1], LOAD(in + 80));
511 |   // s[1][2] = _mm_xor_si128(s[1][2], LOAD(in + 96));
512 |   // s[1][3] = _mm_xor_si128(s[1][3], LOAD(in + 112));
513 |   // s[2][0] = _mm_xor_si128(s[2][0], LOAD(in + 128));
514 |   // s[2][1] = _mm_xor_si128(s[2][1], LOAD(in + 144));
515 |   // s[2][2] = _mm_xor_si128(s[2][2], LOAD(in + 160));
516 |   // s[2][3] = _mm_xor_si128(s[2][3], LOAD(in + 176));
517 |   // s[3][0] = _mm_xor_si128(s[3][0], LOAD(in + 192));
518 |   // s[3][1] = _mm_xor_si128(s[3][1], LOAD(in + 208));
519 |   // s[3][2] = _mm_xor_si128(s[3][2], LOAD(in + 224));
520 |   // s[3][3] = _mm_xor_si128(s[3][3], LOAD(in + 240));
521 |   // s[4][0] = _mm_xor_si128(s[4][0], LOAD(in + 256));
522 |   // s[4][1] = _mm_xor_si128(s[4][1], LOAD(in + 272));
523 |   // s[4][2] = _mm_xor_si128(s[4][2], LOAD(in + 288));
524 |   // s[4][3] = _mm_xor_si128(s[4][3], LOAD(in + 304));
525 |   // s[5][0] = _mm_xor_si128(s[5][0], LOAD(in + 320));
526 |   // s[5][1] = _mm_xor_si128(s[5][1], LOAD(in + 336));
527 |   // s[5][2] = _mm_xor_si128(s[5][2], LOAD(in + 352));
528 |   // s[5][3] = _mm_xor_si128(s[5][3], LOAD(in + 368));
529 |   // s[6][0] = _mm_xor_si128(s[6][0], LOAD(in + 384));
530 |   // s[6][1] = _mm_xor_si128(s[6][1], LOAD(in + 400));
531 |   // s[6][2] = _mm_xor_si128(s[6][2], LOAD(in + 416));
532 |   // s[6][3] = _mm_xor_si128(s[6][3], LOAD(in + 432));
533 |   // s[7][0] = _mm_xor_si128(s[7][0], LOAD(in + 448));
534 |   // s[7][1] = _mm_xor_si128(s[7][1], LOAD(in + 464));
535 |   // s[7][2] = _mm_xor_si128(s[7][2], LOAD(in + 480));
536 |   // s[7][3] = _mm_xor_si128(s[7][3], LOAD(in + 496));
537 |   //
538 |   // TRUNCSTORE(out, s[0][0], s[0][1], s[0][2], s[0][3]);
539 |   // TRUNCSTORE(out + 32, s[1][0], s[1][1], s[1][2], s[1][3]);
540 |   // TRUNCSTORE(out + 64, s[2][0], s[2][1], s[2][2], s[2][3]);
541 |   // TRUNCSTORE(out + 96, s[3][0], s[3][1], s[3][2], s[3][3]);
542 |   // TRUNCSTORE(out + 128, s[4][0], s[4][1], s[4][2], s[4][3]);
543 |   // TRUNCSTORE(out + 160, s[5][0], s[5][1], s[5][2], s[5][3]);
544 |   // TRUNCSTORE(out + 192, s[6][0], s[6][1], s[6][2], s[6][3]);
545 |   // TRUNCSTORE(out + 224, s[7][0], s[7][1], s[7][2], s[7][3]);
546 | }
547 | 


--------------------------------------------------------------------------------
/code/c/aesni_optimized/haraka.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Optimized Implementations for Haraka256 and Haraka512
 3 | */
 4 | #ifndef HARAKA_H_
 5 | #define HARAKA_H_
 6 | 
 7 | #include "immintrin.h"
 8 | 
 9 | #define NUMROUNDS 5
10 | 
11 | #define u64 unsigned long
12 | #define u128 __m128i
13 | 
14 | u128 rc[40];
15 | 
16 | #define LOAD(src) _mm_load_si128((u128 *)(src))
17 | #define STORE(dest,src) _mm_storeu_si128((u128 *)(dest),src)
18 | 
19 | #define AES2(s0, s1, rci) \
20 |   s0 = _mm_aesenc_si128(s0, rc[rci]); \
21 |   s1 = _mm_aesenc_si128(s1, rc[rci + 1]); \
22 |   s0 = _mm_aesenc_si128(s0, rc[rci + 2]); \
23 |   s1 = _mm_aesenc_si128(s1, rc[rci + 3]);
24 | 
25 | #define AES2_4x(s0, s1, s2, s3, rci) \
26 |   AES2(s0[0], s0[1], rci); \
27 |   AES2(s1[0], s1[1], rci); \
28 |   AES2(s2[0], s2[1], rci); \
29 |   AES2(s3[0], s3[1], rci);
30 | 
31 | #define AES2_8x(s0, s1, s2, s3, s4, s5, s6, s7, rci) \
32 |   AES2_4x(s0, s1, s2, s3, rci); \
33 |   AES2_4x(s4, s5, s6, s7, rci);
34 | 
35 | #define AES4(s0, s1, s2, s3, rci) \
36 |   s0 = _mm_aesenc_si128(s0, rc[rci]); \
37 |   s1 = _mm_aesenc_si128(s1, rc[rci + 1]); \
38 |   s2 = _mm_aesenc_si128(s2, rc[rci + 2]); \
39 |   s3 = _mm_aesenc_si128(s3, rc[rci + 3]); \
40 |   s0 = _mm_aesenc_si128(s0, rc[rci + 4]); \
41 |   s1 = _mm_aesenc_si128(s1, rc[rci + 5]); \
42 |   s2 = _mm_aesenc_si128(s2, rc[rci + 6]); \
43 |   s3 = _mm_aesenc_si128(s3, rc[rci + 7]); \
44 | 
45 | #define AES4_4x(s0, s1, s2, s3, rci) \
46 |   AES4(s0[0], s0[1], s0[2], s0[3], rci); \
47 |   AES4(s1[0], s1[1], s1[2], s1[3], rci); \
48 |   AES4(s2[0], s2[1], s2[2], s2[3], rci); \
49 |   AES4(s3[0], s3[1], s3[2], s3[3], rci);
50 | 
51 | #define AES4_8x(s0, s1, s2, s3, s4, s5, s6, s7, rci) \
52 |   AES4_4x(s0, s1, s2, s3, rci); \
53 |   AES4_4x(s4, s5, s6, s7, rci);
54 | 
55 | #define MIX2(s0, s1) \
56 |   tmp = _mm_unpacklo_epi32(s0, s1); \
57 |   s1 = _mm_unpackhi_epi32(s0, s1); \
58 |   s0 = tmp;
59 | 
60 | #define MIX4(s0, s1, s2, s3) \
61 |   tmp  = _mm_unpacklo_epi32(s0, s1); \
62 |   s0 = _mm_unpackhi_epi32(s0, s1); \
63 |   s1 = _mm_unpacklo_epi32(s2, s3); \
64 |   s2 = _mm_unpackhi_epi32(s2, s3); \
65 |   s3 = _mm_unpacklo_epi32(s0, s2); \
66 |   s0 = _mm_unpackhi_epi32(s0, s2); \
67 |   s2 = _mm_unpackhi_epi32(s1, tmp); \
68 |   s1 = _mm_unpacklo_epi32(s1, tmp);
69 | 
70 | #define TRUNCSTORE(out, s0, s1, s2, s3) \
71 |   *(u64*)(out) = (u64*)(s0)[1]; \
72 |   *(u64*)(out + 8) = (u64*)(s1)[1]; \
73 |   *(u64*)(out + 16) = (u64*)(s2)[0]; \
74 |   *(u64*)(out + 24) = (u64*)(s3)[0];
75 | 
76 | void load_constants();
77 | void test_implementations();
78 | 
79 | void haraka256(unsigned char *out, const unsigned char *in);
80 | void haraka256_4x(unsigned char *out, const unsigned char *in);
81 | void haraka256_8x(unsigned char *out, const unsigned char *in);
82 | 
83 | void haraka512(unsigned char *out, const unsigned char *in);
84 | void haraka512_4x(unsigned char *out, const unsigned char *in);
85 | void haraka512_8x(unsigned char *out, const unsigned char *in);
86 | 
87 | 
88 | #endif
89 | 


--------------------------------------------------------------------------------
/code/c/aesni_optimized/main.c:
--------------------------------------------------------------------------------
 1 | /*
 2 | Timing code for optimized implementation of Haraka.
 3 | */
 4 | 
 5 | #include "stdio.h"
 6 | #include "stdlib.h"
 7 | #include <string.h>
 8 | #include "haraka.h"
 9 | #include "timing.h"
10 | 
11 | typedef void (*hash_function)(unsigned char*, const unsigned char*);
12 | 
13 | // Measures how many cycles func requires to process a random input.
14 | double timeit(hash_function func, int inlen, int outlen) {
15 |   unsigned char *in, *out;
16 |   unsigned long long timer = 0;
17 |   double timings[NUM_TIMINGS];
18 | 
19 |   int i, j;
20 | 
21 |   srand(0);
22 |   in = malloc(inlen);
23 |   out = malloc(outlen);
24 | 
25 |   load_constants();
26 | 
27 |   for (i = -100; i < NUM_TIMINGS; i++) {
28 |     //Get random input
29 |     for (j = 0; j < inlen; j++) {
30 |       in[j] = rand() & 0xff;
31 |     }
32 | 
33 |     timer = startTimer();
34 |     for(j = 0; j < ITERATIONS; j++) {
35 |         func(out, in);
36 |     }
37 |     timer = endTimer() - timer;
38 | 
39 |     if (i >= 0 && i < NUM_TIMINGS) {
40 |       timings[i] = ((double)timer) / inlen / ITERATIONS;
41 |     }
42 |   }
43 | 
44 |   //Get Median
45 |   qsort(timings, NUM_TIMINGS, sizeof(double), compareDouble);
46 | 
47 |   free(out);
48 |   free(in);
49 |   return timings[NUM_TIMINGS / 2];
50 | }
51 | 
52 | int main() {
53 |   test_implementations();
54 |   printf("Haraka-256 1x: %f cycles per byte\n", timeit(haraka256, 32, 32));
55 |   printf("Haraka-256 4x: %f cycles per byte\n", timeit(haraka256_4x, 4*32, 4*32));
56 |   printf("Haraka-256 8x: %f cycles per byte\n", timeit(haraka256_8x, 8*32, 8*32));
57 | 
58 |   printf("Haraka-512 1x: %f cycles per byte\n", timeit(haraka512, 64, 32));
59 |   printf("Haraka-512 4x: %f cycles per byte\n", timeit(haraka512_4x, 4*64, 4*32));
60 |   printf("Haraka-512 8x: %f cycles per byte\n", timeit(haraka512_8x, 8*64, 8*32));
61 | }
62 | 


--------------------------------------------------------------------------------
/code/c/aesni_optimized/timing.h:
--------------------------------------------------------------------------------
 1 | #ifndef TIMING_H
 2 | #define TIMING_H
 3 | 
 4 | #define NUM_TIMINGS 10000
 5 | #define ITERATIONS 1000
 6 | 
 7 | int compareDouble(const void *x, const void *y)
 8 | {
 9 |   double xx = *(double*)x, yy = *(double*)y;
10 |   if (xx < yy) return -1;
11 |   if (xx > yy) return  1;
12 |   return 0;
13 | }
14 | 
15 | unsigned long long int startTimer(void)
16 | {
17 |    unsigned a, d;
18 | 
19 |    __asm__ volatile("CPUID\n\t"
20 |                     "RDTSC\n\t"
21 |                     "mov %%edx, %0\n\t"
22 |                     "mov %%eax, %1\n\t": "=r" (d),
23 |                     "=r" (a):: "%rax", "%rbx", "%rcx", "%rdx");
24 | 
25 |    return ((unsigned long long)a) | (((unsigned long long)d) << 32);;
26 | }
27 | 
28 | unsigned long long int endTimer(void)
29 | {
30 |    unsigned a, d;
31 | 
32 |    __asm__ volatile("RDTSCP\n\t"
33 |                     "mov %%edx, %0\n\t"
34 |                     "mov %%eax,%1\n\t"
35 |                     "CPUID\n\t": "=r" (d), "=r" (a)::
36 |                     "%rax", "%rbx", "%rcx", "%rdx");
37 | 
38 |    return ((unsigned long long)a) | (((unsigned long long)d) << 32);;
39 | }
40 | 
41 | #endif
42 | 


--------------------------------------------------------------------------------
/code/c/aesni_ref/Makefile:
--------------------------------------------------------------------------------
 1 | CFLAGS=-std=c99 -O3 -Wno-format -march=native -funroll-loops -fomit-frame-pointer
 2 | 
 3 | ifdef MPAR
 4 | 	CFLAGS += -DMPAR=$(MPAR)
 5 | endif
 6 | 	
 7 | all:
 8 | 	$(CC) $(CFLAGS) helpers.c haraka.c -o haraka
 9 | 
10 | clean:
11 | 	rm haraka


--------------------------------------------------------------------------------
/code/c/aesni_ref/haraka.c:
--------------------------------------------------------------------------------
  1 | #include "wmmintrin.h"
  2 | #include "emmintrin.h"
  3 | #include "smmintrin.h"
  4 | #include "helpers.h"
  5 | #include <stdio.h>
  6 | 
  7 | #define ROUNDS (5)
  8 | #define AES_PER_ROUND (2)
  9 | 
 10 | int haraka512256(unsigned char *hash, const unsigned char *msg) {
 11 |     // stuff we need
 12 |     int i, j;
 13 |     __m128i s[4], tmp, rc[40];
 14 |     __m128i MSB64 = _mm_set_epi32(0xFFFFFFFF,0xFFFFFFFF,0,0);
 15 | 
 16 | 
 17 |     // define round constants
 18 |     rc[0] = _mm_set_epi32(0x0684704c,0xe620c00a,0xb2c5fef0,0x75817b9d);
 19 |     rc[1] = _mm_set_epi32(0x8b66b4e1,0x88f3a06b,0x640f6ba4,0x2f08f717);
 20 |     rc[2] = _mm_set_epi32(0x3402de2d,0x53f28498,0xcf029d60,0x9f029114);
 21 |     rc[3] = _mm_set_epi32(0x0ed6eae6,0x2e7b4f08,0xbbf3bcaf,0xfd5b4f79);
 22 |     rc[4] = _mm_set_epi32(0xcbcfb0cb,0x4872448b,0x79eecd1c,0xbe397044);
 23 |     rc[5] = _mm_set_epi32(0x7eeacdee,0x6e9032b7,0x8d5335ed,0x2b8a057b);
 24 |     rc[6] = _mm_set_epi32(0x67c28f43,0x5e2e7cd0,0xe2412761,0xda4fef1b);
 25 |     rc[7] = _mm_set_epi32(0x2924d9b0,0xafcacc07,0x675ffde2,0x1fc70b3b);
 26 |     rc[8] = _mm_set_epi32(0xab4d63f1,0xe6867fe9,0xecdb8fca,0xb9d465ee);
 27 |     rc[9] = _mm_set_epi32(0x1c30bf84,0xd4b7cd64,0x5b2a404f,0xad037e33);
 28 |     rc[10] = _mm_set_epi32(0xb2cc0bb9,0x941723bf,0x69028b2e,0x8df69800);
 29 |     rc[11] = _mm_set_epi32(0xfa0478a6,0xde6f5572,0x4aaa9ec8,0x5c9d2d8a);
 30 |     rc[12] = _mm_set_epi32(0xdfb49f2b,0x6b772a12,0x0efa4f2e,0x29129fd4);
 31 |     rc[13] = _mm_set_epi32(0x1ea10344,0xf449a236,0x32d611ae,0xbb6a12ee);
 32 |     rc[14] = _mm_set_epi32(0xaf044988,0x4b050084,0x5f9600c9,0x9ca8eca6);
 33 |     rc[15] = _mm_set_epi32(0x21025ed8,0x9d199c4f,0x78a2c7e3,0x27e593ec);
 34 |     rc[16] = _mm_set_epi32(0xbf3aaaf8,0xa759c9b7,0xb9282ecd,0x82d40173);
 35 |     rc[17] = _mm_set_epi32(0x6260700d,0x6186b017,0x37f2efd9,0x10307d6b);
 36 |     rc[18] = _mm_set_epi32(0x5aca45c2,0x21300443,0x81c29153,0xf6fc9ac6);
 37 |     rc[19] = _mm_set_epi32(0x9223973c,0x226b68bb,0x2caf92e8,0x36d1943a);
 38 |     rc[20] = _mm_set_epi32(0xd3bf9238,0x225886eb,0x6cbab958,0xe51071b4);
 39 |     rc[21] = _mm_set_epi32(0xdb863ce5,0xaef0c677,0x933dfddd,0x24e1128d);
 40 |     rc[22] = _mm_set_epi32(0xbb606268,0xffeba09c,0x83e48de3,0xcb2212b1);
 41 |     rc[23] = _mm_set_epi32(0x734bd3dc,0xe2e4d19c,0x2db91a4e,0xc72bf77d);
 42 |     rc[24] = _mm_set_epi32(0x43bb47c3,0x61301b43,0x4b1415c4,0x2cb3924e);
 43 |     rc[25] = _mm_set_epi32(0xdba775a8,0xe707eff6,0x03b231dd,0x16eb6899);
 44 |     rc[26] = _mm_set_epi32(0x6df3614b,0x3c755977,0x8e5e2302,0x7eca472c);
 45 |     rc[27] = _mm_set_epi32(0xcda75a17,0xd6de7d77,0x6d1be5b9,0xb88617f9);
 46 |     rc[28] = _mm_set_epi32(0xec6b43f0,0x6ba8e9aa,0x9d6c069d,0xa946ee5d);
 47 |     rc[29] = _mm_set_epi32(0xcb1e6950,0xf957332b,0xa2531159,0x3bf327c1);
 48 |     rc[30] = _mm_set_epi32(0x2cee0c75,0x00da619c,0xe4ed0353,0x600ed0d9);
 49 |     rc[31] = _mm_set_epi32(0xf0b1a5a1,0x96e90cab,0x80bbbabc,0x63a4a350);
 50 |     rc[32] = _mm_set_epi32(0xae3db102,0x5e962988,0xab0dde30,0x938dca39);
 51 |     rc[33] = _mm_set_epi32(0x17bb8f38,0xd554a40b,0x8814f3a8,0x2e75b442);
 52 |     rc[34] = _mm_set_epi32(0x34bb8a5b,0x5f427fd7,0xaeb6b779,0x360a16f6);
 53 |     rc[35] = _mm_set_epi32(0x26f65241,0xcbe55438,0x43ce5918,0xffbaafde);
 54 |     rc[36] = _mm_set_epi32(0x4ce99a54,0xb9f3026a,0xa2ca9cf7,0x839ec978);
 55 |     rc[37] = _mm_set_epi32(0xae51a51a,0x1bdff7be,0x40c06e28,0x22901235);
 56 |     rc[38] = _mm_set_epi32(0xa0c1613c,0xba7ed22b,0xc173bc0f,0x48a659cf);
 57 |     rc[39] = _mm_set_epi32(0x756acc03,0x02288288,0x4ad6bdfd,0xe9c59da1);
 58 | 
 59 |     // initialize state to msg
 60 |     s[0] = _mm_load_si128(&((__m128i*)msg)[0]);
 61 |     s[1] = _mm_load_si128(&((__m128i*)msg)[1]);
 62 |     s[2] = _mm_load_si128(&((__m128i*)msg)[2]);
 63 |     s[3] = _mm_load_si128(&((__m128i*)msg)[3]);
 64 | 
 65 |     printf("= input state =\n");
 66 |     printstate512(s);
 67 | 
 68 |     for (i = 0; i < ROUNDS; ++i) {
 69 |         // aes round(s)
 70 |         for (j = 0; j < AES_PER_ROUND; ++j) {
 71 |             s[0] = _mm_aesenc_si128(s[0], rc[4*AES_PER_ROUND*i + 4*j]);
 72 |             s[1] = _mm_aesenc_si128(s[1], rc[4*AES_PER_ROUND*i + 4*j + 1]);
 73 |             s[2] = _mm_aesenc_si128(s[2], rc[4*AES_PER_ROUND*i + 4*j + 2]);
 74 |             s[3] = _mm_aesenc_si128(s[3], rc[4*AES_PER_ROUND*i + 4*j + 3]);
 75 |         }
 76 | 
 77 |         printf("= round %d : after aes layer =\n", i);
 78 |         printstate512(s);
 79 | 
 80 |         // mixing
 81 |         tmp  = _mm_unpacklo_epi32(s[0], s[1]);
 82 |         s[0] = _mm_unpackhi_epi32(s[0], s[1]);
 83 |         s[1] = _mm_unpacklo_epi32(s[2], s[3]);
 84 |         s[2] = _mm_unpackhi_epi32(s[2], s[3]);
 85 |         s[3] = _mm_unpacklo_epi32(s[0], s[2]);
 86 |         s[0] = _mm_unpackhi_epi32(s[0], s[2]);
 87 |         s[2] = _mm_unpackhi_epi32(s[1],  tmp);
 88 |         s[1] = _mm_unpacklo_epi32(s[1],  tmp);
 89 | 
 90 |         printf("= round %d : after mix layer =\n", i);
 91 |         printstate512(s);
 92 |     }
 93 | 
 94 |     printf("= output from permutation =\n");
 95 |     printstate512(s);
 96 | 
 97 |     // xor message to get DM effect
 98 |     s[0] = _mm_xor_si128(s[0], _mm_load_si128(&((__m128i*)msg)[0]));
 99 |     s[1] = _mm_xor_si128(s[1], _mm_load_si128(&((__m128i*)msg)[1]));
100 |     s[2] = _mm_xor_si128(s[2], _mm_load_si128(&((__m128i*)msg)[2]));
101 |     s[3] = _mm_xor_si128(s[3], _mm_load_si128(&((__m128i*)msg)[3]));
102 | 
103 |     printf("= after feed-forward =\n");
104 |     printstate512(s);
105 | 
106 |     // truncate and store result
107 |     _mm_maskmoveu_si128(s[0], MSB64, (hash-8));
108 |     _mm_maskmoveu_si128(s[1], MSB64, (hash+0));
109 |     _mm_storel_epi64((__m128i*)(hash + 16), s[2]);
110 |     _mm_storel_epi64((__m128i*)(hash + 24), s[3]);
111 | }
112 | 
113 | int haraka256256(unsigned char *hash, const unsigned char *msg) {
114 |     // stuff we need
115 |     int i, j;
116 |     __m128i s[2], tmp, rc[20];
117 | 
118 |     // define round constants
119 |     rc[0] = _mm_set_epi32(0x0684704c,0xe620c00a,0xb2c5fef0,0x75817b9d);
120 |     rc[1] = _mm_set_epi32(0x8b66b4e1,0x88f3a06b,0x640f6ba4,0x2f08f717);
121 |     rc[2] = _mm_set_epi32(0x3402de2d,0x53f28498,0xcf029d60,0x9f029114);
122 |     rc[3] = _mm_set_epi32(0x0ed6eae6,0x2e7b4f08,0xbbf3bcaf,0xfd5b4f79);
123 |     rc[4] = _mm_set_epi32(0xcbcfb0cb,0x4872448b,0x79eecd1c,0xbe397044);
124 |     rc[5] = _mm_set_epi32(0x7eeacdee,0x6e9032b7,0x8d5335ed,0x2b8a057b);
125 |     rc[6] = _mm_set_epi32(0x67c28f43,0x5e2e7cd0,0xe2412761,0xda4fef1b);
126 |     rc[7] = _mm_set_epi32(0x2924d9b0,0xafcacc07,0x675ffde2,0x1fc70b3b);
127 |     rc[8] = _mm_set_epi32(0xab4d63f1,0xe6867fe9,0xecdb8fca,0xb9d465ee);
128 |     rc[9] = _mm_set_epi32(0x1c30bf84,0xd4b7cd64,0x5b2a404f,0xad037e33);
129 |     rc[10] = _mm_set_epi32(0xb2cc0bb9,0x941723bf,0x69028b2e,0x8df69800);
130 |     rc[11] = _mm_set_epi32(0xfa0478a6,0xde6f5572,0x4aaa9ec8,0x5c9d2d8a);
131 |     rc[12] = _mm_set_epi32(0xdfb49f2b,0x6b772a12,0x0efa4f2e,0x29129fd4);
132 |     rc[13] = _mm_set_epi32(0x1ea10344,0xf449a236,0x32d611ae,0xbb6a12ee);
133 |     rc[14] = _mm_set_epi32(0xaf044988,0x4b050084,0x5f9600c9,0x9ca8eca6);
134 |     rc[15] = _mm_set_epi32(0x21025ed8,0x9d199c4f,0x78a2c7e3,0x27e593ec);
135 |     rc[16] = _mm_set_epi32(0xbf3aaaf8,0xa759c9b7,0xb9282ecd,0x82d40173);
136 |     rc[17] = _mm_set_epi32(0x6260700d,0x6186b017,0x37f2efd9,0x10307d6b);
137 |     rc[18] = _mm_set_epi32(0x5aca45c2,0x21300443,0x81c29153,0xf6fc9ac6);
138 |     rc[19] = _mm_set_epi32(0x9223973c,0x226b68bb,0x2caf92e8,0x36d1943a);
139 | 
140 |     // initialize state to msg
141 |     s[0] = _mm_load_si128(&((__m128i*)msg)[0]);
142 |     s[1] = _mm_load_si128(&((__m128i*)msg)[1]);
143 | 
144 |     printf("= input state =\n");
145 |     printstate256(s);
146 | 
147 |     for (i = 0; i < ROUNDS; ++i) {
148 |         // aes round(s)
149 |         for (j = 0; j < AES_PER_ROUND; ++j) {
150 |             s[0] = _mm_aesenc_si128(s[0], rc[2*AES_PER_ROUND*i + 2*j]);
151 |             s[1] = _mm_aesenc_si128(s[1], rc[2*AES_PER_ROUND*i + 2*j + 1]);
152 |         }
153 | 
154 |         printf("= round %d : after aes layer =\n", i);
155 |         printstate256(s);
156 | 
157 |         // mixing
158 |         tmp = _mm_unpacklo_epi32(s[0], s[1]);
159 |         s[1] = _mm_unpackhi_epi32(s[0], s[1]);
160 |         s[0] = tmp;
161 | 
162 |         printf("= round %d : after mix layer =\n", i);
163 |         printstate256(s);
164 |     }
165 | 
166 |     printf("= output from permutation =\n");
167 |     printstate256(s);
168 | 
169 |     // xor message to get DM effect
170 |     s[0] = _mm_xor_si128(s[0], _mm_load_si128(&((__m128i*)msg)[0]));
171 |     s[1] = _mm_xor_si128(s[1], _mm_load_si128(&((__m128i*)msg)[1]));
172 | 
173 |     printf("= after feed-forward =\n");
174 |     printstate256(s);
175 | 
176 |     // store result
177 |     _mm_storeu_si128((__m128i*)hash, s[0]);
178 |     _mm_storeu_si128((__m128i*)(hash + 16), s[1]);
179 | }
180 | 
181 | int main() {
182 |     // allocate memory for input and digest
183 |     unsigned char *msg = (unsigned char *)calloc(64, sizeof(unsigned char));
184 |     unsigned char *digest = (unsigned char *)calloc(32, sizeof(unsigned char));
185 |     int i;
186 | 
187 |     // set some input bytes
188 |     for (i = 0; i < 64; ++i)
189 |         msg[i] = i;
190 | 
191 |     // print input
192 |     printf("= input bytes =\n");
193 |     printbytes(msg, 64); printf("\n");
194 | 
195 |     // run Haraka-512/256
196 |     haraka512256(digest, msg);
197 | 
198 |     // print output
199 |     printf("= haraka-512/256 output bytes =\n");
200 |     printbytes(digest, 32); printf("\n");
201 | 
202 |     // run Haraka-256/256
203 |     haraka256256(digest, msg);
204 | 
205 |     // print output
206 |     printf("= haraka-256/256 output bytes =\n");
207 |     printbytes(digest, 32); printf("\n");
208 | 
209 |     return 0;
210 | }
211 | 


--------------------------------------------------------------------------------
/code/c/aesni_ref/helpers.c:
--------------------------------------------------------------------------------
 1 | #include "smmintrin.h"
 2 | #include "helpers.h"
 3 | #include <stdio.h>
 4 | 
 5 | /////////////
 6 | // HELPERS //
 7 | /////////////
 8 | void print_block(__m128i var) {
 9 |     uint8_t *val = (uint8_t*) &var;
10 |     //~ printf("%.16llx%.16llx\n", v64val[1], v64val[0]);
11 |     printf("%02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x\n",
12 | 			// val[15], val[14], val[13], val[12], val[11], val[10], val[9], val[8], val[7], val[6], val[5], val[4], val[3], val[2], val[1], val[0]);
13 |     		val[0], val[1], val[2], val[3], val[4], val[5], val[6], val[7], val[8], val[9], val[10], val[11], val[12], val[13], val[14], val[15]);
14 | }
15 | 
16 | void printbytes(unsigned char *m, int len) {
17 | 	int i;
18 | 	for (i = 0; i < len-1; ++i)
19 | 		printf("%02x ", m[i]);
20 | 	printf("%02x\n", m[len-1]);
21 | }
22 | 
23 | void printstate512(__m128i* s) {
24 | 	uint8_t *A = (uint8_t*)(&s[0]);
25 | 	uint8_t *B = (uint8_t*)(&s[1]);
26 | 	uint8_t *C = (uint8_t*)(&s[2]);
27 | 	uint8_t *D = (uint8_t*)(&s[3]);
28 | 
29 | 	int i;
30 | 	for (i = 0; i < 4; ++i)
31 | 		printf("%02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x\n",
32 | 			A[i], A[i+4], A[i+8], A[i+12],
33 | 			B[i], B[i+4], B[i+8], B[i+12],
34 | 			C[i], C[i+4], C[i+8], C[i+12],
35 | 			D[i], D[i+4], D[i+8], D[i+12]);
36 | 	printf("\n");
37 | }
38 | 
39 | void printstate256(__m128i* s) {
40 | 	uint8_t *A = (uint8_t*)(&s[0]);
41 | 	uint8_t *B = (uint8_t*)(&s[1]);
42 | 
43 | 	int i;
44 | 	for (i = 0; i < 4; ++i)
45 | 		printf("%02x %02x %02x %02x %02x %02x %02x %02x\n",
46 | 			A[i], A[i+4], A[i+8], A[i+12],
47 | 			B[i], B[i+4], B[i+8], B[i+12]);
48 | 	printf("\n");
49 | }
50 | 


--------------------------------------------------------------------------------
/code/c/aesni_ref/helpers.h:
--------------------------------------------------------------------------------
 1 | #ifndef HELPERS_H
 2 | #define HELPERS_H
 3 | #include <stdint.h>
 4 | 
 5 | /////////////
 6 | // HELPERS //
 7 | /////////////
 8 | void print_block(__m128i);
 9 | void printbytes(unsigned char *, int);
10 | void printstate512(__m128i* s);
11 | void printstate256(__m128i* s);
12 | 
13 | #endif
14 | 


--------------------------------------------------------------------------------
/code/c/neon/haraka.c:
--------------------------------------------------------------------------------
  1 | #include "haraka.h"
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | 
  5 | void haraka_testvectors() {
  6 |   unsigned char *in = (unsigned char *)calloc(64*8, sizeof(unsigned char));
  7 |   unsigned char *out256 = (unsigned char *)calloc(32*8, sizeof(unsigned char));
  8 |   unsigned char *out512 = (unsigned char *)calloc(32*8, sizeof(unsigned char));
  9 |   unsigned char testvector256[32] = {0x80, 0x27, 0xcc, 0xb8, 0x79, 0x49, 0x77, 0x4b,
 10 |                                      0x78, 0xd0, 0x54, 0x5f, 0xb7, 0x2b, 0xf7, 0x0c,
 11 |                                      0x69, 0x5c, 0x2a, 0x09, 0x23, 0xcb, 0xd4, 0x7b,
 12 |                                      0xba, 0x11, 0x59, 0xef, 0xbf, 0x2b, 0x2c, 0x1c};
 13 | 
 14 |   unsigned char testvector512[32] = {0xbe, 0x7f, 0x72, 0x3b, 0x4e, 0x80, 0xa9, 0x98,
 15 |                                      0x13, 0xb2, 0x92, 0x28, 0x7f, 0x30, 0x6f, 0x62,
 16 |                                      0x5a, 0x6d, 0x57, 0x33, 0x1c, 0xae, 0x5f, 0x34,
 17 |                                      0xdd, 0x92, 0x77, 0xb0, 0x94, 0x5b, 0xe2, 0xaa};
 18 | 
 19 |   int i;
 20 | 
 21 |   // Input for testvector
 22 |   for(i = 0; i < 4*8; i++) {
 23 |     in[i] = i % 32;
 24 |   }
 25 | 
 26 |   haraka_f_8x(out256, in);
 27 | 
 28 |   // Verify output
 29 |   for(i = 0; i < 8*32; i++) {
 30 |     if (out256[i % 32] != testvector256[i % 32]) {
 31 |       printf("Error: testvector incorrect for haraka_f at position %i.\n", i);
 32 |       return;
 33 |     }
 34 |   }
 35 | 
 36 |   // Input for testvector
 37 |   for(i = 0; i < 8*64; i++) {
 38 |     in[i] = i % 64;
 39 |   }
 40 | 
 41 |   haraka_h_8x(out512, in);
 42 | 
 43 |   // Verify output
 44 |   for(i = 0; i < 32; i++) {
 45 |     if (out512[i % 32] != testvector512[i % 32]) {
 46 |       printf("Error: testvector incorrect for haraka_h at position %i.\n", i);
 47 |       return;
 48 |     }
 49 |   }
 50 | 
 51 |   free(in);
 52 |   free(out256);
 53 |   free(out512);
 54 | }
 55 | 
 56 | void haraka_f(unsigned char *out, const unsigned char *in) {
 57 |   u128 s[2], tmp, s_save[2];
 58 | 
 59 |   s[0] = LOAD(in);
 60 |   s[1] = LOAD(in + 16);
 61 | 
 62 |   s_save[0] = s[0];
 63 |   s_save[1] = s[1];
 64 | 
 65 |   AES2(s[0], s[1], 0);
 66 |   MIX2(s[0], s[1]);
 67 | 
 68 |   AES2(s[0], s[1], 4);
 69 |   MIX2(s[0], s[1]);
 70 | 
 71 |   AES2(s[0], s[1], 8);
 72 |   MIX2(s[0], s[1]);
 73 | 
 74 |   AES2(s[0], s[1], 12);
 75 |   MIX2(s[0], s[1]);
 76 | 
 77 |   AES2(s[0], s[1], 16);
 78 |   s[0] = XOR(s[0], rc256[20]);
 79 |   s[1] = XOR(s[1], rc256[21]);
 80 |   MIX2(s[0], s[1]);
 81 | 
 82 |   s[0] = XOR(s[0], s_save[0]);
 83 |   s[1] = XOR(s[1], s_save[1]);
 84 | 
 85 |   STORE(out, s[0]);
 86 |   STORE(out + 16, s[1]);
 87 | }
 88 | 
 89 | void haraka_f_4x(unsigned char *out, const unsigned char *in) {
 90 |   u128 s[4][2], tmp, s_save[4][2];
 91 | 
 92 |   s[0][0] = LOAD(in);
 93 |   s[0][1] = LOAD(in + 16);
 94 |   s[1][0] = LOAD(in + 32);
 95 |   s[1][1] = LOAD(in + 48);
 96 |   s[2][0] = LOAD(in + 64);
 97 |   s[2][1] = LOAD(in + 80);
 98 |   s[3][0] = LOAD(in + 96);
 99 |   s[3][1] = LOAD(in + 112);
100 | 
101 |   s_save[0][0] = s[0][0];
102 |   s_save[0][1] = s[0][1];
103 |   s_save[1][0] = s[1][0];
104 |   s_save[1][1] = s[1][1];
105 |   s_save[2][0] = s[2][0];
106 |   s_save[2][1] = s[2][1];
107 |   s_save[3][0] = s[3][0];
108 |   s_save[3][1] = s[3][1];
109 | 
110 |   // Round 1
111 |   AES2_4x(s[0], s[1], s[2], s[3], 0);
112 | 
113 |   MIX2(s[0][0], s[0][1]);
114 |   MIX2(s[1][0], s[1][1]);
115 |   MIX2(s[2][0], s[2][1]);
116 |   MIX2(s[3][0], s[3][1]);
117 | 
118 |   // Round 2
119 |   AES2_4x(s[0], s[1], s[2], s[3], 4);
120 | 
121 |   MIX2(s[0][0], s[0][1]);
122 |   MIX2(s[1][0], s[1][1]);
123 |   MIX2(s[2][0], s[2][1]);
124 |   MIX2(s[3][0], s[3][1]);
125 | 
126 |   // Round 3
127 |   AES2_4x(s[0], s[1], s[2], s[3], 8);
128 | 
129 |   MIX2(s[0][0], s[0][1]);
130 |   MIX2(s[1][0], s[1][1]);
131 |   MIX2(s[2][0], s[2][1]);
132 |   MIX2(s[3][0], s[3][1]);
133 | 
134 |   // Round 4
135 |   AES2_4x(s[0], s[1], s[2], s[3], 12);
136 | 
137 |   MIX2(s[0][0], s[0][1]);
138 |   MIX2(s[1][0], s[1][1]);
139 |   MIX2(s[2][0], s[2][1]);
140 |   MIX2(s[3][0], s[3][1]);
141 | 
142 |   // Round 5
143 |   AES2_4x(s[0], s[1], s[2], s[3], 16);
144 |   s[0][0] = XOR(s[0][0], rc256[20]);
145 |   s[0][1] = XOR(s[0][1], rc256[21]);
146 |   s[1][0] = XOR(s[1][0], rc256[20]);
147 |   s[1][1] = XOR(s[1][1], rc256[21]);
148 |   s[2][0] = XOR(s[2][0], rc256[20]);
149 |   s[2][1] = XOR(s[2][1], rc256[21]);
150 |   s[3][0] = XOR(s[3][0], rc256[20]);
151 |   s[3][1] = XOR(s[3][1], rc256[21]);
152 | 
153 |   MIX2(s[0][0], s[0][1]);
154 |   MIX2(s[1][0], s[1][1]);
155 |   MIX2(s[2][0], s[2][1]);
156 |   MIX2(s[3][0], s[3][1]);
157 | 
158 |   // Feed Forward
159 |   s[0][0] = XOR(s[0][0], s_save[0][0]);
160 |   s[0][1] = XOR(s[0][1], s_save[0][1]);
161 |   s[1][0] = XOR(s[1][0], s_save[1][0]);
162 |   s[1][1] = XOR(s[1][1], s_save[1][1]);
163 |   s[2][0] = XOR(s[2][0], s_save[2][0]);
164 |   s[2][1] = XOR(s[2][1], s_save[2][1]);
165 |   s[3][0] = XOR(s[3][0], s_save[3][0]);
166 |   s[3][1] = XOR(s[3][1], s_save[3][1]);
167 | 
168 |   STORE(out, s[0][0]);
169 |   STORE(out + 16, s[0][1]);
170 |   STORE(out + 32, s[1][0]);
171 |   STORE(out + 48, s[1][1]);
172 |   STORE(out + 64, s[2][0]);
173 |   STORE(out + 80, s[2][1]);
174 |   STORE(out + 96, s[3][0]);
175 |   STORE(out + 112, s[3][1]);
176 | }
177 | 
178 | void haraka_f_8x(unsigned char *out, const unsigned char *in) {
179 |   u128 s[8][2], tmp, s_save[8][2];
180 |   s[0][0] = LOAD(in + 0);
181 |   s[0][1] = LOAD(in + 16);
182 |   s[1][0] = LOAD(in + 32);
183 |   s[1][1] = LOAD(in + 48);
184 |   s[2][0] = LOAD(in + 64);
185 |   s[2][1] = LOAD(in + 80);
186 |   s[3][0] = LOAD(in + 96);
187 |   s[3][1] = LOAD(in + 112);
188 |   s[4][0] = LOAD(in + 128);
189 |   s[4][1] = LOAD(in + 144);
190 |   s[5][0] = LOAD(in + 160);
191 |   s[5][1] = LOAD(in + 176);
192 |   s[6][0] = LOAD(in + 192);
193 |   s[6][1] = LOAD(in + 208);
194 |   s[7][0] = LOAD(in + 224);
195 |   s[7][1] = LOAD(in + 240);
196 | 
197 |   s_save[0][0] = s[0][0];
198 |   s_save[0][1] = s[0][1];
199 |   s_save[1][0] = s[1][0];
200 |   s_save[1][1] = s[1][1];
201 |   s_save[2][0] = s[2][0];
202 |   s_save[2][1] = s[2][1];
203 |   s_save[3][0] = s[3][0];
204 |   s_save[3][1] = s[3][1];
205 |   s_save[4][0] = s[4][0];
206 |   s_save[4][1] = s[4][1];
207 |   s_save[5][0] = s[5][0];
208 |   s_save[5][1] = s[5][1];
209 |   s_save[6][0] = s[6][0];
210 |   s_save[6][1] = s[6][1];
211 |   s_save[7][0] = s[7][0];
212 |   s_save[7][1] = s[7][1];
213 | 
214 |   AES2_4x(s[0], s[1], s[2], s[3], 0);
215 |   AES2_4x(s[4], s[5], s[6], s[7], 0);
216 |   MIX2(s[0][0], s[0][1]);
217 |   MIX2(s[1][0], s[1][1]);
218 |   MIX2(s[2][0], s[2][1]);
219 |   MIX2(s[3][0], s[3][1]);
220 |   MIX2(s[4][0], s[4][1]);
221 |   MIX2(s[5][0], s[5][1]);
222 |   MIX2(s[6][0], s[6][1]);
223 |   MIX2(s[7][0], s[7][1]);
224 |   AES2_4x(s[0], s[1], s[2], s[3], 4);
225 |   AES2_4x(s[4], s[5], s[6], s[7], 4);
226 |   MIX2(s[0][0], s[0][1]);
227 |   MIX2(s[1][0], s[1][1]);
228 |   MIX2(s[2][0], s[2][1]);
229 |   MIX2(s[3][0], s[3][1]);
230 |   MIX2(s[4][0], s[4][1]);
231 |   MIX2(s[5][0], s[5][1]);
232 |   MIX2(s[6][0], s[6][1]);
233 |   MIX2(s[7][0], s[7][1]);
234 |   AES2_4x(s[0], s[1], s[2], s[3], 8);
235 |   AES2_4x(s[4], s[5], s[6], s[7], 8);
236 |   MIX2(s[0][0], s[0][1]);
237 |   MIX2(s[1][0], s[1][1]);
238 |   MIX2(s[2][0], s[2][1]);
239 |   MIX2(s[3][0], s[3][1]);
240 |   MIX2(s[4][0], s[4][1]);
241 |   MIX2(s[5][0], s[5][1]);
242 |   MIX2(s[6][0], s[6][1]);
243 |   MIX2(s[7][0], s[7][1]);
244 |   AES2_4x(s[0], s[1], s[2], s[3], 12);
245 |   AES2_4x(s[4], s[5], s[6], s[7], 12);
246 |   MIX2(s[0][0], s[0][1]);
247 |   MIX2(s[1][0], s[1][1]);
248 |   MIX2(s[2][0], s[2][1]);
249 |   MIX2(s[3][0], s[3][1]);
250 |   MIX2(s[4][0], s[4][1]);
251 |   MIX2(s[5][0], s[5][1]);
252 |   MIX2(s[6][0], s[6][1]);
253 |   MIX2(s[7][0], s[7][1]);
254 | 
255 |   AES2_4x(s[0], s[1], s[2], s[3], 16);
256 |   AES2_4x(s[4], s[5], s[6], s[7], 16);
257 |   s[0][0] = XOR(s[0][0], rc256[20]);
258 |   s[0][1] = XOR(s[0][1], rc256[21]);
259 |   s[1][0] = XOR(s[1][0], rc256[20]);
260 |   s[1][1] = XOR(s[1][1], rc256[21]);
261 |   s[2][0] = XOR(s[2][0], rc256[20]);
262 |   s[2][1] = XOR(s[2][1], rc256[21]);
263 |   s[3][0] = XOR(s[3][0], rc256[20]);
264 |   s[3][1] = XOR(s[3][1], rc256[21]);
265 |   s[4][0] = XOR(s[4][0], rc256[20]);
266 |   s[4][1] = XOR(s[4][1], rc256[21]);
267 |   s[5][0] = XOR(s[5][0], rc256[20]);
268 |   s[5][1] = XOR(s[5][1], rc256[21]);
269 |   s[6][0] = XOR(s[6][0], rc256[20]);
270 |   s[6][1] = XOR(s[6][1], rc256[21]);
271 |   s[7][0] = XOR(s[7][0], rc256[20]);
272 |   s[7][1] = XOR(s[7][1], rc256[21]);
273 | 
274 |   MIX2(s[0][0], s[0][1]);
275 |   MIX2(s[1][0], s[1][1]);
276 |   MIX2(s[2][0], s[2][1]);
277 |   MIX2(s[3][0], s[3][1]);
278 |   MIX2(s[4][0], s[4][1]);
279 |   MIX2(s[5][0], s[5][1]);
280 |   MIX2(s[6][0], s[6][1]);
281 |   MIX2(s[7][0], s[7][1]);
282 |   s[0][0] = XOR(s[0][0], s_save[0][0]);
283 |   s[0][1] = XOR(s[0][1], s_save[0][1]);
284 |   s[1][0] = XOR(s[1][0], s_save[1][0]);
285 |   s[1][1] = XOR(s[1][1], s_save[1][1]);
286 |   s[2][0] = XOR(s[2][0], s_save[2][0]);
287 |   s[2][1] = XOR(s[2][1], s_save[2][1]);
288 |   s[3][0] = XOR(s[3][0], s_save[3][0]);
289 |   s[3][1] = XOR(s[3][1], s_save[3][1]);
290 |   s[4][0] = XOR(s[4][0], s_save[4][0]);
291 |   s[4][1] = XOR(s[4][1], s_save[4][1]);
292 |   s[5][0] = XOR(s[5][0], s_save[5][0]);
293 |   s[5][1] = XOR(s[5][1], s_save[5][1]);
294 |   s[6][0] = XOR(s[6][0], s_save[6][0]);
295 |   s[6][1] = XOR(s[6][1], s_save[6][1]);
296 |   s[7][0] = XOR(s[7][0], s_save[7][0]);
297 |   s[7][1] = XOR(s[7][1], s_save[7][1]);
298 | 
299 |   STORE(out + 0, s[0][0]);
300 |   STORE(out + 16, s[0][1]);
301 |   STORE(out + 32, s[1][0]);
302 |   STORE(out + 48, s[1][1]);
303 |   STORE(out + 64, s[2][0]);
304 |   STORE(out + 80, s[2][1]);
305 |   STORE(out + 96, s[3][0]);
306 |   STORE(out + 112, s[3][1]);
307 |   STORE(out + 128, s[4][0]);
308 |   STORE(out + 144, s[4][1]);
309 |   STORE(out + 160, s[5][0]);
310 |   STORE(out + 176, s[5][1]);
311 |   STORE(out + 192, s[6][0]);
312 |   STORE(out + 208, s[6][1]);
313 |   STORE(out + 224, s[7][0]);
314 |   STORE(out + 240, s[7][1]);
315 | }
316 | 
317 | void haraka_h(unsigned char *out, const unsigned char *in) {
318 |   u128 s[4], tmp, s_save[4];
319 | 
320 |   s[0] = LOAD(in);
321 |   s[1] = LOAD(in + 16);
322 |   s[2] = LOAD(in + 32);
323 |   s[3] = LOAD(in + 48);
324 | 
325 |   s_save[0] = s[0];
326 |   s_save[1] = s[1];
327 |   s_save[2] = s[2];
328 |   s_save[3] = s[3];
329 | 
330 |   AES4(s[0], s[1], s[2], s[3], 0);
331 |   MIX4(s[0], s[1], s[2], s[3]);
332 | 
333 |   AES4(s[0], s[1], s[2], s[3], 8);
334 |   MIX4(s[0], s[1], s[2], s[3]);
335 | 
336 |   AES4(s[0], s[1], s[2], s[3], 16);
337 |   MIX4(s[0], s[1], s[2], s[3]);
338 | 
339 |   AES4(s[0], s[1], s[2], s[3], 24);
340 |   MIX4(s[0], s[1], s[2], s[3]);
341 | 
342 |   AES4(s[0], s[1], s[2], s[3], 32);
343 |   s[0] = XOR(s[0], rc512[40]);
344 |   s[1] = XOR(s[1], rc512[41]);
345 |   s[2] = XOR(s[2], rc512[42]);
346 |   s[3] = XOR(s[3], rc512[43]);
347 |   MIX4(s[0], s[1], s[2], s[3]);
348 | 
349 |   s[0] = XOR(s[0], s_save[0]);
350 |   s[1] = XOR(s[1], s_save[1]);
351 |   s[2] = XOR(s[2], s_save[2]);
352 |   s[3] = XOR(s[3], s_save[3]);
353 | 
354 |   TRUNCSTORE(out, s[0], s[1], s[2], s[3]);
355 | }
356 | 
357 | void haraka_h_4x(unsigned char *out, const unsigned char *in) {
358 |   u128 s[4][4], tmp, s_save[4][4];
359 | 
360 |   s[0][0] = LOAD(in);
361 |   s[0][1] = LOAD(in + 16);
362 |   s[0][2] = LOAD(in + 32);
363 |   s[0][3] = LOAD(in + 48);
364 |   s[1][0] = LOAD(in + 64);
365 |   s[1][1] = LOAD(in + 80);
366 |   s[1][2] = LOAD(in + 96);
367 |   s[1][3] = LOAD(in + 112);
368 |   s[2][0] = LOAD(in + 128);
369 |   s[2][1] = LOAD(in + 144);
370 |   s[2][2] = LOAD(in + 160);
371 |   s[2][3] = LOAD(in + 176);
372 |   s[3][0] = LOAD(in + 192);
373 |   s[3][1] = LOAD(in + 208);
374 |   s[3][2] = LOAD(in + 224);
375 |   s[3][3] = LOAD(in + 240);
376 | 
377 |   s_save[0][0] = s[0][0];
378 |   s_save[0][1] = s[0][1];
379 |   s_save[0][2] = s[0][2];
380 |   s_save[0][3] = s[0][3];
381 |   s_save[1][0] = s[1][0];
382 |   s_save[1][1] = s[1][1];
383 |   s_save[1][2] = s[1][2];
384 |   s_save[1][3] = s[1][3];
385 |   s_save[2][0] = s[2][0];
386 |   s_save[2][1] = s[2][1];
387 |   s_save[2][2] = s[2][2];
388 |   s_save[2][3] = s[2][3];
389 |   s_save[3][0] = s[3][0];
390 |   s_save[3][1] = s[3][1];
391 |   s_save[3][2] = s[3][2];
392 |   s_save[3][3] = s[3][3];
393 | 
394 |   AES4_4x(s[0], s[1], s[2], s[3], 0);
395 |   MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
396 |   MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
397 |   MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
398 |   MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
399 | 
400 |   AES4_4x(s[0], s[1], s[2], s[3], 8);
401 |   MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
402 |   MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
403 |   MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
404 |   MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
405 | 
406 |   AES4_4x(s[0], s[1], s[2], s[3], 16);
407 |   MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
408 |   MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
409 |   MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
410 |   MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
411 | 
412 |   AES4_4x(s[0], s[1], s[2], s[3], 24);
413 |   MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
414 |   MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
415 |   MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
416 |   MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
417 | 
418 |   AES4_4x(s[0], s[1], s[2], s[3], 32);
419 |   s[0][0] = XOR(s[0][0], rc512[40]);
420 |   s[0][1] = XOR(s[0][1], rc512[41]);
421 |   s[0][2] = XOR(s[0][2], rc512[42]);
422 |   s[0][3] = XOR(s[0][3], rc512[43]);
423 |   s[1][0] = XOR(s[1][0], rc512[40]);
424 |   s[1][1] = XOR(s[1][1], rc512[41]);
425 |   s[1][2] = XOR(s[1][2], rc512[42]);
426 |   s[1][3] = XOR(s[1][3], rc512[43]);
427 |   s[2][0] = XOR(s[2][0], rc512[40]);
428 |   s[2][1] = XOR(s[2][1], rc512[41]);
429 |   s[2][2] = XOR(s[2][2], rc512[42]);
430 |   s[2][3] = XOR(s[2][3], rc512[43]);
431 |   s[3][0] = XOR(s[3][0], rc512[40]);
432 |   s[3][1] = XOR(s[3][1], rc512[41]);
433 |   s[3][2] = XOR(s[3][2], rc512[42]);
434 |   s[3][3] = XOR(s[3][3], rc512[43]);
435 |   MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
436 |   MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
437 |   MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
438 |   MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
439 | 
440 |   s[0][0] = XOR(s[0][0], s_save[0][0]);
441 |   s[0][1] = XOR(s[0][1], s_save[0][1]);
442 |   s[0][2] = XOR(s[0][2], s_save[0][2]);
443 |   s[0][3] = XOR(s[0][3], s_save[0][3]);
444 |   s[1][0] = XOR(s[1][0], s_save[1][0]);
445 |   s[1][1] = XOR(s[1][1], s_save[1][1]);
446 |   s[1][2] = XOR(s[1][2], s_save[1][2]);
447 |   s[1][3] = XOR(s[1][3], s_save[1][3]);
448 |   s[2][0] = XOR(s[2][0], s_save[2][0]);
449 |   s[2][1] = XOR(s[2][1], s_save[2][1]);
450 |   s[2][2] = XOR(s[2][2], s_save[2][2]);
451 |   s[2][3] = XOR(s[2][3], s_save[2][3]);
452 |   s[3][0] = XOR(s[3][0], s_save[3][0]);
453 |   s[3][1] = XOR(s[3][1], s_save[3][1]);
454 |   s[3][2] = XOR(s[3][2], s_save[3][2]);
455 |   s[3][3] = XOR(s[3][3], s_save[3][3]);
456 | 
457 |   TRUNCSTORE(out, s[0][0], s[0][1], s[0][2], s[0][3]);
458 |   TRUNCSTORE(out + 32, s[1][0], s[1][1], s[1][2], s[1][3]);
459 |   TRUNCSTORE(out + 64, s[2][0], s[2][1], s[2][2], s[2][3]);
460 |   TRUNCSTORE(out + 96, s[3][0], s[3][1], s[3][2], s[3][3]);
461 | }
462 | 
463 | void haraka_h_8x(unsigned char *out, const unsigned char *in) {
464 |   haraka_h_4x(out, in);
465 |   haraka_h_4x(out + 128, in + 256);
466 | }
467 | 


--------------------------------------------------------------------------------
/code/c/neon/haraka.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Implementation of Haraka256 and Haraka512 for ARM.
  3 | */
  4 | #ifndef HARAKA_H_
  5 | #define HARAKA_H_
  6 | 
  7 | #include <arm_neon.h>
  8 | 
  9 | #define u64 unsigned long
 10 | #define u128 uint8x16_t
 11 | 
 12 | 
 13 | // Note that the round constants differ from the x86 implementation due to the
 14 | // different order in which the key is added with the ARM AES instruction set.
 15 | 
 16 | static const uint8x16_t rc256[22] = {{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
 17 |                                      {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
 18 |                                      {0x9d,0x7b,0x81,0x75,0xf0,0xfe,0xc5,0xb2,0xa,0xc0,0x20,0xe6,0x4c,0x70,0x84,0x6},
 19 |                                      {0x17,0xf7,0x8,0x2f,0xa4,0x6b,0xf,0x64,0x6b,0xa0,0xf3,0x88,0xe1,0xb4,0x66,0x8b},
 20 |                                      {0x14,0x91,0x2,0x9f,0x79,0x4f,0x5b,0xfd,0x60,0x9d,0x2,0xcf,0xaf,0xbc,0xf3,0xbb},
 21 |                                      {0x98,0x84,0xf2,0x53,0x8,0x4f,0x7b,0x2e,0x2d,0xde,0x2,0x34,0xe6,0xea,0xd6,0xe},
 22 |                                      {0x44,0x70,0x39,0xbe,0x1c,0xcd,0xee,0x79,0x8b,0x44,0x72,0x48,0xcb,0xb0,0xcf,0xcb},
 23 |                                      {0x7b,0x5,0x8a,0x2b,0xed,0x35,0x53,0x8d,0xb7,0x32,0x90,0x6e,0xee,0xcd,0xea,0x7e},
 24 |                                      {0x1b,0xef,0x4f,0xda,0x3b,0xb,0xc7,0x1f,0x61,0x27,0x41,0xe2,0xe2,0xfd,0x5f,0x67},
 25 |                                      {0xd0,0x7c,0x2e,0x5e,0x7,0xcc,0xca,0xaf,0x43,0x8f,0xc2,0x67,0xb0,0xd9,0x24,0x29},
 26 |                                      {0xee,0x65,0xd4,0xb9,0xca,0x8f,0xdb,0xec,0xe9,0x7f,0x86,0xe6,0xf1,0x63,0x4d,0xab},
 27 |                                      {0x33,0x7e,0x3,0xad,0x4f,0x40,0x2a,0x5b,0x64,0xcd,0xb7,0xd4,0x84,0xbf,0x30,0x1c},
 28 |                                      {0x0,0x98,0xf6,0x8d,0x8a,0x2d,0x9d,0x5c,0x2e,0x8b,0x2,0x69,0xc8,0x9e,0xaa,0x4a},
 29 |                                      {0xbf,0x23,0x17,0x94,0x72,0x55,0x6f,0xde,0xb9,0xb,0xcc,0xb2,0xa6,0x78,0x4,0xfa},
 30 |                                      {0xd4,0x9f,0x12,0x29,0x2e,0x4f,0xfa,0xe,0x12,0x2a,0x77,0x6b,0x2b,0x9f,0xb4,0xdf},
 31 |                                      {0xee,0x12,0x6a,0xbb,0xae,0x11,0xd6,0x32,0x36,0xa2,0x49,0xf4,0x44,0x3,0xa1,0x1e},
 32 |                                      {0xa6,0xec,0xa8,0x9c,0xec,0x93,0xe5,0x27,0xc9,0x0,0x96,0x5f,0xe3,0xc7,0xa2,0x78},
 33 |                                      {0x84,0x0,0x5,0x4b,0x4f,0x9c,0x19,0x9d,0x88,0x49,0x4,0xaf,0xd8,0x5e,0x2,0x21},
 34 |                                      {0x73,0x1,0xd4,0x82,0xcd,0x2e,0x28,0xb9,0xb7,0xc9,0x59,0xa7,0xf8,0xaa,0x3a,0xbf},
 35 |                                      {0x6b,0x7d,0x30,0x10,0xd9,0xef,0xf2,0x37,0x17,0xb0,0x86,0x61,0xd,0x70,0x60,0x62},
 36 |                                      {0xc6,0x9a,0xfc,0xf6,0x53,0x91,0xc2,0x81,0x43,0x4,0x30,0x21,0xc2,0x45,0xca,0x5a},
 37 |                                      {0x3a,0x94,0xd1,0x36,0xe8,0x92,0xaf,0x2c,0xbb,0x68,0x6b,0x22,0x3c,0x97,0x23,0x92}};
 38 | 
 39 | 
 40 | static const uint8x16_t rc512[44] = {{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
 41 |                                      {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
 42 |                                      {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
 43 |                                      {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
 44 |                                      {0x9d,0x7b,0x81,0x75,0xf0,0xfe,0xc5,0xb2,0xa,0xc0,0x20,0xe6,0x4c,0x70,0x84,0x6},
 45 |                                      {0x17,0xf7,0x8,0x2f,0xa4,0x6b,0xf,0x64,0x6b,0xa0,0xf3,0x88,0xe1,0xb4,0x66,0x8b},
 46 |                                      {0x14,0x91,0x2,0x9f,0x60,0x9d,0x2,0xcf,0x98,0x84,0xf2,0x53,0x2d,0xde,0x2,0x34},
 47 |                                      {0x79,0x4f,0x5b,0xfd,0xaf,0xbc,0xf3,0xbb,0x8,0x4f,0x7b,0x2e,0xe6,0xea,0xd6,0xe},
 48 |                                      {0xcb,0xb0,0xcf,0xcb,0x43,0x8f,0xc2,0x67,0xee,0xcd,0xea,0x7e,0xb0,0xd9,0x24,0x29},
 49 |                                      {0x1b,0xef,0x4f,0xda,0x44,0x70,0x39,0xbe,0x3b,0xb,0xc7,0x1f,0x7b,0x5,0x8a,0x2b},
 50 |                                      {0x61,0x27,0x41,0xe2,0x1c,0xcd,0xee,0x79,0xe2,0xfd,0x5f,0x67,0xed,0x35,0x53,0x8d},
 51 |                                      {0x8b,0x44,0x72,0x48,0xd0,0x7c,0x2e,0x5e,0xb7,0x32,0x90,0x6e,0x7,0xcc,0xca,0xaf},
 52 |                                      {0xee,0x65,0xd4,0xb9,0xca,0x8f,0xdb,0xec,0xe9,0x7f,0x86,0xe6,0xf1,0x63,0x4d,0xab},
 53 |                                      {0x33,0x7e,0x3,0xad,0x4f,0x40,0x2a,0x5b,0x64,0xcd,0xb7,0xd4,0x84,0xbf,0x30,0x1c},
 54 |                                      {0x0,0x98,0xf6,0x8d,0x2e,0x8b,0x2,0x69,0xbf,0x23,0x17,0x94,0xb9,0xb,0xcc,0xb2},
 55 |                                      {0x8a,0x2d,0x9d,0x5c,0xc8,0x9e,0xaa,0x4a,0x72,0x55,0x6f,0xde,0xa6,0x78,0x4,0xfa},
 56 |                                      {0x2b,0x9f,0xb4,0xdf,0x88,0x49,0x4,0xaf,0x44,0x3,0xa1,0x1e,0xd8,0x5e,0x2,0x21},
 57 |                                      {0xa6,0xec,0xa8,0x9c,0xd4,0x9f,0x12,0x29,0xec,0x93,0xe5,0x27,0xee,0x12,0x6a,0xbb},
 58 |                                      {0xc9,0x0,0x96,0x5f,0x2e,0x4f,0xfa,0xe,0xe3,0xc7,0xa2,0x78,0xae,0x11,0xd6,0x32},
 59 |                                      {0x12,0x2a,0x77,0x6b,0x84,0x0,0x5,0x4b,0x36,0xa2,0x49,0xf4,0x4f,0x9c,0x19,0x9d},
 60 |                                      {0x73,0x1,0xd4,0x82,0xcd,0x2e,0x28,0xb9,0xb7,0xc9,0x59,0xa7,0xf8,0xaa,0x3a,0xbf},
 61 |                                      {0x6b,0x7d,0x30,0x10,0xd9,0xef,0xf2,0x37,0x17,0xb0,0x86,0x61,0xd,0x70,0x60,0x62},
 62 |                                      {0xc6,0x9a,0xfc,0xf6,0x53,0x91,0xc2,0x81,0x43,0x4,0x30,0x21,0xc2,0x45,0xca,0x5a},
 63 |                                      {0x3a,0x94,0xd1,0x36,0xe8,0x92,0xaf,0x2c,0xbb,0x68,0x6b,0x22,0x3c,0x97,0x23,0x92},
 64 |                                      {0x38,0x92,0xbf,0xd3,0x68,0x62,0x60,0xbb,0xe5,0x3c,0x86,0xdb,0xdc,0xd3,0x4b,0x73},
 65 |                                      {0xb1,0x12,0x22,0xcb,0xb4,0x71,0x10,0xe5,0x7d,0xf7,0x2b,0xc7,0x8d,0x12,0xe1,0x24},
 66 |                                      {0xe3,0x8d,0xe4,0x83,0x58,0xb9,0xba,0x6c,0x4e,0x1a,0xb9,0x2d,0xdd,0xfd,0x3d,0x93},
 67 |                                      {0xeb,0x86,0x58,0x22,0x9c,0xa0,0xeb,0xff,0x77,0xc6,0xf0,0xae,0x9c,0xd1,0xe4,0xe2},
 68 |                                      {0x4e,0x92,0xb3,0x2c,0xc4,0x15,0x14,0x4b,0x43,0x1b,0x30,0x61,0xc3,0x47,0xbb,0x43},
 69 |                                      {0x99,0x68,0xeb,0x16,0xdd,0x31,0xb2,0x3,0xf6,0xef,0x7,0xe7,0xa8,0x75,0xa7,0xdb},
 70 |                                      {0x2c,0x47,0xca,0x7e,0x2,0x23,0x5e,0x8e,0x77,0x59,0x75,0x3c,0x4b,0x61,0xf3,0x6d},
 71 |                                      {0xf9,0x17,0x86,0xb8,0xb9,0xe5,0x1b,0x6d,0x77,0x7d,0xde,0xd6,0x17,0x5a,0xa7,0xcd},
 72 |                                      {0xf0,0x43,0x6b,0xec,0x75,0xc,0xee,0x2c,0x50,0x69,0x1e,0xcb,0xa1,0xa5,0xb1,0xf0},
 73 |                                      {0xd9,0xd0,0xe,0x60,0x5d,0xee,0x46,0xa9,0x50,0xa3,0xa4,0x63,0xc1,0x27,0xf3,0x3b},
 74 |                                      {0x53,0x3,0xed,0xe4,0x9d,0x6,0x6c,0x9d,0xbc,0xba,0xbb,0x80,0x59,0x11,0x53,0xa2},
 75 |                                      {0xaa,0xe9,0xa8,0x6b,0x9c,0x61,0xda,0x0,0x2b,0x33,0x57,0xf9,0xab,0xc,0xe9,0x96},
 76 |                                      {0x39,0xca,0x8d,0x93,0x30,0xde,0xd,0xab,0x88,0x29,0x96,0x5e,0x2,0xb1,0x3d,0xae},
 77 |                                      {0x42,0xb4,0x75,0x2e,0xa8,0xf3,0x14,0x88,0xb,0xa4,0x54,0xd5,0x38,0x8f,0xbb,0x17},
 78 |                                      {0xf6,0x16,0xa,0x36,0x79,0xb7,0xb6,0xae,0xd7,0x7f,0x42,0x5f,0x5b,0x8a,0xbb,0x34},
 79 |                                      {0xde,0xaf,0xba,0xff,0x18,0x59,0xce,0x43,0x38,0x54,0xe5,0xcb,0x41,0x52,0xf6,0x26},
 80 |                                      {0x78,0xc9,0x9e,0x83,0xf7,0x9c,0xca,0xa2,0x6a,0x2,0xf3,0xb9,0x54,0x9a,0xe9,0x4c},
 81 |                                      {0x35,0x12,0x90,0x22,0x28,0x6e,0xc0,0x40,0xbe,0xf7,0xdf,0x1b,0x1a,0xa5,0x51,0xae},
 82 |                                      {0xcf,0x59,0xa6,0x48,0xf,0xbc,0x73,0xc1,0x2b,0xd2,0x7e,0xba,0x3c,0x61,0xc1,0xa0},
 83 |                                      {0xa1,0x9d,0xc5,0xe9,0xfd,0xbd,0xd6,0x4a,0x88,0x82,0x28,0x2,0x3,0xcc,0x6a,0x75}};
 84 | 
 85 | #define XOR(a, b) veorq_u8(a, b)
 86 | #define LOAD(src) vld1q_u8(src)
 87 | #define STORE(dest,src) vst1q_u8(dest,src)
 88 | #define ZIP2(a, b) (u128) vzip2q_u32((uint32x4_t)a, (uint32x4_t)b)
 89 | #define ZIP1(a, b) (u128) vzip1q_u32((uint32x4_t)a, (uint32x4_t)b)
 90 | 
 91 | #define AES2(s0, s1, rci) \
 92 |   s0 = vaesmcq_u8(vaeseq_u8(s0, rc256[rci])); \
 93 |   s1 = vaesmcq_u8(vaeseq_u8(s1, rc256[rci + 1])); \
 94 |   s0 = vaesmcq_u8(vaeseq_u8(s0, rc256[rci + 2])); \
 95 |   s1 = vaesmcq_u8(vaeseq_u8(s1, rc256[rci + 3]));
 96 | 
 97 | #define AES2_4x(s0, s1, s2, s3, rci) \
 98 |   AES2(s0[0], s0[1], rci); \
 99 |   AES2(s1[0], s1[1], rci); \
100 |   AES2(s2[0], s2[1], rci); \
101 |   AES2(s3[0], s3[1], rci);
102 | 
103 | #define AES2_8x(s0, s1, s2, s3, s4, s5, s6, s7, rci) \
104 |   AES2_4x(s0, s1, s2, s3, rci); \
105 |   AES2_4x(s4, s5, s6, s7, rci);
106 | 
107 | #define AES4(s0, s1, s2, s3, rci) \
108 |   s0 = vaesmcq_u8(vaeseq_u8(s0, rc512[rci])); \
109 |   s1 = vaesmcq_u8(vaeseq_u8(s1, rc512[rci + 1])); \
110 |   s2 = vaesmcq_u8(vaeseq_u8(s2, rc512[rci + 2])); \
111 |   s3 = vaesmcq_u8(vaeseq_u8(s3, rc512[rci + 3])); \
112 |   s0 = vaesmcq_u8(vaeseq_u8(s0, rc512[rci + 4])); \
113 |   s1 = vaesmcq_u8(vaeseq_u8(s1, rc512[rci + 5])); \
114 |   s2 = vaesmcq_u8(vaeseq_u8(s2, rc512[rci + 6])); \
115 |   s3 = vaesmcq_u8(vaeseq_u8(s3, rc512[rci + 7])); \
116 | 
117 | #define AES4_4x(s0, s1, s2, s3, rci) \
118 |   AES4(s0[0], s0[1], s0[2], s0[3], rci); \
119 |   AES4(s1[0], s1[1], s1[2], s1[3], rci); \
120 |   AES4(s2[0], s2[1], s2[2], s2[3], rci); \
121 |   AES4(s3[0], s3[1], s3[2], s3[3], rci);
122 | 
123 | #define AES4_8x(s0, s1, s2, s3, s4, s5, s6, s7, rci) \
124 |   AES4_4x(s0, s1, s2, s3, rci); \
125 |   AES4_4x(s4, s5, s6, s7, rci);
126 | 
127 | #define MIX2(s0, s1) \
128 |   tmp = ZIP2(s0, s1); \
129 |   s0 = ZIP1(s0, s1); \
130 |   s1 = tmp;
131 | 
132 | #define MIX4(s0, s1, s2, s3) \
133 |   tmp  = ZIP1(s0, s1); \
134 |   s0 = ZIP2(s0, s1); \
135 |   s1 = ZIP1(s2, s3); \
136 |   s2 = ZIP2(s2, s3); \
137 |   s3 = ZIP1(s0, s2); \
138 |   s0 = ZIP2(s0, s2); \
139 |   s2 = ZIP2(s1, tmp); \
140 |   s1 = ZIP1(s1, tmp);
141 | 
142 | #define TRUNCSTORE(out, s0, s1, s2, s3) \
143 |   *(u64*)(out) = vreinterpretq_u64_u8(s0)[1]; \
144 |   *(u64*)(out + 8) = vreinterpretq_u64_u8(s1)[1]; \
145 |   *(u64*)(out + 16) = vreinterpretq_u64_u8(s2)[0]; \
146 |   *(u64*)(out + 24) = vreinterpretq_u64_u8(s3)[0];
147 | 
148 | void haraka_testvectors();
149 | 
150 | void haraka_f(unsigned char *out, const unsigned char *in);
151 | void haraka_f_4x(unsigned char *out, const unsigned char *in);
152 | void haraka_f_8x(unsigned char *out, const unsigned char *in);
153 | 
154 | void haraka_h(unsigned char *out, const unsigned char *in);
155 | void haraka_h_4x(unsigned char *out, const unsigned char *in);
156 | void haraka_h_8x(unsigned char *out, const unsigned char *in);
157 | 
158 | 
159 | #endif
160 | 


--------------------------------------------------------------------------------
/code/python/ref.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import copy
  3 | 
  4 | MPAR = 1
  5 | ROUNDS = 5
  6 | AES_ROUNDS = 2
  7 | 
  8 | # AES S-box
  9 | S = [[0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76],
 10 | 	 [0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0],
 11 | 	 [0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15],
 12 | 	 [0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75],
 13 | 	 [0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84],
 14 | 	 [0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf],
 15 | 	 [0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8],
 16 | 	 [0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2],
 17 | 	 [0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73],
 18 | 	 [0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb],
 19 | 	 [0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79],
 20 | 	 [0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08],
 21 | 	 [0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a],
 22 | 	 [0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e],
 23 | 	 [0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf],
 24 | 	 [0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16]]
 25 | 
 26 | RC = [0x0684704ce620c00ab2c5fef075817b9d, 0x8b66b4e188f3a06b640f6ba42f08f717,
 27 | 	  0x3402de2d53f28498cf029d609f029114, 0x0ed6eae62e7b4f08bbf3bcaffd5b4f79,
 28 | 	  0xcbcfb0cb4872448b79eecd1cbe397044, 0x7eeacdee6e9032b78d5335ed2b8a057b,
 29 | 	  0x67c28f435e2e7cd0e2412761da4fef1b, 0x2924d9b0afcacc07675ffde21fc70b3b,
 30 | 	  0xab4d63f1e6867fe9ecdb8fcab9d465ee, 0x1c30bf84d4b7cd645b2a404fad037e33,
 31 | 	  0xb2cc0bb9941723bf69028b2e8df69800, 0xfa0478a6de6f55724aaa9ec85c9d2d8a,
 32 | 	  0xdfb49f2b6b772a120efa4f2e29129fd4, 0x1ea10344f449a23632d611aebb6a12ee,
 33 | 	  0xaf0449884b0500845f9600c99ca8eca6, 0x21025ed89d199c4f78a2c7e327e593ec,
 34 | 	  0xbf3aaaf8a759c9b7b9282ecd82d40173, 0x6260700d6186b01737f2efd910307d6b,
 35 | 	  0x5aca45c22130044381c29153f6fc9ac6, 0x9223973c226b68bb2caf92e836d1943a,
 36 | 	  0xd3bf9238225886eb6cbab958e51071b4, 0xdb863ce5aef0c677933dfddd24e1128d,
 37 | 	  0xbb606268ffeba09c83e48de3cb2212b1, 0x734bd3dce2e4d19c2db91a4ec72bf77d,
 38 | 	  0x43bb47c361301b434b1415c42cb3924e, 0xdba775a8e707eff603b231dd16eb6899,
 39 | 	  0x6df3614b3c7559778e5e23027eca472c, 0xcda75a17d6de7d776d1be5b9b88617f9,
 40 | 	  0xec6b43f06ba8e9aa9d6c069da946ee5d, 0xcb1e6950f957332ba25311593bf327c1,
 41 | 	  0x2cee0c7500da619ce4ed0353600ed0d9, 0xf0b1a5a196e90cab80bbbabc63a4a350,
 42 | 	  0xae3db1025e962988ab0dde30938dca39, 0x17bb8f38d554a40b8814f3a82e75b442,
 43 | 	  0x34bb8a5b5f427fd7aeb6b779360a16f6, 0x26f65241cbe5543843ce5918ffbaafde,
 44 | 	  0x4ce99a54b9f3026aa2ca9cf7839ec978, 0xae51a51a1bdff7be40c06e2822901235,
 45 | 	  0xa0c1613cba7ed22bc173bc0f48a659cf, 0x756acc03022882884ad6bdfde9c59da1]
 46 | 
 47 | # get padded hex for single byte
 48 | def hexbyte(x):
 49 | 	return hex(x)[2:].zfill(2)
 50 | 
 51 | # print list of bytes in hex
 52 | def ps(s):
 53 | 	return " ".join([hexbyte(x) for x in s])
 54 | 
 55 | # print state
 56 | def printstate(s):
 57 | 	for i in range(4):
 58 | 		if len(s) == 4:
 59 | 	 		q = [s[0][i],s[0][i+4],s[0][i+8],s[0][i+12],
 60 | 	 		 	 s[1][i],s[1][i+4],s[1][i+8],s[1][i+12],
 61 | 	 		 	 s[2][i],s[2][i+4],s[2][i+8],s[2][i+12],
 62 | 	 		 	 s[3][i],s[3][i+4],s[3][i+8],s[3][i+12]]
 63 | 	 	else:
 64 | 	 		q = [s[0][i],s[0][i+4],s[0][i+8],s[0][i+12],
 65 | 	 		 	 s[1][i],s[1][i+4],s[1][i+8],s[1][i+12]]
 66 | 	 	print " ".join([hexbyte(x) for x in q])
 67 | 	 	# print q
 68 | 	print ""
 69 | 
 70 | # multiply by 2 over GF(2^128)
 71 | def xtime(x):
 72 | 	if (x >> 7):
 73 | 		return ((x << 1) ^ 0x1b) & 0xff
 74 | 	else:
 75 | 		return (x << 1) & 0xff
 76 | 
 77 | # xor two lists element-wise
 78 | def xor(x,y):
 79 | 	return [x[i] ^ y[i] for i in range(16)]
 80 | 
 81 | # apply a single S-box
 82 | def sbox(x):
 83 | 	return S[(x >> 4)][x & 0xF]
 84 | 
 85 | # AES SubBytes
 86 | def subbytes(s):
 87 | 	return [sbox(x) for x in s]
 88 | 
 89 | # AES ShiftRows
 90 | def shiftrows(s):
 91 | 	return [s[0], s[5], s[10], s[15], 
 92 | 			s[4], s[9], s[14], s[3], 
 93 | 			s[8], s[13], s[2], s[7], 
 94 | 			s[12], s[1], s[6], s[11]]
 95 | 
 96 | # AES MixColumns
 97 | def mixcolumns(s):	
 98 | 	return list(itertools.chain(*
 99 | 		[[xtime(s[4*i]) ^ xtime(s[4*i+1]) ^ s[4*i+1] ^ s[4*i+2] ^ s[4*i+3],
100 | 		s[4*i] ^ xtime(s[4*i+1]) ^ xtime(s[4*i+2]) ^ s[4*i+2] ^ s[4*i+3],
101 | 		s[4*i] ^ s[4*i+1] ^ xtime(s[4*i+2]) ^ xtime(s[4*i+3]) ^ s[4*i+3],
102 | 		xtime(s[4*i]) ^ s[4*i] ^ s[4*i+1] ^ s[4*i+2] ^ xtime(s[4*i+3])] 
103 | 		for i in range(4)]))
104 | 	
105 | # AES single regular round	
106 | def aesenc(s, rk):
107 | 	s = subbytes(s)
108 | 	s = shiftrows(s)
109 | 	s = mixcolumns(s)
110 | 	s = xor(s, rk[::-1])
111 | 	return s
112 | 
113 | # consider 4 consecutive entries as 32-bit values and shift each of them to the left
114 | def shift32(x):
115 | 	# make list of 32-bit elements
116 | 	w = [((x[i] << 24) ^ (x[i+1] << 16) ^ (x[i+2] << 8) ^ x[i+3]) << 1 for i in [0, 4, 8, 12]]
117 | 	return list(itertools.chain(*[[(q >> 24) & 0xFF, (q >> 16) & 0xFF, (q >> 8) & 0xFF, (q >> 0) & 0xFF] for q in w]))
118 | 
119 | # linear mixing for Haraka-512/256
120 | def mix512(s):
121 | 	return [s[0][12:16] + s[2][12:16] + s[1][12:16] + s[3][12:16],
122 | 	 		s[2][0:4]   + s[0][0:4]   + s[3][0:4]   + s[1][0:4]  ,
123 | 	 		s[2][4:8]   + s[0][4:8]   + s[3][4:8]   + s[1][4:8]  ,
124 | 	 		s[0][8:12]  + s[2][8:12]  + s[1][8:12]  + s[3][8:12]]
125 | 
126 | # linear mixing for Haraka-256/256
127 | def mix256(s):
128 | 	return [s[0][0:4]  + s[1][0:4]  + s[0][4:8]   + s[1][4:8],
129 | 			s[0][8:12] + s[1][8:12] + s[0][12:16] + s[1][12:16]]
130 | 
131 | # convert RC to 16 words state
132 | def convRC(rc):
133 | 	rcstr = hex(rc)[2:-1].zfill(32)
134 | 	return [int(rcstr[i:i+2], 16) for i in range(0, 32, 2)]
135 | 
136 | # Haraka-512/256
137 | def haraka512256(msg):
138 | 	# obtain state from msg input and set initial rcon
139 | 	s = [msg[i:i+16] for i in [0,16,32,48]]
140 | 	rcon = [0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1]
141 | 
142 | 	print "= input state ="
143 | 	printstate(s)
144 | 
145 | 	# apply round functions
146 | 	for t in range(ROUNDS):
147 | 		# first we do AES_ROUNDS of AES rounds and update the round constant each time
148 | 		for m in range(AES_ROUNDS):
149 | 			s = [aesenc(s[i], convRC(RC[4*t*AES_ROUNDS + 4*m + i])) for i in range(4)]
150 | 
151 | 		print "= round %d : after aes layer ="%(t)
152 | 		printstate(s)
153 | 
154 | 		# now apply mixing
155 | 		s = mix512(s)
156 | 
157 | 		print "= round %d : after mix layer ="%(t)
158 | 		printstate(s)
159 | 
160 | 	print "= output from permutation ="
161 | 	printstate(s)
162 | 	
163 | 	# apply feed-forward
164 | 	s = [xor(s[i], msg[16*i:16*(i+1)]) for i in range(4)]
165 | 	
166 | 	print "= after feed-forward ="
167 | 	printstate(s)
168 | 
169 | 	# truncation
170 | 	return s[0][8:] + s[1][8:] + s[2][0:8] + s[3][0:8]
171 | 
172 | # Haraka-256/256
173 | def haraka256256(msg):
174 | 	# obtain state from msg input and set initial rcon
175 | 	s = [msg[i:i+16] for i in [0,16]]
176 | 	rcon = [0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1]
177 | 
178 | 	print "= input state ="
179 | 	printstate(s)
180 | 
181 | 	# apply round functions
182 | 	for t in range(ROUNDS):
183 | 		# first we do AES_ROUNDS of AES rounds and update the round constant each time
184 | 		for m in range(AES_ROUNDS):
185 | 			s = [aesenc(s[i], convRC(RC[2*t*AES_ROUNDS + 2*m + i])) for i in range(2)]
186 | 			rcon = shift32(rcon)
187 | 
188 | 		print "= round %d : after aes layer ="%(t)
189 | 		printstate(s)
190 | 
191 | 		# now apply mixing
192 | 		s = mix256(s)
193 | 
194 | 		print "= round %d : after mix layer ="%(t)
195 | 		printstate(s)
196 | 
197 | 	print "= output from permutation ="
198 | 	printstate(s)
199 | 	
200 | 	# apply feed-forward
201 | 	s = [xor(s[i], msg[16*i:16*(i+1)]) for i in range(2)]
202 | 	
203 | 	print "= after feed-forward ="
204 | 	printstate(s)
205 | 
206 | 	# truncation
207 | 	return list(itertools.chain(*s))
208 | 
209 | 
210 | # set some message bytes
211 | m = [i for i in range(64)]
212 | 
213 | # print input
214 | print "= input bytes ="
215 | print ps(m) + "\n"
216 | 
217 | # call Haraka-512/256
218 | digest = haraka512256(m)
219 | 
220 | # print digest
221 | print "= haraka-512/256 output bytes ="
222 | print ps(digest) + "\n"
223 | 
224 | # call Haraka-256/256
225 | digest = haraka256256(m)
226 | 
227 | # print digest
228 | print "= haraka-256/256 output bytes ="
229 | print ps(digest) + "\n"


--------------------------------------------------------------------------------
/supercop/crypto_sign/measure.c:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include "randombytes.h"
 3 | #include "cpucycles.h"
 4 | #include "crypto_sign.h"
 5 | #include "permute.h"
 6 | 
 7 | extern void printentry(long long,const char *,long long *,long long);
 8 | extern unsigned char *alignedcalloc(unsigned long long);
 9 | extern const char *primitiveimplementation;
10 | extern const char *implementationversion;
11 | extern const char *sizenames[];
12 | extern const long long sizes[];
13 | extern void allocate(void);
14 | extern void measure(void);
15 | 
16 | const char *primitiveimplementation = crypto_sign_IMPLEMENTATION;
17 | const char *implementationversion = crypto_sign_VERSION;
18 | const char *sizenames[] = { "outputbytes", "publickeybytes", "secretkeybytes", 0 };
19 | const long long sizes[] = { crypto_sign_BYTES, crypto_sign_PUBLICKEYBYTES, crypto_sign_SECRETKEYBYTES };
20 | 
21 | #define MAXTEST_BYTES 100000
22 | 
23 | static unsigned char *pk;
24 | static unsigned char *sk;
25 | static unsigned char *m; unsigned long long mlen;
26 | static unsigned char *sm; unsigned long long smlen;
27 | static unsigned char *t; unsigned long long tlen;
28 | 
29 | void preallocate(void)
30 | {
31 | #ifdef RAND_R_PRNG_NOT_SEEDED
32 |   RAND_status();
33 | #endif
34 | }
35 | 
36 | void allocate(void)
37 | {
38 |   pk = alignedcalloc(crypto_sign_PUBLICKEYBYTES);
39 |   sk = alignedcalloc(crypto_sign_SECRETKEYBYTES);
40 |   m = alignedcalloc(MAXTEST_BYTES + crypto_sign_BYTES);
41 |   sm = alignedcalloc(MAXTEST_BYTES + crypto_sign_BYTES);
42 |   t = alignedcalloc(MAXTEST_BYTES + crypto_sign_BYTES);
43 | }
44 | 
45 | #define TIMINGS 20
46 | static long long cycles[TIMINGS + 1];
47 | static long long bytes[TIMINGS + 1];
48 | 
49 | void measure(void)
50 | {
51 | 	int i;
52 | 	int loop;
53 | 	
54 | 	for (loop = 0;loop < LOOPS;++loop) {
55 | 		for (i = 0;i <= TIMINGS;++i) {
56 | 			cycles[i] = cpucycles();
57 | 			crypto_sign_keypair(pk,sk);
58 | 		}
59 | 		for (i = 0;i < TIMINGS;++i)
60 | 			cycles[i] = cycles[i + 1] - cycles[i];
61 | 		printentry(-1,"keypair_cycles",cycles,TIMINGS);
62 | 
63 | 		for (mlen = 0;mlen <= MAXTEST_BYTES;mlen += 1 + mlen / 4) {
64 | 			randombytes(m,mlen);
65 | 
66 | 			for (i = 0;i <= TIMINGS;++i) {
67 | 				cycles[i] = cpucycles();
68 | 				bytes[i] = crypto_sign(sm,&smlen,m,mlen,sk);
69 | 				if (bytes[i] == 0)
70 | 					bytes[i] = smlen;
71 | 			}
72 | 			for (i = 0;i < TIMINGS;++i)
73 | 				cycles[i] = cycles[i + 1] - cycles[i];
74 | 			printentry(mlen,"cycles",cycles,TIMINGS);
75 | 			printentry(mlen,"bytes",bytes,TIMINGS);
76 | 
77 | 			for (i = 0;i <= TIMINGS;++i) {
78 | 				cycles[i] = cpucycles();
79 | 				bytes[i] = crypto_sign_open(t,&tlen,sm,smlen,pk);
80 | 				if (bytes[i] == 0) bytes[i] = tlen;
81 | 			}
82 | 			for (i = 0;i < TIMINGS;++i)
83 | 				cycles[i] = cycles[i + 1] - cycles[i];
84 | 			printentry(mlen,"open_cycles",cycles,TIMINGS);
85 | 			printentry(mlen,"open_bytes",bytes,TIMINGS);
86 | 		}
87 | 	}
88 | }
89 | 


--------------------------------------------------------------------------------
/supercop/crypto_sign/measure.c~:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include "randombytes.h"
 3 | #include "cpucycles.h"
 4 | #include "crypto_sign.h"
 5 | 
 6 | extern void printentry(long long,const char *,long long *,long long);
 7 | extern unsigned char *alignedcalloc(unsigned long long);
 8 | extern const char *primitiveimplementation;
 9 | extern const char *implementationversion;
10 | extern const char *sizenames[];
11 | extern const long long sizes[];
12 | extern void allocate(void);
13 | extern void measure(void);
14 | 
15 | const char *primitiveimplementation = crypto_sign_IMPLEMENTATION;
16 | const char *implementationversion = crypto_sign_VERSION;
17 | const char *sizenames[] = { "outputbytes", "publickeybytes", "secretkeybytes", 0 };
18 | const long long sizes[] = { crypto_sign_BYTES, crypto_sign_PUBLICKEYBYTES, crypto_sign_SECRETKEYBYTES };
19 | 
20 | #define MAXTEST_BYTES 100000
21 | 
22 | static unsigned char *pk;
23 | static unsigned char *sk;
24 | static unsigned char *m; unsigned long long mlen;
25 | static unsigned char *sm; unsigned long long smlen;
26 | static unsigned char *t; unsigned long long tlen;
27 | 
28 | void preallocate(void)
29 | {
30 | #ifdef RAND_R_PRNG_NOT_SEEDED
31 |   RAND_status();
32 | #endif
33 | }
34 | 
35 | void allocate(void)
36 | {
37 |   pk = alignedcalloc(crypto_sign_PUBLICKEYBYTES);
38 |   sk = alignedcalloc(crypto_sign_SECRETKEYBYTES);
39 |   m = alignedcalloc(MAXTEST_BYTES + crypto_sign_BYTES);
40 |   sm = alignedcalloc(MAXTEST_BYTES + crypto_sign_BYTES);
41 |   t = alignedcalloc(MAXTEST_BYTES + crypto_sign_BYTES);
42 | }
43 | 
44 | #define TIMINGS 1
45 | static long long cycles[TIMINGS + 1];
46 | static long long bytes[TIMINGS + 1];
47 | 
48 | void measure(void)
49 | {
50 | 	int i;
51 | 	int loop;
52 | 
53 | 	for (loop = 0;loop < LOOPS;++loop) {
54 | 		for (i = 0;i <= TIMINGS;++i) {
55 | 			cycles[i] = cpucycles();
56 | 			crypto_sign_keypair(pk,sk);
57 | 		}
58 | 		for (i = 0;i < TIMINGS;++i)
59 | 			cycles[i] = cycles[i + 1] - cycles[i];
60 | 		printentry(-1,"keypair_cycles",cycles,TIMINGS);
61 | 
62 | 		for (mlen = 0;mlen <= MAXTEST_BYTES;mlen += 1 + mlen / 4) {
63 | 			randombytes(m,mlen);
64 | 
65 | 			for (i = 0;i <= TIMINGS;++i) {
66 | 				cycles[i] = cpucycles();
67 | 				bytes[i] = crypto_sign(sm,&smlen,m,mlen,sk);
68 | 				if (bytes[i] == 0)
69 | 					bytes[i] = smlen;
70 | 			}
71 | 			for (i = 0;i < TIMINGS;++i)
72 | 				cycles[i] = cycles[i + 1] - cycles[i];
73 | 			printentry(mlen,"cycles",cycles,TIMINGS);
74 | 			printentry(mlen,"bytes",bytes,TIMINGS);
75 | 
76 | 			for (i = 0;i <= TIMINGS;++i) {
77 | 				cycles[i] = cpucycles();
78 | 				bytes[i] = crypto_sign_open(t,&tlen,sm,smlen,pk);
79 | 				if (bytes[i] == 0) bytes[i] = tlen;
80 | 			}
81 | 			for (i = 0;i < TIMINGS;++i)
82 | 				cycles[i] = cycles[i + 1] - cycles[i];
83 | 			printentry(mlen,"open_cycles",cycles,TIMINGS);
84 | 			printentry(mlen,"open_bytes",bytes,TIMINGS);
85 | 		}
86 | 	}
87 | }
88 | 


--------------------------------------------------------------------------------
/supercop/crypto_sign/sphincs256haraka/aesni/api.h:
--------------------------------------------------------------------------------
1 | #include "params.h"
2 | 
3 | #define CRYPTO_SECRETKEYBYTES (SEED_BYTES + CRYPTO_PUBLICKEYBYTES-HASH_BYTES + SK_RAND_SEED_BYTES)
4 | #define CRYPTO_PUBLICKEYBYTES ((N_MASKS+1)*HASH_BYTES)
5 | #define CRYPTO_BYTES (MESSAGE_HASH_SEED_BYTES + (TOTALTREE_HEIGHT+7)/8 + HORST_SIGBYTES + (TOTALTREE_HEIGHT/SUBTREE_HEIGHT)*WOTS_SIGBYTES + TOTALTREE_HEIGHT*HASH_BYTES)
6 | #define CRYPTO_DETERMINISTIC 1
7 | 


--------------------------------------------------------------------------------
/supercop/crypto_sign/sphincs256haraka/aesni/consts.c:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | 
 3 | uint32_t hashc8x[64] = {
 4 |     0x61707865, 0x61707865, 0x61707865, 0x61707865, 0x61707865, 0x61707865, 0x61707865, 0x61707865,
 5 |     0x3320646e, 0x3320646e, 0x3320646e, 0x3320646e, 0x3320646e, 0x3320646e, 0x3320646e, 0x3320646e,
 6 |     0x79622d32, 0x79622d32, 0x79622d32, 0x79622d32, 0x79622d32, 0x79622d32, 0x79622d32, 0x79622d32,
 7 |     0x74206574, 0x74206574, 0x74206574, 0x74206574, 0x74206574, 0x74206574, 0x74206574, 0x74206574,
 8 |     0x3436206f, 0x3436206f, 0x3436206f, 0x3436206f, 0x3436206f, 0x3436206f, 0x3436206f, 0x3436206f,
 9 |     0x7479622d, 0x7479622d, 0x7479622d, 0x7479622d, 0x7479622d, 0x7479622d, 0x7479622d, 0x7479622d,
10 |     0x74732065, 0x74732065, 0x74732065, 0x74732065, 0x74732065, 0x74732065, 0x74732065, 0x74732065,
11 |     0x21657461, 0x21657461, 0x21657461, 0x21657461, 0x21657461, 0x21657461, 0x21657461, 0x21657461};
12 | 
13 | unsigned char _rotate8[32]  = {3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14,
14 |                                3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14};
15 | 
16 | unsigned char _rotate16[32] = {2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,
17 |                                2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13};
18 | 


--------------------------------------------------------------------------------
/supercop/crypto_sign/sphincs256haraka/aesni/haraka.c:
--------------------------------------------------------------------------------
  1 | #include "haraka.h"
  2 | #include <stdio.h>
  3 | 
  4 | void load_constants() {
  5 |   rc[0] = _mm_set_epi32(0x0684704c,0xe620c00a,0xb2c5fef0,0x75817b9d);
  6 |   rc[1] = _mm_set_epi32(0x8b66b4e1,0x88f3a06b,0x640f6ba4,0x2f08f717);
  7 |   rc[2] = _mm_set_epi32(0x3402de2d,0x53f28498,0xcf029d60,0x9f029114);
  8 |   rc[3] = _mm_set_epi32(0x0ed6eae6,0x2e7b4f08,0xbbf3bcaf,0xfd5b4f79);
  9 |   rc[4] = _mm_set_epi32(0xcbcfb0cb,0x4872448b,0x79eecd1c,0xbe397044);
 10 |   rc[5] = _mm_set_epi32(0x7eeacdee,0x6e9032b7,0x8d5335ed,0x2b8a057b);
 11 |   rc[6] = _mm_set_epi32(0x67c28f43,0x5e2e7cd0,0xe2412761,0xda4fef1b);
 12 |   rc[7] = _mm_set_epi32(0x2924d9b0,0xafcacc07,0x675ffde2,0x1fc70b3b);
 13 |   rc[8] = _mm_set_epi32(0xab4d63f1,0xe6867fe9,0xecdb8fca,0xb9d465ee);
 14 |   rc[9] = _mm_set_epi32(0x1c30bf84,0xd4b7cd64,0x5b2a404f,0xad037e33);
 15 |   rc[10] = _mm_set_epi32(0xb2cc0bb9,0x941723bf,0x69028b2e,0x8df69800);
 16 |   rc[11] = _mm_set_epi32(0xfa0478a6,0xde6f5572,0x4aaa9ec8,0x5c9d2d8a);
 17 |   rc[12] = _mm_set_epi32(0xdfb49f2b,0x6b772a12,0x0efa4f2e,0x29129fd4);
 18 |   rc[13] = _mm_set_epi32(0x1ea10344,0xf449a236,0x32d611ae,0xbb6a12ee);
 19 |   rc[14] = _mm_set_epi32(0xaf044988,0x4b050084,0x5f9600c9,0x9ca8eca6);
 20 |   rc[15] = _mm_set_epi32(0x21025ed8,0x9d199c4f,0x78a2c7e3,0x27e593ec);
 21 |   rc[16] = _mm_set_epi32(0xbf3aaaf8,0xa759c9b7,0xb9282ecd,0x82d40173);
 22 |   rc[17] = _mm_set_epi32(0x6260700d,0x6186b017,0x37f2efd9,0x10307d6b);
 23 |   rc[18] = _mm_set_epi32(0x5aca45c2,0x21300443,0x81c29153,0xf6fc9ac6);
 24 |   rc[19] = _mm_set_epi32(0x9223973c,0x226b68bb,0x2caf92e8,0x36d1943a);
 25 |   rc[20] = _mm_set_epi32(0xd3bf9238,0x225886eb,0x6cbab958,0xe51071b4);
 26 |   rc[21] = _mm_set_epi32(0xdb863ce5,0xaef0c677,0x933dfddd,0x24e1128d);
 27 |   rc[22] = _mm_set_epi32(0xbb606268,0xffeba09c,0x83e48de3,0xcb2212b1);
 28 |   rc[23] = _mm_set_epi32(0x734bd3dc,0xe2e4d19c,0x2db91a4e,0xc72bf77d);
 29 |   rc[24] = _mm_set_epi32(0x43bb47c3,0x61301b43,0x4b1415c4,0x2cb3924e);
 30 |   rc[25] = _mm_set_epi32(0xdba775a8,0xe707eff6,0x03b231dd,0x16eb6899);
 31 |   rc[26] = _mm_set_epi32(0x6df3614b,0x3c755977,0x8e5e2302,0x7eca472c);
 32 |   rc[27] = _mm_set_epi32(0xcda75a17,0xd6de7d77,0x6d1be5b9,0xb88617f9);
 33 |   rc[28] = _mm_set_epi32(0xec6b43f0,0x6ba8e9aa,0x9d6c069d,0xa946ee5d);
 34 |   rc[29] = _mm_set_epi32(0xcb1e6950,0xf957332b,0xa2531159,0x3bf327c1);
 35 |   rc[30] = _mm_set_epi32(0x2cee0c75,0x00da619c,0xe4ed0353,0x600ed0d9);
 36 |   rc[31] = _mm_set_epi32(0xf0b1a5a1,0x96e90cab,0x80bbbabc,0x63a4a350);
 37 |   rc[32] = _mm_set_epi32(0xae3db102,0x5e962988,0xab0dde30,0x938dca39);
 38 |   rc[33] = _mm_set_epi32(0x17bb8f38,0xd554a40b,0x8814f3a8,0x2e75b442);
 39 |   rc[34] = _mm_set_epi32(0x34bb8a5b,0x5f427fd7,0xaeb6b779,0x360a16f6);
 40 |   rc[35] = _mm_set_epi32(0x26f65241,0xcbe55438,0x43ce5918,0xffbaafde);
 41 |   rc[36] = _mm_set_epi32(0x4ce99a54,0xb9f3026a,0xa2ca9cf7,0x839ec978);
 42 |   rc[37] = _mm_set_epi32(0xae51a51a,0x1bdff7be,0x40c06e28,0x22901235);
 43 |   rc[38] = _mm_set_epi32(0xa0c1613c,0xba7ed22b,0xc173bc0f,0x48a659cf);
 44 |   rc[39] = _mm_set_epi32(0x756acc03,0x02288288,0x4ad6bdfd,0xe9c59da1);
 45 |   #define CONSTANTSLOADED
 46 | }
 47 | 
 48 | void test_implementations() {
 49 |   unsigned char *in = (unsigned char *)calloc(64*8, sizeof(unsigned char));
 50 |   unsigned char *out256 = (unsigned char *)calloc(32*8, sizeof(unsigned char));
 51 |   unsigned char *out512 = (unsigned char *)calloc(32*8, sizeof(unsigned char));
 52 |   unsigned char testvector256[32] = {0x80, 0x27, 0xcc, 0xb8, 0x79, 0x49, 0x77, 0x4b,
 53 |                                      0x78, 0xd0, 0x54, 0x5f, 0xb7, 0x2b, 0xf7, 0x0c,
 54 |                                      0x69, 0x5c, 0x2a, 0x09, 0x23, 0xcb, 0xd4, 0x7b,
 55 |                                      0xba, 0x11, 0x59, 0xef, 0xbf, 0x2b, 0x2c, 0x1c};
 56 | 
 57 |  unsigned char testvector512[32] = {0xbe, 0x7f, 0x72, 0x3b, 0x4e, 0x80, 0xa9, 0x98,
 58 |                                     0x13, 0xb2, 0x92, 0x28, 0x7f, 0x30, 0x6f, 0x62,
 59 |                                     0x5a, 0x6d, 0x57, 0x33, 0x1c, 0xae, 0x5f, 0x34,
 60 |                                     0xdd, 0x92, 0x77, 0xb0, 0x94, 0x5b, 0xe2, 0xaa};
 61 | 
 62 | 
 63 | 
 64 |   int i;
 65 | 
 66 |   // Input for testvector
 67 |   for(i = 0; i < 512; i++) {
 68 |     in[i] = i % 64;
 69 |   }
 70 | 
 71 |   load_constants();
 72 |   haraka512_8x(out512, in);
 73 | 
 74 |   // Verify output
 75 |   for(i = 0; i < 32; i++) {
 76 |     if (out512[i % 32] != testvector512[i]) {
 77 |       printf("Error: testvector incorrect.\n");
 78 |       return;
 79 |     }
 80 |   }
 81 | 
 82 |   free(in);
 83 |   free(out256);
 84 |   free(out512);
 85 | }
 86 | 
 87 | void haraka256(unsigned char *out, const unsigned char *in) {
 88 |   #ifndef CONSTANTSLOADED
 89 |   load_constants();
 90 |   #endif
 91 |   __m128i s[2], tmp;
 92 | 
 93 |   s[0] = LOAD(in);
 94 |   s[1] = LOAD(in + 16);
 95 | 
 96 |   AES2(s[0], s[1], 0);
 97 |   MIX2(s[0], s[1]);
 98 | 
 99 |   AES2(s[0], s[1], 4);
100 |   MIX2(s[0], s[1]);
101 | 
102 |   AES2(s[0], s[1], 8);
103 |   MIX2(s[0], s[1]);
104 | 
105 |   AES2(s[0], s[1], 12);
106 |   MIX2(s[0], s[1]);
107 | 
108 |   AES2(s[0], s[1], 16);
109 |   MIX2(s[0], s[1]);
110 | 
111 |   s[0] = _mm_xor_si128(s[0], LOAD(in));
112 |   s[1] = _mm_xor_si128(s[1], LOAD(in + 16));
113 | 
114 |   STORE(out, s[0]);
115 |   STORE(out + 16, s[1]);
116 | }
117 | 
118 | void haraka256_4x(unsigned char *out, const unsigned char *in) {
119 |   #ifndef CONSTANTSLOADED
120 |   load_constants();
121 |   #endif
122 |   __m128i s[4][2], tmp;
123 | 
124 |   s[0][0] = LOAD(in);
125 |   s[0][1] = LOAD(in + 16);
126 |   s[1][0] = LOAD(in + 32);
127 |   s[1][1] = LOAD(in + 48);
128 |   s[2][0] = LOAD(in + 64);
129 |   s[2][1] = LOAD(in + 80);
130 |   s[3][0] = LOAD(in + 96);
131 |   s[3][1] = LOAD(in + 112);
132 | 
133 |   // Round 1
134 |   AES2_4x(s[0], s[1], s[2], s[3], 0);
135 | 
136 |   MIX2(s[0][0], s[0][1]);
137 |   MIX2(s[1][0], s[1][1]);
138 |   MIX2(s[2][0], s[2][1]);
139 |   MIX2(s[3][0], s[3][1]);
140 | 
141 |   // Round 2
142 |   AES2_4x(s[0], s[1], s[2], s[3], 4);
143 | 
144 |   MIX2(s[0][0], s[0][1]);
145 |   MIX2(s[1][0], s[1][1]);
146 |   MIX2(s[2][0], s[2][1]);
147 |   MIX2(s[3][0], s[3][1]);
148 | 
149 |   // Round 3
150 |   AES2_4x(s[0], s[1], s[2], s[3], 8);
151 | 
152 |   MIX2(s[0][0], s[0][1]);
153 |   MIX2(s[1][0], s[1][1]);
154 |   MIX2(s[2][0], s[2][1]);
155 |   MIX2(s[3][0], s[3][1]);
156 | 
157 |   // Round 4
158 |   AES2_4x(s[0], s[1], s[2], s[3], 12);
159 | 
160 |   MIX2(s[0][0], s[0][1]);
161 |   MIX2(s[1][0], s[1][1]);
162 |   MIX2(s[2][0], s[2][1]);
163 |   MIX2(s[3][0], s[3][1]);
164 | 
165 |   // Round 5
166 |   AES2_4x(s[0], s[1], s[2], s[3], 16);
167 | 
168 |   MIX2(s[0][0], s[0][1]);
169 |   MIX2(s[1][0], s[1][1]);
170 |   MIX2(s[2][0], s[2][1]);
171 |   MIX2(s[3][0], s[3][1]);
172 | 
173 |   // Feed Forward
174 |   s[0][0] = _mm_xor_si128(s[0][0], LOAD(in));
175 |   s[0][1] = _mm_xor_si128(s[0][1], LOAD(in + 16));
176 |   s[1][0] = _mm_xor_si128(s[1][0], LOAD(in + 32));
177 |   s[1][1] = _mm_xor_si128(s[1][1], LOAD(in + 48));
178 |   s[2][0] = _mm_xor_si128(s[2][0], LOAD(in + 64));
179 |   s[2][1] = _mm_xor_si128(s[2][1], LOAD(in + 80));
180 |   s[3][0] = _mm_xor_si128(s[3][0], LOAD(in + 96));
181 |   s[3][1] = _mm_xor_si128(s[3][1], LOAD(in + 112));
182 | 
183 |   STORE(out, s[0][0]);
184 |   STORE(out + 16, s[0][1]);
185 |   STORE(out + 32, s[1][0]);
186 |   STORE(out + 48, s[1][1]);
187 |   STORE(out + 64, s[2][0]);
188 |   STORE(out + 80, s[2][1]);
189 |   STORE(out + 96, s[3][0]);
190 |   STORE(out + 112, s[3][1]);
191 | }
192 | 
193 | void haraka256_8x(unsigned char *out, const unsigned char *in) {
194 |   #ifndef CONSTANTSLOADED
195 |   load_constants();
196 |   #endif
197 |   haraka256_4x(out, in);
198 |   haraka256_4x(out + 128, in + 128);
199 | 
200 |   // __m128i s[8][2], tmp;
201 |   //
202 |   // int i;
203 |   //
204 |   // s[0][0] = LOAD(in);
205 |   // s[0][1] = LOAD(in + 16);
206 |   // s[1][0] = LOAD(in + 32);
207 |   // s[1][1] = LOAD(in + 48);
208 |   // s[2][0] = LOAD(in + 64);
209 |   // s[2][1] = LOAD(in + 80);
210 |   // s[3][0] = LOAD(in + 96);
211 |   // s[3][1] = LOAD(in + 112);
212 |   // s[4][0] = LOAD(in + 128);
213 |   // s[4][1] = LOAD(in + 144);
214 |   // s[5][0] = LOAD(in + 160);
215 |   // s[5][1] = LOAD(in + 176);
216 |   // s[6][0] = LOAD(in + 192);
217 |   // s[6][1] = LOAD(in + 208);
218 |   // s[7][0] = LOAD(in + 224);
219 |   // s[7][1] = LOAD(in + 240);
220 |   //
221 |   // // Round 1
222 |   // AES2_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 0);
223 |   //
224 |   // MIX2(s[0][0], s[0][1]);
225 |   // MIX2(s[1][0], s[1][1]);
226 |   // MIX2(s[2][0], s[2][1]);
227 |   // MIX2(s[3][0], s[3][1]);
228 |   // MIX2(s[4][0], s[4][1]);
229 |   // MIX2(s[5][0], s[5][1]);
230 |   // MIX2(s[6][0], s[6][1]);
231 |   // MIX2(s[7][0], s[7][1]);
232 |   //
233 |   //
234 |   // // Round 2
235 |   // AES2_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 4);
236 |   //
237 |   // MIX2(s[0][0], s[0][1]);
238 |   // MIX2(s[1][0], s[1][1]);
239 |   // MIX2(s[2][0], s[2][1]);
240 |   // MIX2(s[3][0], s[3][1]);
241 |   // MIX2(s[4][0], s[4][1]);
242 |   // MIX2(s[5][0], s[5][1]);
243 |   // MIX2(s[6][0], s[6][1]);
244 |   // MIX2(s[7][0], s[7][1]);
245 |   //
246 |   // // Round 3
247 |   // AES2_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 8);
248 |   //
249 |   // MIX2(s[0][0], s[0][1]);
250 |   // MIX2(s[1][0], s[1][1]);
251 |   // MIX2(s[2][0], s[2][1]);
252 |   // MIX2(s[3][0], s[3][1]);
253 |   // MIX2(s[4][0], s[4][1]);
254 |   // MIX2(s[5][0], s[5][1]);
255 |   // MIX2(s[6][0], s[6][1]);
256 |   // MIX2(s[7][0], s[7][1]);
257 |   //
258 |   // // Round 4
259 |   // AES2_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 12);
260 |   //
261 |   // MIX2(s[0][0], s[0][1]);
262 |   // MIX2(s[1][0], s[1][1]);
263 |   // MIX2(s[2][0], s[2][1]);
264 |   // MIX2(s[3][0], s[3][1]);
265 |   // MIX2(s[4][0], s[4][1]);
266 |   // MIX2(s[5][0], s[5][1]);
267 |   // MIX2(s[6][0], s[6][1]);
268 |   // MIX2(s[7][0], s[7][1]);
269 |   //
270 |   // // Round 5
271 |   // AES2_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 16);
272 |   //
273 |   // MIX2(s[0][0], s[0][1]);
274 |   // MIX2(s[1][0], s[1][1]);
275 |   // MIX2(s[2][0], s[2][1]);
276 |   // MIX2(s[3][0], s[3][1]);
277 |   // MIX2(s[4][0], s[4][1]);
278 |   // MIX2(s[5][0], s[5][1]);
279 |   // MIX2(s[6][0], s[6][1]);
280 |   // MIX2(s[7][0], s[7][1]);
281 |   //
282 |   // // Feed Forward
283 |   // s[0][0] = _mm_xor_si128(s[0][0], LOAD(in));
284 |   // s[0][1] = _mm_xor_si128(s[0][1], LOAD(in + 16));
285 |   // s[1][0] = _mm_xor_si128(s[1][0], LOAD(in + 32));
286 |   // s[1][1] = _mm_xor_si128(s[1][1], LOAD(in + 48));
287 |   // s[2][0] = _mm_xor_si128(s[2][0], LOAD(in + 64));
288 |   // s[2][1] = _mm_xor_si128(s[2][1], LOAD(in + 80));
289 |   // s[3][0] = _mm_xor_si128(s[3][0], LOAD(in + 96));
290 |   // s[3][1] = _mm_xor_si128(s[3][1], LOAD(in + 112));
291 |   // s[4][0] = _mm_xor_si128(s[4][0], LOAD(in + 128));
292 |   // s[4][1] = _mm_xor_si128(s[4][1], LOAD(in + 144));
293 |   // s[5][0] = _mm_xor_si128(s[5][0], LOAD(in + 160));
294 |   // s[5][1] = _mm_xor_si128(s[5][1], LOAD(in + 176));
295 |   // s[6][0] = _mm_xor_si128(s[6][0], LOAD(in + 192));
296 |   // s[6][1] = _mm_xor_si128(s[6][1], LOAD(in + 208));
297 |   // s[7][0] = _mm_xor_si128(s[7][0], LOAD(in + 224));
298 |   // s[7][1] = _mm_xor_si128(s[7][1], LOAD(in + 240));
299 |   //
300 |   // STORE(out, s[0][0]);
301 |   // STORE(out + 16, s[0][1]);
302 |   // STORE(out + 32, s[1][0]);
303 |   // STORE(out + 48, s[1][1]);
304 |   // STORE(out + 64, s[2][0]);
305 |   // STORE(out + 80, s[2][1]);
306 |   // STORE(out + 96, s[3][0]);
307 |   // STORE(out + 112, s[3][1]);
308 |   // STORE(out + 128, s[4][0]);
309 |   // STORE(out + 144, s[4][1]);
310 |   // STORE(out + 160, s[5][0]);
311 |   // STORE(out + 176, s[5][1]);
312 |   // STORE(out + 192, s[6][0]);
313 |   // STORE(out + 208, s[6][1]);
314 |   // STORE(out + 224, s[7][0]);
315 |   // STORE(out + 240, s[7][1]);
316 | }
317 | 
318 | void haraka512(unsigned char *out, const unsigned char *in) {
319 |   #ifndef CONSTANTSLOADED
320 |   load_constants();
321 |   #endif
322 |   u128 s[4], tmp;
323 | 
324 |   s[0] = LOAD(in);
325 |   s[1] = LOAD(in + 16);
326 |   s[2] = LOAD(in + 32);
327 |   s[3] = LOAD(in + 48);
328 | 
329 |   AES4(s[0], s[1], s[2], s[3], 0);
330 |   MIX4(s[0], s[1], s[2], s[3]);
331 | 
332 |   AES4(s[0], s[1], s[2], s[3], 8);
333 |   MIX4(s[0], s[1], s[2], s[3]);
334 | 
335 |   AES4(s[0], s[1], s[2], s[3], 16);
336 |   MIX4(s[0], s[1], s[2], s[3]);
337 | 
338 |   AES4(s[0], s[1], s[2], s[3], 24);
339 |   MIX4(s[0], s[1], s[2], s[3]);
340 | 
341 |   AES4(s[0], s[1], s[2], s[3], 32);
342 |   MIX4(s[0], s[1], s[2], s[3]);
343 | 
344 |   s[0] = _mm_xor_si128(s[0], LOAD(in));
345 |   s[1] = _mm_xor_si128(s[1], LOAD(in + 16));
346 |   s[2] = _mm_xor_si128(s[2], LOAD(in + 32));
347 |   s[3] = _mm_xor_si128(s[3], LOAD(in + 48));
348 | 
349 |   TRUNCSTORE(out, s[0], s[1], s[2], s[3]);
350 | }
351 | 
352 | void haraka512_4x(unsigned char *out, const unsigned char *in) {
353 |   #ifndef CONSTANTSLOADED
354 |   load_constants();
355 |   #endif
356 |   u128 s[4][4], tmp;
357 | 
358 |   s[0][0] = LOAD(in);
359 |   s[0][1] = LOAD(in + 16);
360 |   s[0][2] = LOAD(in + 32);
361 |   s[0][3] = LOAD(in + 48);
362 |   s[1][0] = LOAD(in + 64);
363 |   s[1][1] = LOAD(in + 80);
364 |   s[1][2] = LOAD(in + 96);
365 |   s[1][3] = LOAD(in + 112);
366 |   s[2][0] = LOAD(in + 128);
367 |   s[2][1] = LOAD(in + 144);
368 |   s[2][2] = LOAD(in + 160);
369 |   s[2][3] = LOAD(in + 176);
370 |   s[3][0] = LOAD(in + 192);
371 |   s[3][1] = LOAD(in + 208);
372 |   s[3][2] = LOAD(in + 224);
373 |   s[3][3] = LOAD(in + 240);
374 | 
375 |   AES4_4x(s[0], s[1], s[2], s[3], 0);
376 |   MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
377 |   MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
378 |   MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
379 |   MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
380 | 
381 |   AES4_4x(s[0], s[1], s[2], s[3], 8);
382 |   MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
383 |   MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
384 |   MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
385 |   MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
386 | 
387 |   AES4_4x(s[0], s[1], s[2], s[3], 16);
388 |   MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
389 |   MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
390 |   MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
391 |   MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
392 | 
393 |   AES4_4x(s[0], s[1], s[2], s[3], 24);
394 |   MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
395 |   MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
396 |   MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
397 |   MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
398 | 
399 |   AES4_4x(s[0], s[1], s[2], s[3], 32);
400 |   MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
401 |   MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
402 |   MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
403 |   MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
404 | 
405 | 
406 |   s[0][0] = _mm_xor_si128(s[0][0], LOAD(in));
407 |   s[0][1] = _mm_xor_si128(s[0][1], LOAD(in + 16));
408 |   s[0][2] = _mm_xor_si128(s[0][2], LOAD(in + 32));
409 |   s[0][3] = _mm_xor_si128(s[0][3], LOAD(in + 48));
410 |   s[1][0] = _mm_xor_si128(s[1][0], LOAD(in + 64));
411 |   s[1][1] = _mm_xor_si128(s[1][1], LOAD(in + 80));
412 |   s[1][2] = _mm_xor_si128(s[1][2], LOAD(in + 96));
413 |   s[1][3] = _mm_xor_si128(s[1][3], LOAD(in + 112));
414 |   s[2][0] = _mm_xor_si128(s[2][0], LOAD(in + 128));
415 |   s[2][1] = _mm_xor_si128(s[2][1], LOAD(in + 144));
416 |   s[2][2] = _mm_xor_si128(s[2][2], LOAD(in + 160));
417 |   s[2][3] = _mm_xor_si128(s[2][3], LOAD(in + 176));
418 |   s[3][0] = _mm_xor_si128(s[3][0], LOAD(in + 192));
419 |   s[3][1] = _mm_xor_si128(s[3][1], LOAD(in + 208));
420 |   s[3][2] = _mm_xor_si128(s[3][2], LOAD(in + 224));
421 |   s[3][3] = _mm_xor_si128(s[3][3], LOAD(in + 240));
422 | 
423 |   TRUNCSTORE(out, s[0][0], s[0][1], s[0][2], s[0][3]);
424 |   TRUNCSTORE(out + 32, s[1][0], s[1][1], s[1][2], s[1][3]);
425 |   TRUNCSTORE(out + 64, s[2][0], s[2][1], s[2][2], s[2][3]);
426 |   TRUNCSTORE(out + 96, s[3][0], s[3][1], s[3][2], s[3][3]);
427 | }
428 | 
429 | void haraka512_8x(unsigned char *out, const unsigned char *in) {
430 |   #ifndef CONSTANTSLOADED
431 |   load_constants();
432 |   #endif
433 |   // This is faster on Skylake, the code below is faster on Haswell.
434 |   haraka512_4x(out, in);
435 |   haraka512_4x(out + 128, in + 256);
436 | 
437 |   // u128 s[8][4], tmp;
438 |   //
439 |   // s[0][0] = LOAD(in);
440 |   // s[0][1] = LOAD(in + 16);
441 |   // s[0][2] = LOAD(in + 32);
442 |   // s[0][3] = LOAD(in + 48);
443 |   // s[1][0] = LOAD(in + 64);
444 |   // s[1][1] = LOAD(in + 80);
445 |   // s[1][2] = LOAD(in + 96);
446 |   // s[1][3] = LOAD(in + 112);
447 |   // s[2][0] = LOAD(in + 128);
448 |   // s[2][1] = LOAD(in + 144);
449 |   // s[2][2] = LOAD(in + 160);
450 |   // s[2][3] = LOAD(in + 176);
451 |   // s[3][0] = LOAD(in + 192);
452 |   // s[3][1] = LOAD(in + 208);
453 |   // s[3][2] = LOAD(in + 224);
454 |   // s[3][3] = LOAD(in + 240);
455 |   // s[4][0] = LOAD(in + 256);
456 |   // s[4][1] = LOAD(in + 272);
457 |   // s[4][2] = LOAD(in + 288);
458 |   // s[4][3] = LOAD(in + 304);
459 |   // s[5][0] = LOAD(in + 320);
460 |   // s[5][1] = LOAD(in + 336);
461 |   // s[5][2] = LOAD(in + 352);
462 |   // s[5][3] = LOAD(in + 368);
463 |   // s[6][0] = LOAD(in + 384);
464 |   // s[6][1] = LOAD(in + 400);
465 |   // s[6][2] = LOAD(in + 416);
466 |   // s[6][3] = LOAD(in + 432);
467 |   // s[7][0] = LOAD(in + 448);
468 |   // s[7][1] = LOAD(in + 464);
469 |   // s[7][2] = LOAD(in + 480);
470 |   // s[7][3] = LOAD(in + 496);
471 |   //
472 |   // AES4_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 0);
473 |   // MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
474 |   // MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
475 |   // MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
476 |   // MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
477 |   // MIX4(s[4][0], s[4][1], s[4][2], s[4][3]);
478 |   // MIX4(s[5][0], s[5][1], s[5][2], s[5][3]);
479 |   // MIX4(s[6][0], s[6][1], s[6][2], s[6][3]);
480 |   // MIX4(s[7][0], s[7][1], s[7][2], s[7][3]);
481 |   //
482 |   // AES4_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 8);
483 |   // MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
484 |   // MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
485 |   // MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
486 |   // MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
487 |   // MIX4(s[4][0], s[4][1], s[4][2], s[4][3]);
488 |   // MIX4(s[5][0], s[5][1], s[5][2], s[5][3]);
489 |   // MIX4(s[6][0], s[6][1], s[6][2], s[6][3]);
490 |   // MIX4(s[7][0], s[7][1], s[7][2], s[7][3]);
491 |   //
492 |   // AES4_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 16);
493 |   // MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
494 |   // MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
495 |   // MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
496 |   // MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
497 |   // MIX4(s[4][0], s[4][1], s[4][2], s[4][3]);
498 |   // MIX4(s[5][0], s[5][1], s[5][2], s[5][3]);
499 |   // MIX4(s[6][0], s[6][1], s[6][2], s[6][3]);
500 |   // MIX4(s[7][0], s[7][1], s[7][2], s[7][3]);
501 |   //
502 |   // AES4_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 24);
503 |   // MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
504 |   // MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
505 |   // MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
506 |   // MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
507 |   // MIX4(s[4][0], s[4][1], s[4][2], s[4][3]);
508 |   // MIX4(s[5][0], s[5][1], s[5][2], s[5][3]);
509 |   // MIX4(s[6][0], s[6][1], s[6][2], s[6][3]);
510 |   // MIX4(s[7][0], s[7][1], s[7][2], s[7][3]);
511 |   //
512 |   // AES4_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 32);
513 |   // MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
514 |   // MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
515 |   // MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
516 |   // MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
517 |   // MIX4(s[4][0], s[4][1], s[4][2], s[4][3]);
518 |   // MIX4(s[5][0], s[5][1], s[5][2], s[5][3]);
519 |   // MIX4(s[6][0], s[6][1], s[6][2], s[6][3]);
520 |   // MIX4(s[7][0], s[7][1], s[7][2], s[7][3]);
521 |   //
522 |   //
523 |   // s[0][0] = _mm_xor_si128(s[0][0], LOAD(in));
524 |   // s[0][1] = _mm_xor_si128(s[0][1], LOAD(in + 16));
525 |   // s[0][2] = _mm_xor_si128(s[0][2], LOAD(in + 32));
526 |   // s[0][3] = _mm_xor_si128(s[0][3], LOAD(in + 48));
527 |   // s[1][0] = _mm_xor_si128(s[1][0], LOAD(in + 64));
528 |   // s[1][1] = _mm_xor_si128(s[1][1], LOAD(in + 80));
529 |   // s[1][2] = _mm_xor_si128(s[1][2], LOAD(in + 96));
530 |   // s[1][3] = _mm_xor_si128(s[1][3], LOAD(in + 112));
531 |   // s[2][0] = _mm_xor_si128(s[2][0], LOAD(in + 128));
532 |   // s[2][1] = _mm_xor_si128(s[2][1], LOAD(in + 144));
533 |   // s[2][2] = _mm_xor_si128(s[2][2], LOAD(in + 160));
534 |   // s[2][3] = _mm_xor_si128(s[2][3], LOAD(in + 176));
535 |   // s[3][0] = _mm_xor_si128(s[3][0], LOAD(in + 192));
536 |   // s[3][1] = _mm_xor_si128(s[3][1], LOAD(in + 208));
537 |   // s[3][2] = _mm_xor_si128(s[3][2], LOAD(in + 224));
538 |   // s[3][3] = _mm_xor_si128(s[3][3], LOAD(in + 240));
539 |   // s[4][0] = _mm_xor_si128(s[4][0], LOAD(in + 256));
540 |   // s[4][1] = _mm_xor_si128(s[4][1], LOAD(in + 272));
541 |   // s[4][2] = _mm_xor_si128(s[4][2], LOAD(in + 288));
542 |   // s[4][3] = _mm_xor_si128(s[4][3], LOAD(in + 304));
543 |   // s[5][0] = _mm_xor_si128(s[5][0], LOAD(in + 320));
544 |   // s[5][1] = _mm_xor_si128(s[5][1], LOAD(in + 336));
545 |   // s[5][2] = _mm_xor_si128(s[5][2], LOAD(in + 352));
546 |   // s[5][3] = _mm_xor_si128(s[5][3], LOAD(in + 368));
547 |   // s[6][0] = _mm_xor_si128(s[6][0], LOAD(in + 384));
548 |   // s[6][1] = _mm_xor_si128(s[6][1], LOAD(in + 400));
549 |   // s[6][2] = _mm_xor_si128(s[6][2], LOAD(in + 416));
550 |   // s[6][3] = _mm_xor_si128(s[6][3], LOAD(in + 432));
551 |   // s[7][0] = _mm_xor_si128(s[7][0], LOAD(in + 448));
552 |   // s[7][1] = _mm_xor_si128(s[7][1], LOAD(in + 464));
553 |   // s[7][2] = _mm_xor_si128(s[7][2], LOAD(in + 480));
554 |   // s[7][3] = _mm_xor_si128(s[7][3], LOAD(in + 496));
555 |   //
556 |   // TRUNCSTORE(out, s[0][0], s[0][1], s[0][2], s[0][3]);
557 |   // TRUNCSTORE(out + 32, s[1][0], s[1][1], s[1][2], s[1][3]);
558 |   // TRUNCSTORE(out + 64, s[2][0], s[2][1], s[2][2], s[2][3]);
559 |   // TRUNCSTORE(out + 96, s[3][0], s[3][1], s[3][2], s[3][3]);
560 |   // TRUNCSTORE(out + 128, s[4][0], s[4][1], s[4][2], s[4][3]);
561 |   // TRUNCSTORE(out + 160, s[5][0], s[5][1], s[5][2], s[5][3]);
562 |   // TRUNCSTORE(out + 192, s[6][0], s[6][1], s[6][2], s[6][3]);
563 |   // TRUNCSTORE(out + 224, s[7][0], s[7][1], s[7][2], s[7][3]);
564 | }
565 | 


--------------------------------------------------------------------------------
/supercop/crypto_sign/sphincs256haraka/aesni/haraka.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Optimized Implementations for Haraka256 and Haraka512
 3 | */
 4 | #ifndef HARAKA_H_
 5 | #define HARAKA_H_
 6 | 
 7 | #include "immintrin.h"
 8 | 
 9 | #define NUMROUNDS 5
10 | 
11 | #define u64 unsigned long
12 | #define u128 __m128i
13 | 
14 | u128 rc[40];
15 | 
16 | #define LOAD(src) _mm_load_si128((u128 *)(src))
17 | #define STORE(dest,src) _mm_storeu_si128((u128 *)(dest),src)
18 | 
19 | #define AES2(s0, s1, rci) \
20 |   s0 = _mm_aesenc_si128(s0, rc[rci]); \
21 |   s1 = _mm_aesenc_si128(s1, rc[rci + 1]); \
22 |   s0 = _mm_aesenc_si128(s0, rc[rci + 2]); \
23 |   s1 = _mm_aesenc_si128(s1, rc[rci + 3]);
24 | 
25 | #define AES2_4x(s0, s1, s2, s3, rci) \
26 |   AES2(s0[0], s0[1], rci); \
27 |   AES2(s1[0], s1[1], rci); \
28 |   AES2(s2[0], s2[1], rci); \
29 |   AES2(s3[0], s3[1], rci);
30 | 
31 | #define AES2_8x(s0, s1, s2, s3, s4, s5, s6, s7, rci) \
32 |   AES2_4x(s0, s1, s2, s3, rci); \
33 |   AES2_4x(s4, s5, s6, s7, rci);
34 | 
35 | #define AES4(s0, s1, s2, s3, rci) \
36 |   s0 = _mm_aesenc_si128(s0, rc[rci]); \
37 |   s1 = _mm_aesenc_si128(s1, rc[rci + 1]); \
38 |   s2 = _mm_aesenc_si128(s2, rc[rci + 2]); \
39 |   s3 = _mm_aesenc_si128(s3, rc[rci + 3]); \
40 |   s0 = _mm_aesenc_si128(s0, rc[rci + 4]); \
41 |   s1 = _mm_aesenc_si128(s1, rc[rci + 5]); \
42 |   s2 = _mm_aesenc_si128(s2, rc[rci + 6]); \
43 |   s3 = _mm_aesenc_si128(s3, rc[rci + 7]); \
44 | 
45 | #define AES4_4x(s0, s1, s2, s3, rci) \
46 |   AES4(s0[0], s0[1], s0[2], s0[3], rci); \
47 |   AES4(s1[0], s1[1], s1[2], s1[3], rci); \
48 |   AES4(s2[0], s2[1], s2[2], s2[3], rci); \
49 |   AES4(s3[0], s3[1], s3[2], s3[3], rci);
50 | 
51 | #define AES4_8x(s0, s1, s2, s3, s4, s5, s6, s7, rci) \
52 |   AES4_4x(s0, s1, s2, s3, rci); \
53 |   AES4_4x(s4, s5, s6, s7, rci);
54 | 
55 | #define MIX2(s0, s1) \
56 |   tmp = _mm_unpacklo_epi32(s0, s1); \
57 |   s1 = _mm_unpackhi_epi32(s0, s1); \
58 |   s0 = tmp;
59 | 
60 | #define MIX4(s0, s1, s2, s3) \
61 |   tmp  = _mm_unpacklo_epi32(s0, s1); \
62 |   s0 = _mm_unpackhi_epi32(s0, s1); \
63 |   s1 = _mm_unpacklo_epi32(s2, s3); \
64 |   s2 = _mm_unpackhi_epi32(s2, s3); \
65 |   s3 = _mm_unpacklo_epi32(s0, s2); \
66 |   s0 = _mm_unpackhi_epi32(s0, s2); \
67 |   s2 = _mm_unpackhi_epi32(s1, tmp); \
68 |   s1 = _mm_unpacklo_epi32(s1, tmp);
69 | 
70 | #define TRUNCSTORE(out, s0, s1, s2, s3) \
71 |   *(u64*)(out) = (u64*)(s0)[1]; \
72 |   *(u64*)(out + 8) = (u64*)(s1)[1]; \
73 |   *(u64*)(out + 16) = (u64*)(s2)[0]; \
74 |   *(u64*)(out + 24) = (u64*)(s3)[0];
75 | 
76 | void load_constants();
77 | void test_implementations();
78 | 
79 | void haraka256(unsigned char *out, const unsigned char *in);
80 | void haraka256_4x(unsigned char *out, const unsigned char *in);
81 | void haraka256_8x(unsigned char *out, const unsigned char *in);
82 | 
83 | void haraka512(unsigned char *out, const unsigned char *in);
84 | void haraka512_4x(unsigned char *out, const unsigned char *in);
85 | void haraka512_8x(unsigned char *out, const unsigned char *in);
86 | 
87 | 
88 | #endif
89 | 


--------------------------------------------------------------------------------
/supercop/crypto_sign/sphincs256haraka/aesni/hash.c:
--------------------------------------------------------------------------------
  1 | #include "params.h"
  2 | #include "permute.h"
  3 | #include "immintrin.h"
  4 | #include "crypto_hash_blake256.h"
  5 | #include "crypto_hash_blake512.h"
  6 | #include "haraka.h"
  7 | 
  8 | #include <stddef.h>
  9 | #include <openssl/sha.h>
 10 | 
 11 | int varlen_hash(unsigned char *out,const unsigned char *in,unsigned long long inlen)
 12 | {
 13 |   //SHA256(in,inlen,out);
 14 |   crypto_hash_blake256(out,in,inlen);
 15 |   return 0;
 16 | }
 17 | 
 18 | int msg_hash(unsigned char *out,const unsigned char *in,unsigned long long inlen)
 19 | {
 20 |   //SHA512(in,inlen,out);
 21 |   crypto_hash_blake512(out,in,inlen);
 22 |   return 0;
 23 | }
 24 | 
 25 | 
 26 | static const char *hashc = "expand 32-byte to 64-byte state!";
 27 | 
 28 | int hash_2n_n(unsigned char *out,const unsigned char *in)
 29 | {
 30 | #if HASH_BYTES != 32
 31 | #error "Current code only supports 32-byte hashes"
 32 | #endif
 33 | 
 34 |   unsigned char x[64];
 35 |   int i;
 36 |   for(i=0;i<64;i++)
 37 |   {
 38 |     x[i] = in[i];
 39 |   }
 40 |   haraka512(out,x);
 41 | 
 42 |   return 0;
 43 | }
 44 | 
 45 | int hash_2n_n_mask(unsigned char *out,const unsigned char *in, const unsigned char *mask)
 46 | {
 47 |   unsigned char buf[2*HASH_BYTES];
 48 |   int i;
 49 |   for(i=0;i<2*HASH_BYTES;i++)
 50 |     buf[i] = in[i] ^ mask[i];
 51 |   return hash_2n_n(out, buf);
 52 | }
 53 | 
 54 | int hash_n_n(unsigned char *out,const unsigned char *in)
 55 | {
 56 | #if HASH_BYTES != 32
 57 | #error "Current code only supports 32-byte hashes"
 58 | #endif
 59 |   unsigned char x[32];
 60 |   int i;
 61 | 
 62 |   for(i=0;i<32;i++)
 63 |   {
 64 |     x[i]    = in[i];
 65 |   }
 66 |   haraka256(out,x);
 67 | 
 68 |   return 0;
 69 | }
 70 | 
 71 | int hash_n_n_mask(unsigned char *out,const unsigned char *in, const unsigned char *mask)
 72 | {
 73 |   unsigned char buf[HASH_BYTES];
 74 |   int i;
 75 |   for(i=0;i<HASH_BYTES;i++)
 76 |     buf[i] = in[i] ^ mask[i];
 77 |   return hash_n_n(out, buf);
 78 | }
 79 | 
 80 | int hash_n_n_mask_8x(unsigned char *out,const unsigned char *in,
 81 |                      const unsigned char *mask)
 82 | {
 83 |   unsigned char x[32*8];
 84 |   int i;
 85 | 
 86 |   __m256i fullmask = _mm256_loadu_si256(mask);
 87 | 
 88 |   for(i=0;i<8;i++)
 89 |   {
 90 |     _mm256_store_si256(x + 32*i, _mm256_xor_si256(_mm256_load_si256(in + 32*i), fullmask));
 91 |   }
 92 | 
 93 |   haraka256_8x(out, x);
 94 | }
 95 | 
 96 | int hash_2n_n_8x(unsigned char *out,const unsigned char *in,
 97 |       unsigned long long out_dist, unsigned long long in_dist)
 98 | {
 99 |   haraka512_8x(out, in);
100 | 
101 | }
102 | int hash_2n_n_mask_8x(unsigned char *out,const unsigned char *in,
103 |       unsigned long long out_dist, unsigned long long in_dist,
104 |       const unsigned char *mask)
105 | {
106 |   unsigned char x[in_dist*8];
107 |   int i;
108 | 
109 |   __m256i mask_a = _mm256_loadu_si256(mask);
110 |   __m256i mask_b = _mm256_loadu_si256(mask + 32);
111 | 
112 |   for(i=0;i<8;i++)
113 |   {
114 |     _mm256_store_si256(x + 64*i, _mm256_xor_si256(_mm256_load_si256(in + 64*i), mask_a));
115 |     _mm256_store_si256(x + 64*i + 32, _mm256_xor_si256(_mm256_load_si256(in + 64*i + 32), mask_b));
116 |   }
117 |   haraka512_8x(out, x);
118 | }
119 | 
120 | int loop_hash_2n_n_mask_8x(unsigned char *out,const unsigned char *in,
121 |                            unsigned long loops, const unsigned char *mask)
122 | {
123 |   int j;
124 |   for(j=0;j<8*loops;j+=8)
125 |     hash_2n_n_mask_8x(out+(j)*HASH_BYTES, in+(2*j)*HASH_BYTES,
126 |                       HASH_BYTES, 2*HASH_BYTES, mask);
127 | }
128 | 
129 | int hash_n_n_8x(unsigned char *out,const unsigned char *in)
130 | {
131 |   haraka256_8x(out, in);
132 | }
133 | 


--------------------------------------------------------------------------------
/supercop/crypto_sign/sphincs256haraka/aesni/hash.h:
--------------------------------------------------------------------------------
 1 | #ifndef HASH_H
 2 | #define HASH_H
 3 | 
 4 | #include "params.h"
 5 | 
 6 | int msg_hash(unsigned char *out,const unsigned char *in,unsigned long long inlen);
 7 | int varlen_hash(unsigned char *out,const unsigned char *in,unsigned long long inlen);
 8 | int hash_2n_n(unsigned char *out,const unsigned char *in);
 9 | int hash_2n_n_mask(unsigned char *out,const unsigned char *in,const unsigned char *mask);
10 | int hash_n_n(unsigned char *out,const unsigned char *in);
11 | int hash_n_n_mask(unsigned char *out,const unsigned char *in,const unsigned char *mask);
12 | 
13 | int hash_2n_n_8x(unsigned char *out,const unsigned char *in, 
14 |       unsigned long long out_dist, unsigned long long in_dist);
15 | int hash_2n_n_mask_8x(unsigned char *out,const unsigned char *in, 
16 |       unsigned long long out_dist, unsigned long long in_dist,
17 |       const unsigned char *mask);
18 | int hash_n_n_8x(unsigned char *out,const unsigned char *in);
19 | void genfullchain8x(unsigned char *out, const unsigned char *masks);
20 | 
21 | int hash_n_n_mask_8x(unsigned char *out,const unsigned char *in, 
22 |                      const unsigned char *mask);
23 | 
24 | int loop_hash_2n_n_mask_8x(unsigned char *out,const unsigned char *in,
25 |                            unsigned long loops, const unsigned char *mask);                     
26 | 
27 | #endif
28 | 


--------------------------------------------------------------------------------
/supercop/crypto_sign/sphincs256haraka/aesni/horst.c:
--------------------------------------------------------------------------------
  1 | #include "params.h"
  2 | #include "horst.h"
  3 | #include "hash.h"
  4 | #include "prg.h"
  5 | #include <stdint.h>
  6 | #include <stdio.h>
  7 | 
  8 | static void expand_seed(unsigned char outseeds[HORST_T*HORST_SKBYTES], const unsigned char inseed[SEED_BYTES])
  9 | {
 10 |   prg(outseeds, HORST_T*HORST_SKBYTES, inseed);
 11 | }
 12 | 
 13 | int horst_sign(unsigned char *sig, unsigned char pk[HASH_BYTES], unsigned long long *sigbytes, 
 14 |                const unsigned char *m, unsigned long long mlen, 
 15 |                const unsigned char seed[SEED_BYTES], 
 16 |                const unsigned char masks[2*HORST_LOGT*HASH_BYTES], 
 17 |                const unsigned char m_hash[MSGHASH_BYTES])
 18 | {
 19 |   unsigned char sk[HORST_T*HORST_SKBYTES];
 20 |   unsigned int idx;
 21 |   int i,j,k;
 22 |   int sigpos = 0;
 23 | 
 24 |   unsigned char tree[(2*HORST_T-1)*HASH_BYTES]; /* replace by something more memory-efficient? */
 25 | 
 26 |   expand_seed(sk, seed);
 27 | 
 28 |   // Build the whole tree and save it
 29 | #if HORST_SKBYTES != HASH_BYTES
 30 | #error "Need to have HORST_SKBYTES == HASH_BYTES"
 31 | #endif
 32 | 
 33 |   // Generate pk leaves non parallel
 34 |   //for(i=0;i<HORST_T;i++)
 35 |   //  hash_n_n(tree+(HORST_T-1+i)*HASH_BYTES, sk+i*HORST_SKBYTES);
 36 | 
 37 |   // Generate pk leaves parallel
 38 |   for(i=0;i<HORST_T;i+=8)
 39 |     hash_n_n_8x(tree+(HORST_T-1+i)*HASH_BYTES, sk+i*HORST_SKBYTES);
 40 | 
 41 |   // Build tree
 42 |   // Hash from level 0 to 1
 43 |   loop_hash_2n_n_mask_8x(tree+(32767)*HASH_BYTES,tree+(65535)*HASH_BYTES,4096,masks);
 44 |   // Hash from level 1 to 2
 45 |   loop_hash_2n_n_mask_8x(tree+(16383)*HASH_BYTES,tree+(32767)*HASH_BYTES,2048,masks+2*HASH_BYTES);
 46 |   // Hash from level 2 to 3
 47 |   loop_hash_2n_n_mask_8x(tree+(8191)*HASH_BYTES,tree+(16383)*HASH_BYTES,1024,masks+4*HASH_BYTES);
 48 |   // Hash from level 3 to 4
 49 |   loop_hash_2n_n_mask_8x(tree+(4095)*HASH_BYTES,tree+(8191)*HASH_BYTES,512,masks+6*HASH_BYTES);
 50 |   // Hash from level 4 to 5
 51 |   loop_hash_2n_n_mask_8x(tree+(2047)*HASH_BYTES,tree+(4095)*HASH_BYTES,256,masks+8*HASH_BYTES);
 52 |   // Hash from level 5 to 6
 53 |   loop_hash_2n_n_mask_8x(tree+(1023)*HASH_BYTES,tree+(2047)*HASH_BYTES,128,masks+10*HASH_BYTES);
 54 |   // Hash from level 6 to 7
 55 |   loop_hash_2n_n_mask_8x(tree+(511)*HASH_BYTES,tree+(1023)*HASH_BYTES,64,masks+12*HASH_BYTES);
 56 |   // Hash from level 7 to 8
 57 |   loop_hash_2n_n_mask_8x(tree+(255)*HASH_BYTES,tree+(511)*HASH_BYTES,32,masks+14*HASH_BYTES);
 58 |   // Hash from level 8 to 9
 59 |   loop_hash_2n_n_mask_8x(tree+(127)*HASH_BYTES,tree+(255)*HASH_BYTES,16,masks+16*HASH_BYTES);
 60 |   // Hash from level 9 to 10
 61 |   loop_hash_2n_n_mask_8x(tree+(63)*HASH_BYTES,tree+(127)*HASH_BYTES,8,masks+18*HASH_BYTES);
 62 |   // Hash from level 10 to 11
 63 |   loop_hash_2n_n_mask_8x(tree+(31)*HASH_BYTES,tree+(63)*HASH_BYTES,4,masks+20*HASH_BYTES);
 64 |   // Hash from level 11 to 12
 65 |   loop_hash_2n_n_mask_8x(tree+(15)*HASH_BYTES,tree+(31)*HASH_BYTES,2,masks+22*HASH_BYTES);
 66 |   // Hash from level 12 to 13
 67 |   loop_hash_2n_n_mask_8x(tree+(7)*HASH_BYTES,tree+(15)*HASH_BYTES,1,masks+24*HASH_BYTES);
 68 |   // Hash from level 13 to 14
 69 |   for(j=0;j<4;j++)
 70 |     hash_2n_n_mask(tree+(3+j)*HASH_BYTES,tree+(7+2*j)*HASH_BYTES,masks+26*HASH_BYTES);
 71 |   // Hash from level 14 to 15
 72 |   for(j=0;j<2;j++)
 73 |       hash_2n_n_mask(tree+(1+j)*HASH_BYTES,tree+(3+2*j)*HASH_BYTES,masks+28*HASH_BYTES);
 74 |   // Hash from level 15 to 16
 75 |   hash_2n_n_mask(tree+0*HASH_BYTES,tree+1*HASH_BYTES,masks+30*HASH_BYTES);
 76 | 
 77 | 
 78 | #if HORST_K != (MSGHASH_BYTES/2)
 79 | #error "Need to have HORST_K == (MSGHASH_BYTES/2)"
 80 | #endif
 81 | 
 82 |   // First write 64 hashes from level 10 to the signature
 83 |   for(j=63*HASH_BYTES;j<127*HASH_BYTES;j++)
 84 |     sig[sigpos++] = tree[j];
 85 | 
 86 |   // Signature consists of HORST_K parts; each part of secret key and HORST_LOGT-4 auth-path hashes
 87 |   for(i=0;i<HORST_K;i++)
 88 |   {
 89 |     idx = m_hash[2*i] + (m_hash[2*i+1]<<8);
 90 | 
 91 |     for(k=0;k<HORST_SKBYTES;k++)
 92 |       sig[sigpos++] = sk[idx*HORST_SKBYTES+k];
 93 | 
 94 |     idx += (HORST_T-1);
 95 |     for(j=0;j<HORST_LOGT-6;j++)
 96 |     {
 97 |       idx = (idx&1)?idx+1:idx-1; // neighbor node
 98 |       for(k=0;k<HASH_BYTES;k++)
 99 |         sig[sigpos++] = tree[idx*HASH_BYTES+k];
100 |       idx = (idx-1)/2; // parent node
101 |     }
102 |   }
103 |  
104 |   for(i=0;i<HASH_BYTES;i++)
105 |     pk[i] = tree[i];
106 |   
107 |   *sigbytes = HORST_SIGBYTES;
108 |   return 0;
109 | }
110 | 
111 | int horst_verify(unsigned char *pk, const unsigned char *sig, const unsigned char *m, unsigned long long mlen, const unsigned char masks[2*HORST_LOGT*HASH_BYTES], const unsigned char m_hash[MSGHASH_BYTES])
112 | {
113 |   unsigned char buffer[32*HASH_BYTES];
114 |   const unsigned char *level10;
115 |   unsigned int idx;
116 |   int i,j,k;
117 | 
118 | #if HORST_K != (MSGHASH_BYTES/2)
119 | #error "Need to have HORST_K == (MSGHASH_BYTES/2)"
120 | #endif
121 | 
122 |   level10 = sig;
123 |   sig+=64*HASH_BYTES;
124 | 
125 |   for(i=0;i<HORST_K;i++)
126 |   {
127 |     idx = m_hash[2*i] + (m_hash[2*i+1]<<8);
128 | 
129 | #if HORST_SKBYTES != HASH_BYTES
130 | #error "Need to have HORST_SKBYTES == HASH_BYTES"
131 | #endif
132 | 
133 |     if(!(idx&1))
134 |     {
135 |       hash_n_n(buffer,sig);
136 |       for(k=0;k<HASH_BYTES;k++)
137 |         buffer[HASH_BYTES+k] = sig[HORST_SKBYTES+k];
138 |     }
139 |     else
140 |     {
141 |       hash_n_n(buffer+HASH_BYTES,sig);
142 |       for(k=0;k<HASH_BYTES;k++)
143 |         buffer[k] = sig[HORST_SKBYTES+k];
144 |     }
145 |     sig += HORST_SKBYTES+HASH_BYTES;
146 | 
147 |     for(j=1;j<HORST_LOGT-6;j++)
148 |     {
149 |       idx = idx>>1; // parent node
150 | 
151 |       if(!(idx&1))
152 |       {
153 |         hash_2n_n_mask(buffer,buffer,masks+2*(j-1)*HASH_BYTES);
154 |         for(k=0;k<HASH_BYTES;k++)
155 |           buffer[HASH_BYTES+k] = sig[k];
156 |       }
157 |       else
158 |       {
159 |         hash_2n_n_mask(buffer+HASH_BYTES,buffer,masks+2*(j-1)*HASH_BYTES);
160 |         for(k=0;k<HASH_BYTES;k++)
161 |           buffer[k] = sig[k];
162 |       }
163 |       sig += HASH_BYTES;
164 |     }
165 | 
166 |     idx = idx>>1; // parent node
167 |     hash_2n_n_mask(buffer,buffer,masks+2*(HORST_LOGT-7)*HASH_BYTES);
168 | 
169 |     for(k=0;k<HASH_BYTES;k++)
170 |       if(level10[idx*HASH_BYTES+k] != buffer[k]) 
171 |         goto fail;
172 |   }
173 | 
174 |   // Compute root from level10
175 |   for(j=0;j<32;j++)
176 |     hash_2n_n_mask(buffer+j*HASH_BYTES, level10+2*j*HASH_BYTES, masks+2*(HORST_LOGT-6)*HASH_BYTES);
177 |   // Hash from level 11 to 12
178 |   for(j=0;j<16;j++)
179 |     hash_2n_n_mask(buffer+j*HASH_BYTES,buffer+2*j*HASH_BYTES,masks+2*(HORST_LOGT-5)*HASH_BYTES);
180 |   // Hash from level 12 to 13
181 |   for(j=0;j<8;j++)
182 |     hash_2n_n_mask(buffer+j*HASH_BYTES,buffer+2*j*HASH_BYTES,masks+2*(HORST_LOGT-4)*HASH_BYTES);
183 |   // Hash from level 13 to 14
184 |   for(j=0;j<4;j++)
185 |     hash_2n_n_mask(buffer+j*HASH_BYTES,buffer+2*j*HASH_BYTES,masks+2*(HORST_LOGT-3)*HASH_BYTES);
186 |   // Hash from level 14 to 15
187 |   for(j=0;j<2;j++)
188 |     hash_2n_n_mask(buffer+j*HASH_BYTES,buffer+2*j*HASH_BYTES,masks+2*(HORST_LOGT-2)*HASH_BYTES);
189 |   // Hash from level 15 to 16
190 |   hash_2n_n_mask(pk, buffer, masks+2*(HORST_LOGT-1)*HASH_BYTES);
191 | 
192 |   return 0;
193 | 
194 | 
195 | fail:
196 |   for(k=0;k<HASH_BYTES;k++)
197 |     pk[k] = 0;
198 |   return -1;
199 | }
200 | 
201 | 


--------------------------------------------------------------------------------
/supercop/crypto_sign/sphincs256haraka/aesni/horst.h:
--------------------------------------------------------------------------------
 1 | #ifndef HORST_H
 2 | #define HORST_H
 3 | 
 4 | #include "params.h"
 5 | 
 6 | int horst_sign(unsigned char *sig, unsigned char pk[HASH_BYTES], unsigned long long *sigbytes, const unsigned char *m, unsigned long long mlen, const unsigned char seed[SEED_BYTES], const unsigned char masks[2*HORST_LOGT*HASH_BYTES], const unsigned char m_hash[MSGHASH_BYTES]);
 7 | 
 8 | int horst_verify(unsigned char *pk, const unsigned char *sig, const unsigned char *m, unsigned long long mlen, const unsigned char masks[2*HORST_LOGT*HASH_BYTES], const unsigned char m_hash[MSGHASH_BYTES]);
 9 | 
10 | #endif
11 | 


--------------------------------------------------------------------------------
/supercop/crypto_sign/sphincs256haraka/aesni/horst.log:
--------------------------------------------------------------------------------
 1 | ==24131== Memcheck, a memory error detector
 2 | ==24131== Copyright (C) 2002-2013, and GNU GPL'd, by Julian Seward et al.
 3 | ==24131== Using Valgrind-3.9.0 and LibVEX; rerun with -h for copyright info
 4 | ==24131== Command: ./test/test_horst
 5 | ==24131== Parent PID: 12092
 6 | ==24131== 
 7 | ==24131== Conditional jump or move depends on uninitialised value(s)
 8 | ==24131==    at 0x40177C1: index (strchr.S:40)
 9 | ==24131==    by 0x400740D: expand_dynamic_string_token (dl-load.c:425)
10 | ==24131==    by 0x400759B: fillin_rpath (dl-load.c:495)
11 | ==24131==    by 0x4007D29: _dl_init_paths (dl-load.c:872)
12 | ==24131==    by 0x4002BC9: dl_main (rtld.c:1349)
13 | ==24131==    by 0x4015334: _dl_sysdep_start (dl-sysdep.c:249)
14 | ==24131==    by 0x4004A35: _dl_start (rtld.c:332)
15 | ==24131==    by 0x4001197: ??? (in /lib/x86_64-linux-gnu/ld-2.19.so)
16 | ==24131== 
17 | ==24131== Conditional jump or move depends on uninitialised value(s)
18 | ==24131==    at 0x4017834: index (strchr.S:77)
19 | ==24131==    by 0x400740D: expand_dynamic_string_token (dl-load.c:425)
20 | ==24131==    by 0x400803D: _dl_map_object (dl-load.c:2538)
21 | ==24131==    by 0x400137D: map_doit (rtld.c:627)
22 | ==24131==    by 0x400E8E3: _dl_catch_error (dl-error.c:187)
23 | ==24131==    by 0x4000B2E: do_preload (rtld.c:816)
24 | ==24131==    by 0x4004147: dl_main (rtld.c:1635)
25 | ==24131==    by 0x4015334: _dl_sysdep_start (dl-sysdep.c:249)
26 | ==24131==    by 0x4004A35: _dl_start (rtld.c:332)
27 | ==24131==    by 0x4001197: ??? (in /lib/x86_64-linux-gnu/ld-2.19.so)
28 | ==24131== 
29 | ==24131== 
30 | ==24131== HEAP SUMMARY:
31 | ==24131==     in use at exit: 0 bytes in 0 blocks
32 | ==24131==   total heap usage: 0 allocs, 0 frees, 0 bytes allocated
33 | ==24131== 
34 | ==24131== All heap blocks were freed -- no leaks are possible
35 | ==24131== 
36 | ==24131== For counts of detected and suppressed errors, rerun with: -v
37 | ==24131== Use --track-origins=yes to see where uninitialised values come from
38 | ==24131== ERROR SUMMARY: 2 errors from 2 contexts (suppressed: 0 from 0)
39 | 


--------------------------------------------------------------------------------
/supercop/crypto_sign/sphincs256haraka/aesni/implementors:
--------------------------------------------------------------------------------
1 | Daniel J. Bernstein1
2 | Daira Hopwood
3 | Andreas Hülsing
4 | Tanja Lange
5 | Ruben Niederhagen
6 | Louiza Papachristodoulou
7 | Peter Schwabe
8 | Zooko Wilcox O'Hearn
9 | 


--------------------------------------------------------------------------------
/supercop/crypto_sign/sphincs256haraka/aesni/params.h:
--------------------------------------------------------------------------------
 1 | #define SUBTREE_HEIGHT 5
 2 | #define TOTALTREE_HEIGHT 60
 3 | #define N_LEVELS (TOTALTREE_HEIGHT/SUBTREE_HEIGHT)
 4 | #define SEED_BYTES 32
 5 | #define WOTS_LOGW 4
 6 | 
 7 | #define SK_RAND_SEED_BYTES 32
 8 | #define MESSAGE_HASH_SEED_BYTES 32
 9 | 
10 | #define HORST_LOGT 16
11 | #define HORST_T (1<<HORST_LOGT)
12 | #define HORST_K 32
13 | #define HORST_SKBYTES 32
14 | #define HORST_SIGBYTES (64*HASH_BYTES+(((HORST_LOGT-6)*HASH_BYTES)+HORST_SKBYTES)*HORST_K)
15 | 
16 | #define WOTS_W (1 << WOTS_LOGW)
17 | #define WOTS_L1 ((256+WOTS_LOGW-1)/WOTS_LOGW)
18 | //#define WOTS_L 133  // for WOTS_W == 4
19 | //#define WOTS_L 90  // for WOTS_W == 8
20 | #define WOTS_L 67  // for WOTS_W == 16
21 | #define WOTS_LOG_L 7  // for WOTS_W == 16
22 | #define WOTS_SIGBYTES (WOTS_L*HASH_BYTES)
23 | 
24 | #define HASH_BYTES 32 // Has to be log(HORST_T)*HORST_K/8
25 | #define MSGHASH_BYTES 64
26 | 
27 | #define N_MASKS (2*(HORST_LOGT)) /* has to be the max of  (2*(SUBTREE_HEIGHT+WOTS_LOGL)) and (WOTS_W-1) and 2*HORST_LOGT */
28 | 


--------------------------------------------------------------------------------
/supercop/crypto_sign/sphincs256haraka/aesni/permute.c:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #include "wmmintrin.h"
 3 | #include "emmintrin.h"
 4 | #include "smmintrin.h"
 5 | 
 6 | #define CHACHA_ROUNDS 12
 7 | 
 8 | #define U32V(x) \
 9 |   ((x) & 0xffffffff)
10 | 
11 | #define ROTL32(x,c) \
12 |   ((((x) << c) | ((x) >> (32-c))) & 0xffffffff)
13 | 
14 | #define ROTATE(v,c) (ROTL32(v,c))
15 | #define XOR(v,w) ((v) ^ (w))
16 | #define PLUS(v,w) (U32V((v) + (w)))
17 | #define PLUSONE(v) (PLUS((v),1))
18 | 
19 | #define QUARTERROUND(a,b,c,d) \
20 |   x[a] = PLUS(x[a],x[b]); x[d] = ROTATE(XOR(x[d],x[a]),16); \
21 |   x[c] = PLUS(x[c],x[d]); x[b] = ROTATE(XOR(x[b],x[c]),12); \
22 |   x[a] = PLUS(x[a],x[b]); x[d] = ROTATE(XOR(x[d],x[a]), 8); \
23 |   x[c] = PLUS(x[c],x[d]); x[b] = ROTATE(XOR(x[b],x[c]), 7);
24 | 
25 | 
26 | void chacha_permute(unsigned char out[64],const unsigned char in [64])
27 | {
28 |   uint32_t x[16];
29 |   int i;
30 | 
31 |   for (i = 0;i < 16;i++)
32 |   {
33 |     x[i] = in[4*i+3];
34 |     x[i] <<= 8;
35 |     x[i] |= in[4*i+2];
36 |     x[i] <<= 8;
37 |     x[i] |= in[4*i+1];
38 |     x[i] <<= 8;
39 |     x[i] |= in[4*i+0];
40 |   }
41 | 
42 |   for (i = CHACHA_ROUNDS;i > 0;i -= 2)
43 |   {
44 |     QUARTERROUND( 0, 4, 8,12)
45 |     QUARTERROUND( 1, 5, 9,13)
46 |     QUARTERROUND( 2, 6,10,14)
47 |     QUARTERROUND( 3, 7,11,15)
48 |     QUARTERROUND( 0, 5,10,15)
49 |     QUARTERROUND( 1, 6,11,12)
50 |     QUARTERROUND( 2, 7, 8,13)
51 |     QUARTERROUND( 3, 4, 9,14)
52 |   }
53 | 
54 | //  for (i = 0;i < 16;++i) x[i] = PLUS(x[i],input[i]); // XXX: Bad idea if we later xor the input to the state?
55 |   for (i = 0;i < 16;++i)
56 |   {
57 |     out[4*i]   =  x[i] & 0xff;
58 |     out[4*i+1] = (x[i] >>  8) & 0xff;
59 |     out[4*i+2] = (x[i] >> 16) & 0xff;
60 |     out[4*i+3] = (x[i] >> 24) & 0xff;
61 |   }
62 | }
63 | 


--------------------------------------------------------------------------------
/supercop/crypto_sign/sphincs256haraka/aesni/permute.h:
--------------------------------------------------------------------------------
 1 | #ifndef PERMUTE_H
 2 | #define PERMUTE_H
 3 | 
 4 | void chacha_permute(unsigned char output[64],const unsigned char input [64]);
 5 | 
 6 | void load_rc();
 7 | void haraka512256(unsigned char out[32], const unsigned char in[64]);
 8 | void haraka256256(unsigned char out[32], const unsigned char in[32]);
 9 | void haraka512256_8x(unsigned char out[32*8], const unsigned char in[64*8]);
10 | void haraka256256_8x(unsigned char out[32*8], const unsigned char in[32*8]);
11 | 
12 | #endif
13 | 


--------------------------------------------------------------------------------
/supercop/crypto_sign/sphincs256haraka/aesni/prg.c:
--------------------------------------------------------------------------------
 1 | #include "crypto_stream_chacha12.h"
 2 | #include "params.h"
 3 | #include "prg.h"
 4 | 
 5 | static unsigned char nonce[crypto_stream_chacha12_NONCEBYTES] = {0};
 6 | 
 7 | #if crypto_stream_chacha12_KEYBYTES != SEED_BYTES
 8 |   #error "SEED_BYTES needs to match CRYPTO_STREAM_KEYBYTES for this implementation"
 9 | #endif
10 | 
11 | void prg(unsigned char *r, unsigned long long rlen, const unsigned char key[SEED_BYTES])
12 | {
13 |   crypto_stream_chacha12(r,rlen,nonce,key);
14 | }
15 | 


--------------------------------------------------------------------------------
/supercop/crypto_sign/sphincs256haraka/aesni/prg.h:
--------------------------------------------------------------------------------
1 | #ifndef PRG_H
2 | #define PRG_H
3 | 
4 | #include "params.h"
5 | 
6 | void prg(unsigned char *r, unsigned long long rlen, const unsigned char key[SEED_BYTES]);
7 | 
8 | #endif
9 | 


--------------------------------------------------------------------------------
/supercop/crypto_sign/sphincs256haraka/aesni/settings.h:
--------------------------------------------------------------------------------
 1 | #ifndef SETTINGS_H
 2 | #define SETTINGS_H
 3 | 
 4 | #ifndef MPAR
 5 | #define MPAR (1)
 6 | #endif
 7 | 
 8 | #ifndef ROUNDS
 9 | #define ROUNDS (4)
10 | #endif
11 | 
12 | #ifndef AES_PER_ROUND
13 | #define AES_PER_ROUND (2)
14 | #endif
15 | 
16 | #ifndef MIX_PER_ROUND
17 | #define MIX_PER_ROUND (1)
18 | #endif
19 | 
20 | #define DEBUG (0)
21 | 
22 | #define MIX_METHOD (0)	// 0 : blend
23 | 						// 1 : shuffle + xor
24 | 						// 2 : AESQ method
25 | 
26 | #endif
27 | 


--------------------------------------------------------------------------------
/supercop/crypto_sign/sphincs256haraka/aesni/sign.c:
--------------------------------------------------------------------------------
  1 | #include "crypto_sign.h"
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | 
  5 | #include "api.h"
  6 | #include "randombytes.h"
  7 | #include "zerobytes.h"
  8 | #include "params.h"
  9 | #include "wots.h"
 10 | #include "horst.h"
 11 | #include "hash.h"
 12 | #include "crypto_hash_blake512.h"
 13 | #include "permute.h"
 14 | 
 15 | #define BIGINT_BYTES ((TOTALTREE_HEIGHT-SUBTREE_HEIGHT+7)/8)
 16 | 
 17 | #if (TOTALTREE_HEIGHT-SUBTREE_HEIGHT) > 64
 18 | #error "TOTALTREE_HEIGHT-SUBTREE_HEIGHT must be at most 64"
 19 | #endif
 20 | 
 21 | typedef struct{
 22 |   int level;
 23 |   unsigned long long subtree;
 24 |   int subleaf;
 25 | } leafaddr;
 26 | 
 27 | 
 28 | static void get_seed(unsigned char seed[SEED_BYTES], const unsigned char *sk, const leafaddr *a)
 29 | {
 30 | #if (N_LEVELS > 15) && (N_LEVELS < 8)
 31 | #error "Need to have 8 <= N_LEVELS <= 15"
 32 | #endif
 33 | 
 34 | #if SUBTREE_HEIGHT != 5
 35 | #error "Need to have SUBTREE_HEIGHT == 5"
 36 | #endif
 37 | 
 38 | #if TOTALTREE_HEIGHT != 60
 39 | #error "Need to have TOTALTREE_HEIGHT == 60"
 40 | #endif
 41 |   unsigned char buffer[SEED_BYTES+8];
 42 |   unsigned long long t;
 43 |   int i;
 44 | 
 45 |   for(i=0;i<SEED_BYTES;i++)
 46 |     buffer[i] = sk[i];
 47 | 
 48 |   //4 bits to encode level
 49 |   t  = a->level;
 50 |   //55 bits to encode subtree
 51 |   t |= a->subtree << 4;
 52 |   //5 bits to encode leaf
 53 |   t |= (unsigned long long)a->subleaf << 59;
 54 | 
 55 |   for(i=0;i<8;i++)
 56 |     buffer[SEED_BYTES+i] = (t >> 8*i) & 0xff;
 57 | 
 58 | #if SEED_BYTES != HASH_BYTES
 59 | #error "Need to have SEED_BYTES == HASH_BYTES"
 60 | #endif
 61 |   varlen_hash(seed,buffer,SEED_BYTES+8);
 62 | }
 63 | 
 64 | 
 65 | /*static void l_tree(unsigned char *leaf, unsigned char *wots_pk, const unsigned char *masks)
 66 | {
 67 |   int l = WOTS_L;
 68 |   int i,j = 0;
 69 |   for(i=0;i<WOTS_LOG_L;i++)
 70 |   {
 71 |     for(j=0 ;j < (l>>1);j++)
 72 |       hash_2n_n_mask(wots_pk+j*HASH_BYTES,wots_pk+j*2*HASH_BYTES, masks+i*2*HASH_BYTES);
 73 | 
 74 |     if(l&1)
 75 |     {
 76 |       memcpy(wots_pk+(l>>1)*HASH_BYTES,wots_pk+(l-1)*HASH_BYTES, HASH_BYTES);
 77 |       l=(l>>1)+1;
 78 |     }
 79 |     else
 80 |       l=(l>>1);
 81 |   }
 82 |   memcpy(leaf,wots_pk,HASH_BYTES);
 83 | }
 84 | */
 85 | static void l_tree(unsigned char *leaf, unsigned char *wots_pk, const unsigned char *masks)
 86 | {
 87 |   int l = WOTS_L;
 88 |   int i,j = 0;
 89 |   for(i=0;i<WOTS_LOG_L;i++)
 90 |   {
 91 |     j = 0;
 92 |     for(;j+8 < (l>>1);j+=8)
 93 |       hash_2n_n_mask_8x(wots_pk+j*HASH_BYTES,wots_pk+j*2*HASH_BYTES, HASH_BYTES, 2*HASH_BYTES, masks+i*2*HASH_BYTES);
 94 | 
 95 |     for(;j < (l>>1);j++)
 96 |       hash_2n_n_mask(wots_pk+j*HASH_BYTES,wots_pk+j*2*HASH_BYTES, masks+i*2*HASH_BYTES);
 97 | 
 98 |     if(l&1)
 99 |     {
100 |       memcpy(wots_pk+(l>>1)*HASH_BYTES,wots_pk+(l-1)*HASH_BYTES, HASH_BYTES);
101 |       l=(l>>1)+1;
102 |     }
103 |     else
104 |       l=(l>>1);
105 |   }
106 |   memcpy(leaf,wots_pk,HASH_BYTES);
107 | }
108 | 
109 | static void l_tree_8x(unsigned char *leaf, unsigned char *wots_pk, const unsigned char *masks)
110 | {
111 |   int l = WOTS_L;
112 |   int i,j,k = 0;
113 | 
114 |   for(i=0;i<WOTS_LOG_L;i++)
115 |   {
116 |     for(j = 0;j < (l>>1);j++)
117 |       hash_2n_n_mask_8x(wots_pk+j*HASH_BYTES, wots_pk+j*2*HASH_BYTES,
118 |           WOTS_L*HASH_BYTES, WOTS_L*HASH_BYTES,
119 |           masks+i*2*HASH_BYTES);
120 |     if(l&1)
121 |     {
122 |       for (k = 0; k < 8; k++)
123 |         memcpy(wots_pk+(l>>1)*HASH_BYTES + k*WOTS_L*HASH_BYTES,
124 |             wots_pk+(l -1)*HASH_BYTES + k*WOTS_L*HASH_BYTES, HASH_BYTES);
125 |       l=(l>>1)+1;
126 |     } else {
127 |       l=(l>>1);
128 |     }
129 |   }
130 |   for (k = 0; k < 8; k++)
131 |     memcpy(leaf + k*HASH_BYTES, wots_pk + k*WOTS_L*HASH_BYTES, HASH_BYTES);
132 | }
133 | 
134 | static void gen_leaf_wots(unsigned char leaf[HASH_BYTES], const unsigned char *masks, const unsigned char *sk, const leafaddr *a)
135 | {
136 |   unsigned char seed[SEED_BYTES];
137 |   unsigned char pk[WOTS_L*HASH_BYTES];
138 | 
139 |   get_seed(seed, sk, a);
140 |   wots_pkgen(pk, seed, masks);
141 | 
142 |   l_tree(leaf, pk, masks);
143 | }
144 | 
145 | 
146 | static void treehash(unsigned char *node, int height, const unsigned char *sk, const leafaddr *leaf, const unsigned char *masks)
147 | {
148 | 
149 |   leafaddr a = *leaf;
150 |   int lastnode,i;
151 |   unsigned char stack[(height+1)*HASH_BYTES];
152 |   unsigned int  stacklevels[height+1];
153 |   unsigned int  stackoffset=0;
154 |   unsigned int maskoffset =0;
155 | 
156 |   lastnode = a.subleaf+(1<<height);
157 | 
158 |   for(;a.subleaf<lastnode;a.subleaf++)
159 |   {
160 |     gen_leaf_wots(stack+stackoffset*HASH_BYTES,masks,sk,&a);
161 |     stacklevels[stackoffset] = 0;
162 |     stackoffset++;
163 |     while(stackoffset>1 && stacklevels[stackoffset-1] == stacklevels[stackoffset-2])
164 |     {
165 |       //MASKS
166 |       maskoffset = 2*(stacklevels[stackoffset-1] + WOTS_LOG_L)*HASH_BYTES;
167 |       hash_2n_n_mask(stack+(stackoffset-2)*HASH_BYTES,stack+(stackoffset-2)*HASH_BYTES,
168 |           masks+maskoffset);
169 |       stacklevels[stackoffset-2]++;
170 |       stackoffset--;
171 |     }
172 |   }
173 |   for(i=0;i<HASH_BYTES;i++)
174 |     node[i] = stack[i];
175 | }
176 | 
177 | 
178 | static void validate_authpath(unsigned char root[HASH_BYTES], const unsigned char leaf[HASH_BYTES], unsigned int leafidx, const unsigned char *authpath, const unsigned char *masks, unsigned int height)
179 | {
180 |   int i,j;
181 |   unsigned char buffer[2*HASH_BYTES];
182 | 
183 |   if(leafidx&1)
184 |   {
185 |     for(j=0;j<HASH_BYTES;j++)
186 |       buffer[HASH_BYTES+j] = leaf[j];
187 |     for(j=0;j<HASH_BYTES;j++)
188 |       buffer[j] = authpath[j];
189 |   }
190 |   else
191 |   {
192 |     for(j=0;j<HASH_BYTES;j++)
193 |       buffer[j] = leaf[j];
194 |     for(j=0;j<HASH_BYTES;j++)
195 |       buffer[HASH_BYTES+j] = authpath[j];
196 |   }
197 |   authpath += HASH_BYTES;
198 | 
199 |   for(i=0;i<height-1;i++)
200 |   {
201 |     leafidx >>= 1;
202 |     if(leafidx&1)
203 |     {
204 |       hash_2n_n_mask(buffer+HASH_BYTES,buffer,masks+2*(WOTS_LOG_L+i)*HASH_BYTES);
205 |       for(j=0;j<HASH_BYTES;j++)
206 |         buffer[j] = authpath[j];
207 |     }
208 |     else
209 |     {
210 |       hash_2n_n_mask(buffer,buffer,masks+2*(WOTS_LOG_L+i)*HASH_BYTES);
211 |       for(j=0;j<HASH_BYTES;j++)
212 |         buffer[j+HASH_BYTES] = authpath[j];
213 |     }
214 |     authpath += HASH_BYTES;
215 |   }
216 |   hash_2n_n_mask(root,buffer,masks+2*(WOTS_LOG_L+height-1)*HASH_BYTES);
217 | }
218 | 
219 | 
220 | static void compute_authpath_wots(unsigned char root[HASH_BYTES], unsigned char *authpath, const leafaddr *a, const unsigned char *sk, const unsigned char *masks, unsigned int height)
221 | {
222 |   int i, idx, j;
223 |   leafaddr ta = *a;
224 | 
225 |   unsigned char tree[2*(1<<SUBTREE_HEIGHT)*HASH_BYTES];
226 |   unsigned char seed[(1<<SUBTREE_HEIGHT)*SEED_BYTES];
227 |   unsigned char pk[(1<<SUBTREE_HEIGHT)*WOTS_L*HASH_BYTES];
228 | 
229 |   // level 0
230 |   for(ta.subleaf = 0; ta.subleaf < (1<<SUBTREE_HEIGHT); ta.subleaf++)
231 |     get_seed(seed + ta.subleaf * SEED_BYTES, sk, &ta);
232 | 
233 |   for(ta.subleaf = 0; ta.subleaf < (1<<SUBTREE_HEIGHT); ta.subleaf++)
234 |     wots_pkgen(pk + ta.subleaf * WOTS_L*HASH_BYTES, seed + ta.subleaf * SEED_BYTES, masks);
235 | 
236 |   for(ta.subleaf = 0; ta.subleaf < (1<<SUBTREE_HEIGHT); ta.subleaf++)
237 |     l_tree(tree + (1<<SUBTREE_HEIGHT)*HASH_BYTES + ta.subleaf * HASH_BYTES,
238 |         pk  + ta.subleaf * WOTS_L*HASH_BYTES, masks);
239 | 
240 |   int level = 0;
241 | 
242 |   // tree
243 |   for (i = (1<<SUBTREE_HEIGHT); i > 0; i>>=1)
244 |   {
245 |     for (j = 0; j < i; j+=2)
246 |       hash_2n_n_mask(tree + (i>>1)*HASH_BYTES + (j>>1) * HASH_BYTES,
247 |           tree + i*HASH_BYTES + j * HASH_BYTES,
248 |           masks+2*(WOTS_LOG_L + level)*HASH_BYTES);
249 | 
250 |     level++;
251 |   }
252 | 
253 | 
254 |   idx = a->subleaf;
255 | 
256 |   // copy authpath
257 |   for(i=0;i<height;i++)
258 |     memcpy(authpath + i*HASH_BYTES, tree + ((1<<SUBTREE_HEIGHT)>>i)*HASH_BYTES + ((idx >> i) ^ 1) * HASH_BYTES, HASH_BYTES);
259 | 
260 |   // copy root
261 |   memcpy(root, tree+HASH_BYTES, HASH_BYTES);
262 | }
263 | 
264 | 
265 | /*
266 |  * Format pk: [|N_MASKS*HASH_BYTES| Bitmasks || root]
267 |  */
268 | int crypto_sign_keypair(unsigned char *pk, unsigned char *sk)
269 | {
270 |   leafaddr a;
271 | 
272 |   randombytes(sk,CRYPTO_SECRETKEYBYTES);
273 |   memcpy(pk,sk+SEED_BYTES,N_MASKS*HASH_BYTES);
274 | 
275 |   // Initialization of top-subtree address
276 |   a.level   = N_LEVELS - 1;
277 |   a.subtree = 0;
278 |   a.subleaf = 0;
279 | 
280 |   // Construct top subtree
281 |   treehash(pk+(N_MASKS*HASH_BYTES), SUBTREE_HEIGHT, sk, &a, pk);
282 |   return 0;
283 | }
284 | 
285 | 
286 | int crypto_sign(unsigned char *sm,unsigned long long *smlen, const unsigned char *m,unsigned long long mlen, const unsigned char *sk)
287 | {
288 |   leafaddr a;
289 |   unsigned long long i;
290 |   unsigned long long leafidx;
291 |   unsigned char R[MESSAGE_HASH_SEED_BYTES];
292 |   unsigned char m_h[MSGHASH_BYTES];
293 |   unsigned long long rnd[8];
294 |   unsigned long long horst_sigbytes;
295 |   unsigned char root[HASH_BYTES];
296 |   unsigned char seed[SEED_BYTES];
297 |   unsigned char masks[N_MASKS*HASH_BYTES];
298 |   unsigned char *pk;
299 |   unsigned char tsk[CRYPTO_SECRETKEYBYTES];
300 | 
301 |   for(i=0;i<CRYPTO_SECRETKEYBYTES;i++)
302 |     tsk[i] = sk[i];
303 | 
304 |   // create leafidx deterministically
305 |   {
306 |     // shift scratch upwards so we can reuse msg later
307 |     unsigned char* scratch = sm + CRYPTO_BYTES - SK_RAND_SEED_BYTES;
308 | 
309 |     // Copy message to scratch backwards to handle m = sm overlap
310 |     for(i=mlen;i>0;i--)
311 |       scratch[SK_RAND_SEED_BYTES+i-1] = m[i-1];
312 |     // Copy secret random seed to scratch
313 |     memcpy(scratch, tsk + CRYPTO_SECRETKEYBYTES - SK_RAND_SEED_BYTES, SK_RAND_SEED_BYTES);
314 | 
315 |     crypto_hash_blake512((unsigned char*)rnd, scratch, SK_RAND_SEED_BYTES + mlen); //XXX: Why Blake 512?
316 | 
317 |     // wipe sk
318 |     zerobytes(scratch,SK_RAND_SEED_BYTES);
319 | 
320 | #if TOTALTREE_HEIGHT != 60
321 | #error "Implemented for TOTALTREE_HEIGHT == 60!"
322 | #endif
323 | 
324 |     leafidx = rnd[0] & 0xfffffffffffffff;
325 | 
326 | #if MESSAGE_HASH_SEED_BYTES != 32
327 | #error "Implemented for MESSAGE_HASH_SEED_BYTES == 32!"
328 | #endif
329 |     memcpy(R, &rnd[2], MESSAGE_HASH_SEED_BYTES);
330 | 
331 |     // prepare msg_hash
332 |     scratch = sm + CRYPTO_BYTES - MESSAGE_HASH_SEED_BYTES - CRYPTO_PUBLICKEYBYTES;
333 | 
334 |     // cpy R
335 |     memcpy(scratch, R, MESSAGE_HASH_SEED_BYTES);
336 | 
337 |     // construct and cpy pk
338 |     leafaddr a;
339 |     a.level = N_LEVELS - 1;
340 |     a.subtree = 0;
341 |     a.subleaf=0;
342 | 
343 |     pk = scratch + MESSAGE_HASH_SEED_BYTES;
344 | 
345 |     memcpy(pk, tsk+SEED_BYTES, N_MASKS*HASH_BYTES);
346 | 
347 |     treehash(pk+(N_MASKS*HASH_BYTES), SUBTREE_HEIGHT, tsk, &a, pk);
348 | 
349 |     // message already on the right spot
350 | 
351 |     msg_hash(m_h, scratch, mlen + MESSAGE_HASH_SEED_BYTES + CRYPTO_PUBLICKEYBYTES);
352 |   }
353 | 
354 |   a.level   = N_LEVELS; // Use unique value $d$ for HORST address.
355 |   a.subleaf = leafidx & ((1<<SUBTREE_HEIGHT)-1);
356 |   a.subtree = leafidx >> SUBTREE_HEIGHT;
357 | 
358 |   *smlen = 0;
359 | 
360 |   for(i=0; i<MESSAGE_HASH_SEED_BYTES; i++)
361 |     sm[i] = R[i];
362 | 
363 |   sm += MESSAGE_HASH_SEED_BYTES;
364 |   *smlen += MESSAGE_HASH_SEED_BYTES;
365 | 
366 |   memcpy(masks, tsk+SEED_BYTES,N_MASKS*HASH_BYTES);
367 |   for(i=0;i<(TOTALTREE_HEIGHT+7)/8;i++)
368 |     sm[i] = (leafidx >> 8*i) & 0xff;
369 | 
370 |   sm += (TOTALTREE_HEIGHT+7)/8;
371 |   *smlen += (TOTALTREE_HEIGHT+7)/8;
372 | 
373 |   get_seed(seed, tsk, &a);
374 |   horst_sign(sm, root, &horst_sigbytes, m, mlen, seed, masks, m_h);
375 | 
376 |   sm += horst_sigbytes;
377 |   *smlen += horst_sigbytes;
378 | 
379 |   for(i=0;i<N_LEVELS;i++)
380 |   {
381 |     a.level = i;
382 | 
383 |     get_seed(seed, tsk, &a); //XXX: Don't use the same address as for horst_sign here!
384 |     wots_sign(sm, root, seed, masks);
385 |     sm += WOTS_SIGBYTES;
386 |     *smlen += WOTS_SIGBYTES;
387 | 
388 |     compute_authpath_wots(root,sm,&a,tsk,masks,SUBTREE_HEIGHT);
389 |     sm += SUBTREE_HEIGHT*HASH_BYTES;
390 |     *smlen += SUBTREE_HEIGHT*HASH_BYTES;
391 | 
392 |     a.subleaf = a.subtree & ((1<<SUBTREE_HEIGHT)-1);
393 |     a.subtree >>= SUBTREE_HEIGHT;
394 |   }
395 | 
396 |   zerobytes(tsk, CRYPTO_SECRETKEYBYTES);
397 | 
398 |   *smlen += mlen;
399 | 
400 |   return 0;
401 | }
402 | 
403 | 
404 | 
405 | int crypto_sign_open(unsigned char *m,unsigned long long *mlen, const unsigned char *sm,unsigned long long smlen, const unsigned char *pk)
406 | {
407 |   unsigned long long i;
408 |   unsigned long long leafidx=0;
409 |   unsigned char wots_pk[WOTS_L*HASH_BYTES];
410 |   unsigned char pkhash[HASH_BYTES];
411 |   unsigned char root[HASH_BYTES];
412 |   unsigned char sig[CRYPTO_BYTES];
413 |   unsigned char *sigp;
414 |   unsigned char tpk[CRYPTO_PUBLICKEYBYTES];
415 | 
416 |   if(smlen < CRYPTO_BYTES)
417 |     return -1;
418 | 
419 |   unsigned char m_h[MSGHASH_BYTES];
420 | 
421 |   for(i=0;i<CRYPTO_PUBLICKEYBYTES;i++)
422 |     tpk[i] = pk[i];
423 | 
424 |   // construct message hash
425 |   {
426 |     unsigned char R[MESSAGE_HASH_SEED_BYTES];
427 | 
428 |     for(i=0; i<MESSAGE_HASH_SEED_BYTES; i++)
429 |       R[i] = sm[i];
430 | 
431 |     int len = smlen - CRYPTO_BYTES;
432 | 
433 |     unsigned char *scratch = m;
434 | 
435 |     memcpy(sig, sm, CRYPTO_BYTES);
436 | 
437 |     memcpy(scratch + MESSAGE_HASH_SEED_BYTES + CRYPTO_PUBLICKEYBYTES, sm + CRYPTO_BYTES, len);
438 | 
439 |     // cpy R
440 |     memcpy(scratch, R, MESSAGE_HASH_SEED_BYTES);
441 | 
442 |     // cpy pub key
443 |     memcpy(scratch + MESSAGE_HASH_SEED_BYTES, tpk, CRYPTO_PUBLICKEYBYTES);
444 | 
445 |     msg_hash(m_h, scratch, len + MESSAGE_HASH_SEED_BYTES + CRYPTO_PUBLICKEYBYTES);
446 |   }
447 |   sigp = &sig[0];
448 | 
449 |   sigp += MESSAGE_HASH_SEED_BYTES;
450 |   smlen -= MESSAGE_HASH_SEED_BYTES;
451 | 
452 | 
453 |   for(i=0;i<(TOTALTREE_HEIGHT+7)/8;i++)
454 |     leafidx ^= (((unsigned long long)sigp[i]) << 8*i);
455 | 
456 | 
457 |   horst_verify(root, sigp+(TOTALTREE_HEIGHT+7)/8,
458 |       sigp+CRYPTO_BYTES-MESSAGE_HASH_SEED_BYTES, smlen-CRYPTO_BYTES-MESSAGE_HASH_SEED_BYTES, tpk, m_h);
459 | 
460 |   sigp += (TOTALTREE_HEIGHT+7)/8;
461 |   smlen -= (TOTALTREE_HEIGHT+7)/8;
462 | 
463 |   sigp += HORST_SIGBYTES;
464 |   smlen -= HORST_SIGBYTES;
465 | 
466 |   for(i=0;i<N_LEVELS;i++)
467 |   {
468 |     wots_verify(wots_pk, sigp, root, tpk);
469 | 
470 |     sigp += WOTS_SIGBYTES;
471 |     smlen -= WOTS_SIGBYTES;
472 | 
473 |     l_tree(pkhash, wots_pk,tpk);
474 |     validate_authpath(root, pkhash, leafidx & 0x1f, sigp, tpk, SUBTREE_HEIGHT);
475 |     leafidx >>= 5;
476 | 
477 |     sigp += SUBTREE_HEIGHT*HASH_BYTES;
478 |     smlen -= SUBTREE_HEIGHT*HASH_BYTES;
479 |   }
480 | 
481 |   for(i=0;i<HASH_BYTES;i++)
482 |     if(root[i] != tpk[i+N_MASKS*HASH_BYTES])
483 |       goto fail;
484 | 
485 |   *mlen = smlen;
486 |   for(i=0;i<*mlen;i++)
487 |     m[i] = m[i+MESSAGE_HASH_SEED_BYTES+CRYPTO_PUBLICKEYBYTES];
488 | 
489 |   return 0;
490 | 
491 | 
492 | fail:
493 |   *mlen = smlen;
494 |   for(i=0;i<*mlen;i++)
495 |     m[i] = 0;
496 |   *mlen = -1;
497 |   return -1;
498 | }
499 | 


--------------------------------------------------------------------------------
/supercop/crypto_sign/sphincs256haraka/aesni/wots.c:
--------------------------------------------------------------------------------
  1 | #include "params.h"
  2 | #include "prg.h"
  3 | #include "hash.h"
  4 | 
  5 | #include <stdio.h>
  6 | #include <stdlib.h>
  7 | #include <string.h>
  8 | 
  9 | 
 10 | static void expand_seed(unsigned char outseeds[WOTS_L*HASH_BYTES], const unsigned char inseed[SEED_BYTES])
 11 | {
 12 |   prg(outseeds, WOTS_L*HASH_BYTES, inseed);
 13 | }
 14 | 
 15 | static void gen_chain(unsigned char out[HASH_BYTES], const unsigned char seed[HASH_BYTES], const unsigned char *masks, int chainlen)
 16 | {
 17 |   int i,j;
 18 |   for(j=0;j<HASH_BYTES;j++) 
 19 |     out[j] = seed[j];
 20 | 
 21 |   for(i=0;i<chainlen;i++)
 22 |     hash_n_n_mask(out,out,masks+(i*HASH_BYTES));
 23 | }
 24 | 
 25 | static void gen_chain_8x(unsigned char out[8*HASH_BYTES], const unsigned char seed[8*HASH_BYTES], const unsigned char *masks, int chainlen)
 26 | {
 27 |   int i,j;
 28 | 
 29 |   for(i=0;i<chainlen;i++)
 30 |   {
 31 |     hash_n_n_mask_8x(out,out,masks+(i*HASH_BYTES));
 32 |   }
 33 | }
 34 | 
 35 | static void gen_fullchains(unsigned char x[WOTS_L*HASH_BYTES], const unsigned char masks[(WOTS_W-1)*HASH_BYTES])
 36 | {
 37 |   int i;
 38 |   for(i=0;i<64;i+=8)
 39 |     gen_chain_8x(x+i*HASH_BYTES, x+i*HASH_BYTES, masks, WOTS_W-1);
 40 | 
 41 |   for(i=64;i<WOTS_L;i++)
 42 |     gen_chain(x+i*HASH_BYTES, x+i*HASH_BYTES, masks, WOTS_W-1); 
 43 | }
 44 | 
 45 | static void gen_partialchains(unsigned char out[WOTS_L*HASH_BYTES], const unsigned char in[WOTS_L*HASH_BYTES], const unsigned char *masks, int chainlens[WOTS_L])
 46 | {
 47 |   int i,j;
 48 | 
 49 |   for(i=0;i<WOTS_L;i++)
 50 |   {
 51 |     for(j=0;j<HASH_BYTES;j++)
 52 |       out[i*HASH_BYTES+j] = in[i*HASH_BYTES+j];
 53 |     for(j=0;j<chainlens[i];j++)
 54 |     {
 55 |       //vec_xor(out+i*HASH_BYTES,out+i*HASH_BYTES,masks+j*HASH_BYTES,HASH_BYTES);
 56 |       hash_n_n_mask(out+i*HASH_BYTES,out+i*HASH_BYTES,masks+j*HASH_BYTES);
 57 |     }
 58 |   }
 59 | }
 60 | 
 61 | void wots_pkgen(unsigned char pk[WOTS_L*HASH_BYTES], const unsigned char sk[SEED_BYTES], const unsigned char masks[(WOTS_W-1)*HASH_BYTES])
 62 | {
 63 |   expand_seed(pk, sk);
 64 |   gen_fullchains(pk , masks); 
 65 | }
 66 | 
 67 | 
 68 | void wots_sign(unsigned char sig[WOTS_L*HASH_BYTES], const unsigned char msg[HASH_BYTES], const unsigned char sk[SEED_BYTES], const unsigned char masks[(WOTS_W-1)*HASH_BYTES])
 69 | {
 70 |   int basew[WOTS_L],i,c=0;
 71 | 
 72 | #if WOTS_W != 16
 73 | #error "Implementation specialized for WOTS_W == 16"
 74 | #endif
 75 | 
 76 |   for(i=0;i<WOTS_L1;i+=2)
 77 |   {
 78 |     basew[i]   = msg[i/2] & 0xf;
 79 |     basew[i+1] = msg[i/2] >> 4;
 80 |     c += WOTS_W - 1 - basew[i];
 81 |     c += WOTS_W - 1 - basew[i+1];
 82 |   }
 83 | 
 84 |   for( ;i<WOTS_L;i++)
 85 |   {
 86 |     basew[i] = c & 0xf;
 87 |     c >>= 4;
 88 |   }
 89 |   
 90 |   expand_seed(sig, sk);
 91 |   /*
 92 |   for(i=0;i<WOTS_L;i++)
 93 |     gen_chain(sig+i*HASH_BYTES, sig+i*HASH_BYTES, masks, basew[i]);
 94 |   */
 95 |   gen_partialchains(sig,sig,masks,basew);
 96 | }
 97 | 
 98 | void wots_verify(unsigned char pk[WOTS_L*HASH_BYTES], const unsigned char sig[WOTS_L*HASH_BYTES], const unsigned char msg[HASH_BYTES], const unsigned char masks[(WOTS_W-1)*HASH_BYTES])
 99 | {
100 |   int basew[WOTS_L],i,c=0,j, k, l;
101 | 
102 |   unsigned char tmp[8*HASH_BYTES];
103 | 
104 | #if WOTS_W != 16
105 | #error "Implementation specialized for WOTS_W == 16"
106 | #endif
107 | 
108 |   for(i=0;i<WOTS_L1;i+=2)
109 |   {
110 |     basew[i]   = msg[i/2] & 0xf;
111 |     basew[i+1] = msg[i/2] >> 4;
112 |     c += WOTS_W - 1 - basew[i];
113 |     c += WOTS_W - 1 - basew[i+1];
114 |   }
115 | 
116 |   for( ;i<WOTS_L;i++)
117 |   {
118 |     basew[i] = c & 0xf;
119 |     c >>= 4;
120 |   }
121 | 
122 |   // as much as possible 8 times parallel
123 |   for(i=0; (i+8) < WOTS_L; i+=8)
124 |   {
125 |     memcpy(tmp, sig+i*HASH_BYTES, 8*HASH_BYTES);
126 | 
127 |     int cnt = (1 << 8) - 1;
128 | 
129 |     // remove basew == 0 cases
130 |     for (k = 0; k < 8; k++)
131 |       if (0 == WOTS_W-1-basew[i+k])
132 |       {
133 |         memcpy(pk+(i+k)*HASH_BYTES, tmp+k*HASH_BYTES, HASH_BYTES);
134 |         cnt ^= (1 << k);
135 |       }
136 | 
137 |     for(j=0; (j < WOTS_W) & (cnt > 0); )
138 |     {
139 |       for (k = 0; k < 8; k++)
140 |         for (l = 0; l < 32; l++)
141 |           tmp[l + k*HASH_BYTES] ^= (masks+(basew[i+k]*HASH_BYTES)+(j*HASH_BYTES))[l];
142 | 
143 |       hash_n_n_8x(tmp, tmp);
144 | 
145 |       j++;
146 | 
147 |       for (k = 0; k < 8; k++)
148 |         if (j == WOTS_W-1-basew[i+k])// | (j == WOTS_W))
149 |         {
150 |           memcpy(pk+(i+k)*HASH_BYTES, tmp+k*HASH_BYTES, HASH_BYTES);
151 |           cnt ^= (1 << k);
152 |         }
153 |     }
154 |   }
155 | 
156 |   // rest one by one...
157 |   for(; (i) < WOTS_L; i+=1)
158 |   {
159 |     memcpy(tmp, sig+i*HASH_BYTES, HASH_BYTES);
160 | 
161 |     for(j=0; j < WOTS_W-1-basew[i]; j++)
162 |       hash_n_n_mask(tmp, tmp, masks+(basew[i]*HASH_BYTES)+(j*HASH_BYTES));
163 | 
164 |     memcpy(pk+i*HASH_BYTES, tmp, HASH_BYTES);
165 |   }
166 | 
167 | }
168 | 


--------------------------------------------------------------------------------
/supercop/crypto_sign/sphincs256haraka/aesni/wots.h:
--------------------------------------------------------------------------------
 1 | #ifndef WOTS_H
 2 | #define WOTS_H
 3 | 
 4 | #include "params.h"
 5 | 
 6 | void wots_pkgen(unsigned char pk[WOTS_L*HASH_BYTES], const unsigned char sk[SEED_BYTES], const unsigned char masks[(WOTS_W-1)*HASH_BYTES]);
 7 | 
 8 | void wots_sign(unsigned char sig[WOTS_L*HASH_BYTES], const unsigned char msg[HASH_BYTES], const unsigned char sk[SEED_BYTES], const unsigned char masks[(WOTS_W-1)*HASH_BYTES]);
 9 | 
10 | void wots_verify(unsigned char pk[WOTS_L*HASH_BYTES], const unsigned char sig[WOTS_L*HASH_BYTES], const unsigned char msg[HASH_BYTES], const unsigned char masks[(WOTS_W-1)*HASH_BYTES]);
11 | 
12 | #endif
13 | 


--------------------------------------------------------------------------------
/supercop/crypto_sign/sphincs256haraka/aesni/zerobytes.c:
--------------------------------------------------------------------------------
 1 | #include "zerobytes.h"
 2 | 
 3 | unsigned char *zerobytes(unsigned char *r,unsigned long long n)
 4 | { 
 5 |   volatile unsigned char *p=r; 
 6 |   while (n--) 
 7 |     *(p++) = 0; 
 8 |   return r; 
 9 | }
10 | 


--------------------------------------------------------------------------------
/supercop/crypto_sign/sphincs256haraka/aesni/zerobytes.h:
--------------------------------------------------------------------------------
1 | #ifndef ZEROBYTES_H
2 | #define ZEROBYTES_H
3 | 
4 | unsigned char *zerobytes(unsigned char *r,unsigned long long n);
5 | 
6 | #endif
7 | 


--------------------------------------------------------------------------------
/supercop/crypto_sign/try.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * crypto_sign/try.c version 20140423
  3 |  * D. J. Bernstein
  4 |  * Public domain.
  5 |  * Auto-generated by trygen.py; do not edit.
  6 |  */
  7 | 
  8 | #include "crypto_sign.h"
  9 | #include "try.h"
 10 | #include "randombytes.h"
 11 | 
 12 | const char *primitiveimplementation = crypto_sign_IMPLEMENTATION;
 13 | 
 14 | #define TUNE_BYTES 1536
 15 | #ifdef SMALL
 16 | #define MAXTEST_BYTES 128
 17 | #else
 18 | #define MAXTEST_BYTES 4096
 19 | #endif
 20 | #ifdef SMALL
 21 | #define LOOPS 8
 22 | #else
 23 | #define LOOPS 64
 24 | #endif
 25 | 
 26 | static unsigned char *p;
 27 | static unsigned char *s;
 28 | static unsigned char *m;
 29 | static unsigned char *c;
 30 | static unsigned char *t;
 31 | static unsigned char *p2;
 32 | static unsigned char *s2;
 33 | static unsigned char *m2;
 34 | static unsigned char *c2;
 35 | static unsigned char *t2;
 36 | #define plen crypto_sign_PUBLICKEYBYTES
 37 | #define slen crypto_sign_SECRETKEYBYTES
 38 | unsigned long long mlen;
 39 | unsigned long long clen;
 40 | unsigned long long tlen;
 41 | 
 42 | void preallocate(void)
 43 | {
 44 | #ifdef RAND_R_PRNG_NOT_SEEDED
 45 |   RAND_status();
 46 | #endif
 47 | }
 48 | 
 49 | void allocate(void)
 50 | {
 51 |   unsigned long long alloclen = 0;
 52 |   if (alloclen < TUNE_BYTES) alloclen = TUNE_BYTES;
 53 |   if (alloclen < MAXTEST_BYTES + crypto_sign_BYTES) alloclen = MAXTEST_BYTES + crypto_sign_BYTES;
 54 |   if (alloclen < crypto_sign_PUBLICKEYBYTES) alloclen = crypto_sign_PUBLICKEYBYTES;
 55 |   if (alloclen < crypto_sign_SECRETKEYBYTES) alloclen = crypto_sign_SECRETKEYBYTES;
 56 |   p = alignedcalloc(alloclen);
 57 |   s = alignedcalloc(alloclen);
 58 |   m = alignedcalloc(alloclen);
 59 |   c = alignedcalloc(alloclen);
 60 |   t = alignedcalloc(alloclen);
 61 |   p2 = alignedcalloc(alloclen);
 62 |   s2 = alignedcalloc(alloclen);
 63 |   m2 = alignedcalloc(alloclen);
 64 |   c2 = alignedcalloc(alloclen);
 65 |   t2 = alignedcalloc(alloclen);
 66 | }
 67 | 
 68 | void predoit(void)
 69 | {
 70 |   crypto_sign_keypair(p,s);
 71 |   mlen = TUNE_BYTES;
 72 |   clen = 0;
 73 |   randombytes(m,mlen);
 74 | }
 75 | 
 76 | void doit(void)
 77 | {
 78 |   crypto_sign(c,&clen,m,mlen,s);
 79 |   crypto_sign_open(t,&tlen,c,clen,p);
 80 | }
 81 | 
 82 | void test(void)
 83 | {
 84 |   unsigned long long loop;
 85 |   
 86 |   for (loop = 0;loop < LOOPS;++loop) {
 87 |     mlen = myrandom() % (MAXTEST_BYTES + 1);
 88 |     
 89 |     output_prepare(p2,p,plen);
 90 |     output_prepare(s2,s,slen);
 91 |     if (crypto_sign_keypair(p,s) != 0) fail("crypto_sign_keypair returns nonzero");
 92 |     checksum(p,plen);
 93 |     checksum(s,slen);
 94 |     output_compare(p2,p,plen,"crypto_sign_keypair");
 95 |     output_compare(s2,s,slen,"crypto_sign_keypair");
 96 |     
 97 |     clen = mlen + crypto_sign_BYTES;
 98 |     output_prepare(c2,c,clen);
 99 |     input_prepare(m2,m,mlen);
100 |     memcpy(s2,s,slen);
101 |     double_canary(s2,s,slen);
102 |     if (crypto_sign(c,&clen,m,mlen,s) != 0) fail("crypto_sign returns nonzero");
103 |     if (clen < mlen) fail("crypto_sign returns smaller output than input");
104 |     if (clen > mlen + crypto_sign_BYTES) fail("crypto_sign returns more than crypto_sign_BYTES extra bytes");
105 |     checksum(c,clen);
106 |     output_compare(c2,c,clen,"crypto_sign");
107 |     input_compare(m2,m,mlen,"crypto_sign");
108 |     input_compare(s2,s,slen,"crypto_sign");
109 |     
110 | #if crypto_sign_DETERMINISTIC == 1
111 |     double_canary(c2,c,clen);
112 |     double_canary(m2,m,mlen);
113 |     double_canary(s2,s,slen);
114 |     if (crypto_sign(c2,&clen,m2,mlen,s2) != 0) fail("crypto_sign returns nonzero");
115 |     if (memcmp(c2,c,clen) != 0) fail("crypto_sign is nondeterministic");
116 | #endif
117 |     
118 | #if crypto_sign_DETERMINISTIC == 1
119 |     double_canary(c2,c,clen);
120 |     double_canary(m2,m,mlen);
121 |     double_canary(s2,s,slen);
122 |     if (crypto_sign(m2,&clen,m2,mlen,s) != 0) fail("crypto_sign with m=c overlap returns nonzero");
123 |     if (memcmp(m2,c,clen) != 0) fail("crypto_sign does not handle m=c overlap");
124 |     memcpy(m2,m,mlen);
125 |     if (crypto_sign(s2,&clen,m,mlen,s2) != 0) fail("crypto_sign with s=c overlap returns nonzero");
126 |     if (memcmp(s2,c,clen) != 0) fail("crypto_sign does not handle s=c overlap");
127 |     memcpy(s2,s,slen);
128 | #endif
129 |     
130 |     tlen = clen;
131 |     output_prepare(t2,t,tlen);
132 |     memcpy(c2,c,clen);
133 |     double_canary(c2,c,clen);
134 |     memcpy(p2,p,plen);
135 |     double_canary(p2,p,plen);
136 |     if (crypto_sign_open(t,&tlen,c,clen,p) != 0) fail("crypto_sign_open returns nonzero");
137 |     if (tlen != mlen) fail("crypto_sign_open does not match mlen");
138 |     if (memcmp(t,m,mlen) != 0) fail("crypto_sign_open does not match m");
139 |     checksum(t,tlen);
140 |     output_compare(t2,t,clen,"crypto_sign_open");
141 |     input_compare(c2,c,clen,"crypto_sign_open");
142 |     input_compare(p2,p,plen,"crypto_sign_open");
143 |     
144 |     double_canary(t2,t,tlen);
145 |     double_canary(c2,c,clen);
146 |     double_canary(p2,p,plen);
147 |     if (crypto_sign_open(t2,&tlen,c2,clen,p2) != 0) fail("crypto_sign_open returns nonzero");
148 |     if (memcmp(t2,t,tlen) != 0) fail("crypto_sign_open is nondeterministic");
149 |     
150 |     double_canary(t2,t,tlen);
151 |     double_canary(c2,c,clen);
152 |     double_canary(p2,p,plen);
153 |     if (crypto_sign_open(c2,&tlen,c2,clen,p) != 0) fail("crypto_sign_open with c=t overlap returns nonzero");
154 |     if (memcmp(c2,t,tlen) != 0) fail("crypto_sign_open does not handle c=t overlap");
155 |     memcpy(c2,c,clen);
156 |     if (crypto_sign_open(p2,&tlen,c,clen,p2) != 0) fail("crypto_sign_open with p=t overlap returns nonzero");
157 |     if (memcmp(p2,t,tlen) != 0) fail("crypto_sign_open does not handle p=t overlap");
158 |     memcpy(p2,p,plen);
159 |     
160 |     c[myrandom() % clen] += 1 + (myrandom() % 255);
161 |     if (crypto_sign_open(t,&tlen,c,clen,p) == 0)
162 |       if ((tlen != mlen) || (memcmp(t,m,mlen) != 0))
163 |         fail("crypto_sign_open allows trivial forgeries");
164 |     c[myrandom() % clen] += 1 + (myrandom() % 255);
165 |     if (crypto_sign_open(t,&tlen,c,clen,p) == 0)
166 |       if ((tlen != mlen) || (memcmp(t,m,mlen) != 0))
167 |         fail("crypto_sign_open allows trivial forgeries");
168 |     c[myrandom() % clen] += 1 + (myrandom() % 255);
169 |     if (crypto_sign_open(t,&tlen,c,clen,p) == 0)
170 |       if ((tlen != mlen) || (memcmp(t,m,mlen) != 0))
171 |         fail("crypto_sign_open allows trivial forgeries");
172 |   }
173 | }
174 | 


--------------------------------------------------------------------------------