├── .gitattributes
├── .gitignore
├── Makefile
├── README.md
├── app
├── .keep
├── extensions
│ ├── .keep
│ └── poly1305
│ │ ├── impl.c
│ │ ├── poly1305.S
│ │ ├── poly1305_armv6-32.inc
│ │ ├── poly1305_avx-32.inc
│ │ ├── poly1305_avx-64.inc
│ │ ├── poly1305_avx2-32.inc
│ │ ├── poly1305_avx2-64.inc
│ │ ├── poly1305_constants_x86.inc
│ │ ├── poly1305_neon-32.inc
│ │ ├── poly1305_ref-32.inc
│ │ ├── poly1305_ref-64.inc
│ │ ├── poly1305_ref-8.inc
│ │ ├── poly1305_sse2-32.inc
│ │ ├── poly1305_sse2-64.inc
│ │ ├── poly1305_x86-32.inc
│ │ └── poly1305_x86-64.inc
├── include
│ ├── .keep
│ └── poly1305.h
├── project.def
└── project.ver
├── configure
├── framework
├── bench.c
├── driver
│ ├── arm
│ │ ├── cpucycles_impl.inc
│ │ ├── cpuid_flags.inc
│ │ ├── cpuid_impl.inc
│ │ ├── cpuid_impl_linux.inc
│ │ ├── cpuid_impl_msvc.inc
│ │ ├── cpuid_impl_netbsd.inc
│ │ └── gcc.inc
│ ├── cpucycles.c
│ ├── cpuid.c
│ ├── gcc_driver.inc
│ ├── generic
│ │ ├── cpucycles_impl.inc
│ │ ├── cpuid_flags.inc
│ │ └── cpuid_impl.inc
│ ├── x86
│ │ ├── cpucycles_impl.inc
│ │ ├── cpuid_flags.inc
│ │ ├── cpuid_impl.inc
│ │ ├── driver.S
│ │ ├── gcc.inc
│ │ └── yasm.inc
│ └── yasm_driver.inc
├── fuzz.c
├── include
│ ├── bench.h
│ ├── cpucycles.h
│ ├── cpuid.h
│ └── fuzz.h
├── main_shared.c
└── main_util.c
├── genvs.php
└── sources
├── crypto_onetimeauth_poly1305_ref_auth.c
├── crypto_onetimeauth_poly1305_x86_auth.s
├── poly1305-donna-x64-avx2-incremental-source.c
├── poly1305-donna-x64-sse2-incremental-source.c
├── poly1305-donna-x86-avx2-incremental-source.c
└── poly1305-donna-x86-sse2-incremental-source.c
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Set the default behavior, in case people don't have core.autocrlf set.
2 | * text=auto
3 |
4 | # Explicitly declare text files you want to always be normalized and converted
5 | # to native line endings on checkout.
6 | *.c text
7 | *.h text
8 |
9 | # Declare files that will always have CRLF line endings on checkout.
10 | *.sln text eol=crlf
11 |
12 | # Denote all files that are truly binary and should not be modified.
13 | *.png binary
14 | *.jpg binary
15 |
16 | # Included assembler files, must be LF
17 | *.inc text eol=lf
18 |
19 | # configure must be LF
20 | configure text eol=lf
21 |
22 | # project files must be LF
23 | project.def text eol=lf
24 | project.ver text eol=lf
25 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | asmopt.mak
2 | bin/*
3 | build/*
4 | build_util/*
5 | config.log
6 | framework/include/asmopt.h
7 | framework/include/asmopt_internal.h
8 | framework/include/util_implementations.h
9 | example
10 | example-util
11 | !example/
12 | vs201*/ipch/*
13 | vs201*/*.sdf
14 | vs201*/*.suo
15 | vs201*/*.user
16 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | ifeq ($(wildcard asmopt.mak),)
2 | $(error Run ./configure first)
3 | endif
4 |
5 | include asmopt.mak
6 |
7 | ##########################
8 | # set up variables
9 | #
10 |
11 | BASEDIR = .
12 | BINDIR = bin
13 | BUILDDIR = build
14 | BUILDDIRUTIL = build_util
15 | INCLUDE = $(addprefix -I$(BASEDIR)/,$(appdir)/extensions $(appdir)/include framework/include framework/driver framework/driver/$(ARCH))
16 | CINCLUDE = $(INCLUDE)
17 | ASMINCLUDE = $(INCLUDE)
18 |
19 | # yasm doesn't need includes passed to the assembler
20 | ifneq ($(AS),yasm)
21 | COMMA := ,
22 | ASMINCLUDE += $(addprefix -Wa$(COMMA),$(INCLUDE))
23 | endif
24 |
25 | ###########################
26 | # define recursive wildcard: $(call rwildcard, basepath, globs)
27 | #
28 | rwildcard = $(foreach d, $(wildcard $(1)*), $(call rwildcard, $(d)/, $(2)) $(filter $(subst *, %, $(2)), $(d)))
29 |
30 | SRCDRIVER = $(wildcard framework/driver/*.c)
31 | SRCEXT = $(call rwildcard, $(appdir)/extensions/, *.c)
32 | SRCASM =
33 | SRCMAIN = $(appdir)/main.c
34 | SRCUTIL = framework/main_util.c framework/bench.c framework/fuzz.c
35 | SRCSHARED = framework/main_shared.c
36 |
37 |
38 | # do we have an assembler?
39 | ifeq ($(HAVEAS),yes)
40 |
41 | # grab all the assembler files
42 | SRCASM = $(call rwildcard, $(appdir)/extensions/, *.S)
43 |
44 | # add asm for the appropriate arch
45 | SRCASM += $(call rwildcard, $(addsuffix $(ARCH),framework/driver/), *.S)
46 |
47 | endif
48 |
49 | ##########################
50 | # expand all source file paths in to object files in $(BUILDDIR)/$(BUILDDIRUTIL)
51 | #
52 | OBJDRIVER = $(patsubst %.c, $(BUILDDIR)/%.o, $(SRCDRIVER))
53 | OBJEXT = $(patsubst %.c, $(BUILDDIR)/%.o, $(SRCEXT))
54 | OBJASM = $(patsubst %.S, $(BUILDDIR)/%.o, $(SRCASM))
55 | OBJMAIN = $(patsubst %.c, $(BUILDDIR)/%.o, $(SRCMAIN))
56 | OBJUTIL = $(patsubst %.c, $(BUILDDIRUTIL)/%.o, $(SRCUTIL))
57 | OBJEXTUTIL = $(patsubst %.c, $(BUILDDIRUTIL)/%.o, $(SRCEXT))
58 | OBJSHARED = $(patsubst %.c, $(BUILDDIR)/%.o, $(SRCSHARED))
59 |
60 | ##########################
61 | # non-file targets
62 | #
63 | .PHONY: all
64 | .PHONY: default
65 | .PHONY: makebin
66 | .PHONY: exe
67 | .PHONY: lib
68 | .PHONY: shared
69 | .PHONY: util
70 |
71 | .PHONY: install-shared
72 | .PHONY: install-generic
73 | .PHONY: install-lib
74 | .PHONY: uninstall
75 |
76 | .PHONY: clean
77 | .PHONY: distclean
78 |
79 |
80 | all: default
81 |
82 | default: lib
83 |
84 | makebin:
85 | @mkdir -p $(BINDIR)
86 |
87 | exe: makebin $(BINDIR)/$(PROJECTNAME)$(EXE)
88 | @echo built [$(BINDIR)/$(PROJECTNAME)$(EXE)]
89 |
90 | install-generic:
91 | $(INSTALL) -d $(includedir)/lib$(PROJECTNAME)
92 | $(INSTALL) -d $(libdir)
93 | $(INSTALL) -m 644 $(appdir)/include/*.h $(includedir)/lib$(PROJECTNAME)
94 |
95 | lib: makebin $(BINDIR)/$(PROJECTNAME)$(STATICLIB)
96 | @echo built [$(BINDIR)/$(PROJECTNAME)$(STATICLIB)]
97 |
98 | install-lib: lib install-generic
99 | $(INSTALL) -m 644 $(BINDIR)/$(PROJECTNAME)$(STATICLIB) $(libdir)
100 | $(if $(RANLIB), $(RANLIB) $(libdir)/$(PROJECTNAME)$(STATICLIB))
101 |
102 | util: makebin $(BINDIR)/$(PROJECTNAME)-util$(EXE)
103 | @echo built [$(BINDIR)/$(PROJECTNAME)-util$(EXE)]
104 |
105 | ifeq ($(HAVESHARED),yes)
106 | shared: makebin $(BINDIR)/$(SONAME)
107 | @echo built [$(BINDIR)/$(SONAME)]
108 |
109 | install-shared: shared install-generic
110 | ifneq ($(SOIMPORT),)
111 | $(INSTALL) -d $(bindir)
112 | $(INSTALL) -m 755 $(BINDIR)/$(SONAME) $(bindir)
113 | $(INSTALL) -m 644 $(BINDIR)/$(SOIMPORT) $(libdir)
114 | else ifneq ($(SONAME),)
115 | $(INSTALL) -m 755 $(BINDIR)/$(SONAME) $(libdir)
116 | ln -f -s $(libdir)/$(SONAME) $(libdir)/lib$(PROJECTNAME).$(SOSUFFIX)
117 | endif
118 | else
119 | shared:
120 | @echo project must be /configured with --pic
121 |
122 | install-shared:
123 | @echo project must be /configured with --pic
124 | endif # HAVESHARED
125 |
126 | uninstall:
127 | rm -rf $(includedir)/lib$(PROJECTNAME)
128 | rm -f $(libdir)/$(PROJECTNAME)$(STATICLIB)
129 | ifneq ($(SOIMPORT),)
130 | rm -f $(bindir)/$(SONAME) $(libdir)/lib$(SOIMPORT)
131 | else ifneq ($(SONAME),)
132 | rm -f $(libdir)/$(SONAME) $(libdir)/lib$(PROJECTNAME).$(SOSUFFIX)
133 | endif
134 |
135 | clean:
136 | @echo cleaning project [$(PROJECTNAME)]
137 | @rm -rf $(BUILDDIR)/*
138 | @rm -rf $(BUILDDIRUTIL)/*
139 | @rm -rf $(BINDIR)/*
140 |
141 | distclean: clean
142 | @rm asmopt.mak
143 | @rm config.log
144 |
145 | ##########################
146 | # build rules for files
147 | #
148 |
149 | # use $(BASEOBJ) in build rules to grab the base path/name of the object file, without an extension
150 | BASEOBJ = $(BUILDDIR)/$*
151 | BASEOBJUTIL = $(BUILDDIRUTIL)/$*
152 |
153 | # building .S (assembler) files
154 | $(BUILDDIR)/%.o: %.S
155 | @mkdir -p $(dir $@)
156 | # yasm needs one pass to compile, and one to generate dependencies
157 | ifeq ($(AS),yasm)
158 | $(AS) $(ASFLAGS) $(ASMINCLUDE) -o $@ $<
159 | @$(AS) $(ASFLAGS) $(ASMINCLUDE) -o $@ -M $< >$(BASEOBJ).temp
160 | else
161 | $(AS) $(ASFLAGS) $(ASMINCLUDE) $(DEPMM) $(DEPMF) $(BASEOBJ).temp -D BUILDING_ASM -c -o $(BASEOBJ).o $<
162 | endif
163 | @cp $(BASEOBJ).temp $(BASEOBJ).P
164 | @sed \
165 | -e 's/^[^:]*: *//' \
166 | -e 's/ *\\$$//' \
167 | -e '/^$$/ d' \
168 | -e 's/$$/ :/' \
169 | < $(BASEOBJ).temp >> $(BASEOBJ).P
170 | @rm -f $(BASEOBJ).temp
171 |
172 | # building .c (C) files
173 | $(BUILDDIR)/%.o: %.c
174 | @mkdir -p $(dir $@)
175 | $(CC) $(CFLAGS) $(CINCLUDE) $(DEPMM) $(DEPMF) $(BASEOBJ).temp -c -o $(BASEOBJ).o $<
176 | @cp $(BASEOBJ).temp $(BASEOBJ).P
177 | @sed \
178 | -e 's/#.*//' \
179 | -e 's/^[^:]*: *//' \
180 | -e 's/ *\\$$//' \
181 | -e '/^$$/ d' \
182 | -e 's/$$/ :/' \
183 | < $(BASEOBJ).temp >> $(BASEOBJ).P
184 | @rm -f $(BASEOBJ).temp
185 |
186 | # building .c (C) files for fuzzing/benching
187 | $(BUILDDIRUTIL)/%.o: %.c
188 | @mkdir -p $(dir $@)
189 | $(CC) $(CFLAGS) $(CINCLUDE) $(DEPMM) $(DEPMF) $(BASEOBJUTIL).temp -DUTILITIES -c -o $(BASEOBJUTIL).o $<
190 | @cp $(BASEOBJUTIL).temp $(BASEOBJUTIL).P
191 | @sed \
192 | -e 's/#.*//' \
193 | -e 's/^[^:]*: *//' \
194 | -e 's/ *\\$$//' \
195 | -e '/^$$/ d' \
196 | -e 's/$$/ :/' \
197 | < $(BASEOBJUTIL).temp >> $(BASEOBJUTIL).P
198 | @rm -f $(BASEOBJUTIL).temp
199 |
200 |
201 | ##########################
202 | # include all auto-generated dependencies
203 | #
204 |
205 | -include $(OBJDRIVER:%.o=%.P)
206 | -include $(OBJEXT:%.o=%.P)
207 | -include $(OBJASM:%.o=%.P)
208 | -include $(OBJMAIN:%.o=%.P)
209 | -include $(OBJUTIL:%.o=%.P)
210 | -include $(OBJEXTUTIL:%.o=%.P)
211 | -include $(OBJSHARED:%.o=%.P)
212 |
213 | ##########################
214 | # final build targets
215 | #
216 | $(BINDIR)/$(PROJECTNAME)$(EXE): $(OBJDRIVER) $(OBJEXT) $(OBJASM) $(OBJMAIN)
217 | $(CC) $(CFLAGS) -o $@ $(OBJDRIVER) $(OBJEXT) $(OBJASM) $(OBJMAIN)
218 |
219 | $(BINDIR)/$(PROJECTNAME)$(STATICLIB): $(OBJDRIVER) $(OBJEXT) $(OBJASM)
220 | rm -f $(PROJECTNAME)$(STATICLIB)
221 | $(AR)$@ $(OBJDRIVER) $(OBJEXT) $(OBJASM)
222 | $(if $(RANLIB), $(RANLIB) $@)
223 |
224 | $(BINDIR)/$(PROJECTNAME)-util$(EXE): $(OBJDRIVER) $(OBJEXTUTIL) $(OBJASM) $(OBJUTIL)
225 | $(CC) $(CFLAGS) -o $@ $(OBJDRIVER) $(OBJEXTUTIL) $(OBJASM) $(OBJUTIL)
226 |
227 | ifeq ($(HAVESHARED),yes)
228 | $(BINDIR)/$(SONAME): $(OBJDRIVER) $(OBJEXT) $(OBJASM) $(OBJSHARED)
229 | $(LD)$@ $(OBJDRIVER) $(OBJEXT) $(OBJASM) $(OBJSHARED) $(SOFLAGS) $(LDFLAGS)
230 | endif
231 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ABOUT #
2 |
3 | This is a portable, performant implementation of [Poly1305](http://cr.yp.to/mac.html), a "secret-key message-authentication code suitable for a wide variety of applications".
4 |
5 | All assembler is PIC safe.
6 |
7 | # INITIALIZING #
8 |
9 | The library can be initialized, i.e. the most optimized implementation that passes internal tests will be automatically selected, in two ways, **neither of which are thread safe**:
10 |
11 | 1. `int poly1305_startup(void);` explicitly initializes the library, and returns a non-zero value if no suitable implementation is found that passes internal tests
12 |
13 | 2. Do nothing and use the library like normal. It will auto-initialize itself when needed, and hard exit if no suitable implementation is found.
14 |
15 | # CALLING #
16 |
17 | Common assumptions:
18 |
19 | * When using the incremental functions, the `poly1305_state` struct is assumed to be word aligned, if necessary, for the system in use.
20 |
21 | ## ONE SHOT ##
22 |
23 | `in` is assumed to be word aligned. Incremental support has no alignment requirements, but will obviously slow down if non word-aligned pointers are passed.
24 |
25 | `void poly1305_auth(unsigned char *mac, const unsigned char *in, size_t inlen, const poly1305_key *key);`
26 |
27 | Creates an authentictor in `mac` under the key `key` with `inlen` bytes from `in`.
28 |
29 | ## INCREMENTAL ##
30 |
31 | Incremental `in` buffers are *not* required to be word aligned. Unaligned buffers will require copying to aligned buffers however, which will obviously incur a speed penalty.
32 |
33 | `void poly1305_init(poly1305_state *S, const poly1305_key *key)`
34 |
35 | Initializes `S` with the key `key`.
36 |
37 | `void poly1305_init_ext(poly1305_state *S, const poly1305_key *key, size_t bytes_hint)`
38 |
39 | Initializes `S` with the key `key`, and the hint that no more than `bytes_hint` will be authenticated. If more than `bytes_hint` bytes are passed, in total, the result _may_ be undefined.
40 |
41 | `void poly1305_update(poly1305_state *S, const unsigned char *in, size_t inlen)`
42 |
43 | Updates the state `S` with `inlen` bytes from `in` in.
44 |
45 | `void poly1305_finish(poly1305_state *S, unsigned char *mac)`
46 |
47 | Performs any finalizations on `S` and store the resulting authentictor in to `mac`.
48 |
49 | # Examples #
50 |
51 | ## AUTHENTICATING DATA WITH ONE CALL ##
52 |
53 | size_t bytes = ...;
54 | unsigned char data[...] = {...};
55 | poly1305_key key = {{...}};
56 | unsigned char mac[16];
57 |
58 | poly1305_auth(mac, data, bytes, &key);
59 |
60 | ## HASHING INCREMENTALLY ##
61 |
62 | Hashing incrementally, i.e. with multiple calls to update the state.
63 |
64 | size_t bytes = ...;
65 | unsigned char data[...] = {...};
66 | poly1305_key key = {{...}};
67 | unsigned char mac[16];
68 | poly1305_state state;
69 | size_t i;
70 |
71 | poly1305_init(&state, &key);
72 | /* add one byte at a time, extremely inefficient */
73 | for (i = 0; i < bytes; i++) {
74 | poly1305_update(&state, data + i, 1);
75 | }
76 | poly1305_finish(&state, mac);
77 |
78 |
79 | # VERSIONS #
80 |
81 | ## Reference ##
82 |
83 | There are 3 reference versions, specialized for increasingly capable systems from 8 bit-ish only operations (with the world's most inefficient portable carries, you really don't want to use this unless nothing else runs) to 64 bit.
84 |
85 | * Generic 8-bit-ish: [poly1305\_ref](app/extensions/poly1305/poly1305_ref-8.inc)
86 | * Generic 32-bit with 64-bit compiler support: [poly1305\_ref](app/extensions/poly1305/poly1305_ref-32.inc)
87 | * Generic 64-bit: [poly1305\_ref](app/extensions/poly1305/poly1305_ref-64.inc)
88 |
89 | ## x86 (32 bit) ##
90 |
91 | * 386 compatible: [poly1305\_x86](app/extensions/poly1305/poly1305_x86-32.inc)
92 | * SSE2: [poly1305\_sse2](app/extensions/poly1305/poly1305_sse2-32.inc)
93 | * AVX: [poly1305\_avx](app/extensions/poly1305/poly1305_avx-32.inc)
94 | * AVX2: [poly1305\_avx2](app/extensions/poly1305/poly1305_avx2-32.inc)
95 |
96 | The 386 compatible version is a modified version of djb's floating point public domain implementation.
97 |
98 | SSE2, AVX, and AVX2 versions of the one-shot version `poly1305_auth` will revert to the 386 compatible version if the number of bytes is below a certain threshhold.
99 |
100 | ## x86-64 ##
101 |
102 | * x86-64 compatible: [poly1305\_x86](app/extensions/poly1305/poly1305_x86-64.inc)
103 | * SSE2: [poly1305\_sse2](app/extensions/poly1305/poly1305_sse2-64.inc)
104 | * AVX: [poly1305\_avx](app/extensions/poly1305/poly1305_avx-64.inc)
105 | * AVX2: [poly1305\_avx2](app/extensions/poly1305/poly1305_avx2-64.inc)
106 |
107 | SSE2, AVX, and AVX2 versions of the one-shot version `poly1305_auth` will revert to the x86-64 compatible version if the number of bytes is below a certain threshhold.
108 |
109 | The x86-64 compatible version is _only_ included for short messages. It is thoroughly beaten by SIMD versions above 64-128 bytes.
110 |
111 | ## ARM ##
112 |
113 | * ARMv6: [poly1305\_armv6](app/extensions/poly1305/poly1305_armv6-32.inc)
114 | * NEON: [poly1305\_neon](app/extensions/poly1305/poly1305_neon-32.inc)
115 |
116 | NEON versions of the one-shot version `poly1305_auth` will revert to the ARMv6 version if the number of bytes is below a certain threshhold.
117 |
118 |
119 |
120 | # BUILDING #
121 |
122 | See [asm-opt#configuring](https://github.com/floodyberry/asm-opt#configuring) for full configure options.
123 |
124 | If you would like to use Yasm with a gcc-compatible compiler, pass `--yasm` to configure.
125 |
126 | The Visual Studio projects are generated assuming Yasm is available. You will need to have [Yasm.exe](http://yasm.tortall.net/Download.html) somewhere in your path to build them.
127 |
128 | ## STATIC LIBRARY ##
129 |
130 | ./configure
131 | make lib
132 |
133 | and `make install-lib` OR copy `bin/poly1305.lib` and `app/include/poly1305.h` to your desired location.
134 |
135 | ## SHARED LIBRARY ##
136 |
137 | ./configure --pic
138 | make shared
139 | make install-shared
140 |
141 | ## UTILITIES / TESTING ##
142 |
143 | ./configure
144 | make util
145 | bin/poly1305-util [bench|fuzz]
146 |
147 | ### BENCHMARK / TESTING ###
148 |
149 | Benchmarking will implicitly test every available version. If any fail, it will exit with an error indicating which versions did not pass. Features tested include:
150 |
151 | * One-shot and Incremental authentication
152 | * Results above 2^130 - 5 are properly normalized
153 | * All potential block sizes in the underlying implementation are triggered
154 |
155 | ### FUZZING ###
156 |
157 | Fuzzing tests every available implementation for the current CPU against the reference implementation. Features tested are:
158 |
159 | * One-shot and Incremental authentication
160 |
161 | # BENCHMARKS #
162 |
163 | Only the top 3 benchmarks per mode will be shown. Anything past 3 or so is pretty irrelevant to the current architecture.
164 |
165 | ## [E5200](http://ark.intel.com/products/37212/) ##
166 |
167 |
168 | Implemenation 1 byte 64 bytes 576 bytes 8192 bytes
169 |
170 | SSE2-64 158 4.70 2.22 1.53
171 | SSE2-32 275 7.42 2.54 1.80
172 | x86-64 158 4.74 3.44 3.30
173 | x86-32 275 7.08 3.74 3.33
174 |
175 |
176 |
177 |
178 | ## [i7-4770K](http://ark.intel.com/products/75123) ##
179 |
180 | Timings are with Turbo Boost and Hyperthreading, so their accuracy is not concrete.
181 | For reference, OpenSSL and Crypto++ give ~0.8cpb for AES-128-CTR and ~1.1cpb for AES-256-CTR, ~7.4cpb for SHA-512, and ~4.5cpb for MD5.
182 |
183 |
184 | Implemenation 1 byte 64 bytes 576 bytes 8192 bytes
185 |
186 | AVX2-64 110 3.22 0.96 0.60
187 | AVX2-32 223 4.37 1.15 0.67
188 | AVX-64 110 3.22 1.39 1.06
189 | AVX-32 223 4.37 1.51 1.04
190 | SSE2-64 110 3.22 1.43 1.12
191 | SSE2-32 223 4.33 1.55 1.10
192 |
193 |
194 |
195 | ## AMD FX-8120 ##
196 |
197 | Timings are with Turbo on, so accuracy is not concrete. I'm not sure how to adjust for it either,
198 | and depending on clock speed (3.1ghz vs 4.0ghz), OpenSSL gives between 0.73cpb - 0.94cpb for AES-128-CTR,
199 | 1.03cpb - 1.33cpb for AES-256-CTR, 10.96cpb - 14.1cpb for SHA-512, and 4.7cpb - 5.16cpb for MD5.
200 |
201 |
202 | Implemenation 1 byte 64 bytes 576 bytes 8192 bytes
203 |
204 | AVX-64 175 5.27 1.35 0.80
205 | SSE2-64 175 5.36 1.47 0.88
206 | AVX-32 319 5.72 1.85 1.19
207 | SSE2-32 320 5.78 1.94 1.31
208 | x86-32 313 8.00 3.62 2.99
209 | x86-64 175 5.30 4.03 3.83
210 |
211 |
212 |
213 | ## ZedBoard (Cortex-A9) ##
214 |
215 | I don't have access to the cycle counter yet, so cycles are computed by taking the microseconds times the clock speed (666mhz) divided by 1 million. For comparison, on long messages, OpenSSL 1.0.0e gives 52.3 cpb for aes-128-cbc (woof), ~123cpb for SHA-512 (really woof), and ~9.6cpb for MD5.
216 |
217 |
218 | Implemenation 1 byte 64 bytes 576 bytes 8192 bytes
219 |
220 | Neon-32 290 9.53 3.33 2.26
221 | ARMv6-32 290 9.53 6.99 6.73
222 |
223 |
224 |
225 |
226 | # LICENSE #
227 |
228 | Public Domain, or MIT
--------------------------------------------------------------------------------
/app/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floodyberry/poly1305-opt/700d5cf167441f627d76c845f56b7ea72bdd91e8/app/.keep
--------------------------------------------------------------------------------
/app/extensions/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floodyberry/poly1305-opt/700d5cf167441f627d76c845f56b7ea72bdd91e8/app/extensions/.keep
--------------------------------------------------------------------------------
/app/extensions/poly1305/impl.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include "cpuid.h"
5 | #include "poly1305.h"
6 |
7 | typedef struct poly1305_state_internal_t {
8 | unsigned char opaque[192]; /* largest state required (AVX2) */
9 | size_t leftover, block_size;
10 | unsigned char buffer[64]; /* largest blocksize (AVX2) */
11 | } poly1305_state_internal;
12 |
13 | typedef struct poly1305_impl_t {
14 | unsigned long cpu_flags;
15 | const char *desc;
16 |
17 | size_t (*block_size)(void);
18 | void (*init_ext)(void *state, const poly1305_key *key, size_t bytes_hint);
19 | void (*blocks)(void *state, const unsigned char *in, size_t inlen);
20 | void (*finish_ext)(void *state, const unsigned char *in, size_t remaining, unsigned char *mac);
21 | void (*auth)(unsigned char *mac, const unsigned char *in, size_t inlen, const poly1305_key *key);
22 | } poly1305_impl_t;
23 |
24 | #define POLY1305_DECLARE(ext) \
25 | size_t poly1305_block_size_##ext(void); \
26 | void poly1305_init_ext_##ext(void *state, const poly1305_key *key, size_t bytes_hint); \
27 | void poly1305_blocks_##ext(void *state, const unsigned char *in, size_t inlen); \
28 | void poly1305_finish_ext_##ext(void *state, const unsigned char *in, size_t remaining, unsigned char *mac); \
29 | void poly1305_auth_##ext(unsigned char *mac, const unsigned char *m, size_t inlen, const poly1305_key *key);
30 |
31 | #define POLY1305_IMPL(cpuflags, desc, ext) \
32 | {(cpuflags), desc, poly1305_block_size_##ext, poly1305_init_ext_##ext, poly1305_blocks_##ext, poly1305_finish_ext_##ext, poly1305_auth_##ext}
33 |
34 | #if defined(ARCH_X86)
35 | /* 32 bit only implementations */
36 | #if defined(CPU_32BITS)
37 | #endif
38 |
39 | /* 64 bit only implementations */
40 | #if defined(CPU_64BITS)
41 | #endif
42 |
43 | /* both 32 and 64 bits */
44 | POLY1305_DECLARE(x86)
45 | #define POLY1305_X86 POLY1305_IMPL(CPUID_X86, "x86", x86)
46 |
47 | #if defined(HAVE_SSE2)
48 | POLY1305_DECLARE(sse2)
49 | #define POLY1305_SSE2 POLY1305_IMPL(CPUID_SSE2, "sse2", sse2)
50 | #endif
51 |
52 | #if defined(HAVE_AVX)
53 | POLY1305_DECLARE(avx)
54 | #define POLY1305_AVX POLY1305_IMPL(CPUID_AVX, "avx", avx)
55 | #endif
56 |
57 | #if defined(HAVE_AVX2)
58 | POLY1305_DECLARE(avx2)
59 | #define POLY1305_AVX2 POLY1305_IMPL(CPUID_AVX2, "avx2", avx2)
60 | #endif
61 | #endif
62 |
63 | #if defined(ARCH_ARM)
64 | #if defined(HAVE_ARMv6)
65 | POLY1305_DECLARE(armv6)
66 | #define POLY1305_ARMv6 POLY1305_IMPL(CPUID_ARMv6, "armv6", armv6)
67 | #endif
68 |
69 | #if defined(HAVE_NEON)
70 | POLY1305_DECLARE(neon)
71 | #define POLY1305_NEON POLY1305_IMPL(CPUID_NEON, "neon", neon)
72 | #endif
73 | #endif
74 |
75 | /* the "always runs" version */
76 | #if defined(HAVE_INT64) && defined(HAVE_INT128)
77 | #define POLY1305_GENERIC POLY1305_IMPL(CPUID_GENERIC, "generic/64", ref)
78 | #include "poly1305/poly1305_ref-64.inc"
79 | #elif defined(HAVE_INT32) && defined(HAVE_INT64)
80 | #define POLY1305_GENERIC POLY1305_IMPL(CPUID_GENERIC, "generic/32", ref)
81 | #include "poly1305/poly1305_ref-32.inc"
82 | #else
83 | #define POLY1305_GENERIC POLY1305_IMPL(CPUID_GENERIC, "generic/8", ref)
84 | #include "poly1305/poly1305_ref-8.inc"
85 | #endif
86 |
87 | /* list implemenations from most optimized to least, with generic as the last entry */
88 | static const poly1305_impl_t poly1305_list[] = {
89 | /* x86 */
90 | #if defined(POLY1305_AVX2)
91 | POLY1305_AVX2,
92 | #endif
93 | #if defined(POLY1305_AVX)
94 | POLY1305_AVX,
95 | #endif
96 | #if defined(POLY1305_SSE2)
97 | POLY1305_SSE2,
98 | #endif
99 | #if defined(POLY1305_X86)
100 | POLY1305_X86,
101 | #endif
102 |
103 | /* arm */
104 | #if defined(POLY1305_NEON)
105 | POLY1305_NEON,
106 | #endif
107 | #if defined(POLY1305_ARMv6)
108 | POLY1305_ARMv6,
109 | #endif
110 |
111 | POLY1305_GENERIC
112 | };
113 |
114 | POLY1305_DECLARE(bootup)
115 |
116 | static const poly1305_impl_t poly1305_bootup_impl = POLY1305_IMPL(CPUID_GENERIC, "bootup", bootup);
117 | static const poly1305_impl_t *poly1305_opt = &poly1305_bootup_impl;
118 |
119 | /* is the pointer aligned on a word boundary? */
120 | static int
121 | poly1305_is_aligned(const void *p) {
122 | return ((size_t)p & (sizeof(size_t) - 1)) == 0;
123 | }
124 |
125 | /* processes inlen bytes (full blocks only), handling input alignment */
126 | static void
127 | poly1305_consume(poly1305_state_internal *state, const unsigned char *in, size_t inlen) {
128 | int in_aligned;
129 |
130 | /* it's ok to call with 0 bytes */
131 | if (!inlen)
132 | return;
133 |
134 | /* if everything is aligned, handle directly */
135 | in_aligned = poly1305_is_aligned(in);
136 | if (in_aligned) {
137 | poly1305_opt->blocks(state->opaque, in, inlen);
138 | return;
139 | }
140 |
141 | /* copy the unaligned data to an aligned buffer and process in chunks */
142 | while (inlen) {
143 | unsigned char buffer[1024];
144 | const size_t bytes = (inlen > sizeof(buffer)) ? sizeof(buffer) : inlen;
145 | memcpy(buffer, in, bytes);
146 | poly1305_opt->blocks(state->opaque, buffer, bytes);
147 | in += bytes;
148 | inlen -= bytes;
149 | }
150 | }
151 |
152 |
153 | LIB_PUBLIC void
154 | poly1305_init(poly1305_state *S, const poly1305_key *key) {
155 | poly1305_state_internal *state = (poly1305_state_internal *)S;
156 | poly1305_opt->init_ext(state->opaque, key, 0);
157 | state->leftover = 0;
158 | state->block_size = poly1305_opt->block_size();
159 | }
160 |
161 | LIB_PUBLIC void
162 | poly1305_init_ext(poly1305_state *S, const poly1305_key *key, size_t bytes_hint) {
163 | poly1305_state_internal *state = (poly1305_state_internal *)S;
164 | poly1305_opt->init_ext(state->opaque, key, bytes_hint);
165 | state->leftover = 0;
166 | state->block_size = poly1305_opt->block_size();
167 | }
168 |
169 | LIB_PUBLIC void
170 | poly1305_update(poly1305_state *S, const unsigned char *in, size_t inlen) {
171 | poly1305_state_internal *state = (poly1305_state_internal *)S;
172 |
173 | /* handle leftover */
174 | if (state->leftover) {
175 | size_t want = (state->block_size - state->leftover);
176 | if (want > inlen)
177 | want = inlen;
178 | memcpy(state->buffer + state->leftover, in, want);
179 | inlen -= want;
180 | in += want;
181 | state->leftover += want;
182 | if (state->leftover < state->block_size)
183 | return;
184 | poly1305_opt->blocks(state->opaque, state->buffer, state->block_size);
185 | state->leftover = 0;
186 | }
187 |
188 | /* process full blocks */
189 | if (inlen >= state->block_size) {
190 | size_t want = (inlen & ~(state->block_size - 1));
191 | poly1305_consume(state, in, want);
192 | in += want;
193 | inlen -= want;
194 | }
195 |
196 | /* store leftover */
197 | if (inlen) {
198 | memcpy(state->buffer + state->leftover, in, inlen);
199 | state->leftover += inlen;
200 | }
201 | }
202 |
203 | LIB_PUBLIC void
204 | poly1305_finish(poly1305_state *S, unsigned char *mac) {
205 | poly1305_state_internal *state = (poly1305_state_internal *)S;
206 | poly1305_opt->finish_ext(state->opaque, state->buffer, state->leftover, mac);
207 | }
208 |
209 | LIB_PUBLIC void
210 | poly1305_auth(unsigned char *mac, const unsigned char *in, size_t inlen, const poly1305_key *key) {
211 | poly1305_opt->auth(mac, in, inlen, key);
212 | }
213 |
214 | /* does an incremental mac as well as a one pass and verifies they all match */
215 | static int
216 | poly1305_auth_test(unsigned char *mac, const unsigned char *in, size_t inlen, const poly1305_key *key) {
217 | poly1305_state st;
218 | unsigned char mac2[16];
219 | size_t block_size = poly1305_opt->block_size();
220 |
221 | /* one pass */
222 | poly1305_auth(mac, in, inlen, key);
223 |
224 | /* incremental one pass */
225 | poly1305_init_ext(&st, key, inlen);
226 | poly1305_update(&st, in, inlen);
227 | poly1305_finish(&st, mac2);
228 |
229 | /* make sure they match */
230 | if (memcmp(mac, mac2, 16) != 0) {
231 | memset(mac, 0, 16);
232 | return 1;
233 | }
234 |
235 | /* incremental multi-pass. SSE2/AVX/AVX2 can support up to a 64 byte block size, so try all possible block sizes (64, 32, 16) */
236 | poly1305_init(&st, key);
237 |
238 | /* do the native block size first to prime the state */
239 | if (inlen >= block_size) { poly1305_update(&st, in, block_size); in += block_size; inlen -= block_size; }
240 |
241 | /* try 64 down to 16 */
242 | if (inlen >= 64) { poly1305_update(&st, in, 64); in += 64; inlen -= 64; }
243 | if (inlen >= 32) { poly1305_update(&st, in, 32); in += 32; inlen -= 32; }
244 | if (inlen >= 16) { poly1305_update(&st, in, 16); in += 16; inlen -= 16; }
245 | if (inlen > 0) { poly1305_update(&st, in, inlen); }
246 | poly1305_finish(&st, mac2);
247 |
248 | /* make sure they match */
249 | if (memcmp(mac, mac2, 16) != 0) {
250 | memset(mac, 0, 16);
251 | return 1;
252 | }
253 |
254 | return 0;
255 | }
256 |
257 | static int
258 | poly1305_test_impl(const void *impl) {
259 | /* example from nacl */
260 | static const poly1305_key nacl_key = {{
261 | 0xee,0xa6,0xa7,0x25,0x1c,0x1e,0x72,0x91,
262 | 0x6d,0x11,0xc2,0xcb,0x21,0x4d,0x3c,0x25,
263 | 0x25,0x39,0x12,0x1d,0x8e,0x23,0x4e,0x65,
264 | 0x2d,0x65,0x1f,0xa4,0xc8,0xcf,0xf8,0x80,
265 | }};
266 |
267 | static const unsigned char nacl_msg[131] = {
268 | 0x8e,0x99,0x3b,0x9f,0x48,0x68,0x12,0x73,
269 | 0xc2,0x96,0x50,0xba,0x32,0xfc,0x76,0xce,
270 | 0x48,0x33,0x2e,0xa7,0x16,0x4d,0x96,0xa4,
271 | 0x47,0x6f,0xb8,0xc5,0x31,0xa1,0x18,0x6a,
272 | 0xc0,0xdf,0xc1,0x7c,0x98,0xdc,0xe8,0x7b,
273 | 0x4d,0xa7,0xf0,0x11,0xec,0x48,0xc9,0x72,
274 | 0x71,0xd2,0xc2,0x0f,0x9b,0x92,0x8f,0xe2,
275 | 0x27,0x0d,0x6f,0xb8,0x63,0xd5,0x17,0x38,
276 | 0xb4,0x8e,0xee,0xe3,0x14,0xa7,0xcc,0x8a,
277 | 0xb9,0x32,0x16,0x45,0x48,0xe5,0x26,0xae,
278 | 0x90,0x22,0x43,0x68,0x51,0x7a,0xcf,0xea,
279 | 0xbd,0x6b,0xb3,0x73,0x2b,0xc0,0xe9,0xda,
280 | 0x99,0x83,0x2b,0x61,0xca,0x01,0xb6,0xde,
281 | 0x56,0x24,0x4a,0x9e,0x88,0xd5,0xf9,0xb3,
282 | 0x79,0x73,0xf6,0x22,0xa4,0x3d,0x14,0xa6,
283 | 0x59,0x9b,0x1f,0x65,0x4c,0xb4,0x5a,0x74,
284 | 0xe3,0x55,0xa5
285 | };
286 |
287 | static const unsigned char nacl_mac[16] = {
288 | 0xf3,0xff,0xc7,0x70,0x3f,0x94,0x00,0xe5,
289 | 0x2a,0x7d,0xfb,0x4b,0x3d,0x33,0x05,0xd9
290 | };
291 |
292 | /* generates a final value of (2^130 - 2) == 3 */
293 | static const poly1305_key wrap_key = {{
294 | 0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
295 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
296 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
297 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
298 | }};
299 |
300 | static const unsigned char wrap_msg[16] = {
301 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
302 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
303 | };
304 |
305 | static const unsigned char wrap_mac[16] = {
306 | 0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
307 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
308 | };
309 |
310 | /*
311 | auth the auths of [msg:,key:0,0..,pad:ff,ff...], [msg:1,key:1,1..,pad:ff,ff...],
312 | [msg:2,2,key:2,2..,pad:ff,ff...] with the following key
313 | */
314 | static const poly1305_key total_key = {{
315 | 0x01,0x02,0x03,0x04,0x05,0x06,0x07,
316 | 0xff,0xfe,0xfd,0xfc,0xfb,0xfa,0xf9,
317 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,
318 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff
319 | }};
320 |
321 | static const unsigned char total_mac[16] = {
322 | 0xc6,0x9d,0xc3,0xb9,0x75,0xee,0x5f,0x6b,
323 | 0x28,0x99,0x57,0x94,0x41,0x27,0xd7,0x5e,
324 | };
325 |
326 | poly1305_state state_total;
327 | poly1305_key key;
328 | unsigned char msg[256];
329 | unsigned char mac[16];
330 | size_t i, j;
331 | int result = 0;
332 |
333 | poly1305_opt = (poly1305_impl_t *)impl;
334 |
335 | result |= poly1305_auth_test(mac, nacl_msg, sizeof(nacl_msg), &nacl_key);
336 | result |= memcmp(nacl_mac, mac, sizeof(nacl_mac));
337 |
338 | result |= poly1305_auth_test(mac, wrap_msg, sizeof(wrap_msg), &wrap_key);
339 | result |= memcmp(wrap_mac, mac, sizeof(wrap_mac));
340 |
341 | poly1305_init(&state_total, &total_key);
342 | for (i = 0; i < 256; i++) {
343 | /* set key and message to 'i,i,i..', pad to 'ff,ff,ff..' */
344 | for (j = 0; j < 16; j++) key.b[j] = i;
345 | for (j = 0; j < 16; j++) key.b[j+16] = 0xff;
346 | for (j = 0; j < i; j++) msg[j] = i;
347 | result |= poly1305_auth_test(mac, msg, i, &key);
348 | poly1305_update(&state_total, mac, 16);
349 | }
350 | poly1305_finish(&state_total, mac);
351 | result |= memcmp(total_mac, mac, sizeof(total_mac));
352 |
353 | return result;
354 | }
355 |
356 | LIB_PUBLIC int
357 | poly1305_startup(void) {
358 | const void *opt = LOCAL_PREFIX(cpu_select)(poly1305_list, sizeof(poly1305_impl_t), poly1305_test_impl);
359 | if (opt) {
360 | poly1305_opt = (const poly1305_impl_t *)opt;
361 | return 0;
362 | } else {
363 | return 1;
364 | }
365 | }
366 |
367 | size_t
368 | poly1305_block_size_bootup(void) {
369 | size_t ret = 0;
370 | if (poly1305_startup() == 0) {
371 | ret = poly1305_opt->block_size();
372 | } else {
373 | fprintf(stderr, "poly1305 failed to startup\n");
374 | exit(1);
375 | }
376 | return ret;
377 | }
378 |
379 | void
380 | poly1305_init_ext_bootup(void *state, const poly1305_key *key, size_t bytes_hint) {
381 | if (poly1305_startup() == 0) {
382 | poly1305_opt->init_ext(state, key, bytes_hint);
383 | } else {
384 | fprintf(stderr, "poly1305 failed to startup\n");
385 | exit(1);
386 | }
387 | }
388 |
389 | void
390 | poly1305_blocks_bootup(void *state, const unsigned char *in, size_t inlen) {
391 | if (poly1305_startup() == 0) {
392 | poly1305_opt->blocks(state, in, inlen);
393 | } else {
394 | fprintf(stderr, "poly1305 failed to startup\n");
395 | exit(1);
396 | }
397 | }
398 |
399 | void
400 | poly1305_finish_ext_bootup(void *state, const unsigned char *in, size_t remaining, unsigned char *mac) {
401 | if (poly1305_startup() == 0) {
402 | poly1305_opt->finish_ext(state, in, remaining, mac);
403 | } else {
404 | fprintf(stderr, "poly1305 failed to startup\n");
405 | exit(1);
406 | }
407 | }
408 |
409 | void
410 | poly1305_auth_bootup(unsigned char *mac, const unsigned char *in, size_t inlen, const poly1305_key *key) {
411 | if (poly1305_startup() == 0) {
412 | poly1305_opt->auth(mac, in, inlen, key);
413 | } else {
414 | fprintf(stderr, "poly1305 failed to startup\n");
415 | exit(1);
416 | }
417 | }
418 |
419 | #if defined(UTILITIES)
420 |
421 | #include
422 | #include
423 | #include "fuzz.h"
424 | #include "bench.h"
425 |
426 | static const fuzz_variable_t fuzz_inputs[] = {
427 | {"key", FUZZ_ARRAY, 32},
428 | {"input", FUZZ_RANDOM_LENGTH_ARRAY0, 256},
429 | {0, FUZZ_DONE, 0}
430 | };
431 |
432 | static const fuzz_variable_t fuzz_outputs[] = {
433 | {"auth", FUZZ_ARRAY, 16},
434 | {0, FUZZ_DONE, 0}
435 | };
436 |
437 |
438 | /* process the input with the given implementation and write it to the output */
439 | static void
440 | poly1305_fuzz_impl(const void *impl, const unsigned char *in, const size_t *random_sizes, unsigned char *out) {
441 | const poly1305_key *k = (const poly1305_key *)in;
442 | const unsigned char *m = in + 32;
443 | size_t bytes = random_sizes[0];
444 | poly1305_opt = (const poly1305_impl_t *)impl;
445 | poly1305_auth_test(out, m, bytes, k);
446 | }
447 |
448 | /* run the fuzzer on poly1305 */
449 | void
450 | poly1305_fuzz(void) {
451 | fuzz_init();
452 | fuzz(poly1305_list, sizeof(poly1305_impl_t), fuzz_inputs, fuzz_outputs, poly1305_fuzz_impl);
453 | }
454 |
455 |
456 |
457 | static unsigned char *bench_arr = NULL;
458 | static unsigned char bench_mac[16];
459 | static poly1305_key bench_key = {{0}};
460 | static size_t bench_len = 0;
461 |
462 | static void
463 | poly1305_bench_impl(const void *impl) {
464 | poly1305_opt = (const poly1305_impl_t *)impl;
465 | poly1305_auth(bench_mac, bench_arr, bench_len, &bench_key);
466 | }
467 |
468 | void
469 | poly1305_bench(void) {
470 | static const size_t lengths[] = {1, 64, 128, 576, 8192, 0};
471 | size_t i;
472 | bench_arr = bench_get_buffer();
473 | memset(bench_arr, 0xff, 8192);
474 | memset(&bench_key, 0xff, sizeof(bench_key));
475 | for (i = 0; lengths[i]; i++) {
476 | bench_len = lengths[i];
477 | bench(poly1305_list, sizeof(poly1305_impl_t), poly1305_test_impl, poly1305_bench_impl, bench_len, "byte");
478 | }
479 |
480 | }
481 |
482 | #endif /* defined(UTILITIES) */
483 |
--------------------------------------------------------------------------------
/app/extensions/poly1305/poly1305.S:
--------------------------------------------------------------------------------
1 | #if defined(__GNUC__)
2 | #include "gcc_driver.inc"
3 | #else
4 | ;.if 0
5 | %include "yasm_driver.inc"
6 | ;.endif
7 | #endif
8 |
9 | INCLUDE_IF_AVX2_64BIT "poly1305/poly1305_avx2-64.inc"
10 | INCLUDE_IF_AVX_64BIT "poly1305/poly1305_avx-64.inc"
11 | INCLUDE_IF_SSE2_64BIT "poly1305/poly1305_sse2-64.inc"
12 | INCLUDE_IF_X86_64BIT "poly1305/poly1305_x86-64.inc"
13 |
14 | INCLUDE_IF_AVX2_32BIT "poly1305/poly1305_avx2-32.inc"
15 | INCLUDE_IF_AVX_32BIT "poly1305/poly1305_avx-32.inc"
16 | INCLUDE_IF_SSE2_32BIT "poly1305/poly1305_sse2-32.inc"
17 | INCLUDE_IF_X86_32BIT "poly1305/poly1305_x86-32.inc"
18 |
19 | #if defined(HAVE_ARMv6)
20 | #include "poly1305/poly1305_armv6-32.inc"
21 | #endif
22 |
23 | #if (defined(CPU_32BITS) && defined(HAVE_NEON))
24 | #include "poly1305/poly1305_neon-32.inc"
25 | #endif
26 |
27 |
--------------------------------------------------------------------------------
/app/extensions/poly1305/poly1305_armv6-32.inc:
--------------------------------------------------------------------------------
1 | SECTION_TEXT
2 |
3 | .arch armv5
4 |
5 | GLOBAL_HIDDEN_FN poly1305_block_size_armv6
6 | mov r0, #16
7 | bx lr
8 | FN_END poly1305_block_size_armv6
9 |
10 | .p2align 2
11 | poly1305_init_constants_armv6:
12 | .long 0x3ffffff
13 | .long 0x3ffff03
14 | .long 0x3ffc0ff
15 | .long 0x3f03fff
16 | .long 0x00fffff
17 |
18 | GLOBAL_HIDDEN_FN poly1305_init_ext_armv6
19 | poly1305_init_ext_armv6_local:
20 | stmfd sp!, {r4-r11, lr}
21 | ldmia r1!, {r2-r5}
22 | ldr r7, =poly1305_init_constants_armv6
23 | mov r8, r2
24 | mov r9, r2, lsr #26
25 | mov r10, r3, lsr #20
26 | mov r11, r4, lsr #14
27 | mov r12, r5, lsr #8
28 | orr r9, r9, r3, lsl #6
29 | orr r10, r10, r4, lsl #12
30 | orr r11, r11, r5, lsl #18
31 | ldmia r7, {r2-r6}
32 | and r2, r2, r8
33 | and r3, r3, r9
34 | and r4, r4, r10
35 | and r5, r5, r11
36 | and r6, r6, r12
37 | stmia r0!, {r2-r6}
38 | eor r2, r2, r2
39 | eor r3, r3, r3
40 | eor r4, r4, r4
41 | eor r5, r5, r5
42 | eor r6, r6, r6
43 | stmia r0!, {r2-r6}
44 | ldmia r1!, {r2-r5}
45 | stmia r0, {r2-r6}
46 | ldmfd sp!, {r4-r11, lr}
47 | bx lr
48 | FN_END poly1305_init_ext_armv6
49 |
50 | .ltorg
51 |
52 | GLOBAL_HIDDEN_FN poly1305_blocks_armv6
53 | poly1305_blocks_armv6_local:
54 | stmfd sp!, {r4-r11, lr}
55 | sub sp, sp, #128
56 | str r0, [sp, #36]
57 | str r1, [sp, #40]
58 | str r2, [sp, #44]
59 | mov r14, r1
60 | mov r12, r2
61 | ldr r8, [r0, #56]
62 | tst r8, r8
63 | eor r6, r6, r6
64 | moveq r6, #(1 << 24)
65 | str r6, [sp, #32]
66 | add r10, sp, #64
67 | ldmia r0, {r0-r9}
68 | stmia r10, {r0-r4}
69 | cmp r12, #16
70 | blo poly1305_blocks_armv6_done
71 | poly1305_blocks_armv6_mainloop:
72 | ldmia r14!, {r0-r3}
73 | mov r10, r0, lsr #26
74 | mov r11, r1, lsr #20
75 | mov r12, r2, lsr #14
76 | str r14, [sp, #40]
77 | mov r4, r3, lsr #8
78 | orr r10, r10, r1, lsl #6
79 | orr r11, r11, r2, lsl #12
80 | orr r12, r12, r3, lsl #18
81 | and r0, r0, #0x3ffffff
82 | and r10, r10, #0x3ffffff
83 | ldr r3, [sp, #32]
84 | and r11, r11, #0x3ffffff
85 | and r12, r12, #0x3ffffff
86 | add r5, r5, r0
87 | add r6, r6, r10
88 | orr r4, r4, r3
89 | add r7, r7, r11
90 | add r14, sp, #64
91 | add r8, r8, r12
92 | add r9, r9, r4
93 | ldmia r14, {r0-r4}
94 | umull r10,r11,r5,r4
95 | umull r12,r14,r5,r3
96 | umlal r10,r11,r6,r3
97 | umlal r12,r14,r6,r2
98 | umlal r10,r11,r7,r2
99 | umlal r12,r14,r7,r1
100 | add r4,r4,r4,lsl #2
101 | add r3,r3,r3,lsl #2
102 | umlal r10,r11,r8,r1
103 | umlal r12,r14,r8,r0
104 | umlal r10,r11,r9,r0
105 | umlal r12,r14,r9,r4
106 | str r10, [sp, #24]
107 | str r11, [sp, #28]
108 | str r12, [sp, #16]
109 | str r14, [sp, #20]
110 | umull r10,r11,r5,r2
111 | umull r12,r14,r5,r1
112 | umlal r10,r11,r6,r1
113 | umlal r12,r14,r6,r0
114 | umlal r10,r11,r7,r0
115 | umlal r12,r14,r7,r4
116 | add r2,r2,r2,lsl #2
117 | add r1,r1,r1,lsl #2
118 | umlal r10,r11,r8,r4
119 | umlal r12,r14,r8,r3
120 | umlal r10,r11,r9,r3
121 | umlal r12,r14,r9,r2
122 | str r10, [sp, #8]
123 | str r11, [sp, #12]
124 | str r12, [sp, #0]
125 | str r14, [sp, #4]
126 | umull r10,r11,r5,r0
127 | umlal r10,r11,r6,r4
128 | umlal r10,r11,r7,r3
129 | umlal r10,r11,r8,r2
130 | umlal r10,r11,r9,r1
131 | ldmia sp, {r0-r7}
132 | lsr r12, r10, #26
133 | lsr r14, r4, #26
134 | orr r12, r12, r11, lsl #6
135 | orr r14, r14, r5, lsl #6
136 | and r10, r10, #0x3ffffff
137 | and r4, r4, #0x3ffffff
138 | adds r0, r0, r12
139 | adc r1, r1, #0
140 | adds r6, r6, r14
141 | adc r7, r7, #0
142 | lsr r12, r0, #26
143 | lsr r14, r6, #26
144 | orr r12, r12, r1, lsl #6
145 | orr r14, r14, r7, lsl #6
146 | and r0, r0, #0x3ffffff
147 | and r6, r6, #0x3ffffff
148 | add r14, r14, r14, lsl #2
149 | adds r2, r2, r12
150 | adc r3, r3, #0
151 | add r10, r10, r14
152 | lsr r12, r2, #26
153 | lsr r14, r10, #26
154 | orr r12, r12, r3, lsl #6
155 | and r5, r10, #0x3ffffff
156 | and r7, r2, #0x3ffffff
157 | add r4, r4, r12
158 | add r0, r0, r14
159 | lsr r12, r4, #26
160 | and r8, r4, #0x3ffffff
161 | add r9, r6, r12
162 | ldr r12, [sp, #44]
163 | ldr r14, [sp, #40]
164 | mov r6, r0
165 | cmp r12, #32
166 | sub r12, r12, #16
167 | str r12, [sp, #44]
168 | bhs poly1305_blocks_armv6_mainloop
169 | poly1305_blocks_armv6_done:
170 | ldr r12, [sp, #36]
171 | str r5, [r12, #20]
172 | str r6, [r12, #24]
173 | str r7, [r12, #28]
174 | str r8, [r12, #32]
175 | str r9, [r12, #36]
176 | add sp, sp, #128
177 | ldmfd sp!, {r4-r11, lr}
178 | bx lr
179 | FN_END poly1305_blocks_armv6
180 |
181 | GLOBAL_HIDDEN_FN poly1305_finish_ext_armv6
182 | poly1305_finish_ext_armv6_local:
183 | stmfd sp!, {r4-r11, lr}
184 | sub sp, sp, #16
185 | mov r5, r0
186 | mov r6, r1
187 | mov r7, r2
188 | mov r8, r3
189 | ands r2, r2, r2
190 | beq poly1305_finish_ext_armv6_noremaining
191 | eor r0, r0
192 | mov r9, sp
193 | str r0, [sp, #0]
194 | str r0, [sp, #4]
195 | str r0, [sp, #8]
196 | str r0, [sp, #12]
197 | tst r2, #8
198 | beq poly1305_finish_ext_armv6_skip8
199 | ldmia r1!, {r10-r11}
200 | stmia r9!, {r10-r11}
201 | poly1305_finish_ext_armv6_skip8:
202 | tst r2, #4
203 | beq poly1305_finish_ext_armv6_skip4
204 | ldr r10, [r1], #4
205 | str r10, [r9], #4
206 | poly1305_finish_ext_armv6_skip4:
207 | tst r2, #2
208 | beq poly1305_finish_ext_armv6_skip2
209 | ldrh r10, [r1], #2
210 | strh r10, [r9], #2
211 | poly1305_finish_ext_armv6_skip2:
212 | tst r2, #1
213 | beq poly1305_finish_ext_armv6_skip1
214 | ldrb r10, [r1], #1
215 | strb r10, [r9], #1
216 | poly1305_finish_ext_armv6_skip1:
217 | mov r11, #1
218 | strb r11, [r9]
219 | str r11, [r5, #56]
220 | mov r0, r5
221 | mov r1, sp
222 | mov r2, #16
223 | bl poly1305_blocks_armv6_local
224 | poly1305_finish_ext_armv6_noremaining:
225 | ldr r0, [r5, #20]
226 | ldr r1, [r5, #24]
227 | ldr r2, [r5, #28]
228 | ldr r3, [r5, #32]
229 | ldr r4, [r5, #36]
230 | mov r12, r4, lsr #26
231 | and r4, r4, #0x3ffffff
232 | add r12, r12, r12, lsl #2
233 | add r0, r0, r12
234 | mov r12, r0, lsr #26
235 | and r0, r0, #0x3ffffff
236 | add r1, r1, r12
237 | mov r12, r1, lsr #26
238 | and r1, r1, #0x3ffffff
239 | add r2, r2, r12
240 | mov r12, r2, lsr #26
241 | and r2, r2, #0x3ffffff
242 | add r3, r3, r12
243 | mov r12, r3, lsr #26
244 | and r3, r3, #0x3ffffff
245 | add r4, r4, r12
246 | add r6, r0, #5
247 | mov r12, r6, lsr #26
248 | and r6, r6, #0x3ffffff
249 | add r7, r1, r12
250 | mov r12, r7, lsr #26
251 | and r7, r7, #0x3ffffff
252 | add r10, r2, r12
253 | mov r12, r10, lsr #26
254 | and r10, r10, #0x3ffffff
255 | add r11, r3, r12
256 | mov r12, #-(1 << 26)
257 | add r12, r12, r11, lsr #26
258 | and r11, r11, #0x3ffffff
259 | add r14, r4, r12
260 | mov r12, r14, lsr #31
261 | sub r12, #1
262 | and r6, r6, r12
263 | and r7, r7, r12
264 | and r10, r10, r12
265 | and r11, r11, r12
266 | and r14, r14, r12
267 | mvn r12, r12
268 | and r0, r0, r12
269 | and r1, r1, r12
270 | and r2, r2, r12
271 | and r3, r3, r12
272 | and r4, r4, r12
273 | orr r0, r0, r6
274 | orr r1, r1, r7
275 | orr r2, r2, r10
276 | orr r3, r3, r11
277 | orr r4, r4, r14
278 | orr r0, r0, r1, lsl #26
279 | lsr r1, r1, #6
280 | orr r1, r1, r2, lsl #20
281 | lsr r2, r2, #12
282 | orr r2, r2, r3, lsl #14
283 | lsr r3, r3, #18
284 | orr r3, r3, r4, lsl #8
285 | ldr r6, [r5, #40]
286 | ldr r7, [r5, #44]
287 | ldr r10, [r5, #48]
288 | ldr r11, [r5, #52]
289 | adds r0, r0, r6
290 | adcs r1, r1, r7
291 | adcs r2, r2, r10
292 | adcs r3, r3, r11
293 | stmia r8, {r0-r3}
294 | mov r12, r5
295 | eor r0, r0, r0
296 | eor r1, r1, r1
297 | eor r2, r2, r2
298 | eor r3, r3, r3
299 | eor r4, r4, r4
300 | eor r5, r5, r5
301 | eor r6, r6, r6
302 | eor r7, r7, r7
303 | stmia r12!, {r0-r7}
304 | stmia r12, {r0-r7}
305 | add sp, sp, #16
306 | ldmfd sp!, {r4-r11, lr}
307 | bx lr
308 | FN_END poly1305_finish_ext_armv6
309 |
310 | GLOBAL_HIDDEN_FN poly1305_auth_armv6
311 | poly1305_auth_armv6_local:
312 | stmfd sp!, {r4-r8, lr}
313 | mov r8, sp
314 | and sp, sp, #(~63)
315 | sub sp, sp, #64
316 | mov r4, r0
317 | mov r5, r1
318 | mov r6, r2
319 | mov r7, r3
320 | mov r0, sp
321 | mov r1, r7
322 | bl poly1305_init_ext_armv6_local
323 | ands r2, r6, #(~15)
324 | beq poly1305_auth_armv6_noblocks
325 | mov r0, sp
326 | mov r1, r5
327 | add r5, r5, r2
328 | sub r6, r6, r2
329 | bl poly1305_blocks_armv6_local
330 | poly1305_auth_armv6_noblocks:
331 | mov r0, sp
332 | mov r1, r5
333 | mov r2, r6
334 | mov r3, r4
335 | bl poly1305_finish_ext_armv6_local
336 | mov sp, r8
337 | ldmfd sp!, {r4-r8, lr}
338 | bx lr
339 | FN_END poly1305_auth_armv6
340 |
341 |
--------------------------------------------------------------------------------
/app/extensions/poly1305/poly1305_avx-64.inc:
--------------------------------------------------------------------------------
1 | SECTION_TEXT
2 |
3 | GLOBAL_HIDDEN_FN_EXT poly1305_block_size_avx,0,0
4 | movl $32, %eax
5 | ret
6 | FN_END poly1305_block_size_avx
7 |
8 | GLOBAL_HIDDEN_FN_EXT poly1305_init_ext_avx,4,1
9 | poly1305_init_ext_avx_local:
10 | pushq %r15
11 | pushq %r14
12 | pushq %r13
13 | pushq %r12
14 | pushq %rbp
15 | pushq %rbx
16 | movq %rdi, %rbp
17 | testq %rdx, %rdx
18 | movq $-1, %rax
19 | cmovne %rdx, %rax
20 | movq %rax, -16(%rsp)
21 | vpxor %xmm0, %xmm0, %xmm0
22 | vmovdqu %xmm0, (%rdi)
23 | vmovdqu %xmm0, 16(%rdi)
24 | vmovdqu %xmm0, 32(%rdi)
25 | movq (%rsi), %r9
26 | movq 8(%rsi), %r8
27 | movabsq $17575274610687, %r10
28 | andq %r9, %r10
29 | shrq $44, %r9
30 | movq %r8, %rax
31 | salq $20, %rax
32 | orq %rax, %r9
33 | movabsq $17592181915647, %rax
34 | andq %rax, %r9
35 | shrq $24, %r8
36 | movabsq $68719475727, %rax
37 | andq %rax, %r8
38 | leaq 40(%rdi), %r15
39 | movl %r10d, %eax
40 | andl $67108863, %eax
41 | movl %eax, 40(%rdi)
42 | movl %r9d, %edx
43 | sall $18, %edx
44 | movq %r10, %rax
45 | shrq $26, %rax
46 | orl %edx, %eax
47 | andl $67108863, %eax
48 | movl %eax, 44(%rdi)
49 | movq %r9, %rax
50 | shrq $8, %rax
51 | andl $67108863, %eax
52 | movl %eax, 48(%rdi)
53 | movq %r9, %rax
54 | shrq $34, %rax
55 | movl %r8d, %edx
56 | sall $10, %edx
57 | orl %edx, %eax
58 | andl $67108863, %eax
59 | movl %eax, 52(%rdi)
60 | movq %r8, %rax
61 | shrq $16, %rax
62 | movl %eax, 56(%rdi)
63 | movq 16(%rsi), %rax
64 | movq %rax, 104(%rdi)
65 | movq 24(%rsi), %rax
66 | movq %rax, 112(%rdi)
67 | movl $0, %ebx
68 | .L7:
69 | testq %rbx, %rbx
70 | jne .L4
71 | leaq 60(%rbp), %r15
72 | cmpq $16, -16(%rsp)
73 | ja .L6
74 | jmp .L5
75 | .L4:
76 | cmpq $1, %rbx
77 | jne .L6
78 | leaq 80(%rbp), %r15
79 | cmpq $95, -16(%rsp)
80 | jbe .L5
81 | .L6:
82 | leaq (%r8,%r8,4), %rsi
83 | salq $2, %rsi
84 | leaq (%r9,%r9), %rdi
85 | movq %rdi, %rax
86 | mulq %rsi
87 | movq %rax, %r13
88 | movq %rdx, %r14
89 | movq %r10, %rax
90 | mulq %r10
91 | addq %r13, %rax
92 | adcq %r14, %rdx
93 | movabsq $17592186044415, %rcx
94 | movq %rax, -72(%rsp)
95 | movq %rdx, -64(%rsp)
96 | andq -72(%rsp), %rcx
97 | leaq (%r10,%r10), %r11
98 | movq %r11, %rax
99 | mulq %r9
100 | movq %rax, %r11
101 | movq %rdx, %r12
102 | movq %rsi, %rax
103 | mulq %r8
104 | movq %rax, %r13
105 | movq %rdx, %r14
106 | addq %r11, %r13
107 | adcq %r12, %r14
108 | movq -72(%rsp), %rax
109 | movq -64(%rsp), %rdx
110 | shrdq $44, %rdx, %rax
111 | movq %rax, -56(%rsp)
112 | movq $0, -48(%rsp)
113 | addq -56(%rsp), %r13
114 | adcq -48(%rsp), %r14
115 | movabsq $17592186044415, %rsi
116 | andq %r13, %rsi
117 | leaq (%r8,%r8), %rdi
118 | movq %rdi, %rax
119 | mulq %r10
120 | movq %rax, %r11
121 | movq %rdx, %r12
122 | movq %r9, %rax
123 | mulq %r9
124 | addq %r11, %rax
125 | adcq %r12, %rdx
126 | shrdq $44, %r14, %r13
127 | movq %r13, -40(%rsp)
128 | movq $0, -32(%rsp)
129 | addq -40(%rsp), %rax
130 | adcq -32(%rsp), %rdx
131 | movabsq $4398046511103, %rdi
132 | andq %rax, %rdi
133 | shrdq $42, %rdx, %rax
134 | leaq (%rax,%rax,4), %r8
135 | addq %rcx, %r8
136 | movabsq $17592186044415, %r10
137 | andq %r8, %r10
138 | shrq $44, %r8
139 | addq %rsi, %r8
140 | movabsq $17592186044415, %r9
141 | andq %r8, %r9
142 | shrq $44, %r8
143 | addq %rdi, %r8
144 | movl %r10d, %eax
145 | andl $67108863, %eax
146 | movl %eax, (%r15)
147 | movl %r9d, %edx
148 | sall $18, %edx
149 | movq %r10, %rax
150 | shrq $26, %rax
151 | orl %edx, %eax
152 | andl $67108863, %eax
153 | movl %eax, 4(%r15)
154 | movq %r9, %rax
155 | shrq $8, %rax
156 | andl $67108863, %eax
157 | movl %eax, 8(%r15)
158 | movl %r8d, %edx
159 | sall $10, %edx
160 | movq %r9, %rax
161 | shrq $34, %rax
162 | orl %edx, %eax
163 | andl $67108863, %eax
164 | movl %eax, 12(%r15)
165 | movq %r8, %rax
166 | shrq $16, %rax
167 | movl %eax, 16(%r15)
168 | addq $1, %rbx
169 | cmpq $2, %rbx
170 | jne .L7
171 | .L5:
172 | movq $0, 120(%rbp)
173 | popq %rbx
174 | popq %rbp
175 | popq %r12
176 | popq %r13
177 | popq %r14
178 | popq %r15
179 | ret
180 | FN_END poly1305_init_ext_avx
181 |
182 |
183 |
184 | GLOBAL_HIDDEN_FN poly1305_blocks_avx
185 | poly1305_blocks_avx_local:
186 | pushq %rbp
187 | movq %rsp, %rbp
188 | pushq %rbx
189 | andq $-64, %rsp
190 | subq $200, %rsp
191 | movl $(1 << 24), %eax
192 | movl $((1 << 26) - 1), %r8d
193 | movl $(5), %r9d
194 | vmovd %eax, %xmm1
195 | vmovd %r8d, %xmm0
196 | vmovd %r9d, %xmm2
197 | vpshufd $68, %xmm1, %xmm1
198 | vpshufd $68, %xmm0, %xmm0
199 | vpshufd $68, %xmm2, %xmm2
200 | vmovdqa %xmm1, 152(%rsp)
201 | vmovdqa %xmm2, 184(%rsp)
202 | movq 120(%rdi), %rax
203 | testb $4, %al
204 | je .L12
205 | vpsrldq $8, %xmm1, %xmm1
206 | vmovdqa %xmm1, 152(%rsp)
207 | .L12:
208 | testb $8, %al
209 | je .L13
210 | vpxor %xmm1, %xmm1, %xmm1
211 | vmovdqa %xmm1, 152(%rsp)
212 | .L13:
213 | testb $1, %al
214 | jne .L14
215 | vmovq (%rsi), %xmm1
216 | vpinsrq $1, 16(%rsi), %xmm1, %xmm1
217 | vmovq 8(%rsi), %xmm3
218 | vpinsrq $1, 24(%rsi), %xmm3, %xmm2
219 | vpand %xmm0, %xmm1, %xmm7
220 | vpsrlq $26, %xmm1, %xmm12
221 | vpand %xmm0, %xmm12, %xmm12
222 | vpsllq $12, %xmm2, %xmm3
223 | vpsrlq $52, %xmm1, %xmm1
224 | vpor %xmm3, %xmm1, %xmm6
225 | vpand %xmm0, %xmm6, %xmm3
226 | vpsrlq $26, %xmm6, %xmm6
227 | vpand %xmm0, %xmm6, %xmm6
228 | vpsrlq $40, %xmm2, %xmm2
229 | vpor 152(%rsp), %xmm2, %xmm2
230 | addq $32, %rsi
231 | subq $32, %rdx
232 | orq $1, %rax
233 | movq %rax, 120(%rdi)
234 | jmp .L15
235 | .L14:
236 | vmovdqu (%rdi), %xmm12
237 | vmovdqu 16(%rdi), %xmm6
238 | vmovdqu 32(%rdi), %xmm2
239 | vpshufd $80, %xmm12, %xmm7
240 | vpshufd $250, %xmm12, %xmm12
241 | vpshufd $80, %xmm6, %xmm3
242 | vpshufd $250, %xmm6, %xmm6
243 | vpshufd $80, %xmm2, %xmm2
244 | .L15:
245 | movq 120(%rdi), %rax
246 | testb $48, %al
247 | je .L16
248 | testb $16, %al
249 | je .L17
250 | vmovdqu 40(%rdi), %xmm1
251 | vmovd 56(%rdi), %xmm4
252 | vmovdqu 60(%rdi), %xmm5
253 | vpunpckldq %xmm1, %xmm5, %xmm11
254 | vpunpckhdq %xmm1, %xmm5, %xmm5
255 | vmovd 76(%rdi), %xmm1
256 | vpunpcklqdq %xmm4, %xmm1, %xmm4
257 | jmp .L18
258 | .L17:
259 | movl $(1), %r8d
260 | vmovdqu 40(%rdi), %xmm5
261 | vmovd 56(%rdi), %xmm4
262 | vmovd %r8d, %xmm1
263 | vpunpckldq %xmm1, %xmm5, %xmm11
264 | vpunpckhdq %xmm1, %xmm5, %xmm5
265 | .L18:
266 | vpshufd $80, %xmm11, %xmm1
267 | vpshufd $250, %xmm11, %xmm11
268 | vpshufd $80, %xmm5, %xmm10
269 | vpshufd $250, %xmm5, %xmm5
270 | jmp .L19
271 | .L16:
272 | vmovdqu 60(%rdi), %xmm5
273 | vpshufd $0, %xmm5, %xmm1
274 | vpshufd $85, %xmm5, %xmm11
275 | vpshufd $170, %xmm5, %xmm10
276 | vpshufd $255, %xmm5, %xmm5
277 | vmovd 76(%rdi), %xmm4
278 | vpshufd $0, %xmm4, %xmm4
279 | .L19:
280 | vmovdqa %xmm11, 136(%rsp)
281 | vpmuludq 184(%rsp), %xmm11, %xmm13
282 | vmovdqa %xmm13, 120(%rsp)
283 | vmovdqa %xmm10, 104(%rsp)
284 | vpmuludq 184(%rsp), %xmm10, %xmm13
285 | vmovdqa %xmm13, 88(%rsp)
286 | vmovdqa %xmm5, 72(%rsp)
287 | vpmuludq 184(%rsp), %xmm5, %xmm5
288 | vmovdqa %xmm5, 56(%rsp)
289 | vmovdqa %xmm4, 40(%rsp)
290 | vpmuludq 184(%rsp), %xmm4, %xmm4
291 | vmovdqa %xmm4, 24(%rsp)
292 | cmpq $63, %rdx
293 | jbe .L20
294 | vmovdqu 80(%rdi), %xmm4
295 | vpshufd $0, %xmm4, %xmm5
296 | vmovdqa %xmm5, 8(%rsp)
297 | vpshufd $85, %xmm4, %xmm5
298 | vmovdqa %xmm5, -8(%rsp)
299 | vpshufd $170, %xmm4, %xmm13
300 | vmovdqa %xmm13, -24(%rsp)
301 | vpshufd $255, %xmm4, %xmm4
302 | vmovdqa %xmm4, %xmm10
303 | vmovdqa %xmm4, -40(%rsp)
304 | vmovd 96(%rdi), %xmm4
305 | vpshufd $0, %xmm4, %xmm4
306 | vmovdqa %xmm4, %xmm8
307 | vmovdqa %xmm4, -56(%rsp)
308 | vpmuludq 184(%rsp), %xmm5, %xmm4
309 | vmovdqa %xmm4, -72(%rsp)
310 | vpmuludq 184(%rsp), %xmm13, %xmm4
311 | vmovdqa %xmm4, -88(%rsp)
312 | vpmuludq 184(%rsp), %xmm10, %xmm4
313 | vmovdqa %xmm4, -104(%rsp)
314 | vpmuludq 184(%rsp), %xmm8, %xmm4
315 | vmovdqa %xmm4, -120(%rsp)
316 | leaq 32(%rsi), %rax
317 | movq %rdx, %rcx
318 | vmovdqa %xmm1, 168(%rsp)
319 | jmp .L22
320 | .p2align 6
321 | nop
322 | nop
323 | nop
324 | nop
325 | .L22:
326 | vpmuludq -72(%rsp), %xmm2, %xmm13
327 | vmovdqa -88(%rsp), %xmm5
328 | vpmuludq %xmm5, %xmm6, %xmm4
329 | vpmuludq %xmm5, %xmm2, %xmm11
330 | vmovdqa -104(%rsp), %xmm9
331 | vpmuludq %xmm9, %xmm6, %xmm5
332 | vpmuludq %xmm9, %xmm2, %xmm10
333 | vpaddq %xmm4, %xmm13, %xmm13
334 | vpmuludq %xmm9, %xmm3, %xmm4
335 | vmovdqa -120(%rsp), %xmm8
336 | vpmuludq %xmm8, %xmm2, %xmm9
337 | vpaddq %xmm5, %xmm11, %xmm11
338 | vmovdqa %xmm8, %xmm5
339 | vpmuludq %xmm8, %xmm12, %xmm8
340 | vpmuludq %xmm5, %xmm3, %xmm14
341 | vpaddq %xmm4, %xmm13, %xmm13
342 | vpmuludq %xmm5, %xmm6, %xmm4
343 | vmovdqa 8(%rsp), %xmm15
344 | vpmuludq %xmm15, %xmm6, %xmm5
345 | vpaddq %xmm8, %xmm13, %xmm13
346 | vpmuludq %xmm15, %xmm2, %xmm8
347 | vpaddq %xmm14, %xmm11, %xmm11
348 | vpmuludq %xmm15, %xmm7, %xmm14
349 | vpaddq %xmm4, %xmm10, %xmm10
350 | vpmuludq %xmm15, %xmm12, %xmm4
351 | vpaddq %xmm5, %xmm9, %xmm9
352 | vpmuludq %xmm15, %xmm3, %xmm5
353 | vmovdqa -8(%rsp), %xmm15
354 | vpmuludq %xmm15, %xmm3, %xmm2
355 | vpaddq %xmm14, %xmm13, %xmm13
356 | vpmuludq %xmm15, %xmm6, %xmm6
357 | vpaddq %xmm4, %xmm11, %xmm11
358 | vpmuludq %xmm15, %xmm7, %xmm4
359 | vpaddq %xmm5, %xmm10, %xmm10
360 | vmovq -32(%rax), %xmm5
361 | vpinsrq $1, -16(%rax), %xmm5, %xmm5
362 | vpmuludq %xmm15, %xmm12, %xmm14
363 | vpaddq %xmm2, %xmm9, %xmm9
364 | vmovdqa -24(%rsp), %xmm2
365 | vpmuludq %xmm2, %xmm12, %xmm15
366 | vpaddq %xmm6, %xmm8, %xmm8
367 | vpmuludq %xmm2, %xmm3, %xmm3
368 | vpaddq %xmm4, %xmm11, %xmm11
369 | vmovq -24(%rax), %xmm4
370 | vpinsrq $1, -8(%rax), %xmm4, %xmm6
371 | vpmuludq %xmm2, %xmm7, %xmm4
372 | vpaddq %xmm14, %xmm10, %xmm10
373 | vmovdqa -40(%rsp), %xmm1
374 | vpmuludq %xmm1, %xmm7, %xmm14
375 | vpaddq %xmm15, %xmm9, %xmm9
376 | vpand %xmm5, %xmm0, %xmm2
377 | vpmuludq %xmm1, %xmm12, %xmm12
378 | vpaddq %xmm3, %xmm8, %xmm8
379 | vpsrlq $26, %xmm5, %xmm3
380 | vpand %xmm3, %xmm0, %xmm3
381 | vpmuludq -56(%rsp), %xmm7, %xmm7
382 | vpaddq %xmm4, %xmm10, %xmm10
383 | vpsllq $12, %xmm6, %xmm15
384 | vpsrlq $52, %xmm5, %xmm4
385 | vpor %xmm15, %xmm4, %xmm4
386 | vpaddq %xmm14, %xmm9, %xmm9
387 | vpsrlq $14, %xmm6, %xmm5
388 | vpand %xmm5, %xmm0, %xmm5
389 | vpaddq %xmm12, %xmm8, %xmm8
390 | vpand %xmm4, %xmm0, %xmm4
391 | vpaddq %xmm7, %xmm8, %xmm8
392 | vpsrlq $40, %xmm6, %xmm6
393 | vpor 152(%rsp), %xmm6, %xmm6
394 | vmovdqu (%rax), %xmm12
395 | vmovdqu 16(%rax), %xmm7
396 | vpunpckldq %xmm7, %xmm12, %xmm15
397 | vpunpckhdq %xmm7, %xmm12, %xmm7
398 | vpxor %xmm14, %xmm14, %xmm14
399 | vpunpckldq %xmm14, %xmm15, %xmm12
400 | vpunpckhdq %xmm14, %xmm15, %xmm15
401 | vpunpckldq %xmm14, %xmm7, %xmm14
402 | vpxor %xmm1, %xmm1, %xmm1
403 | vpunpckhdq %xmm1, %xmm7, %xmm7
404 | vpsllq $6, %xmm15, %xmm15
405 | vpsllq $12, %xmm14, %xmm14
406 | vpsllq $18, %xmm7, %xmm7
407 | vpaddq %xmm12, %xmm13, %xmm12
408 | vpaddq %xmm15, %xmm11, %xmm15
409 | vpaddq %xmm14, %xmm10, %xmm14
410 | vpaddq %xmm7, %xmm9, %xmm7
411 | vpaddq 152(%rsp), %xmm8, %xmm8
412 | vpmuludq 120(%rsp), %xmm6, %xmm13
413 | vmovdqa 88(%rsp), %xmm10
414 | vpmuludq %xmm10, %xmm5, %xmm9
415 | vpmuludq %xmm10, %xmm6, %xmm11
416 | vmovdqa 56(%rsp), %xmm1
417 | vpmuludq %xmm1, %xmm5, %xmm10
418 | vpaddq %xmm13, %xmm12, %xmm12
419 | vpmuludq %xmm1, %xmm6, %xmm13
420 | vpaddq %xmm9, %xmm12, %xmm12
421 | vpmuludq %xmm1, %xmm4, %xmm9
422 | vpaddq %xmm11, %xmm15, %xmm15
423 | vmovdqa 24(%rsp), %xmm1
424 | vpmuludq %xmm1, %xmm6, %xmm11
425 | vpaddq %xmm10, %xmm15, %xmm10
426 | vpmuludq %xmm1, %xmm3, %xmm15
427 | vpaddq %xmm13, %xmm14, %xmm14
428 | vpmuludq %xmm1, %xmm4, %xmm13
429 | vpaddq %xmm9, %xmm12, %xmm9
430 | vpmuludq %xmm1, %xmm5, %xmm12
431 | vpaddq %xmm11, %xmm7, %xmm7
432 | vpmuludq 168(%rsp), %xmm5, %xmm11
433 | vpaddq %xmm15, %xmm9, %xmm9
434 | vpmuludq 168(%rsp), %xmm6, %xmm6
435 | vpaddq %xmm13, %xmm10, %xmm10
436 | vpmuludq 168(%rsp), %xmm2, %xmm15
437 | vpaddq %xmm12, %xmm14, %xmm14
438 | vpmuludq 168(%rsp), %xmm3, %xmm13
439 | vpaddq %xmm11, %xmm7, %xmm11
440 | vpmuludq 168(%rsp), %xmm4, %xmm12
441 | vpaddq %xmm6, %xmm8, %xmm6
442 | vmovdqa 136(%rsp), %xmm8
443 | vpmuludq %xmm8, %xmm4, %xmm7
444 | vpaddq %xmm15, %xmm9, %xmm9
445 | vpmuludq %xmm8, %xmm5, %xmm5
446 | vpaddq %xmm13, %xmm10, %xmm10
447 | vpmuludq %xmm8, %xmm2, %xmm15
448 | vpaddq %xmm12, %xmm14, %xmm14
449 | vpmuludq %xmm8, %xmm3, %xmm8
450 | vpaddq %xmm7, %xmm11, %xmm11
451 | vmovdqa 104(%rsp), %xmm7
452 | vpmuludq %xmm7, %xmm3, %xmm13
453 | vpaddq %xmm5, %xmm6, %xmm6
454 | vpmuludq %xmm7, %xmm4, %xmm4
455 | vpaddq %xmm15, %xmm10, %xmm10
456 | vpmuludq %xmm7, %xmm2, %xmm15
457 | vpaddq %xmm8, %xmm14, %xmm14
458 | vmovdqa 72(%rsp), %xmm5
459 | vpmuludq %xmm5, %xmm2, %xmm7
460 | vpaddq %xmm13, %xmm11, %xmm11
461 | vpmuludq %xmm5, %xmm3, %xmm3
462 | vpaddq %xmm4, %xmm6, %xmm6
463 | vpmuludq 40(%rsp), %xmm2, %xmm2
464 | vpaddq %xmm15, %xmm14, %xmm14
465 | vpaddq %xmm7, %xmm11, %xmm11
466 | vpaddq %xmm3, %xmm6, %xmm6
467 | vpaddq %xmm2, %xmm6, %xmm2
468 | vpsrlq $26, %xmm9, %xmm12
469 | vpsrlq $26, %xmm11, %xmm5
470 | vpand %xmm0, %xmm9, %xmm9
471 | vpand %xmm0, %xmm11, %xmm11
472 | vpaddq %xmm12, %xmm10, %xmm10
473 | vpaddq %xmm5, %xmm2, %xmm2
474 | vpsrlq $26, %xmm10, %xmm3
475 | vpsrlq $26, %xmm2, %xmm7
476 | vpand %xmm0, %xmm10, %xmm10
477 | vpand %xmm0, %xmm2, %xmm2
478 | vpaddq %xmm3, %xmm14, %xmm3
479 | vpmuludq 184(%rsp), %xmm7, %xmm7
480 | vpaddq %xmm7, %xmm9, %xmm9
481 | vpsrlq $26, %xmm3, %xmm6
482 | vpsrlq $26, %xmm9, %xmm12
483 | vpand %xmm0, %xmm3, %xmm3
484 | vpand %xmm0, %xmm9, %xmm7
485 | vpaddq %xmm6, %xmm11, %xmm6
486 | vpaddq %xmm12, %xmm10, %xmm12
487 | vpsrlq $26, %xmm6, %xmm8
488 | vpand %xmm0, %xmm6, %xmm6
489 | vpaddq %xmm8, %xmm2, %xmm2
490 | subq $64, %rcx
491 | addq $64, %rax
492 | cmpq $63, %rcx
493 | ja .L22
494 | vmovdqa 168(%rsp), %xmm1
495 | leaq -64(%rdx), %rax
496 | andq $-64, %rax
497 | leaq 64(%rsi,%rax), %rsi
498 | andl $63, %edx
499 | .L20:
500 | cmpq $31, %rdx
501 | jbe .L23
502 | vpmuludq 120(%rsp), %xmm2, %xmm11
503 | vmovdqa 88(%rsp), %xmm4
504 | vpmuludq %xmm4, %xmm6, %xmm0
505 | vpmuludq %xmm4, %xmm2, %xmm10
506 | vmovdqa 56(%rsp), %xmm4
507 | vpmuludq %xmm4, %xmm6, %xmm8
508 | vpmuludq %xmm4, %xmm2, %xmm5
509 | vpaddq %xmm0, %xmm11, %xmm11
510 | vpmuludq %xmm4, %xmm3, %xmm0
511 | vmovdqa 24(%rsp), %xmm13
512 | vpmuludq %xmm13, %xmm2, %xmm4
513 | vpaddq %xmm8, %xmm10, %xmm10
514 | vpmuludq %xmm13, %xmm12, %xmm8
515 | vpmuludq %xmm13, %xmm3, %xmm9
516 | vpaddq %xmm0, %xmm11, %xmm11
517 | vpmuludq %xmm13, %xmm6, %xmm13
518 | vpmuludq %xmm1, %xmm6, %xmm0
519 | vpaddq %xmm8, %xmm11, %xmm8
520 | vpmuludq %xmm1, %xmm2, %xmm2
521 | vpaddq %xmm9, %xmm10, %xmm9
522 | vpmuludq %xmm1, %xmm7, %xmm11
523 | vpaddq %xmm13, %xmm5, %xmm5
524 | vpmuludq %xmm1, %xmm12, %xmm10
525 | vpaddq %xmm0, %xmm4, %xmm0
526 | vpmuludq %xmm1, %xmm3, %xmm1
527 | vmovdqa 136(%rsp), %xmm4
528 | vpmuludq %xmm4, %xmm3, %xmm14
529 | vpaddq %xmm11, %xmm8, %xmm11
530 | vpmuludq %xmm4, %xmm6, %xmm6
531 | vpaddq %xmm10, %xmm9, %xmm9
532 | vpmuludq %xmm4, %xmm7, %xmm15
533 | vpaddq %xmm1, %xmm5, %xmm5
534 | vpmuludq %xmm4, %xmm12, %xmm1
535 | vpaddq %xmm14, %xmm0, %xmm0
536 | vmovdqa 104(%rsp), %xmm4
537 | vpmuludq %xmm4, %xmm12, %xmm8
538 | vpaddq %xmm6, %xmm2, %xmm2
539 | vpmuludq %xmm4, %xmm3, %xmm3
540 | vpaddq %xmm15, %xmm9, %xmm9
541 | vpmuludq %xmm4, %xmm7, %xmm10
542 | vpaddq %xmm1, %xmm5, %xmm1
543 | vmovdqa 72(%rsp), %xmm4
544 | vpmuludq %xmm4, %xmm7, %xmm15
545 | vpaddq %xmm8, %xmm0, %xmm0
546 | vpmuludq %xmm4, %xmm12, %xmm12
547 | vpaddq %xmm3, %xmm2, %xmm2
548 | vpmuludq 40(%rsp), %xmm7, %xmm7
549 | vpaddq %xmm10, %xmm1, %xmm1
550 | vpaddq %xmm15, %xmm0, %xmm0
551 | vpaddq %xmm12, %xmm2, %xmm2
552 | vpaddq %xmm7, %xmm2, %xmm2
553 | movl $((1 << 26) - 1), %r8d
554 | testq %rsi, %rsi
555 | vmovd %r8d, %xmm15
556 | je .L24
557 | vmovdqu (%rsi), %xmm4
558 | vmovdqu 16(%rsi), %xmm3
559 | vpunpckldq %xmm3, %xmm4, %xmm5
560 | vpunpckhdq %xmm3, %xmm4, %xmm3
561 | vpxor %xmm4, %xmm4, %xmm4
562 | vpunpckldq %xmm4, %xmm5, %xmm7
563 | vpunpckhdq %xmm4, %xmm5, %xmm5
564 | vpunpckldq %xmm4, %xmm3, %xmm6
565 | vpunpckhdq %xmm4, %xmm3, %xmm3
566 | vpsllq $6, %xmm5, %xmm5
567 | vpsllq $12, %xmm6, %xmm6
568 | vpsllq $18, %xmm3, %xmm3
569 | vpaddq %xmm7, %xmm11, %xmm11
570 | vpaddq %xmm5, %xmm9, %xmm9
571 | vpaddq %xmm6, %xmm1, %xmm1
572 | vpaddq %xmm3, %xmm0, %xmm0
573 | vpaddq 152(%rsp), %xmm2, %xmm2
574 | .L24:
575 | vpshufd $68, %xmm15, %xmm15
576 | vpsrlq $26, %xmm11, %xmm12
577 | vpsrlq $26, %xmm0, %xmm3
578 | vpand %xmm15, %xmm11, %xmm11
579 | vpand %xmm15, %xmm0, %xmm6
580 | vpaddq %xmm12, %xmm9, %xmm9
581 | vpaddq %xmm3, %xmm2, %xmm2
582 | vpsrlq $26, %xmm9, %xmm3
583 | vpsrlq $26, %xmm2, %xmm7
584 | vpand %xmm15, %xmm9, %xmm9
585 | vpand %xmm15, %xmm2, %xmm2
586 | vpaddq %xmm3, %xmm1, %xmm3
587 | vpmuludq 184(%rsp), %xmm7, %xmm7
588 | vpaddq %xmm7, %xmm11, %xmm7
589 | vpsrlq $26, %xmm3, %xmm4
590 | vpsrlq $26, %xmm7, %xmm1
591 | vpand %xmm15, %xmm3, %xmm3
592 | vpand %xmm15, %xmm7, %xmm7
593 | vpaddq %xmm4, %xmm6, %xmm6
594 | vpaddq %xmm1, %xmm9, %xmm12
595 | vpsrlq $26, %xmm6, %xmm0
596 | vpand %xmm15, %xmm6, %xmm6
597 | vpaddq %xmm0, %xmm2, %xmm2
598 | .L23:
599 | testq %rsi, %rsi
600 | je .L25
601 | vpshufd $8, %xmm7, %xmm7
602 | vpshufd $8, %xmm12, %xmm12
603 | vpshufd $8, %xmm3, %xmm3
604 | vpshufd $8, %xmm6, %xmm6
605 | vpshufd $8, %xmm2, %xmm2
606 | vpunpcklqdq %xmm12, %xmm7, %xmm7
607 | vpunpcklqdq %xmm6, %xmm3, %xmm3
608 | vmovdqu %xmm7, (%rdi)
609 | vmovdqu %xmm3, 16(%rdi)
610 | vmovq %xmm2, 32(%rdi)
611 | jmp .L11
612 | .L25:
613 | vpsrldq $8, %xmm7, %xmm0
614 | vpaddq %xmm0, %xmm7, %xmm7
615 | vpsrldq $8, %xmm12, %xmm0
616 | vpaddq %xmm0, %xmm12, %xmm12
617 | vpsrldq $8, %xmm3, %xmm0
618 | vpaddq %xmm0, %xmm3, %xmm3
619 | vpsrldq $8, %xmm6, %xmm0
620 | vpaddq %xmm0, %xmm6, %xmm6
621 | vpsrldq $8, %xmm2, %xmm0
622 | vpaddq %xmm0, %xmm2, %xmm2
623 | vmovd %xmm7, %eax
624 | vmovd %xmm12, %edx
625 | movl %eax, %r9d
626 | shrl $26, %r9d
627 | addl %edx, %r9d
628 | movl %r9d, %r8d
629 | andl $67108863, %r8d
630 | vmovd %xmm3, %edx
631 | shrl $26, %r9d
632 | addl %edx, %r9d
633 | vmovd %xmm6, %edx
634 | movl %r9d, %ecx
635 | shrl $26, %ecx
636 | addl %edx, %ecx
637 | movl %ecx, %esi
638 | andl $67108863, %esi
639 | vmovd %xmm2, %r10d
640 | movl %r8d, %r11d
641 | salq $26, %r11
642 | andl $67108863, %eax
643 | orq %rax, %r11
644 | movabsq $17592186044415, %rax
645 | andq %rax, %r11
646 | andl $67108863, %r9d
647 | salq $8, %r9
648 | shrl $18, %r8d
649 | movl %r8d, %r8d
650 | orq %r8, %r9
651 | movq %rsi, %rdx
652 | salq $34, %rdx
653 | orq %rdx, %r9
654 | andq %rax, %r9
655 | shrl $26, %ecx
656 | addl %r10d, %ecx
657 | salq $16, %rcx
658 | shrl $10, %esi
659 | movl %esi, %esi
660 | orq %rsi, %rcx
661 | movabsq $4398046511103, %r10
662 | movq %rcx, %r8
663 | andq %r10, %r8
664 | shrq $42, %rcx
665 | leaq (%rcx,%rcx,4), %rdx
666 | addq %r11, %rdx
667 | movq %rdx, %rsi
668 | andq %rax, %rsi
669 | shrq $44, %rdx
670 | addq %r9, %rdx
671 | movq %rdx, %rcx
672 | andq %rax, %rcx
673 | shrq $44, %rdx
674 | addq %r8, %rdx
675 | andq %rdx, %r10
676 | shrq $42, %rdx
677 | leaq (%rsi,%rdx,4), %rsi
678 | leaq (%rsi,%rdx), %r11
679 | movq %r11, %rbx
680 | andq %rax, %rbx
681 | shrq $44, %r11
682 | addq %rcx, %r11
683 | leaq 5(%rbx), %r9
684 | movq %r9, %r8
685 | shrq $44, %r8
686 | addq %r11, %r8
687 | movabsq $-4398046511104, %rsi
688 | addq %r10, %rsi
689 | movq %r8, %rdx
690 | shrq $44, %rdx
691 | addq %rdx, %rsi
692 | movq %rsi, %rdx
693 | shrq $63, %rdx
694 | subq $1, %rdx
695 | movq %rdx, %rcx
696 | notq %rcx
697 | andq %rcx, %rbx
698 | andq %rcx, %r11
699 | andq %r10, %rcx
700 | andq %rax, %r9
701 | andq %rdx, %r9
702 | orq %r9, %rbx
703 | movq %rbx, (%rdi)
704 | andq %r8, %rax
705 | andq %rdx, %rax
706 | orq %rax, %r11
707 | movq %r11, 8(%rdi)
708 | andq %rsi, %rdx
709 | orq %rcx, %rdx
710 | movq %rdx, 16(%rdi)
711 | .L11:
712 | movq -8(%rbp), %rbx
713 | leave
714 | ret
715 | FN_END poly1305_blocks_avx
716 |
717 | GLOBAL_HIDDEN_FN poly1305_finish_ext_avx
718 | poly1305_finish_ext_avx_local:
719 | pushq %r12
720 | pushq %rbp
721 | pushq %rbx
722 | subq $32, %rsp
723 | movq %rdi, %rbx
724 | movq %rdx, %rbp
725 | movq %rcx, %r12
726 | testq %rdx, %rdx
727 | je .L30
728 | movq $0, (%rsp)
729 | movq $0, 8(%rsp)
730 | movq $0, 16(%rsp)
731 | movq $0, 24(%rsp)
732 | movq %rsp, %rax
733 | subq %rsp, %rsi
734 | testb $16, %dl
735 | je .L31
736 | vmovdqu (%rsp,%rsi), %xmm0
737 | vmovdqa %xmm0, (%rsp)
738 | addq $16, %rax
739 | .L31:
740 | testb $8, %bpl
741 | je .L32
742 | movq (%rax,%rsi), %rdx
743 | movq %rdx, (%rax)
744 | addq $8, %rax
745 | .L32:
746 | testb $4, %bpl
747 | je .L33
748 | movl (%rax,%rsi), %edx
749 | movl %edx, (%rax)
750 | addq $4, %rax
751 | .L33:
752 | testb $2, %bpl
753 | je .L34
754 | movzwl (%rax,%rsi), %edx
755 | movw %dx, (%rax)
756 | addq $2, %rax
757 | .L34:
758 | testb $1, %bpl
759 | je .L35
760 | movzbl (%rax,%rsi), %edx
761 | movb %dl, (%rax)
762 | .L35:
763 | cmpq $16, %rbp
764 | je .L36
765 | movb $1, (%rsp,%rbp)
766 | movq 120(%rbx), %rdx
767 | cmpq $16, %rbp
768 | sbbq %rax, %rax
769 | andl $4, %eax
770 | addq $4, %rax
771 | .L37:
772 | orq %rdx, %rax
773 | movq %rax, 120(%rbx)
774 | movq %rsp, %rsi
775 | movl $32, %edx
776 | movq %rbx, %rdi
777 | call poly1305_blocks_avx_local
778 | .L30:
779 | movq 120(%rbx), %rax
780 | testb $1, %al
781 | je .L38
782 | subq $1, %rbp
783 | cmpq $15, %rbp
784 | jbe .L39
785 | orq $16, %rax
786 | movq %rax, 120(%rbx)
787 | jmp .L40
788 | .L39:
789 | orq $32, %rax
790 | movq %rax, 120(%rbx)
791 | .L40:
792 | movl $32, %edx
793 | movl $0, %esi
794 | movq %rbx, %rdi
795 | call poly1305_blocks_avx_local
796 | .L38:
797 | movq 8(%rbx), %rax
798 | movq %rax, %rdx
799 | salq $44, %rdx
800 | orq (%rbx), %rdx
801 | shrq $20, %rax
802 | movq 16(%rbx), %rcx
803 | salq $24, %rcx
804 | orq %rcx, %rax
805 | movq 104(%rbx), %rcx
806 | movq 112(%rbx), %rsi
807 | addq %rcx, %rdx
808 | adcq %rsi, %rax
809 | vpxor %xmm0, %xmm0, %xmm0
810 | vmovdqu %xmm0, (%rbx)
811 | vmovdqu %xmm0, 16(%rbx)
812 | vmovdqu %xmm0, 32(%rbx)
813 | vmovdqu %xmm0, 48(%rbx)
814 | vmovdqu %xmm0, 64(%rbx)
815 | vmovdqu %xmm0, 80(%rbx)
816 | vmovdqu %xmm0, 96(%rbx)
817 | vmovdqu %xmm0, 112(%rbx)
818 | movq %rdx, (%r12)
819 | movq %rax, 8(%r12)
820 | jmp .L43
821 | .L36:
822 | movq 120(%rbx), %rdx
823 | movl $4, %eax
824 | jmp .L37
825 | .L43:
826 | addq $32, %rsp
827 | popq %rbx
828 | popq %rbp
829 | popq %r12
830 | ret
831 | FN_END poly1305_finish_ext_avx
832 |
833 | GLOBAL_HIDDEN_FN poly1305_auth_avx
834 | cmp $128, %rdx
835 | jb poly1305_auth_x86_local
836 | pushq %rbp
837 | movq %rsp, %rbp
838 | pushq %r14
839 | pushq %r13
840 | pushq %r12
841 | pushq %rbx
842 | andq $-64, %rsp
843 | addq $-128, %rsp
844 | movq %rdi, %r14
845 | movq %rsi, %r12
846 | movq %rdx, %rbx
847 | movq %rsp, %rdi
848 | movq %rcx, %rsi
849 | call poly1305_init_ext_avx_local
850 | movq %rbx, %r13
851 | andq $-32, %r13
852 | je .L46
853 | movq %rsp, %rdi
854 | movq %r13, %rdx
855 | movq %r12, %rsi
856 | call poly1305_blocks_avx_local
857 | addq %r13, %r12
858 | subq %r13, %rbx
859 | .L46:
860 | movq %rsp, %rdi
861 | movq %r14, %rcx
862 | movq %rbx, %rdx
863 | movq %r12, %rsi
864 | call poly1305_finish_ext_avx_local
865 | leaq -32(%rbp), %rsp
866 | popq %rbx
867 | popq %r12
868 | popq %r13
869 | popq %r14
870 | popq %rbp
871 | ret
872 | FN_END poly1305_auth_avx
873 |
--------------------------------------------------------------------------------
/app/extensions/poly1305/poly1305_constants_x86.inc:
--------------------------------------------------------------------------------
1 | SECTION_RODATA
2 |
3 | .p2align 4
4 | poly1305_constants_x86:
5 | /* 0 */ poly1305_x86_scale: .long 0x0,0x37f40000
6 | /* 8 */ poly1305_x86_two32: .long 0x0,0x41f00000
7 | /* 16 */ poly1305_x86_two64: .long 0x0,0x43f00000
8 | /* 24 */ poly1305_x86_two96: .long 0x0,0x45f00000
9 | /* 32 */ poly1305_x86_alpha32: .long 0x0,0x45e80000
10 | /* 40 */ poly1305_x86_alpha64: .long 0x0,0x47e80000
11 | /* 48 */ poly1305_x86_alpha96: .long 0x0,0x49e80000
12 | /* 56 */ poly1305_x86_alpha130: .long 0x0,0x4c080000
13 | /* 64 */ poly1305_x86_doffset0: .long 0x0,0x43300000
14 | /* 72 */ poly1305_x86_doffset1: .long 0x0,0x45300000
15 | /* 80 */ poly1305_x86_doffset2: .long 0x0,0x47300000
16 | /* 88 */ poly1305_x86_doffset3: .long 0x0,0x49300000
17 | /* 96 */ poly1305_x86_doffset3minustwo128: .long 0x0,0x492ffffe
18 | /* 104 */ poly1305_x86_hoffset0: .long 0xfffffffb,0x43300001
19 | /* 112 */ poly1305_x86_hoffset1: .long 0xfffffffe,0x45300001
20 | /* 120 */ poly1305_x86_hoffset2: .long 0xfffffffe,0x47300001
21 | /* 124 */ poly1305_x86_hoffset3: .long 0xfffffffe,0x49300003
22 |
--------------------------------------------------------------------------------
/app/extensions/poly1305/poly1305_neon-32.inc:
--------------------------------------------------------------------------------
1 | SECTION_TEXT
2 |
3 | .arm
4 | .fpu neon
5 |
6 | GLOBAL_HIDDEN_FN poly1305_block_size_neon
7 | mov r0, #32
8 | bx lr
9 | FN_END poly1305_block_size_neon
10 |
11 | .p2align 2
12 | poly1305_init_constants_neon:
13 | .long 0x3ffff03
14 | .long 0x3ffc0ff
15 | .long 0x3f03fff
16 | .long 0x00fffff
17 |
18 | GLOBAL_HIDDEN_FN poly1305_init_ext_neon
19 | poly1305_init_ext_neon_local:
20 | stmfd sp!, {r4-r11, lr}
21 | sub sp, sp, #32
22 | mov r14, r2
23 | and r2, r2, r2
24 | moveq r14, #-1
25 | ldmia r1!, {r2-r5}
26 | ldr r7, =poly1305_init_constants_neon
27 | mov r6, r2
28 | mov r8, r2, lsr #26
29 | mov r9, r3, lsr #20
30 | mov r10, r4, lsr #14
31 | mov r11, r5, lsr #8
32 | orr r8, r8, r3, lsl #6
33 | orr r9, r9, r4, lsl #12
34 | orr r10, r10, r5, lsl #18
35 | ldmia r7, {r2-r5}
36 | and r2, r2, r8
37 | and r3, r3, r9
38 | and r4, r4, r10
39 | and r5, r5, r11
40 | and r6, r6, 0x3ffffff
41 | stmia r0!, {r2-r6}
42 | eor r8, r8, r8
43 | str r8, [sp, #24]
44 | poly1305_init_ext_neon_squareloop:
45 | ldr r8, [sp, #24]
46 | mov r12, #16
47 | cmp r8, #2
48 | beq poly1305_init_ext_neon_donesquaring
49 | cmp r8, #1
50 | moveq r12, #64
51 | cmp r14, r12
52 | bls poly1305_init_ext_neon_donesquaring
53 | add r8, #1
54 | str r8, [sp, #24]
55 | mov r6, r6, lsl #1
56 | mov r2, r2, lsl #1
57 | umull r7, r8, r3, r3
58 | umull r9, r10, r6, r4
59 | umlal r7, r8, r6, r5
60 | umlal r9, r10, r2, r3
61 | add r11, r5, r5, lsl #2
62 | umlal r7, r8, r2, r4
63 | umlal r9, r10, r5, r11
64 | str r7, [sp, #16]
65 | str r8, [sp, #20]
66 | mov r2, r2, lsr #1
67 | mov r5, r5, lsl #1
68 | str r9, [sp, #8]
69 | str r10, [sp, #12]
70 | umull r7, r8, r2, r2
71 | umull r9, r10, r6, r2
72 | add r11, r3, r3, lsl #2
73 | add r12, r4, r4, lsl #2
74 | umlal r7, r8, r6, r3
75 | umlal r9, r10, r5, r11
76 | umlal r7, r8, r5, r12
77 | umlal r9, r10, r4, r12
78 | mov r6, r6, lsr #1
79 | mov r3, r3, lsl #1
80 | add r11, r2, r2, lsl #2
81 | str r7, [sp, #0]
82 | str r8, [sp, #4]
83 | umull r7, r8, r6, r6
84 | umlal r7, r8, r3, r12
85 | umlal r7, r8, r5, r11
86 | and r6, r7, 0x3ffffff
87 | mov r11, r7, lsr #26
88 | orr r11, r11, r8, lsl #6
89 | ldr r7, [sp, #0]
90 | ldr r8, [sp, #4]
91 | adds r9, r9, r11
92 | adc r10, r10, #0
93 | and r2, r9, 0x3ffffff
94 | mov r11, r9, lsr #26
95 | orr r11, r11, r10, lsl #6
96 | ldr r9, [sp, #8]
97 | ldr r10, [sp, #12]
98 | adds r7, r7, r11
99 | adc r8, r8, #0
100 | and r3, r7, 0x3ffffff
101 | mov r11, r7, lsr #26
102 | orr r11, r11, r8, lsl #6
103 | ldr r7, [sp, #16]
104 | ldr r8, [sp, #20]
105 | adds r9, r9, r11
106 | adc r10, r10, #0
107 | and r4, r9, 0x3ffffff
108 | mov r11, r9, lsr #26
109 | orr r11, r11, r10, lsl #6
110 | adds r7, r7, r11
111 | adc r8, r8, #0
112 | and r5, r7, 0x3ffffff
113 | mov r11, r7, lsr #26
114 | orr r11, r11, r8, lsl #6
115 | add r11, r11, r11, lsl #2
116 | add r6, r6, r11
117 | mov r11, r6, lsr #26
118 | and r6, r6, 0x3ffffff
119 | add r2, r2, r11
120 | stmia r0!, {r2-r6}
121 | b poly1305_init_ext_neon_squareloop
122 | poly1305_init_ext_neon_donesquaring:
123 | mov r2, #2
124 | ldr r14, [sp, #24]
125 | sub r14, r2, r14
126 | mov r3, r14, lsl #4
127 | add r3, r3, r14, lsl #2
128 | add r0, r0, r3
129 | eor r2, r2, r2
130 | eor r3, r3, r3
131 | eor r4, r4, r4
132 | eor r5, r5, r5
133 | eor r6, r6, r6
134 | stmia r0!, {r2-r6}
135 | stmia r0!, {r2-r6}
136 | ldmia r1!, {r2-r5}
137 | stmia r0, {r2-r6}
138 | add sp, sp, #32
139 | ldmfd sp!, {r4-r11, lr}
140 | bx lr
141 | FN_END poly1305_init_ext_neon
142 |
143 | .ltorg
144 |
145 | GLOBAL_HIDDEN_FN poly1305_blocks_neon
146 | poly1305_blocks_neon_local:
147 | vmov.i32 q0, #0xffffffff
148 | vmov.i32 d4, #1
149 | vsubw.u32 q0, q0, d4
150 | vstmdb sp!, {q4,q5,q6,q7}
151 | stmfd sp!, {r4-r11, lr}
152 | mov r8, sp
153 | and sp, sp, #~63
154 | sub sp, sp, #192
155 | str r0, [sp, #108]
156 | str r1, [sp, #112]
157 | str r2, [sp, #116]
158 | str r8, [sp, #120]
159 | mov r3, r0
160 | mov r0, r1
161 | mov r1, r2
162 | mov r2, r3
163 | ldr r8, [r2, #116]
164 | veor d15, d15, d15
165 | vorr.i32 d15, #(1 << 24)
166 | tst r8, #2
167 | beq poly1305_blocks_neon_skip_shift8
168 | vshr.u64 d15, #32
169 | poly1305_blocks_neon_skip_shift8:
170 | tst r8, #4
171 | beq poly1305_blocks_neon_skip_shift16
172 | veor d15, d15, d15
173 | poly1305_blocks_neon_skip_shift16:
174 | vst1.64 d15, [sp, :64]
175 | tst r8, #1
176 | bne poly1305_blocks_neon_started
177 | vld1.64 {q0-q1}, [r0]!
178 | vswp d1, d2
179 | vmovn.i64 d21, q0
180 | vshrn.i64 d22, q0, #26
181 | vshrn.u64 d24, q1, #14
182 | vext.8 d0, d0, d2, #4
183 | vext.8 d1, d1, d3, #4
184 | vshr.u64 q1, q1, #32
185 | vshrn.i64 d23, q0, #20
186 | vshrn.u64 d25, q1, #8
187 | vand.i32 d21, #0x03ffffff
188 | vand.i32 q11, #0x03ffffff
189 | vand.i32 q12, #0x03ffffff
190 | orr r8, r8, #1
191 | sub r1, r1, #32
192 | str r8, [r2, #116]
193 | vorr d25, d25, d15
194 | b poly1305_blocks_neon_setupr20
195 | poly1305_blocks_neon_started:
196 | add r9, r2, #60
197 | vldm r9, {d21-d25}
198 | poly1305_blocks_neon_setupr20:
199 | vmov.i32 d0, #5
200 | tst r8, #(8|16)
201 | beq poly1305_blocks_neon_setupr20_simple
202 | tst r8, #(8)
203 | beq poly1305_blocks_neon_setupr20_r_1
204 | mov r9, r2
205 | add r10, r2, #20
206 | vld1.64 {q9}, [r9]!
207 | vld1.64 {q8}, [r10]!
208 | vld1.64 {d2}, [r9]
209 | vld1.64 {d20}, [r10]
210 | b poly1305_blocks_neon_setupr20_hard
211 | poly1305_blocks_neon_setupr20_r_1:
212 | mov r9, r2
213 | vmov.i32 d2, #1
214 | vld1.64 {q8}, [r9]!
215 | veor q9, q9, q9
216 | vshr.u64 d2, d2, #32
217 | vld1.64 {d20}, [r9]
218 | poly1305_blocks_neon_setupr20_hard:
219 | vzip.i32 q8, q9
220 | vzip.i32 d20, d2
221 | b poly1305_blocks_neon_setups20
222 | poly1305_blocks_neon_setupr20_simple:
223 | add r9, r2, #20
224 | vld1.64 {d2-d4}, [r9]
225 | vdup.32 d16, d2[0]
226 | vdup.32 d17, d2[1]
227 | vdup.32 d18, d3[0]
228 | vdup.32 d19, d3[1]
229 | vdup.32 d20, d4[0]
230 | poly1305_blocks_neon_setups20:
231 | vmul.i32 q13, q8, d0[0]
232 | vmov.i64 q15, 0x00000000ffffffff
233 | vmul.i32 q14, q9, d0[0]
234 | vshr.u64 q15, q15, #6
235 | cmp r1, #64
236 | blo poly1305_blocks_neon_try32
237 | add r9, sp, #16
238 | add r10, r2, #40
239 | add r11, sp, #64
240 | str r1, [sp, #116]
241 | vld1.64 {d10-d12}, [r10]
242 | vmov d14, d12
243 | vmul.i32 q6, q5, d0[0]
244 | poly1305_blocks_neon_mainloop:
245 | ldmia r0!, {r2-r5}
246 | vmull.u32 q0, d25, d12[0]
247 | mov r7, r2, lsr #26
248 | vmlal.u32 q0, d24, d12[1]
249 | mov r8, r3, lsr #20
250 | ldr r6, [sp, #0]
251 | vmlal.u32 q0, d23, d13[0]
252 | mov r9, r4, lsr #14
253 | vmlal.u32 q0, d22, d13[1]
254 | orr r6, r6, r5, lsr #8
255 | vmlal.u32 q0, d21, d14[0]
256 | orr r3, r7, r3, lsl #6
257 | vmull.u32 q1, d25, d12[1]
258 | orr r4, r8, r4, lsl #12
259 | orr r5, r9, r5, lsl #18
260 | vmlal.u32 q1, d24, d13[0]
261 | ldmia r0!, {r7-r10}
262 | vmlal.u32 q1, d23, d13[1]
263 | mov r1, r7, lsr #26
264 | vmlal.u32 q1, d22, d14[0]
265 | ldr r11, [sp, #4]
266 | mov r12, r8, lsr #20
267 | vmlal.u32 q1, d21, d10[0]
268 | mov r14, r9, lsr #14
269 | vmull.u32 q2, d25, d13[0]
270 | orr r11, r11, r10, lsr #8
271 | orr r8, r1, r8, lsl #6
272 | vmlal.u32 q2, d24, d13[1]
273 | orr r9, r12, r9, lsl #12
274 | vmlal.u32 q2, d23, d14[0]
275 | orr r10, r14, r10, lsl #18
276 | vmlal.u32 q2, d22, d10[0]
277 | mov r12, r3
278 | and r2, r2, #0x3ffffff
279 | vmlal.u32 q2, d21, d10[1]
280 | mov r14, r5
281 | vmull.u32 q3, d25, d13[1]
282 | and r3, r7, #0x3ffffff
283 | vmlal.u32 q3, d24, d14[0]
284 | and r5, r8, #0x3ffffff
285 | vmlal.u32 q3, d23, d10[0]
286 | and r7, r9, #0x3ffffff
287 | vmlal.u32 q3, d22, d10[1]
288 | and r8, r14, #0x3ffffff
289 | vmlal.u32 q3, d21, d11[0]
290 | and r9, r10, #0x3ffffff
291 | add r14, sp, #128
292 | vmull.u32 q4, d25, d14[0]
293 | mov r10, r6
294 | vmlal.u32 q4, d24, d10[0]
295 | and r6, r4, #0x3ffffff
296 | vmlal.u32 q4, d23, d10[1]
297 | and r4, r12, #0x3ffffff
298 | vmlal.u32 q4, d22, d11[0]
299 | stm r14, {r2-r11}
300 | vmlal.u32 q4, d21, d11[1]
301 | vld1.64 {d21-d24}, [r14, :256]!
302 | vld1.64 {d25}, [r14, :64]
303 | ldmia r0!, {r2-r5}
304 | vmlal.u32 q0, d25, d26
305 | mov r7, r2, lsr #26
306 | vmlal.u32 q0, d24, d27
307 | ldr r6, [sp, #0]
308 | mov r8, r3, lsr #20
309 | vmlal.u32 q0, d23, d28
310 | mov r9, r4, lsr #14
311 | vmlal.u32 q0, d22, d29
312 | orr r6, r6, r5, lsr #8
313 | vmlal.u32 q0, d21, d20
314 | orr r3, r7, r3, lsl #6
315 | vmlal.u32 q1, d25, d27
316 | orr r4, r8, r4, lsl #12
317 | orr r5, r9, r5, lsl #18
318 | vmlal.u32 q1, d24, d28
319 | ldmia r0!, {r7-r10}
320 | vmlal.u32 q1, d23, d29
321 | mov r1, r7, lsr #26
322 | vmlal.u32 q1, d22, d20
323 | ldr r11, [sp, #4]
324 | mov r12, r8, lsr #20
325 | vmlal.u32 q1, d21, d16
326 | mov r14, r9, lsr #14
327 | vmlal.u32 q2, d25, d28
328 | orr r11, r11, r10, lsr #8
329 | orr r8, r1, r8, lsl #6
330 | orr r9, r12, r9, lsl #12
331 | vmlal.u32 q2, d24, d29
332 | orr r10, r14, r10, lsl #18
333 | and r2, r2, #0x3ffffff
334 | mov r12, r3
335 | vmlal.u32 q2, d23, d20
336 | mov r14, r5
337 | vmlal.u32 q2, d22, d16
338 | and r3, r7, #0x3ffffff
339 | vmlal.u32 q2, d21, d17
340 | and r5, r8, #0x3ffffff
341 | vmlal.u32 q3, d25, d29
342 | and r7, r9, #0x3ffffff
343 | vmlal.u32 q3, d24, d20
344 | and r8, r14, #0x3ffffff
345 | vmlal.u32 q3, d23, d16
346 | and r9, r10, #0x3ffffff
347 | vmlal.u32 q3, d22, d17
348 | add r14, sp, #128
349 | vmlal.u32 q3, d21, d18
350 | mov r10, r6
351 | vmlal.u32 q4, d25, d20
352 | vmlal.u32 q4, d24, d16
353 | and r6, r4, #0x3ffffff
354 | vmlal.u32 q4, d23, d17
355 | and r4, r12, #0x3ffffff
356 | vmlal.u32 q4, d22, d18
357 | stm r14, {r2-r11}
358 | vmlal.u32 q4, d21, d19
359 | vld1.64 {d21-d24}, [r14, :256]!
360 | vld1.64 {d25}, [r14, :64]
361 | vaddw.u32 q0, q0, d21
362 | vaddw.u32 q1, q1, d22
363 | vaddw.u32 q2, q2, d23
364 | vaddw.u32 q3, q3, d24
365 | vaddw.u32 q4, q4, d25
366 | vshr.u64 q11, q0, #26
367 | vand q0, q0, q15
368 | vadd.i64 q1, q1, q11
369 | vshr.u64 q12, q3, #26
370 | vand q3, q3, q15
371 | vadd.i64 q4, q4, q12
372 | vshr.u64 q11, q1, #26
373 | vand q1, q1, q15
374 | vadd.i64 q2, q2, q11
375 | vshr.u64 q12, q4, #26
376 | vand q4, q4, q15
377 | vadd.i64 q0, q0, q12
378 | vshl.i64 q12, q12, #2
379 | ldr r1, [sp, #116]
380 | vadd.i64 q0, q0, q12
381 | vshr.u64 q11, q2, #26
382 | vand q2, q2, q15
383 | vadd.i64 q3, q3, q11
384 | sub r1, #64
385 | vshr.u64 q12, q0, #26
386 | vand q0, q0, q15
387 | vadd.i64 q1, q1, q12
388 | cmp r1, #64
389 | vshr.u64 q11, q3, #26
390 | vand q3, q3, q15
391 | vadd.i64 q4, q4, q11
392 | vmovn.i64 d21, q0
393 | str r1, [sp, #116]
394 | vmovn.i64 d22, q1
395 | vmovn.i64 d23, q2
396 | vmovn.i64 d24, q3
397 | vmovn.i64 d25, q4
398 | bhs poly1305_blocks_neon_mainloop
399 | poly1305_blocks_neon_try32:
400 | cmp r1, #32
401 | blo poly1305_blocks_neon_done
402 | tst r0, r0
403 | bne poly1305_blocks_loadm32
404 | veor q0, q0, q0
405 | veor q1, q1, q1
406 | veor q2, q2, q2
407 | veor q3, q3, q3
408 | veor q4, q4, q4
409 | b poly1305_blocks_continue32
410 | poly1305_blocks_loadm32:
411 | vld1.64 {q0-q1}, [r0]!
412 | veor q4, q4, q4
413 | vswp d1, d2
414 | veor q3, q3, q3
415 | vtrn.32 q0, q4
416 | vtrn.32 q1, q3
417 | vshl.i64 q2, q1, #12
418 | vshl.i64 q3, q3, #18
419 | vshl.i64 q1, q4, #6
420 | vmovl.u32 q4, d15
421 | poly1305_blocks_continue32:
422 | vmlal.u32 q0, d25, d26
423 | vmlal.u32 q0, d24, d27
424 | vmlal.u32 q0, d23, d28
425 | vmlal.u32 q0, d22, d29
426 | vmlal.u32 q0, d21, d20
427 | vmlal.u32 q1, d25, d27
428 | vmlal.u32 q1, d24, d28
429 | vmlal.u32 q1, d23, d29
430 | vmlal.u32 q1, d22, d20
431 | vmlal.u32 q1, d21, d16
432 | vmlal.u32 q2, d25, d28
433 | vmlal.u32 q2, d24, d29
434 | vmlal.u32 q2, d23, d20
435 | vmlal.u32 q2, d22, d16
436 | vmlal.u32 q2, d21, d17
437 | vmlal.u32 q3, d25, d29
438 | vmlal.u32 q3, d24, d20
439 | vmlal.u32 q3, d23, d16
440 | vmlal.u32 q3, d22, d17
441 | vmlal.u32 q3, d21, d18
442 | vmlal.u32 q4, d25, d20
443 | vmlal.u32 q4, d24, d16
444 | vmlal.u32 q4, d23, d17
445 | vmlal.u32 q4, d22, d18
446 | vmlal.u32 q4, d21, d19
447 | vshr.u64 q11, q0, #26
448 | vand q0, q0, q15
449 | vadd.i64 q1, q1, q11
450 | vshr.u64 q12, q3, #26
451 | vand q3, q3, q15
452 | vadd.i64 q4, q4, q12
453 | vshr.u64 q11, q1, #26
454 | vand q1, q1, q15
455 | vadd.i64 q2, q2, q11
456 | vshr.u64 q12, q4, #26
457 | vand q4, q4, q15
458 | vadd.i64 q0, q0, q12
459 | vshl.i64 q12, q12, #2
460 | vadd.i64 q0, q0, q12
461 | vshr.u64 q11, q2, #26
462 | vand q2, q2, q15
463 | vadd.i64 q3, q3, q11
464 | vshr.u64 q12, q0, #26
465 | vand q0, q0, q15
466 | vadd.i64 q1, q1, q12
467 | vshr.u64 q11, q3, #26
468 | vand q3, q3, q15
469 | vadd.i64 q4, q4, q11
470 | vmovn.i64 d21, q0
471 | vmovn.i64 d22, q1
472 | vmovn.i64 d23, q2
473 | vmovn.i64 d24, q3
474 | vmovn.i64 d25, q4
475 | poly1305_blocks_neon_done:
476 | tst r0, r0
477 | beq poly1305_blocks_neon_final
478 | ldr r2, [sp, #108]
479 | add r2, r2, #60
480 | vst1.64 {d21}, [r2]!
481 | vst1.64 {d22-d25}, [r2]
482 | b poly1305_blocks_neon_leave
483 | poly1305_blocks_neon_final:
484 | vadd.u32 d10, d0, d1
485 | vadd.u32 d13, d2, d3
486 | vadd.u32 d11, d4, d5
487 | ldr r5, [sp, #108]
488 | vadd.u32 d14, d6, d7
489 | vadd.u32 d12, d8, d9
490 | vtrn.32 d10, d13
491 | vtrn.32 d11, d14
492 | vst1.64 {d10-d12}, [sp]
493 | ldm sp, {r0-r4}
494 | mov r12, r0, lsr #26
495 | and r0, r0, #0x3ffffff
496 | add r1, r1, r12
497 | mov r12, r1, lsr #26
498 | and r1, r1, #0x3ffffff
499 | add r2, r2, r12
500 | mov r12, r2, lsr #26
501 | and r2, r2, #0x3ffffff
502 | add r3, r3, r12
503 | mov r12, r3, lsr #26
504 | and r3, r3, #0x3ffffff
505 | add r4, r4, r12
506 | mov r12, r4, lsr #26
507 | and r4, r4, #0x3ffffff
508 | add r12, r12, r12, lsl #2
509 | add r0, r0, r12
510 | mov r12, r0, lsr #26
511 | and r0, r0, #0x3ffffff
512 | add r1, r1, r12
513 | mov r12, r1, lsr #26
514 | and r1, r1, #0x3ffffff
515 | add r2, r2, r12
516 | mov r12, r2, lsr #26
517 | and r2, r2, #0x3ffffff
518 | add r3, r3, r12
519 | mov r12, r3, lsr #26
520 | and r3, r3, #0x3ffffff
521 | add r4, r4, r12
522 | mov r12, r4, lsr #26
523 | and r4, r4, #0x3ffffff
524 | add r12, r12, r12, lsl #2
525 | add r0, r0, r12
526 | mov r12, r0, lsr #26
527 | and r0, r0, #0x3ffffff
528 | add r1, r1, r12
529 | add r6, r0, #5
530 | mov r12, r6, lsr #26
531 | and r6, r6, #0x3ffffff
532 | add r7, r1, r12
533 | mov r12, r7, lsr #26
534 | and r7, r7, #0x3ffffff
535 | add r10, r2, r12
536 | mov r12, r10, lsr #26
537 | and r10, r10, #0x3ffffff
538 | add r11, r3, r12
539 | mov r12, #-(1 << 26)
540 | add r12, r12, r11, lsr #26
541 | and r11, r11, #0x3ffffff
542 | add r14, r4, r12
543 | mov r12, r14, lsr #31
544 | sub r12, #1
545 | and r6, r6, r12
546 | and r7, r7, r12
547 | and r10, r10, r12
548 | and r11, r11, r12
549 | and r14, r14, r12
550 | mvn r12, r12
551 | and r0, r0, r12
552 | and r1, r1, r12
553 | and r2, r2, r12
554 | and r3, r3, r12
555 | and r4, r4, r12
556 | orr r0, r0, r6
557 | orr r1, r1, r7
558 | orr r2, r2, r10
559 | orr r3, r3, r11
560 | orr r4, r4, r14
561 | orr r0, r0, r1, lsl #26
562 | lsr r1, r1, #6
563 | orr r1, r1, r2, lsl #20
564 | lsr r2, r2, #12
565 | orr r2, r2, r3, lsl #14
566 | lsr r3, r3, #18
567 | orr r3, r3, r4, lsl #8
568 | add r5, r5, #60
569 | stm r5, {r0-r3}
570 | poly1305_blocks_neon_leave:
571 | ldr sp, [sp, #120]
572 | ldmfd sp!, {r4-r11, lr}
573 | vldm sp!, {q4-q7}
574 | bx lr
575 | FN_END poly1305_init_ext_neon
576 |
577 | GLOBAL_HIDDEN_FN poly1305_finish_ext_neon
578 | poly1305_finish_ext_neon_local:
579 | stmfd sp!, {r4-r11, lr}
580 | sub sp, sp, #32
581 | mov r5, r0
582 | mov r6, r1
583 | mov r7, r2
584 | mov r8, r3
585 | ands r7, r7, r7
586 | beq poly1305_finish_ext_neon_noremaining
587 | mov r9, sp
588 | veor q0, q0, q0
589 | veor q1, q1, q1
590 | vst1.64 {q0-q1}, [sp]
591 | tst r7, #16
592 | beq poly1305_finish_ext_neon_skip16
593 | vld1.u64 {q0}, [r1]!
594 | vst1.64 {q0}, [r9]!
595 | poly1305_finish_ext_neon_skip16:
596 | tst r7, #8
597 | beq poly1305_finish_ext_neon_skip8
598 | ldmia r1!, {r10-r11}
599 | stmia r9!, {r10-r11}
600 | poly1305_finish_ext_neon_skip8:
601 | tst r7, #4
602 | beq poly1305_finish_ext_neon_skip4
603 | ldr r10, [r1], #4
604 | str r10, [r9], #4
605 | poly1305_finish_ext_neon_skip4:
606 | tst r7, #2
607 | beq poly1305_finish_ext_neon_skip2
608 | ldrh r10, [r1], #2
609 | strh r10, [r9], #2
610 | poly1305_finish_ext_neon_skip2:
611 | tst r7, #1
612 | beq poly1305_finish_ext_neon_skip1
613 | ldrb r10, [r1], #1
614 | strb r10, [r9], #1
615 | poly1305_finish_ext_neon_skip1:
616 | cmp r7, #16
617 | beq poly1305_finish_ext_neon_skipfinalbit
618 | mov r10, #1
619 | strb r10, [r9]
620 | poly1305_finish_ext_neon_skipfinalbit:
621 | ldr r10, [r5, #116]
622 | orrhs r10, #2
623 | orrlo r10, #4
624 | str r10, [r5, #116]
625 | mov r0, r5
626 | mov r1, sp
627 | mov r2, #32
628 | bl poly1305_blocks_neon_local
629 | poly1305_finish_ext_neon_noremaining:
630 | ldr r10, [r5, #116]
631 | tst r10, #1
632 | beq poly1305_finish_ext_neon_notstarted
633 | cmp r7, #0
634 | beq poly1305_finish_ext_neon_user2r
635 | cmp r7, #16
636 | bls poly1305_finish_ext_neon_user1
637 | poly1305_finish_ext_neon_user2r:
638 | orr r10, r10, #8
639 | b poly1305_finish_ext_neon_finalblock
640 | poly1305_finish_ext_neon_user1:
641 | orr r10, r10, #16
642 | poly1305_finish_ext_neon_finalblock:
643 | str r10, [r5, #116]
644 | mov r0, r5
645 | eor r1, r1, r1
646 | mov r2, #32
647 | bl poly1305_blocks_neon_local
648 | poly1305_finish_ext_neon_notstarted:
649 | add r0, r5, #60
650 | add r9, r5, #100
651 | ldm r0, {r0-r3}
652 | ldm r9, {r9-r12}
653 | adds r0, r0, r9
654 | adcs r1, r1, r10
655 | adcs r2, r2, r11
656 | adcs r3, r3, r12
657 | stm r8, {r0-r3}
658 | veor q0, q0, q0
659 | veor q1, q1, q1
660 | veor q2, q2, q2
661 | veor q3, q3, q3
662 | vstmia r5!, {q0-q3}
663 | vstm r5, {q0-q3}
664 | add sp, sp, #32
665 | ldmfd sp!, {r4-r11, lr}
666 | bx lr
667 | FN_END poly1305_finish_ext_neon
668 |
669 | GLOBAL_HIDDEN_FN poly1305_auth_neon
670 | cmp r2, #128
671 | blo poly1305_auth_armv6_local
672 | stmfd sp!, {r4-r8, lr}
673 | mov r8, sp
674 | and sp, sp, #(~63)
675 | sub sp, sp, #128
676 | mov r4, r0
677 | mov r5, r1
678 | mov r6, r2
679 | mov r7, r3
680 | mov r0, sp
681 | mov r1, r7
682 | bl poly1305_init_ext_neon_local
683 | ands r2, r6, #(~31)
684 | beq poly1305_auth_neon_noblocks
685 | mov r0, sp
686 | mov r1, r5
687 | add r5, r5, r2
688 | sub r6, r6, r2
689 | bl poly1305_blocks_neon_local
690 | poly1305_auth_neon_noblocks:
691 | mov r0, sp
692 | mov r1, r5
693 | mov r2, r6
694 | mov r3, r4
695 | bl poly1305_finish_ext_neon_local
696 | mov sp, r8
697 | ldmfd sp!, {r4-r8, lr}
698 | bx lr
699 | FN_END poly1305_auth_neon
700 |
--------------------------------------------------------------------------------
/app/extensions/poly1305/poly1305_ref-32.inc:
--------------------------------------------------------------------------------
1 | /*
2 | poly1305 implementation using 32 bit * 32 bit = 64 bit multiplication and 64 bit addition
3 |
4 | assumes the existence of uint32_t and uint64_t
5 | */
6 |
7 | enum {
8 | POLY1305_BLOCK_SIZE = 16
9 | };
10 |
11 | typedef struct poly1305_state_ref_t {
12 | uint32_t r[5];
13 | uint32_t h[5];
14 | uint32_t pad[4];
15 | unsigned char final;
16 | } poly1305_state_ref_t;
17 |
18 | /* interpret four 8 bit unsigned integers as a 32 bit unsigned integer in little endian */
19 | static uint32_t
20 | U8TO32(const unsigned char *p) {
21 | return
22 | (((uint32_t)(p[0] & 0xff) ) |
23 | ((uint32_t)(p[1] & 0xff) << 8) |
24 | ((uint32_t)(p[2] & 0xff) << 16) |
25 | ((uint32_t)(p[3] & 0xff) << 24));
26 | }
27 |
28 | /* store a 32 bit unsigned integer as four 8 bit unsigned integers in little endian */
29 | static void
30 | U32TO8(unsigned char *p, uint32_t v) {
31 | p[0] = (unsigned char)((v ) & 0xff);
32 | p[1] = (unsigned char)((v >> 8) & 0xff);
33 | p[2] = (unsigned char)((v >> 16) & 0xff);
34 | p[3] = (unsigned char)((v >> 24) & 0xff);
35 | }
36 |
37 | static size_t
38 | poly1305_block_size_ref(void) {
39 | return POLY1305_BLOCK_SIZE;
40 | }
41 |
42 | static void
43 | poly1305_init_ext_ref(void *state, const poly1305_key *key, size_t bytes_hint) {
44 | poly1305_state_ref_t *st = (poly1305_state_ref_t *)state;
45 |
46 | /* bytes_hint not used */
47 | (void)bytes_hint;
48 |
49 | /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
50 | st->r[0] = (U8TO32(&key->b[ 0]) ) & 0x3ffffff;
51 | st->r[1] = (U8TO32(&key->b[ 3]) >> 2) & 0x3ffff03;
52 | st->r[2] = (U8TO32(&key->b[ 6]) >> 4) & 0x3ffc0ff;
53 | st->r[3] = (U8TO32(&key->b[ 9]) >> 6) & 0x3f03fff;
54 | st->r[4] = (U8TO32(&key->b[12]) >> 8) & 0x00fffff;
55 |
56 | /* h = 0 */
57 | st->h[0] = 0;
58 | st->h[1] = 0;
59 | st->h[2] = 0;
60 | st->h[3] = 0;
61 | st->h[4] = 0;
62 |
63 | /* save pad for later */
64 | st->pad[0] = U8TO32(&key->b[16]);
65 | st->pad[1] = U8TO32(&key->b[20]);
66 | st->pad[2] = U8TO32(&key->b[24]);
67 | st->pad[3] = U8TO32(&key->b[28]);
68 |
69 | st->final = 0;
70 | }
71 |
72 | static void
73 | poly1305_blocks_ref(void *state, const unsigned char *in, size_t inlen) {
74 | poly1305_state_ref_t *st = (poly1305_state_ref_t *)state;
75 | const uint32_t hibit = (st->final) ? 0 : (1 << 24); /* 1 << 128 */
76 | uint32_t r0,r1,r2,r3,r4;
77 | uint32_t s1,s2,s3,s4;
78 | uint32_t h0,h1,h2,h3,h4;
79 | uint64_t d0,d1,d2,d3,d4;
80 | uint32_t c;
81 |
82 | r0 = st->r[0];
83 | r1 = st->r[1];
84 | r2 = st->r[2];
85 | r3 = st->r[3];
86 | r4 = st->r[4];
87 |
88 | s1 = r1 * 5;
89 | s2 = r2 * 5;
90 | s3 = r3 * 5;
91 | s4 = r4 * 5;
92 |
93 | h0 = st->h[0];
94 | h1 = st->h[1];
95 | h2 = st->h[2];
96 | h3 = st->h[3];
97 | h4 = st->h[4];
98 |
99 | while (inlen >= POLY1305_BLOCK_SIZE) {
100 | /* h += m[i] */
101 | h0 += (U8TO32(in+ 0) ) & 0x3ffffff;
102 | h1 += (U8TO32(in+ 3) >> 2) & 0x3ffffff;
103 | h2 += (U8TO32(in+ 6) >> 4) & 0x3ffffff;
104 | h3 += (U8TO32(in+ 9) >> 6) & 0x3ffffff;
105 | h4 += (U8TO32(in+12) >> 8) | hibit;
106 |
107 | /* h *= r */
108 | d0 = ((uint64_t)h0 * r0) + ((uint64_t)h1 * s4) + ((uint64_t)h2 * s3) + ((uint64_t)h3 * s2) + ((uint64_t)h4 * s1);
109 | d1 = ((uint64_t)h0 * r1) + ((uint64_t)h1 * r0) + ((uint64_t)h2 * s4) + ((uint64_t)h3 * s3) + ((uint64_t)h4 * s2);
110 | d2 = ((uint64_t)h0 * r2) + ((uint64_t)h1 * r1) + ((uint64_t)h2 * r0) + ((uint64_t)h3 * s4) + ((uint64_t)h4 * s3);
111 | d3 = ((uint64_t)h0 * r3) + ((uint64_t)h1 * r2) + ((uint64_t)h2 * r1) + ((uint64_t)h3 * r0) + ((uint64_t)h4 * s4);
112 | d4 = ((uint64_t)h0 * r4) + ((uint64_t)h1 * r3) + ((uint64_t)h2 * r2) + ((uint64_t)h3 * r1) + ((uint64_t)h4 * r0);
113 |
114 | /* (partial) h %= p */
115 | c = (uint32_t)(d0 >> 26); h0 = (uint32_t)d0 & 0x3ffffff;
116 | d1 += c; c = (uint32_t)(d1 >> 26); h1 = (uint32_t)d1 & 0x3ffffff;
117 | d2 += c; c = (uint32_t)(d2 >> 26); h2 = (uint32_t)d2 & 0x3ffffff;
118 | d3 += c; c = (uint32_t)(d3 >> 26); h3 = (uint32_t)d3 & 0x3ffffff;
119 | d4 += c; c = (uint32_t)(d4 >> 26); h4 = (uint32_t)d4 & 0x3ffffff;
120 | h0 += c * 5; c = (h0 >> 26); h0 = h0 & 0x3ffffff;
121 | h1 += c;
122 |
123 | in += POLY1305_BLOCK_SIZE;
124 | inlen -= POLY1305_BLOCK_SIZE;
125 | }
126 |
127 | st->h[0] = h0;
128 | st->h[1] = h1;
129 | st->h[2] = h2;
130 | st->h[3] = h3;
131 | st->h[4] = h4;
132 | }
133 |
134 | static void
135 | poly1305_finish_ext_ref(void *state, const unsigned char *in, size_t remaining, unsigned char mac[16]) {
136 | poly1305_state_ref_t *st = (poly1305_state_ref_t *)state;
137 | uint32_t h0,h1,h2,h3,h4,c;
138 | uint32_t g0,g1,g2,g3,g4;
139 | uint64_t f;
140 | uint32_t mask;
141 |
142 | /* process the remaining block */
143 | if (remaining) {
144 | unsigned char final[POLY1305_BLOCK_SIZE] = {0};
145 | size_t i;
146 | for (i = 0; i < remaining; i++)
147 | final[i] = in[i];
148 | final[remaining] = 1;
149 | st->final = 1;
150 | poly1305_blocks_ref(st, final, POLY1305_BLOCK_SIZE);
151 | }
152 |
153 | /* fully carry h */
154 | h0 = st->h[0];
155 | h1 = st->h[1];
156 | h2 = st->h[2];
157 | h3 = st->h[3];
158 | h4 = st->h[4];
159 |
160 | c = h1 >> 26; h1 = h1 & 0x3ffffff;
161 | h2 += c; c = h2 >> 26; h2 = h2 & 0x3ffffff;
162 | h3 += c; c = h3 >> 26; h3 = h3 & 0x3ffffff;
163 | h4 += c; c = h4 >> 26; h4 = h4 & 0x3ffffff;
164 | h0 += c * 5; c = h0 >> 26; h0 = h0 & 0x3ffffff;
165 | h1 += c;
166 |
167 | /* compute h + -p */
168 | g0 = h0 + 5; c = g0 >> 26; g0 &= 0x3ffffff;
169 | g1 = h1 + c; c = g1 >> 26; g1 &= 0x3ffffff;
170 | g2 = h2 + c; c = g2 >> 26; g2 &= 0x3ffffff;
171 | g3 = h3 + c; c = g3 >> 26; g3 &= 0x3ffffff;
172 | g4 = h4 + c - (1 << 26);
173 |
174 | /* select h if h < p, or h + -p if h >= p */
175 | mask = (g4 >> ((sizeof(uint32_t) * 8) - 1)) - 1;
176 | g0 &= mask;
177 | g1 &= mask;
178 | g2 &= mask;
179 | g3 &= mask;
180 | g4 &= mask;
181 | mask = ~mask;
182 | h0 = (h0 & mask) | g0;
183 | h1 = (h1 & mask) | g1;
184 | h2 = (h2 & mask) | g2;
185 | h3 = (h3 & mask) | g3;
186 | h4 = (h4 & mask) | g4;
187 |
188 | /* h = h % (2^128) */
189 | h0 = ((h0 ) | (h1 << 26)) & 0xffffffff;
190 | h1 = ((h1 >> 6) | (h2 << 20)) & 0xffffffff;
191 | h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff;
192 | h3 = ((h3 >> 18) | (h4 << 8)) & 0xffffffff;
193 |
194 | /* mac = (h + pad) % (2^128) */
195 | f = (uint64_t)h0 + st->pad[0] ; h0 = (uint32_t)f;
196 | f = (uint64_t)h1 + st->pad[1] + (f >> 32); h1 = (uint32_t)f;
197 | f = (uint64_t)h2 + st->pad[2] + (f >> 32); h2 = (uint32_t)f;
198 | f = (uint64_t)h3 + st->pad[3] + (f >> 32); h3 = (uint32_t)f;
199 |
200 | U32TO8(mac + 0, h0);
201 | U32TO8(mac + 4, h1);
202 | U32TO8(mac + 8, h2);
203 | U32TO8(mac + 12, h3);
204 |
205 | /* zero out the state */
206 | st->h[0] = 0;
207 | st->h[1] = 0;
208 | st->h[2] = 0;
209 | st->h[3] = 0;
210 | st->h[4] = 0;
211 | st->r[0] = 0;
212 | st->r[1] = 0;
213 | st->r[2] = 0;
214 | st->r[3] = 0;
215 | st->r[4] = 0;
216 | st->pad[0] = 0;
217 | st->pad[1] = 0;
218 | st->pad[2] = 0;
219 | st->pad[3] = 0;
220 | }
221 |
222 | static void
223 | poly1305_auth_ref(unsigned char mac[16], const unsigned char *in, size_t inlen, const poly1305_key *key) {
224 | poly1305_state_ref_t st;
225 | size_t blocks;
226 | poly1305_init_ext_ref(&st, key, inlen);
227 | blocks = (inlen & ~(POLY1305_BLOCK_SIZE - 1));
228 | if (blocks) {
229 | poly1305_blocks_ref(&st, in, blocks);
230 | in += blocks;
231 | inlen -= blocks;
232 | }
233 | poly1305_finish_ext_ref(&st, in, inlen, mac);
234 | }
235 |
236 |
--------------------------------------------------------------------------------
/app/extensions/poly1305/poly1305_ref-64.inc:
--------------------------------------------------------------------------------
1 | /*
2 | poly1305 implementation using 64 bit * 64 bit = 128 bit multiplication and 128 bit addition
3 |
4 | assumes the existence of uint64_t and uint128_t
5 | */
6 |
7 | enum {
8 | POLY1305_BLOCK_SIZE = 16
9 | };
10 |
11 | typedef struct poly1305_state_ref_t {
12 | uint64_t r[3];
13 | uint64_t h[3];
14 | uint64_t pad[2];
15 | unsigned char final;
16 | } poly1305_state_ref_t;
17 |
18 | /* interpret eight 8 bit unsigned integers as a 64 bit unsigned integer in little endian */
19 | static uint64_t
20 | U8TO64(const unsigned char *p) {
21 | return
22 | ((uint64_t)p[0] ) |
23 | ((uint64_t)p[1] << 8) |
24 | ((uint64_t)p[2] << 16) |
25 | ((uint64_t)p[3] << 24) |
26 | ((uint64_t)p[4] << 32) |
27 | ((uint64_t)p[5] << 40) |
28 | ((uint64_t)p[6] << 48) |
29 | ((uint64_t)p[7] << 56);
30 | }
31 |
32 | /* store a 64 bit unsigned integer as eight 8 bit unsigned integers in little endian */
33 | static void
34 | U64TO8(unsigned char *p, uint64_t v) {
35 | p[0] = (unsigned char)(v ) & 0xff;
36 | p[1] = (unsigned char)(v >> 8) & 0xff;
37 | p[2] = (unsigned char)(v >> 16) & 0xff;
38 | p[3] = (unsigned char)(v >> 24) & 0xff;
39 | p[4] = (unsigned char)(v >> 32) & 0xff;
40 | p[5] = (unsigned char)(v >> 40) & 0xff;
41 | p[6] = (unsigned char)(v >> 48) & 0xff;
42 | p[7] = (unsigned char)(v >> 56) & 0xff;
43 | }
44 |
45 | static size_t
46 | poly1305_block_size_ref(void) {
47 | return POLY1305_BLOCK_SIZE;
48 | }
49 |
50 | static void
51 | poly1305_init_ext_ref(void *state, const poly1305_key *key, size_t bytes_hint) {
52 | poly1305_state_ref_t *st = (poly1305_state_ref_t *)state;
53 | uint64_t t0, t1;
54 |
55 | /* bytes_hint not used */
56 | (void)bytes_hint;
57 |
58 | /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
59 | t0 = U8TO64(&key->b[0]);
60 | t1 = U8TO64(&key->b[8]);
61 | st->r[0] = ( t0 ) & 0xffc0fffffff;
62 | st->r[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffff;
63 | st->r[2] = ((t1 >> 24) ) & 0x00ffffffc0f;
64 |
65 | /* h = 0 */
66 | st->h[0] = 0;
67 | st->h[1] = 0;
68 | st->h[2] = 0;
69 |
70 | /* save pad for later */
71 | st->pad[0] = U8TO64(&key->b[16]);
72 | st->pad[1] = U8TO64(&key->b[24]);
73 |
74 | st->final = 0;
75 | }
76 |
77 | static void
78 | poly1305_blocks_ref(void *state, const unsigned char *in, size_t inlen) {
79 | poly1305_state_ref_t *st = (poly1305_state_ref_t *)state;
80 | const uint64_t hibit = (st->final) ? 0 : ((uint64_t)1 << 40); /* 1 << 128 */
81 | uint64_t r0,r1,r2;
82 | uint64_t s1,s2;
83 | uint64_t h0,h1,h2;
84 | uint64_t c;
85 | uint128_t d0,d1,d2;
86 |
87 | r0 = st->r[0];
88 | r1 = st->r[1];
89 | r2 = st->r[2];
90 |
91 | s1 = r1 * (5 << 2);
92 | s2 = r2 * (5 << 2);
93 |
94 | h0 = st->h[0];
95 | h1 = st->h[1];
96 | h2 = st->h[2];
97 |
98 | while (inlen >= POLY1305_BLOCK_SIZE) {
99 | uint64_t t0, t1;
100 |
101 | /* h += in[i] */
102 | t0 = U8TO64(in + 0);
103 | t1 = U8TO64(in + 8);
104 | h0 += (( t0 ) & 0xfffffffffff);
105 | h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff);
106 | h2 += (((t1 >> 24) ) & 0x3ffffffffff) | hibit;
107 |
108 | /* h *= r */
109 | d0 = ((uint128_t)h0 * r0) + ((uint128_t)h1 * s2) + ((uint128_t)h2 * s1);
110 | d1 = ((uint128_t)h0 * r1) + ((uint128_t)h1 * r0) + ((uint128_t)h2 * s2);
111 | d2 = ((uint128_t)h0 * r2) + ((uint128_t)h1 * r1) + ((uint128_t)h2 * r0);
112 |
113 | /* (partial) h %= p */
114 | c = (uint64_t)(d0 >> 44); h0 = (uint64_t)d0 & 0xfffffffffff;
115 | d1 += c; c = (uint64_t)(d1 >> 44); h1 = (uint64_t)d1 & 0xfffffffffff;
116 | d2 += c; c = (uint64_t)(d2 >> 42); h2 = (uint64_t)d2 & 0x3ffffffffff;
117 | h0 += c * 5; c = (h0 >> 44); h0 = h0 & 0xfffffffffff;
118 | h1 += c;
119 |
120 | in += POLY1305_BLOCK_SIZE;
121 | inlen -= POLY1305_BLOCK_SIZE;
122 | }
123 |
124 | st->h[0] = h0;
125 | st->h[1] = h1;
126 | st->h[2] = h2;
127 | }
128 |
129 | static void
130 | poly1305_finish_ext_ref(void *state, const unsigned char *in, size_t remaining, unsigned char mac[16]) {
131 | poly1305_state_ref_t *st = (poly1305_state_ref_t *)state;
132 | uint64_t h0, h1, h2, c;
133 | uint64_t g0, g1, g2;
134 | uint64_t t0, t1;
135 |
136 | /* process the remaining block */
137 | if (remaining) {
138 | unsigned char final[POLY1305_BLOCK_SIZE] = {0};
139 | size_t i;
140 | for (i = 0; i < remaining; i++)
141 | final[i] = in[i];
142 | final[remaining] = 1;
143 | st->final = 1;
144 | poly1305_blocks_ref(st, final, POLY1305_BLOCK_SIZE);
145 | }
146 |
147 | /* fully carry h */
148 | h0 = st->h[0];
149 | h1 = st->h[1];
150 | h2 = st->h[2];
151 |
152 | c = (h1 >> 44); h1 &= 0xfffffffffff;
153 | h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff;
154 | h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
155 | h1 += c; c = (h1 >> 44); h1 &= 0xfffffffffff;
156 | h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff;
157 | h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
158 | h1 += c;
159 |
160 | /* compute h + -p */
161 | g0 = h0 + 5; c = (g0 >> 44); g0 &= 0xfffffffffff;
162 | g1 = h1 + c; c = (g1 >> 44); g1 &= 0xfffffffffff;
163 | g2 = h2 + c - ((uint64_t)1 << 42);
164 |
165 | /* select h if h < p, or h + -p if h >= p */
166 | c = (g2 >> 63) - 1;
167 | h0 = (h0 & ~c) | (g0 & c);
168 | h1 = (h1 & ~c) | (g1 & c);
169 | h2 = (h2 & ~c) | (g2 & c);
170 |
171 | /* h = (h + pad) */
172 | t0 = st->pad[0];
173 | t1 = st->pad[1];
174 |
175 | h0 += (( t0 ) & 0xfffffffffff) ; c = (h0 >> 44); h0 &= 0xfffffffffff;
176 | h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff) + c; c = (h1 >> 44); h1 &= 0xfffffffffff;
177 | h2 += (((t1 >> 24) ) & 0x3ffffffffff) + c; h2 &= 0x3ffffffffff;
178 |
179 | /* mac = h % (2^128) */
180 | h0 = ((h0 ) | (h1 << 44));
181 | h1 = ((h1 >> 20) | (h2 << 24));
182 |
183 | U64TO8(&mac[0], h0);
184 | U64TO8(&mac[8], h1);
185 |
186 | /* zero out the state */
187 | st->h[0] = 0;
188 | st->h[1] = 0;
189 | st->h[2] = 0;
190 | st->r[0] = 0;
191 | st->r[1] = 0;
192 | st->r[2] = 0;
193 | st->pad[0] = 0;
194 | st->pad[1] = 0;
195 | }
196 |
197 |
198 | static void
199 | poly1305_auth_ref(unsigned char mac[16], const unsigned char *in, size_t inlen, const poly1305_key *key) {
200 | poly1305_state_ref_t st;
201 | size_t blocks;
202 | poly1305_init_ext_ref(&st, key, inlen);
203 | blocks = (inlen & ~(POLY1305_BLOCK_SIZE - 1));
204 | if (blocks) {
205 | poly1305_blocks_ref(&st, in, blocks);
206 | in += blocks;
207 | inlen -= blocks;
208 | }
209 | poly1305_finish_ext_ref(&st, in, inlen, mac);
210 | }
211 |
212 |
--------------------------------------------------------------------------------
/app/extensions/poly1305/poly1305_ref-8.inc:
--------------------------------------------------------------------------------
1 | /*
2 | poly1305 implementation using 8 bit * 8 bit = 16 bit multiplication and 32 bit addition
3 |
4 | based on the public domain reference version in supercop by djb
5 | */
6 |
7 | enum {
8 | POLY1305_BLOCK_SIZE = 16
9 | };
10 |
11 | typedef struct poly1305_state_ref_t {
12 | unsigned char r[17];
13 | unsigned char h[17];
14 | unsigned char pad[17];
15 | unsigned char final;
16 | } poly1305_state_ref_t;
17 |
18 | static size_t
19 | poly1305_block_size_ref(void) {
20 | return POLY1305_BLOCK_SIZE;
21 | }
22 |
23 | static void
24 | poly1305_init_ext_ref(void *state, const poly1305_key *key, size_t bytes_hint) {
25 | poly1305_state_ref_t *st = (poly1305_state_ref_t *)state;
26 | size_t i;
27 |
28 | /* bytes_hint not used */
29 | (void)bytes_hint;
30 |
31 | /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
32 | for (i = 0; i < 16; i++) st->r[i] = key->b[i];
33 | st->r[3] &= 0x0f;
34 | st->r[4] &= 0xfc;
35 | st->r[7] &= 0x0f;
36 | st->r[8] &= 0xfc;
37 | st->r[11] &= 0x0f;
38 | st->r[12] &= 0xfc;
39 | st->r[15] &= 0x0f;
40 | st->r[16] = 0;
41 |
42 | /* h = 0 */
43 | for (i = 0; i < 17; i++) st->h[i] = 0;
44 |
45 | /* save pad for later */
46 | for (i = 0; i < 16; i++) st->pad[i] = key->b[i + 16];
47 | st->pad[16] = 0;
48 |
49 | st->final = 0;
50 | }
51 |
52 | static void
53 | poly1305_add(unsigned char h[17], const unsigned char c[17]) {
54 | unsigned short u = 0;
55 | size_t i;
56 | for (i = 0; i < 17; i++) {
57 | u += (unsigned short)h[i] + (unsigned short)c[i];
58 | h[i] = (unsigned char)u & 0xff;
59 | u >>= 8;
60 | }
61 | }
62 |
63 | static void
64 | poly1305_partial_reduce(unsigned char h[17], unsigned long hr[17]) {
65 | unsigned long u;
66 | size_t i;
67 | u = 0;
68 | for (i = 0; i < 16; i++) {
69 | u += hr[i];
70 | h[i] = (unsigned char)u & 0xff;
71 | u >>= 8;
72 | }
73 | u += hr[16];
74 | h[16] = (unsigned char)u & 0x03;
75 | u >>= 2;
76 | u += (u << 2); /* u *= 5; */
77 | for (i = 0; i < 16; i++) {
78 | u += h[i];
79 | h[i] = (unsigned char)u & 0xff;
80 | u >>= 8;
81 | }
82 | h[16] += (unsigned char)u;
83 | }
84 |
85 | static void
86 | poly1305_full_reduce(unsigned char h[17]) {
87 | static const unsigned char minusp[17] = {
88 | 0x05,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
89 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
90 | 0xfc
91 | };
92 | unsigned char horig[17], negative;
93 | size_t i;
94 |
95 | /* compute h + -p */
96 | for (i = 0; i < 17; i++) horig[i] = h[i];
97 | poly1305_add(h, minusp);
98 |
99 | /* select h if h < p, or h + -p if h >= p */
100 | negative = -(h[16] >> 7);
101 | for (i = 0; i < 17; i++)
102 | h[i] ^= negative & (horig[i] ^ h[i]);
103 | }
104 |
105 |
106 | static void
107 | poly1305_blocks_ref(void *state, const unsigned char *in, size_t inlen) {
108 | poly1305_state_ref_t *st = (poly1305_state_ref_t *)state;
109 | const unsigned char hibit = st->final ? 0 : 1; /* 1 << 128 */
110 |
111 | while (inlen >= POLY1305_BLOCK_SIZE) {
112 | unsigned long hr[17], u;
113 | unsigned char c[17];
114 | size_t i, j;
115 |
116 | /* h += m */
117 | for (i = 0; i < 16; i++)
118 | c[i] = in[i];
119 | c[16] = hibit;
120 | poly1305_add(st->h, c);
121 |
122 | /* h *= r */
123 | for (i = 0; i < 17; i++) {
124 | u = 0;
125 | for (j = 0; j <= i ; j++) {
126 | u += (unsigned short)st->h[j] * st->r[i - j];
127 | }
128 | for (j = i + 1; j < 17; j++) {
129 | unsigned long v = (unsigned short)st->h[j] * st->r[i + 17 - j];
130 | v = ((v << 8) + (v << 6)); /* v *= (5 << 6); */
131 | u += v;
132 | }
133 | hr[i] = u;
134 | }
135 |
136 | /* (partial) h %= p */
137 | poly1305_partial_reduce(st->h, hr);
138 |
139 | in += POLY1305_BLOCK_SIZE;
140 | inlen -= POLY1305_BLOCK_SIZE;
141 | }
142 | }
143 |
144 | static void
145 | poly1305_finish_ext_ref(void *state, const unsigned char *in, size_t remaining, unsigned char mac[16]) {
146 | poly1305_state_ref_t *st = (poly1305_state_ref_t *)state;
147 | size_t i;
148 |
149 | /* process the remaining block */
150 | if (remaining) {
151 | unsigned char final[POLY1305_BLOCK_SIZE] = {0};
152 | size_t i;
153 | for (i = 0; i < remaining; i++)
154 | final[i] = in[i];
155 | final[remaining] = 1;
156 | st->final = 1;
157 | poly1305_blocks_ref(st, final, POLY1305_BLOCK_SIZE);
158 | }
159 |
160 | /* fully reduce h */
161 | poly1305_full_reduce(st->h);
162 |
163 | /* h = (h + pad) % (1 << 128) */
164 | poly1305_add(st->h, st->pad);
165 | for (i = 0; i < 16; i++) mac[i] = st->h[i];
166 |
167 | /* zero out the state */
168 | for (i = 0; i < 17; i++) st->r[i] = 0;
169 | for (i = 0; i < 17; i++) st->h[i] = 0;
170 | for (i = 0; i < 17; i++) st->pad[i] = 0;
171 | }
172 |
173 | static void
174 | poly1305_auth_ref(unsigned char mac[16], const unsigned char *in, size_t inlen, const poly1305_key *key) {
175 | poly1305_state_ref_t st;
176 | size_t blocks;
177 | poly1305_init_ext_ref(&st, key, inlen);
178 | blocks = (inlen & ~(POLY1305_BLOCK_SIZE - 1));
179 | if (blocks) {
180 | poly1305_blocks_ref(&st, in, blocks);
181 | in += blocks;
182 | inlen -= blocks;
183 | }
184 | poly1305_finish_ext_ref(&st, in, inlen, mac);
185 | }
186 |
187 |
--------------------------------------------------------------------------------
/app/extensions/poly1305/poly1305_x86-32.inc:
--------------------------------------------------------------------------------
1 | /* cannibalized from the public domain x86 implementation in supercop by djb */
2 |
3 | SECTION_TEXT
4 |
5 | GLOBAL_HIDDEN_FN poly1305_block_size_x86
6 | movl $16, %eax
7 | ret
8 | FN_END poly1305_block_size_x86
9 |
10 | GLOBAL_HIDDEN_FN poly1305_init_ext_x86
11 | poly1305_init_ext_x86_local:
12 | pushl %ebx
13 | pushl %esi
14 | pushl %edi
15 | pushl %eax
16 | pushl %eax
17 | movl 24(%esp), %eax
18 | movl 28(%esp), %edx
19 | movl $0x137f, %ecx
20 | movl %ecx, 0(%esp)
21 | fstcw 4(%esp)
22 | fldcw 0(%esp)
23 | movl 16(%edx), %ecx
24 | movl 20(%edx), %ebx
25 | movl 24(%edx), %esi
26 | movl 28(%edx), %edi
27 | movl %ecx, 104(%eax)
28 | movl %ebx, 108(%eax)
29 | movl %esi, 112(%eax)
30 | movl %edi, 116(%eax)
31 | movl 0(%edx), %ecx
32 | movl 4(%edx), %ebx
33 | movl 8(%edx), %esi
34 | movl 12(%edx), %edi
35 | andl $0x0fffffff, %ecx
36 | andl $0x0ffffffc, %ebx
37 | andl $0x0ffffffc, %esi
38 | andl $0x0ffffffc, %edi
39 | movl %ecx, 0(%eax)
40 | movl $0x43300000, 4(%eax)
41 | movl %ebx, 8(%eax)
42 | movl $0x45300000, 12(%eax)
43 | movl %esi, 16(%eax)
44 | movl $0x47300000, 20(%eax)
45 | movl %edi, 24(%eax)
46 | movl $0x49300000, 28(%eax)
47 | LOAD_VAR_PIC poly1305_constants_x86, %edx
48 | fldl 0(%eax)
49 | fsubl 64(%edx)
50 | fldl 8(%eax)
51 | fsubl 72(%edx)
52 | fldl 16(%eax)
53 | fsubl 80(%edx)
54 | fldl 24(%eax)
55 | fsubl 88(%edx)
56 | fxch %st(3)
57 | fstpl 0(%eax)
58 | fxch %st(1)
59 | fstl 8(%eax)
60 | fmull 0(%edx)
61 | fstpl 16(%eax)
62 | fstl 24(%eax)
63 | fmull 0(%edx)
64 | fstpl 32(%eax)
65 | fstl 40(%eax)
66 | fmull 0(%edx)
67 | fstpl 48(%eax)
68 | fldz
69 | fstl 56(%eax)
70 | fstl 64(%eax)
71 | fstl 72(%eax)
72 | fstl 80(%eax)
73 | fstl 88(%eax)
74 | fstpl 96(%eax)
75 | fldcw 4(%esp)
76 | popl %eax
77 | popl %eax
78 | popl %edi
79 | popl %esi
80 | popl %ebx
81 | ret
82 | FN_END poly1305_init_ext_x86
83 |
84 |
85 | GLOBAL_HIDDEN_FN poly1305_blocks_x86
86 | poly1305_blocks_x86_local:
87 | movl %esp,%eax
88 | andl $63,%eax
89 | addl $192,%eax
90 | subl %eax,%esp
91 | movl %eax,0(%esp)
92 | movl %ebx,4(%esp)
93 | movl %esi,8(%esp)
94 | movl %edi,12(%esp)
95 | movl %ebp,16(%esp)
96 | movl $0x137f, %ecx
97 | movl %ecx, 188(%esp)
98 | fstcw 184(%esp)
99 | fldcw 188(%esp)
100 | movl $0x43300000,100(%esp)
101 | movl $0x45300000,108(%esp)
102 | movl $0x47300000,116(%esp)
103 | movl $0x49300000,124(%esp)
104 | movl 4(%esp,%eax),%ebp
105 | movl 8(%esp,%eax),%esi
106 | movl 12(%esp,%eax),%ecx
107 | LOAD_VAR_PIC poly1305_constants_x86, %edx
108 | cmp $16,%ecx
109 | jb poly1305_blocks_x86_nomorebytes
110 | fldt 92(%ebp)
111 | fldt 80(%ebp)
112 | fldt 68(%ebp)
113 | fldt 56(%ebp)
114 | add $16,%esi
115 | sub $16,%ecx
116 | movl %ecx, 20(%esp)
117 | movl -4(%esi),%eax
118 | movl -8(%esi),%ecx
119 | movl -12(%esi),%ebx
120 | movl -16(%esi),%edi
121 | movl %eax,120(%esp)
122 | movl %ecx,112(%esp)
123 | movl %ebx,104(%esp)
124 | movl %edi,96(%esp)
125 | fxch %st(3)
126 | faddl 120(%esp)
127 | fsubl 96(%edx)
128 | fxch %st(1)
129 | faddl 104(%esp)
130 | fsubl 72(%edx)
131 | fxch %st(2)
132 | faddl 112(%esp)
133 | fsubl 80(%edx)
134 | fxch %st(3)
135 | faddl 96(%esp)
136 | fsubl 64(%edx)
137 | movl 20(%esp), %ecx
138 | cmp $16, %ecx
139 | jb poly1305_blocks_x86_lastmultiply
140 | poly1305_blocks_x86_multiplyaddatleast16bytes:
141 | add $16,%esi
142 | sub $16,%ecx
143 | movl %ecx, 20(%esp)
144 | movl -4(%esi),%eax
145 | movl -8(%esi),%ecx
146 | movl -12(%esi),%ebx
147 | movl -16(%esi),%edi
148 | movl %eax,120(%esp)
149 | movl %ecx,112(%esp)
150 | movl %ebx,104(%esp)
151 | movl %edi,96(%esp)
152 | fldl 56(%edx)
153 | fadd %st(2),%st(0)
154 | fsubl 56(%edx)
155 | fsubr %st(0),%st(2)
156 | fmull 0(%edx)
157 | fldl 32(%edx)
158 | fadd %st(2),%st(0)
159 | fsubl 32(%edx)
160 | fsubr %st(0),%st(2)
161 | fxch %st(2)
162 | faddp %st(0),%st(1)
163 | fldl 40(%edx)
164 | fadd %st(4),%st(0)
165 | fsubl 40(%edx)
166 | fsubr %st(0),%st(4)
167 | fldl 48(%edx)
168 | fadd %st(6),%st(0)
169 | fsubl 48(%edx)
170 | fsubr %st(0),%st(6)
171 | fxch %st(6)
172 | faddp %st(0),%st(1)
173 | fxch %st(3)
174 | faddp %st(0),%st(5)
175 | fxch %st(3)
176 | faddp %st(0),%st(1)
177 | fldl 40(%ebp)
178 | fmul %st(3),%st(0)
179 | fldl 24(%ebp)
180 | fmul %st(4),%st(0)
181 | fldl 8(%ebp)
182 | fmul %st(5),%st(0)
183 | fldl 0(%ebp)
184 | fmulp %st(0),%st(6)
185 | fldl 24(%ebp)
186 | fmul %st(4),%st(0)
187 | faddp %st(0),%st(3)
188 | fldl 8(%ebp)
189 | fmul %st(4),%st(0)
190 | faddp %st(0),%st(2)
191 | fldl 0(%ebp)
192 | fmul %st(4),%st(0)
193 | faddp %st(0),%st(1)
194 | fldl 48(%ebp)
195 | fmulp %st(0),%st(4)
196 | fxch %st(3)
197 | faddp %st(0),%st(5)
198 | fldl 8(%ebp)
199 | fmul %st(4),%st(0)
200 | faddp %st(0),%st(2)
201 | fldl 0(%ebp)
202 | fmul %st(4),%st(0)
203 | faddp %st(0),%st(1)
204 | fldl 48(%ebp)
205 | fmul %st(4),%st(0)
206 | faddp %st(0),%st(3)
207 | fldl 32(%ebp)
208 | fmulp %st(0),%st(4)
209 | fxch %st(3)
210 | faddp %st(0),%st(4)
211 | fldl 0(%ebp)
212 | fmul %st(5),%st(0)
213 | faddp %st(0),%st(1)
214 | fxch %st(3)
215 | fldl 48(%ebp)
216 | fmul %st(5),%st(0)
217 | faddp %st(0),%st(3)
218 | fxch %st(1)
219 | fldl 32(%ebp)
220 | fmul %st(5),%st(0)
221 | faddp %st(0),%st(1)
222 | fldl 16(%ebp)
223 | fmulp %st(0),%st(5)
224 | fxch %st(4)
225 | faddp %st(0),%st(1)
226 | movl 20(%esp), %ecx
227 | fxch %st(2)
228 | fldl 120(%esp)
229 | fsubl 96(%edx)
230 | faddp %st(0),%st(1)
231 | fxch %st(1)
232 | fldl 112(%esp)
233 | fsubl 80(%edx)
234 | cmp $16,%ecx
235 | faddp %st(0),%st(1)
236 | fxch %st(3)
237 | fldl 104(%esp)
238 | fsubl 72(%edx)
239 | faddp %st(0),%st(1)
240 | fxch %st(2)
241 | fldl 96(%esp)
242 | fsubl 64(%edx)
243 | faddp %st(0),%st(1)
244 | jae poly1305_blocks_x86_multiplyaddatleast16bytes
245 | poly1305_blocks_x86_lastmultiply:
246 | fldl 56(%edx)
247 | fadd %st(2),%st(0)
248 | fsubl 56(%edx)
249 | fsubr %st(0),%st(2)
250 | fmull 0(%edx)
251 | fldl 32(%edx)
252 | fadd %st(2),%st(0)
253 | fsubl 32(%edx)
254 | fsubr %st(0),%st(2)
255 | fldl 40(%edx)
256 | fadd %st(5),%st(0)
257 | fsubl 40(%edx)
258 | fsubr %st(0),%st(5)
259 | fldl 48(%edx)
260 | fadd %st(7),%st(0)
261 | fsubl 48(%edx)
262 | fsubr %st(0),%st(7)
263 | fxch %st(7)
264 | faddp %st(0),%st(1)
265 | fxch %st(5)
266 | faddp %st(0),%st(1)
267 | fxch %st(3)
268 | faddp %st(0),%st(5)
269 | faddp %st(0),%st(1)
270 | fldl 40(%ebp)
271 | fmul %st(1),%st(0)
272 | fldl 24(%ebp)
273 | fmul %st(2),%st(0)
274 | fldl 8(%ebp)
275 | fmul %st(3),%st(0)
276 | fldl 0(%ebp)
277 | fmulp %st(0),%st(4)
278 | fldl 24(%ebp)
279 | fmul %st(5),%st(0)
280 | faddp %st(0),%st(3)
281 | fldl 8(%ebp)
282 | fmul %st(5),%st(0)
283 | faddp %st(0),%st(2)
284 | fldl 0(%ebp)
285 | fmul %st(5),%st(0)
286 | faddp %st(0),%st(1)
287 | fldl 48(%ebp)
288 | fmulp %st(0),%st(5)
289 | fxch %st(4)
290 | faddp %st(0),%st(3)
291 | fldl 8(%ebp)
292 | fmul %st(5),%st(0)
293 | faddp %st(0),%st(2)
294 | fldl 0(%ebp)
295 | fmul %st(5),%st(0)
296 | faddp %st(0),%st(1)
297 | fldl 48(%ebp)
298 | fmul %st(5),%st(0)
299 | faddp %st(0),%st(4)
300 | fldl 32(%ebp)
301 | fmulp %st(0),%st(5)
302 | fxch %st(4)
303 | faddp %st(0),%st(2)
304 | fldl 0(%ebp)
305 | fmul %st(5),%st(0)
306 | faddp %st(0),%st(1)
307 | fldl 48(%ebp)
308 | fmul %st(5),%st(0)
309 | faddp %st(0),%st(4)
310 | fldl 32(%ebp)
311 | fmul %st(5),%st(0)
312 | faddp %st(0),%st(3)
313 | fldl 16(%ebp)
314 | fmulp %st(0),%st(5)
315 | fxch %st(4)
316 | faddp %st(0),%st(1)
317 | fstpt 56(%ebp)
318 | fstpt 68(%ebp)
319 | fstpt 80(%ebp)
320 | fstpt 92(%ebp)
321 | poly1305_blocks_x86_nomorebytes:
322 | fldcw 184(%esp)
323 | movl 0(%esp), %eax
324 | movl 4(%esp), %ebx
325 | movl 8(%esp), %esi
326 | movl 12(%esp), %edi
327 | movl 16(%esp), %ebp
328 | addl %eax, %esp
329 | ret
330 | FN_END poly1305_blocks_x86
331 |
332 | GLOBAL_HIDDEN_FN poly1305_finish_ext_x86
333 | poly1305_finish_ext_x86_local:
334 | pushl %ebx
335 | pushl %esi
336 | pushl %edi
337 | pushl %ebp
338 | mov %esp, %ebp
339 | andl $~63, %esp
340 | subl $256, %esp
341 | movl $0x137f, %ecx
342 | movl %ecx, 0(%esp)
343 | fstcw 4(%esp)
344 | fldcw 0(%esp)
345 | mov 20(%ebp), %ebx
346 | mov 24(%ebp), %esi
347 | mov 28(%ebp), %ecx
348 | mov 32(%ebp), %eax
349 | movl %ebp, 0(%esp)
350 | movl %ebx, 4(%esp)
351 | movl %eax, 8(%esp)
352 | fldt 92(%ebx)
353 | fldt 80(%ebx)
354 | fldt 68(%ebx)
355 | fldt 56(%ebx)
356 | andl %ecx, %ecx
357 | jz poly1305_finish_x86_nomorebytes
358 | movl $0x43300000,100(%esp)
359 | movl $0x45300000,108(%esp)
360 | movl $0x47300000,116(%esp)
361 | movl $0x49300000,124(%esp)
362 | movl $0,64(%esp)
363 | movl $0,4+64(%esp)
364 | movl $0,8+64(%esp)
365 | movl $0,12+64(%esp)
366 | leal 64(%esp),%edi
367 | rep movsb
368 | movb $1,0(%edi)
369 | movl 12+64(%esp),%eax
370 | movl 8+64(%esp),%ecx
371 | movl 4+64(%esp),%edx
372 | movl 64(%esp),%esi
373 | movl %eax,120(%esp)
374 | movl %ecx,112(%esp)
375 | movl %edx,104(%esp)
376 | movl %esi,96(%esp)
377 | LOAD_VAR_PIC poly1305_constants_x86, %edx
378 | fxch %st(3)
379 | faddl 120(%esp)
380 | fsubl 88(%edx)
381 | fxch %st(2)
382 | faddl 112(%esp)
383 | fsubl 80(%edx)
384 | fxch %st(1)
385 | faddl 104(%esp)
386 | fsubl 72(%edx)
387 | fxch %st(3)
388 | faddl 96(%esp)
389 | fsubl 64(%edx)
390 | fldl 56(%edx)
391 | fadd %st(3),%st(0)
392 | fsubl 56(%edx)
393 | fsubr %st(0),%st(3)
394 | fmull 0(%edx)
395 | fldl 32(%edx)
396 | fadd %st(2),%st(0)
397 | fsubl 32(%edx)
398 | fsubr %st(0),%st(2)
399 | fldl 40(%edx)
400 | fadd %st(6),%st(0)
401 | fsubl 40(%edx)
402 | fsubr %st(0),%st(6)
403 | fldl 48(%edx)
404 | fadd %st(5),%st(0)
405 | fsubl 48(%edx)
406 | fsubr %st(0),%st(5)
407 | fxch %st(4)
408 | faddp %st(0),%st(3)
409 | fxch %st(6)
410 | faddp %st(0),%st(1)
411 | fxch %st(3)
412 | faddp %st(0),%st(5)
413 | fxch %st(3)
414 | faddp %st(0),%st(1)
415 | fldl 40(%ebx)
416 | fmul %st(3),%st(0)
417 | fldl 24(%ebx)
418 | fmul %st(4),%st(0)
419 | fldl 8(%ebx)
420 | fmul %st(5),%st(0)
421 | fldl 0(%ebx)
422 | fmulp %st(0),%st(6)
423 | fldl 24(%ebx)
424 | fmul %st(5),%st(0)
425 | faddp %st(0),%st(3)
426 | fldl 8(%ebx)
427 | fmul %st(5),%st(0)
428 | faddp %st(0),%st(2)
429 | fldl 0(%ebx)
430 | fmul %st(5),%st(0)
431 | faddp %st(0),%st(1)
432 | fldl 48(%ebx)
433 | fmulp %st(0),%st(5)
434 | fxch %st(4)
435 | faddp %st(0),%st(5)
436 | fldl 8(%ebx)
437 | fmul %st(6),%st(0)
438 | faddp %st(0),%st(2)
439 | fldl 0(%ebx)
440 | fmul %st(6),%st(0)
441 | faddp %st(0),%st(1)
442 | fldl 48(%ebx)
443 | fmul %st(6),%st(0)
444 | faddp %st(0),%st(4)
445 | fldl 32(%ebx)
446 | fmulp %st(0),%st(6)
447 | fxch %st(5)
448 | faddp %st(0),%st(4)
449 | fldl 0(%ebx)
450 | fmul %st(2),%st(0)
451 | faddp %st(0),%st(1)
452 | fldl 48(%ebx)
453 | fmul %st(2),%st(0)
454 | faddp %st(0),%st(5)
455 | fldl 32(%ebx)
456 | fmul %st(2),%st(0)
457 | faddp %st(0),%st(3)
458 | fldl 16(%ebx)
459 | fmulp %st(0),%st(2)
460 | fxch %st(1)
461 | faddp %st(0),%st(3)
462 | fxch %st(3)
463 | fxch %st(2)
464 | poly1305_finish_x86_nomorebytes:
465 | LOAD_VAR_PIC poly1305_constants_x86, %edx
466 | fldl 56(%edx)
467 | fadd %st(4),%st(0)
468 | fsubl 56(%edx)
469 | fsubr %st(0),%st(4)
470 | fmull 0(%edx)
471 | fldl 32(%edx)
472 | fadd %st(2),%st(0)
473 | fsubl 32(%edx)
474 | fsubr %st(0),%st(2)
475 | fldl 40(%edx)
476 | fadd %st(4),%st(0)
477 | fsubl 40(%edx)
478 | fsubr %st(0),%st(4)
479 | fldl 48(%edx)
480 | fadd %st(6),%st(0)
481 | fsubl 48(%edx)
482 | fxch %st(6)
483 | fsub %st(6),%st(0)
484 | fxch %st(4)
485 | faddp %st(0),%st(3)
486 | fxch %st(4)
487 | faddp %st(0),%st(1)
488 | fxch %st(2)
489 | faddp %st(0),%st(3)
490 | fxch %st(4)
491 | faddp %st(0),%st(3)
492 | fxch %st(3)
493 | faddl 104(%edx)
494 | fxch %st(3)
495 | faddl 112(%edx)
496 | fxch %st(1)
497 | faddl 120(%edx)
498 | fxch %st(2)
499 | faddl 128(%edx)
500 | fxch %st(3)
501 | fstpl 96(%esp)
502 | fstpl 104(%esp)
503 | fstpl 112(%esp)
504 | fstpl 120(%esp)
505 | movl 100(%esp),%eax
506 | and $63,%eax
507 | movl 108(%esp),%ecx
508 | and $63,%ecx
509 | movl 116(%esp),%edx
510 | and $63,%edx
511 | movl 124(%esp),%ebx
512 | and $63,%ebx
513 | movl 104(%esp),%esi
514 | addl %eax,%esi
515 | movl %esi,28(%esp)
516 | movl 112(%esp),%eax
517 | adcl %ecx,%eax
518 | movl %eax,32(%esp)
519 | movl 120(%esp),%eax
520 | adcl %edx,%eax
521 | movl %eax,36(%esp)
522 | mov $0,%eax
523 | adcl %ebx,%eax
524 | movl %eax,40(%esp)
525 | mov $5,%eax
526 | movl 96(%esp),%ecx
527 | addl %ecx,%eax
528 | movl %eax,44(%esp)
529 | mov $0,%eax
530 | movl 28(%esp),%edx
531 | adcl %edx,%eax
532 | movl %eax,28(%esp)
533 | mov $0,%eax
534 | movl 32(%esp),%ebx
535 | adcl %ebx,%eax
536 | movl %eax,32(%esp)
537 | mov $0,%eax
538 | movl 36(%esp),%esi
539 | adcl %esi,%eax
540 | movl %eax,36(%esp)
541 | mov $0xfffffffc,%eax
542 | movl 40(%esp),%edi
543 | adcl %edi,%eax
544 | sar $16,%eax
545 | mov %eax,%edi
546 | xor $0xffffffff,%edi
547 | andl %eax,%ecx
548 | movl 44(%esp),%ebp
549 | andl %edi,%ebp
550 | orl %ebp,%ecx
551 | andl %eax,%edx
552 | movl 28(%esp),%ebp
553 | andl %edi,%ebp
554 | orl %ebp,%edx
555 | andl %eax,%ebx
556 | movl 32(%esp),%ebp
557 | andl %edi,%ebp
558 | orl %ebp,%ebx
559 | andl %eax,%esi
560 | movl 36(%esp),%eax
561 | andl %edi,%eax
562 | orl %eax,%esi
563 | movl 4(%esp),%eax
564 | addl 104(%eax),%ecx
565 | adcl 108(%eax),%edx
566 | adcl 112(%eax),%ebx
567 | adcl 116(%eax),%esi
568 | movl 8(%esp),%eax
569 | movl %ecx,0(%eax)
570 | movl %edx,4(%eax)
571 | movl %ebx,8(%eax)
572 | movl %esi,12(%eax)
573 | xorl %eax, %eax
574 | movl 4(%esp),%edi
575 | fldz
576 | fstl 0(%edi)
577 | fstl 8(%edi)
578 | fstl 16(%edi)
579 | fstl 24(%edi)
580 | fstl 32(%edi)
581 | fstl 40(%edi)
582 | fstl 48(%edi)
583 | fstl 56(%edi)
584 | fstl 64(%edi)
585 | fstl 72(%edi)
586 | fstl 80(%edi)
587 | fstl 88(%edi)
588 | fstl 96(%edi)
589 | fstl 104(%edi)
590 | fstpl 112(%edi)
591 | movl 0(%esp), %esp
592 | popl %ebp
593 | popl %edi
594 | popl %esi
595 | popl %ebx
596 | ret
597 | FN_END poly1305_finish_ext_x86
598 |
599 | GLOBAL_HIDDEN_FN poly1305_auth_x86
600 | poly1305_auth_x86_local:
601 | pushl %ebp
602 | pushl %edi
603 | movl %esp, %ebp
604 | subl $128, %esp
605 | andl $~63, %esp
606 | movl %esp, %edi
607 | pushl 24(%ebp)
608 | pushl %edi
609 | calll poly1305_init_ext_x86_local
610 | movl 20(%ebp), %ecx
611 | andl $~15, %ecx
612 | jz poly1305_auth_x86_no_data
613 | pushl %ecx
614 | pushl 16(%ebp)
615 | addl %ecx, 16(%ebp)
616 | pushl %edi
617 | calll poly1305_blocks_x86_local
618 | poly1305_auth_x86_no_data:
619 | pushl 12(%ebp)
620 | movl 20(%ebp), %ecx
621 | andl $15, %ecx
622 | pushl %ecx
623 | pushl 16(%ebp)
624 | pushl %edi
625 | calll poly1305_finish_ext_x86_local
626 | movl %ebp, %esp
627 | popl %edi
628 | popl %ebp
629 | ret
630 | FN_END poly1305_auth_x86
631 |
632 | INCLUDE_VAR_FILE "poly1305/poly1305_constants_x86.inc", poly1305_constants_x86
633 |
--------------------------------------------------------------------------------
/app/extensions/poly1305/poly1305_x86-64.inc:
--------------------------------------------------------------------------------
1 | SECTION_TEXT
2 |
3 | GLOBAL_HIDDEN_FN poly1305_block_size_x86
4 | movl $16, %eax
5 | ret
6 | FN_END poly1305_block_size_x86
7 |
8 | GLOBAL_HIDDEN_FN poly1305_init_ext_x86
9 | poly1305_init_ext_x86_local:
10 | movabsq $17575274610687, %rax
11 | movq (%rsi), %rcx
12 | movq 8(%rsi), %rdx
13 | movq $0, 24(%rdi)
14 | movq $0, 32(%rdi)
15 | movq $0, 40(%rdi)
16 | andq %rcx, %rax
17 | shrq $44, %rcx
18 | movq %rax, (%rdi)
19 | movq %rdx, %rax
20 | salq $20, %rax
21 | orq %rcx, %rax
22 | movabsq $17592181915647, %rcx
23 | andq %rcx, %rax
24 | movq %rax, 8(%rdi)
25 | movq %rdx, %rax
26 | movabsq $68719475727, %rdx
27 | shrq $24, %rax
28 | andq %rdx, %rax
29 | movq %rax, 16(%rdi)
30 | movq 16(%rsi), %rax
31 | movq %rax, 48(%rdi)
32 | movq 24(%rsi), %rax
33 | movq $0, 64(%rdi)
34 | movq %rax, 56(%rdi)
35 | ret
36 | FN_END poly1305_init_ext_x86
37 |
38 |
39 | GLOBAL_HIDDEN_FN poly1305_blocks_x86
40 | poly1305_blocks_x86_local:
41 | movabsq $1099511627776, %rax
42 | pushq %r15
43 | pushq %r14
44 | pushq %r13
45 | pushq %r12
46 | pushq %rbp
47 | movq %rdx, %rbp
48 | pushq %rbx
49 | cmpq $1, 64(%rdi)
50 | movq %rdi, -16(%rsp)
51 | movq (%rdi), %r14
52 | movq 8(%rdi), %r15
53 | sbbq %rcx, %rcx
54 | andq %rax, %rcx
55 | movq %rdi, %rax
56 | cmpq $15, %rbp
57 | movq %rcx, -40(%rsp)
58 | movq 16(%rdi), %rcx
59 | movq 32(%rax), %r8
60 | movq 24(%rdi), %rdi
61 | movq 40(%rax), %rdx
62 | movq %rcx, -32(%rsp)
63 | jbe poly1305_blocks_x86_5
64 | leaq (%rcx,%rcx,4), %rax
65 | movq %r15, -48(%rsp)
66 | movabsq $17592186044415, %rbx
67 | salq $2, %rax
68 | movq %rax, -56(%rsp)
69 | leaq (%r15,%r15,4), %rax
70 | salq $2, %rax
71 | movq %rax, -24(%rsp)
72 | .p2align 4
73 | poly1305_blocks_x86_6:
74 | movq $0, -80(%rsp)
75 | movq (%rsi), %r9
76 | movq $0, -64(%rsp)
77 | movq 8(%rsi), %rcx
78 | movq %r9, %rax
79 | shrq $44, %r9
80 | movq %rcx, %r10
81 | shrq $24, %rcx
82 | andq %rbx, %rax
83 | orq -40(%rsp), %rcx
84 | addq %rax, %rdi
85 | salq $20, %r10
86 | movq -24(%rsp), %rax
87 | orq %r9, %r10
88 | andq %rbx, %r10
89 | addq %r10, %r8
90 | addq %rdx, %rcx
91 | mulq %rcx
92 | movq %rax, %r9
93 | movq %rdi, %rax
94 | movq %rdx, %r10
95 | mulq %r14
96 | addq %rax, %r9
97 | movq -56(%rsp), %rax
98 | adcq %rdx, %r10
99 | mulq %r8
100 | addq %rax, %r9
101 | movq -56(%rsp), %rax
102 | adcq %rdx, %r10
103 | movq %r9, %r15
104 | andq %rbx, %r15
105 | mulq %rcx
106 | movq %rax, %r11
107 | movq -48(%rsp), %rax
108 | movq %rdx, %r12
109 | mulq %rdi
110 | addq %rax, %r11
111 | movq %r8, %rax
112 | adcq %rdx, %r12
113 | mulq %r14
114 | addq %rax, %r11
115 | movq %rcx, %rax
116 | adcq %rdx, %r12
117 | shrdq $44, %r10, %r9
118 | movq %r9, -88(%rsp)
119 | addq -88(%rsp), %r11
120 | adcq -80(%rsp), %r12
121 | mulq %r14
122 | movq %r11, %r13
123 | andq %rbx, %r13
124 | movq %rax, %r9
125 | movq -32(%rsp), %rax
126 | movq %rdx, %r10
127 | mulq %rdi
128 | addq %rax, %r9
129 | movq -48(%rsp), %rax
130 | adcq %rdx, %r10
131 | mulq %r8
132 | addq %rax, %r9
133 | adcq %rdx, %r10
134 | shrdq $44, %r12, %r11
135 | movabsq $4398046511103, %rdx
136 | movq %r11, -72(%rsp)
137 | addq -72(%rsp), %r9
138 | adcq -64(%rsp), %r10
139 | andq %r9, %rdx
140 | subq $16, %rbp
141 | addq $16, %rsi
142 | shrdq $42, %r10, %r9
143 | leaq (%r9,%r9,4), %r8
144 | addq %r15, %r8
145 | movq %r8, %rdi
146 | shrq $44, %r8
147 | andq %rbx, %rdi
148 | addq %r13, %r8
149 | cmpq $15, %rbp
150 | ja poly1305_blocks_x86_6
151 | poly1305_blocks_x86_5:
152 | movq -16(%rsp), %rcx
153 | movq %rdi, 24(%rcx)
154 | movq %r8, 32(%rcx)
155 | movq %rdx, 40(%rcx)
156 | popq %rbx
157 | popq %rbp
158 | popq %r12
159 | popq %r13
160 | popq %r14
161 | popq %r15
162 | ret
163 | FN_END poly1305_blocks_x86
164 |
165 | GLOBAL_HIDDEN_FN poly1305_finish_ext_x86
166 | poly1305_finish_ext_x86_local:
167 | pushq %rbp
168 | movq %rsi, %rax
169 | movq %rcx, %rbp
170 | pushq %rbx
171 | movq %rdi, %rbx
172 | subq $24, %rsp
173 | testq %rdx, %rdx
174 | je poly1305_finish_ext_x86_11
175 | movq %rax, %rdi
176 | movq $0, (%rsp)
177 | movq %rsp, %rcx
178 | movq $0, 8(%rsp)
179 | subq %rsp, %rdi
180 | testb $8, %dl
181 | je poly1305_finish_ext_x86_12
182 | movq (%rax), %rax
183 | leaq 8(%rsp), %rcx
184 | movq %rax, (%rsp)
185 | poly1305_finish_ext_x86_12:
186 | testb $4, %dl
187 | je poly1305_finish_ext_x86_13
188 | movl (%rcx,%rdi), %eax
189 | movl %eax, (%rcx)
190 | addq $4, %rcx
191 | poly1305_finish_ext_x86_13:
192 | testb $2, %dl
193 | je poly1305_finish_ext_x86_14
194 | movzwl (%rcx,%rdi), %eax
195 | movw %ax, (%rcx)
196 | addq $2, %rcx
197 | poly1305_finish_ext_x86_14:
198 | testb $1, %dl
199 | je poly1305_finish_ext_x86_15
200 | movzbl (%rcx,%rdi), %eax
201 | movb %al, (%rcx)
202 | poly1305_finish_ext_x86_15:
203 | movb $1, (%rsp,%rdx)
204 | movq %rsp, %rsi
205 | movl $16, %edx
206 | movq $1, 64(%rbx)
207 | movq %rbx, %rdi
208 | call poly1305_blocks_x86
209 | poly1305_finish_ext_x86_11:
210 | movabsq $17592186044415, %rdx
211 | movq 32(%rbx), %rsi
212 | movabsq $4398046511103, %rax
213 | movabsq $-4398046511104, %r10
214 | movq %rsi, %r9
215 | shrq $44, %rsi
216 | addq 40(%rbx), %rsi
217 | andq %rdx, %r9
218 | movq %rsi, %r8
219 | shrq $42, %rsi
220 | leaq (%rsi,%rsi,4), %rcx
221 | andq %rax, %r8
222 | addq 24(%rbx), %rcx
223 | movq %rcx, %rdi
224 | shrq $44, %rcx
225 | addq %r9, %rcx
226 | andq %rdx, %rdi
227 | movq %rcx, %rsi
228 | shrq $44, %rcx
229 | addq %r8, %rcx
230 | andq %rdx, %rsi
231 | andq %rcx, %rax
232 | shrq $42, %rcx
233 | leaq (%rdi,%rcx,4), %rdi
234 | addq %rax, %r10
235 | addq %rcx, %rdi
236 | movq %rdi, %rcx
237 | shrq $44, %rdi
238 | andq %rdx, %rcx
239 | addq %rsi, %rdi
240 | leaq 5(%rcx), %r9
241 | movq %r9, %r11
242 | andq %rdx, %r9
243 | shrq $44, %r11
244 | addq %rdi, %r11
245 | movq %r11, %rsi
246 | andq %r11, %rdx
247 | shrq $44, %rsi
248 | addq %rsi, %r10
249 | movq %r10, %rsi
250 | shrq $63, %rsi
251 | subq $1, %rsi
252 | movq %rsi, %r8
253 | andq %rsi, %r9
254 | andq %rsi, %rdx
255 | notq %r8
256 | andq %r8, %rcx
257 | andq %r8, %rdi
258 | orq %r9, %rcx
259 | orq %rdx, %rdi
260 | andq %r8, %rax
261 | andq %r10, %rsi
262 | movq %rdi, %rdx
263 | shrq $20, %rdi
264 | orq %rsi, %rax
265 | salq $44, %rdx
266 | movq 56(%rbx), %rsi
267 | salq $24, %rax
268 | orq %rdx, %rcx
269 | movq 48(%rbx), %rdx
270 | orq %rdi, %rax
271 | addq %rdx, %rcx
272 | adcq %rsi, %rax
273 | movq %rcx, 0(%rbp)
274 | movq %rax, 8(%rbp)
275 | movq $0, 24(%rbx)
276 | movq $0, 32(%rbx)
277 | movq $0, 40(%rbx)
278 | movq $0, (%rbx)
279 | movq $0, 8(%rbx)
280 | movq $0, 16(%rbx)
281 | movq $0, 48(%rbx)
282 | movq $0, 56(%rbx)
283 | addq $24, %rsp
284 | popq %rbx
285 | popq %rbp
286 | ret
287 | FN_END poly1305_finish_ext_x86
288 |
289 |
290 | GLOBAL_HIDDEN_FN poly1305_auth_x86
291 | poly1305_auth_x86_local:
292 | pushq %rbp
293 | movq %rsp, %rbp
294 | movq %rbx, -32(%rbp)
295 | movq %rdx, %rbx
296 | movq %r12, -24(%rbp)
297 | movq %rsi, %r12
298 | movq %rcx, %rsi
299 | movq %r13, -16(%rbp)
300 | movq %rdi, %r13
301 | movq %r14, -8(%rbp)
302 | subq $32, %rsp
303 | movq %rbx, %r14
304 | andq $-64, %rsp
305 | addq $-128, %rsp
306 | movq %rsp, %rdi
307 | call poly1305_init_ext_x86_local
308 | andq $-16, %r14
309 | je poly1305_auth_x86_19
310 | movq %r12, %rsi
311 | movq %r14, %rdx
312 | movq %rsp, %rdi
313 | call poly1305_blocks_x86_local
314 | addq %r14, %r12
315 | subq %r14, %rbx
316 | poly1305_auth_x86_19:
317 | movq %r13, %rcx
318 | movq %rbx, %rdx
319 | movq %r12, %rsi
320 | movq %rsp, %rdi
321 | call poly1305_finish_ext_x86_local
322 | movq -32(%rbp), %rbx
323 | movq -24(%rbp), %r12
324 | movq -16(%rbp), %r13
325 | movq -8(%rbp), %r14
326 | leave
327 | ret
328 | FN_END poly1305_auth_x86
329 |
330 |
--------------------------------------------------------------------------------
/app/include/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floodyberry/poly1305-opt/700d5cf167441f627d76c845f56b7ea72bdd91e8/app/include/.keep
--------------------------------------------------------------------------------
/app/include/poly1305.h:
--------------------------------------------------------------------------------
1 | #ifndef POLY1305_H
2 | #define POLY1305_H
3 |
4 | #include
5 |
6 | #if !defined(LIB_PUBLIC)
7 | #define LIB_PUBLIC
8 | #endif
9 |
10 | #if defined(__cplusplus)
11 | extern "C" {
12 | #endif
13 |
14 | typedef struct poly1305_state {
15 | unsigned char opaque[320];
16 | } poly1305_state;
17 |
18 | typedef struct poly1305_key {
19 | unsigned char b[32];
20 | } poly1305_key;
21 |
22 | LIB_PUBLIC void poly1305_init(poly1305_state *S, const poly1305_key *key);
23 | LIB_PUBLIC void poly1305_init_ext(poly1305_state *S, const poly1305_key *key, size_t bytes_hint);
24 | LIB_PUBLIC void poly1305_update(poly1305_state *S, const unsigned char *in, size_t inlen);
25 | LIB_PUBLIC void poly1305_finish(poly1305_state *S, unsigned char *mac);
26 |
27 | LIB_PUBLIC void poly1305_auth(unsigned char *mac, const unsigned char *in, size_t inlen, const poly1305_key *key);
28 |
29 | LIB_PUBLIC int poly1305_startup(void);
30 |
31 | #if defined(UTILITIES)
32 | void poly1305_fuzz(void);
33 | void poly1305_bench(void);
34 | #endif
35 |
36 | #if defined(__cplusplus)
37 | }
38 | #endif
39 |
40 | #endif /* POLY1305_H */
41 |
42 |
--------------------------------------------------------------------------------
/app/project.def:
--------------------------------------------------------------------------------
1 | poly1305
2 |
--------------------------------------------------------------------------------
/app/project.ver:
--------------------------------------------------------------------------------
1 | 1.0.0
2 |
--------------------------------------------------------------------------------
/framework/bench.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include "cpucycles.h"
4 | #include "cpuid.h"
5 | #include "bench.h"
6 |
7 | /* a 32k, 64 byte aligned buffer to bench with */
8 | unsigned char *
9 | bench_get_buffer(void) {
10 | static unsigned char buffer[0x8000 + 0x40 + 0x40];
11 | unsigned char *p = buffer;
12 | p += 0x3f;
13 | p -= (size_t)p & 0x3f;
14 | return p;
15 | }
16 |
17 | static cycles_t smallest_timeslice = ~(cycles_t)0;
18 | static int have_global_stats = 0;
19 | static cycles_t cycles_per_second = 1;
20 | static size_t global_dummy = 0;
21 |
22 | static void
23 | bench_gather_global_stats(void) {
24 | const char *cpu_units = LOCAL_PREFIX(cpucycles_units)();
25 | size_t delay = 0;
26 | size_t dummy = 55;
27 | clock_t start;
28 | cycles_t delta;
29 | size_t j;
30 |
31 | /* find the smallest one and run with that, this isn't an exact science */
32 | do {
33 | delta = LOCAL_PREFIX(cpucycles)();
34 | for (j = 0; j < delay; j++) {
35 | dummy ^= (dummy << 1) + j;
36 | dummy += (dummy >> 3);
37 | }
38 | delta = LOCAL_PREFIX(cpucycles)() - delta;
39 | delay++;
40 | } while (!delta);
41 |
42 | /* run until at least one second has passed AND smallest_timeslice has been set */
43 | start = clock();
44 | do {
45 | delta = LOCAL_PREFIX(cpucycles)();
46 | for (j = 0; j < delay; j++) {
47 | dummy ^= (dummy << 1) + j;
48 | dummy += (dummy >> 3);
49 | }
50 | delta = LOCAL_PREFIX(cpucycles)() - delta;
51 |
52 | /* 2 is as good as 1 cycle_t, and should avoid some burps that gettimeofday has with erroneously reporting 1 cycle_t */
53 | if ((delta > 1) && (delta < smallest_timeslice))
54 | smallest_timeslice = delta;
55 | } while (((clock() - start) < CLOCKS_PER_SEC) && (smallest_timeslice == ~(cycles_t)0));
56 |
57 | /* 1/2 of a second back of the hand calculation for cycles_t per second */
58 | cycles_per_second = LOCAL_PREFIX(cpucycles)();
59 | start = clock();
60 | while ((clock() - start) < (CLOCKS_PER_SEC / 2)) {
61 | dummy ^= (dummy << 1) + 19;
62 | dummy += (dummy >> 3);
63 | }
64 | cycles_per_second = LOCAL_PREFIX(cpucycles)() - cycles_per_second;
65 | cycles_per_second <<= 1;
66 |
67 |
68 | printf("time granularity: %.0f %s, %.0f %s/second\n\n", (double)smallest_timeslice, cpu_units, (double)cycles_per_second, cpu_units);
69 |
70 | global_dummy = dummy & 1;
71 | }
72 |
73 | int
74 | bench(const void *impls, size_t impl_size, impl_test test_fn, impl_bench bench_fn, size_t units_count, const char *units_desc) {
75 | unsigned long cpu_flags = LOCAL_PREFIX(cpuid)();
76 | const char *cpu_units = LOCAL_PREFIX(cpucycles_units)();
77 | const unsigned char *p;
78 | int first_item = 1, err = 0;
79 |
80 | if (!have_global_stats) {
81 | bench_gather_global_stats();
82 | have_global_stats = 1;
83 | }
84 |
85 | /* validate all implementations */
86 | p = (const unsigned char *)impls;
87 | for (;;) {
88 | const cpu_specific_impl_t *impl = (const cpu_specific_impl_t *)p;
89 | if (impl->cpu_flags == (impl->cpu_flags & cpu_flags)) {
90 | if (test_fn(impl) != 0) {
91 | printf("%s: error in implementation!\n", impl->desc);
92 | err = 1;
93 | }
94 | }
95 | if (impl->cpu_flags == CPUID_GENERIC)
96 | break;
97 | p += impl_size;
98 | }
99 |
100 | if (err)
101 | return 1;
102 |
103 | p = (const unsigned char *)impls;
104 | for (;;) {
105 | const cpu_specific_impl_t *impl = (const cpu_specific_impl_t *)p;
106 |
107 | if (impl->cpu_flags == (impl->cpu_flags & cpu_flags)) {
108 | cycles_t tbest = ~(cycles_t)0;
109 | size_t batch_size = 1, trials = 1;
110 | size_t i;
111 |
112 | /* get a rough estimate for batch size and # of trials */
113 | for (;;) {
114 | cycles_t tbest = ~(cycles_t)0;
115 | size_t i, j;
116 | for (i = 0; i < 100; i++) {
117 | cycles_t t1 = LOCAL_PREFIX(cpucycles)();
118 | for (j = 0; j < batch_size; j++)
119 | bench_fn(impl);
120 | t1 = LOCAL_PREFIX(cpucycles)() - t1;
121 | if (t1 < tbest)
122 | tbest = t1;
123 | }
124 | if (tbest > smallest_timeslice * 25) {
125 | trials = (cycles_per_second / tbest);
126 | if (trials < 1)
127 | trials = 1;
128 | break;
129 | }
130 | batch_size = (batch_size == 1) ? 2 : (((batch_size * 4) / 3) + 1);
131 | }
132 |
133 |
134 |
135 | /* measure! */
136 | for (i = 0; i < trials; i++) {
137 | cycles_t t1 = LOCAL_PREFIX(cpucycles)();
138 | size_t j;
139 | for (j = 0; j < batch_size; j++)
140 | bench_fn(impl);
141 | t1 = LOCAL_PREFIX(cpucycles)() - t1;
142 | if (t1 < tbest)
143 | tbest = t1;
144 | }
145 |
146 | if (first_item) {
147 | printf("%u %s(s):\n", (unsigned int)units_count, units_desc);
148 | first_item = 0;
149 | }
150 |
151 | printf(" %12s, %8.2f %s per call, %8.4f %s/%s\n",
152 | impl->desc,
153 | (double)tbest / batch_size, cpu_units,
154 | ((double)tbest / batch_size) / units_count, cpu_units, units_desc
155 | );
156 | }
157 |
158 | if (impl->cpu_flags == CPUID_GENERIC)
159 | return 0;
160 | p += impl_size;
161 | }
162 | }
163 |
--------------------------------------------------------------------------------
/framework/driver/arm/cpucycles_impl.inc:
--------------------------------------------------------------------------------
1 | #if defined(HAVE_GETTIMEOFDAY)
2 | #include
3 | #endif
4 |
5 | static cycles_t
6 | cpucycles_impl(void) {
7 | #if defined(HAVE_GETTIMEOFDAY)
8 | struct timeval t;
9 | gettimeofday(&t, NULL);
10 | return ((cycles_t)t.tv_sec * 1000000) + (cycles_t)t.tv_usec;
11 | #else
12 | printf("no suitable timing mechanism found\n");
13 | exit(1);
14 | return 0;
15 | #endif
16 | }
17 |
18 | static const char *
19 | cpucycles_units_impl(void) {
20 | #if defined(HAVE_GETTIMEOFDAY)
21 | return "us";
22 | #else
23 | return "";
24 | #endif
25 | }
26 |
27 |
--------------------------------------------------------------------------------
/framework/driver/arm/cpuid_flags.inc:
--------------------------------------------------------------------------------
1 | enum cpuid_flags_arm_t {
2 | CPUID_ARM = (1 << 0),
3 | CPUID_ARMv6 = (1 << 1),
4 | CPUID_ARMv7 = (1 << 2),
5 | CPUID_ARMv8 = (1 << 3),
6 |
7 | CPUID_ASIMD = (1 << 18),
8 | CPUID_TLS = (1 << 19),
9 | CPUID_AES = (1 << 20),
10 | CPUID_PMULL = (1 << 21),
11 | CPUID_SHA1 = (1 << 22),
12 | CPUID_SHA2 = (1 << 23),
13 | CPUID_CRC32 = (1 << 24),
14 | CPUID_IWMMXT = (1 << 25),
15 | CPUID_IDIVT = (1 << 26),
16 | CPUID_IDIVA = (1 << 27),
17 | CPUID_VFP3D16 = (1 << 28),
18 | CPUID_VFP3 = (1 << 29),
19 | CPUID_VFP4 = (1 << 30),
20 | CPUID_NEON = (1 << 31)
21 | };
22 |
--------------------------------------------------------------------------------
/framework/driver/arm/cpuid_impl.inc:
--------------------------------------------------------------------------------
1 | #include "cpuid_impl_linux.inc"
2 | #include "cpuid_impl_msvc.inc"
3 | #include "cpuid_impl_netbsd.inc"
4 |
5 |
6 | static unsigned long
7 | cpuid_impl(void) {
8 | unsigned long flags = cpuid_specific_impl();
9 | if (flags & CPUID_ARMv8)
10 | flags |= (CPUID_ARMv7 | CPUID_ARMv6);
11 | if (flags & CPUID_ARMv7)
12 | flags |= (CPUID_ARMv6);
13 | if (flags & CPUID_NEON)
14 | flags |= (CPUID_ARMv7 | CPUID_ARMv6 | CPUID_VFP3);
15 | /* vfp3d16 is used for both vfp3 & vfp4 */
16 | if (flags & CPUID_VFP3D16)
17 | flags &= ~(CPUID_VFP3 | CPUID_VFP4);
18 | return flags;
19 | }
20 |
21 |
--------------------------------------------------------------------------------
/framework/driver/arm/cpuid_impl_linux.inc:
--------------------------------------------------------------------------------
1 | #if defined(__linux__)
2 |
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 |
9 | #define CPUINFO_LINE_LENGTH 128
10 |
11 | typedef struct cpuid_flags_t {
12 | unsigned long processor;
13 | unsigned long features;
14 | unsigned long implementer;
15 | unsigned long arch;
16 | unsigned long variant;
17 | unsigned long part;
18 | unsigned long revision;
19 | } cpuid_flags_t;
20 |
21 | typedef struct cpuid_flag_table_t {
22 | const char *str;
23 | unsigned long flag;
24 | } cpuid_flag_table_t;
25 |
26 | static const cpuid_flag_table_t features[] = {
27 | {"tls ", CPUID_TLS},
28 | {"aes ", CPUID_AES},
29 | {"pmull ", CPUID_PMULL},
30 | {"sha1 ", CPUID_SHA1},
31 | {"sha2 ", CPUID_SHA2},
32 | {"crc32 ", CPUID_CRC32},
33 | {"iwmmxt ", CPUID_IWMMXT},
34 | {"idivt ", CPUID_IDIVT},
35 | {"idiva ", CPUID_IDIVA},
36 | {"vfpv3d16 ", CPUID_VFP3D16},
37 | {"vfpv3 ", CPUID_VFP3},
38 | {"vfpv4 ", CPUID_VFP4},
39 | {"neon ", CPUID_NEON},
40 | {"asimd ", CPUID_ASIMD},
41 | {NULL, 0}
42 | };
43 |
44 | /* that's an L, not a 1?? */
45 | static const cpuid_flag_table_t processors[] = {
46 | {"(v6l)", CPUID_ARMv6},
47 | {"(v7l)", CPUID_ARMv7},
48 | {"(aarch64)", CPUID_ARMv8},
49 | {NULL, 0}
50 | };
51 |
52 | static const cpuid_flag_table_t archs[] = {
53 | {"6TEJ", CPUID_ARMv6},
54 | {"7", CPUID_ARMv7},
55 | {"7M", CPUID_ARMv7},
56 | {"AArch64", CPUID_ARMv8},
57 | {NULL, 0}
58 | };
59 |
60 | static const char *
61 | cpuid_ltrim(const char *line) {
62 | /* advance to the ':' */
63 | while (*line && (*line != ':'))
64 | line++;
65 |
66 | if (*line == ':')
67 | line++;
68 |
69 | /* skip whitespace */
70 | while (*line && ((*line == ' ') || (*line == '\t')))
71 | line++;
72 |
73 | return line;
74 | }
75 |
76 | static unsigned long
77 | cpuid_parse_unsigned(const char *line) {
78 | unsigned long value = 0;
79 |
80 | if ((line[0] == '0') && (line[1] == 'x')) {
81 | for (line += 2; *line; line++) {
82 | unsigned long digit = *line;
83 | if ((digit - '0') < 10)
84 | digit -= '0';
85 | else if ((digit - 'A') < 16)
86 | digit -= ('A' - 10);
87 | else if ((digit - 'a') < 16)
88 | digit -= ('a' - 10);
89 | else
90 | return 0;
91 | value = (value * 16) + digit;
92 | }
93 | } else {
94 | for (; *line; line++) {
95 | unsigned long digit = *line;
96 | if ((digit - '0') < 10)
97 | digit -= '0';
98 | else
99 | return 0;
100 | value = (value * 10) + digit;
101 | }
102 | }
103 |
104 | return value;
105 | }
106 |
107 | static unsigned long
108 | cpuid_scan(const char *line, const cpuid_flag_table_t *table) {
109 | unsigned long flags = 0;
110 |
111 | for (; table->str; table++) {
112 | if (strstr(line, table->str) != NULL)
113 | flags |= table->flag;
114 | }
115 |
116 | return flags;
117 | }
118 |
119 | /* flags: [processor, feature, arch, part] */
120 | static void
121 | cpuid_line_parse(const char *line, cpuid_flags_t *flags) {
122 | const char *trimmed = cpuid_ltrim(line);
123 | if (strncmp(line, "Processor", 9) == 0)
124 | flags->processor = cpuid_scan(trimmed, processors);
125 | else if (strncmp(line, "Features", 8) == 0)
126 | flags->features |= cpuid_scan(trimmed, features);
127 | else if (strncmp(line, "CPU implementer", 15) == 0)
128 | flags->implementer = cpuid_parse_unsigned(trimmed);
129 | else if (strncmp(line, "CPU architecture", 16) == 0)
130 | flags->arch = cpuid_scan(line + 16, archs);
131 | else if (strncmp(line, "CPU variant", 11) == 0)
132 | flags->variant = cpuid_parse_unsigned(trimmed);
133 | else if (strncmp(line, "CPU part", 8) == 0)
134 | flags->part = cpuid_parse_unsigned(trimmed);
135 | else if (strncmp(line, "CPU revision", 12) == 0)
136 | flags->revision = cpuid_parse_unsigned(trimmed);
137 | }
138 |
139 | static int
140 | cpuid_line_length(char *line, int start, int end) {
141 | int i;
142 | for (i = start; i < end; i++) {
143 | if (line[i] == '\n') {
144 | line[i] = 0;
145 | return i - start;
146 | }
147 | }
148 | return -1;
149 | }
150 |
151 | /* parse /proc/cpuinfo in-place with no allocations */
152 | static unsigned long
153 | cpuid_specific_impl(void) {
154 | cpuid_flags_t flags = {0, 0, 0, 0, 0, 0, 0};
155 | int skip_to_next_line = 0;
156 | int incomplete_bytes = 0;
157 |
158 | char line[CPUINFO_LINE_LENGTH];
159 | int fd;
160 |
161 | fd = open("/proc/cpuinfo", O_RDONLY);
162 | if (fd < 0)
163 | goto cpuid_specific_impl_done;
164 |
165 | for (;;) {
166 | int cur_line_pos = 0;
167 | int bytes_read = read(fd, line + incomplete_bytes, CPUINFO_LINE_LENGTH - incomplete_bytes);
168 | int bytes_left;
169 | int cur_line_end;
170 |
171 | if (bytes_read <= 0) {
172 | if ((bytes_read < 0) && (errno == EINTR))
173 | continue;
174 | goto cpuid_specific_impl_done;
175 | }
176 |
177 | bytes_left = bytes_read + incomplete_bytes;
178 | cur_line_end = bytes_left;
179 | incomplete_bytes = 0;
180 | while (bytes_left) {
181 | int line_length = cpuid_line_length(line, cur_line_pos, cur_line_end);
182 |
183 | /* if the line extends past the buffer.. */
184 | if (line_length < 0) {
185 | if (cur_line_pos == 0) {
186 | /* and it's larger than our buffer, skip it */
187 | skip_to_next_line = 1;
188 | } else {
189 | /* otherwise copy it to the front */
190 | memmove(line, line + cur_line_pos, CPUINFO_LINE_LENGTH - cur_line_pos);
191 | incomplete_bytes = bytes_left;
192 | line[incomplete_bytes] = 0;
193 | cur_line_pos = 0;
194 | }
195 |
196 | /* break out and read more */
197 | break;
198 | }
199 |
200 | /* found the end of a line, are we skipping until a new line? */
201 | if (!skip_to_next_line)
202 | cpuid_line_parse(line + cur_line_pos, &flags);
203 | else
204 | skip_to_next_line = 0;
205 |
206 | cur_line_pos += line_length + 1;
207 | bytes_left -= line_length + 1;
208 | }
209 | }
210 |
211 | cpuid_specific_impl_done:
212 | if (fd != -1)
213 | close(fd);
214 |
215 | /* trust processor over arch, see https://code.google.com/p/android/issues/detail?id=10812 */
216 | if (!flags.processor)
217 | flags.processor = flags.arch;
218 |
219 | switch (flags.implementer) {
220 | case 0x41: /* ARM */
221 | /* 0xb02: armv6k - mpcore */
222 | /* 0xb36: armv6j - arm1136j-s */
223 | /* 0xb56: armv6t2 - arm1156t2-s */
224 | /* 0xb76: armv6zk - arm1176jz-s */
225 | /* 0xc05: armv7-a - cortex-a5 */
226 | /* 0xc07: armv7ve - cortex-a7 */
227 | /* 0xc08: armv7-a - cortex-a8 */
228 | /* 0xc09: armv7-a - cortex-a9 */
229 | /* 0xc0d: armv7ve - cortex-a12 */
230 | /* 0xc0f: armv7ve - cortex-a15 */
231 | /* 0xc14: armv7-r - cortex-r4 */
232 | /* 0xc15: armv7-r - cortex-r5 */
233 | /* 0xc20: armv6-m - cortex-m0 */
234 | /* 0xc21: armv6-m - cortex-m1 */
235 | /* 0xc23: armv7-m - cortex-m3 */
236 | /* 0xc24: armv7e-m - cortex-m4 */
237 | /* 0xc60: armv6-m - cortex-m0+ */
238 | /* 0xd03: armv8-a - cortex-a53 */
239 | /* 0xd07: armv8-a - cortex-a57 */
240 | break;
241 |
242 | case 0x51: /* Qualcomm */
243 | /* 0x0d4: armv7-a - MSM8960 */
244 | /* 0x06f: armv7-a - APQ8064 */
245 |
246 | /* work around faulty neon implementation https://code.google.com/p/chromium/issues/detail?id=341598 */
247 | if ((flags.arch == CPUID_ARMv7) && (flags.variant == 1) && (flags.part == 0x4d) && (flags.revision == 0))
248 | flags.features &= ~CPUID_NEON;
249 | break;
250 |
251 | case 0x69: /* Intel */
252 | break;
253 |
254 | default:
255 | break;
256 | }
257 |
258 | return CPUID_ARM | flags.processor | flags.features;
259 | }
260 |
261 | #endif
262 |
--------------------------------------------------------------------------------
/framework/driver/arm/cpuid_impl_msvc.inc:
--------------------------------------------------------------------------------
1 | #if defined(_MSC_VER)
2 |
3 | static unsigned long
4 | cpuid_specific_impl(void) {
5 | unsigned long flags = CPUID_ARM;
6 |
7 | #define CPUID_TEST_FEATURE(feature, value) \
8 | if (IsProcessorFeaturePresent(feature)) \
9 | flags |= (value);
10 |
11 | #if defined(PF_NX_ENABLED)
12 | CPUID_TEST_FEATURE(PF_NX_ENABLED, CPUID_ARMv6)
13 | #endif
14 |
15 | #if defined(PF_ARM_V6)
16 | CPUID_TEST_FEATURE(PF_ARM_V6, CPUID_ARMv6)
17 | #endif
18 |
19 | #if defined(PF_ARM_V7)
20 | CPUID_TEST_FEATURE(PF_ARM_V7, CPUID_ARMv7)
21 | #endif
22 |
23 | #if defined(PF_ARM_DIVIDE_INSTRUCTION_AVAILABLE)
24 | CPUID_TEST_FEATURE(PF_ARM_DIVIDE_INSTRUCTION_AVAILABLE, CPUID_IDIVT | CPUID_IDIVA)
25 | #endif
26 |
27 | #if defined(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE)
28 | CPUID_TEST_FEATURE(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE, CPUID_NEON)
29 | #endif
30 |
31 | #if defined(PF_ARM_NEON)
32 | CPUID_TEST_FEATURE(PF_ARM_NEON, CPUID_NEON)
33 | #endif
34 |
35 | #if defined(PF_ARM_VFP_32_REGISTERS_AVAILABLE)
36 | CPUID_TEST_FEATURE(PF_ARM_VFP_32_REGISTERS_AVAILABLE, CPUID_VFP3 | CPUID_NEON)
37 | #endif
38 |
39 | return flags;
40 | }
41 |
42 | #endif
43 |
44 |
--------------------------------------------------------------------------------
/framework/driver/arm/cpuid_impl_netbsd.inc:
--------------------------------------------------------------------------------
1 | #if defined(__NetBSD__)
2 |
3 | #include
4 |
5 | static unsigned long
6 | cpuid_specific_impl(void) {
7 | unsigned long flags = CPUID_ARM;
8 | size_t len;
9 | int flag;
10 |
11 | len = sizeof(flag);
12 | if (!sysctlbyname("machdep.simdex_present", &flag, &len, NULL, 0) && flag)
13 | flags |= CPUID_ARMv6;
14 |
15 | len = sizeof(flag);
16 | if (!sysctlbyname("machdep.neon_present", &flag, &len, NULL, 0) && flag)
17 | flags |= CPUID_NEON;
18 |
19 | /* should use machdep.cpu_id as well.. */
20 |
21 | return flags;
22 | }
23 |
24 | #endif
--------------------------------------------------------------------------------
/framework/driver/arm/gcc.inc:
--------------------------------------------------------------------------------
1 | #ifndef BASE_GCC_ARM_S
2 | #define BASE_GCC_ARM_S
3 |
4 | #include "asmopt_internal.h"
5 |
6 | #if !defined(HAVE_SLASHMACRO) && !defined(HAVE_DOLLARMACRO)
7 | #error Unknown gnu as macro parameter convention! Run ./configure
8 | #endif
9 |
10 | .syntax unified
11 | .arm
12 |
13 | #define IS_ARM32 (defined(__arm__))
14 | #define IS_ARM64 (defined(__aarch64__))
15 | #define IS_ELF (defined(__ELF__))
16 | #define IS_MACH (defined(__MACH__))
17 |
18 | #if (IS_ELF)
19 | .macro FN name
20 | .align 2
21 | \name:
22 | _\name:
23 | .endm
24 |
25 | .macro FN_END name
26 | .size \name, .-\name
27 | .type \name STT_FUNC
28 | .size _\name, .-_\name
29 | .type _\name STT_FUNC
30 | .endm
31 |
32 | .macro HIDDEN name
33 | #if defined(HAVE_AS_HIDDEN)
34 | .hidden \name
35 | .hidden _\name
36 | #endif
37 | .endm
38 |
39 | /* set NX for stack */
40 | .section .note.GNU-stack,"",%progbits
41 | #elif (IS_MACH)
42 | .macro FN name
43 | .align 2
44 | #if defined(HAVE_SLASHMACRO)
45 | \name:
46 | _\name:
47 | #elif defined(HAVE_DOLLARMACRO)
48 | $0:
49 | _$0:
50 | #endif
51 | .endm
52 |
53 | .macro FN_END name
54 | .endm
55 |
56 | .macro HIDDEN name
57 | #if defined(HAVE_AS_PRIVATE_EXTERN)
58 | #if defined(HAVE_SLASHMACRO)
59 | .private_extern \name
60 | .private_extern _\name
61 | #elif defined(HAVE_DOLLARMACRO)
62 | .private_extern $0
63 | .private_extern _$0
64 | #endif
65 | #endif
66 | .endm
67 | #endif
68 |
69 | /* put everything in the code segment to simplify things */
70 | #if (IS_MACH)
71 | .macro SECTION_TEXT
72 | .section __TEXT,__text,regular
73 | .endm
74 |
75 | .macro SECTION_RODATA
76 | .section __TEXT,__text,regular
77 | .endm
78 | #else
79 | /* put everything in the code segment to simplify things */
80 | .macro SECTION_TEXT
81 | .text
82 | .endm
83 |
84 | .macro SECTION_RODATA
85 | .text
86 | .endm
87 | #endif
88 |
89 | /* declare a global function */
90 | .macro GLOBAL name
91 | #if defined(HAVE_SLASHMACRO)
92 | .globl \name
93 | .globl _\name
94 | #elif defined(HAVE_DOLLARMACRO)
95 | .globl $0
96 | .globl _$0
97 | #endif
98 | .endm
99 |
100 | .macro FN_LOCAL_PREFIX name
101 | #if defined(HAVE_SLASHMACRO)
102 | FN LOCAL_PREFIX(\name)
103 | #elif defined(HAVE_DOLLARMACRO)
104 | FN LOCAL_PREFIX($0)
105 | #endif
106 | .endm
107 |
108 | .macro FN_END_LOCAL_PREFIX name
109 | #if defined(HAVE_SLASHMACRO)
110 | FN_END LOCAL_PREFIX(\name)
111 | #elif defined(HAVE_DOLLARMACRO)
112 | FN_END LOCAL_PREFIX($0)
113 | #endif
114 | .endm
115 |
116 | .macro GLOBAL_LOCAL_PREFIX name
117 | #if defined(HAVE_SLASHMACRO)
118 | GLOBAL LOCAL_PREFIX(\name)
119 | HIDDEN LOCAL_PREFIX(\name)
120 | #elif defined(HAVE_DOLLARMACRO)
121 | GLOBAL LOCAL_PREFIX($0)
122 | HIDDEN LOCAL_PREFIX($0)
123 | #endif
124 | .endm
125 |
126 | .macro GLOBAL_HIDDEN_FN name
127 | #if defined(HAVE_SLASHMACRO)
128 | GLOBAL \name
129 | HIDDEN \name
130 | FN \name
131 | #elif defined(HAVE_DOLLARMACRO)
132 | GLOBAL $0
133 | HIDDEN $0
134 | FN $0
135 | #endif
136 | .endm
137 |
138 | /* pic support */
139 | .macro LOAD_VAR_PIC var, reg
140 | #if (IS_ARM32)
141 | #if defined(HAVE_SLASHMACRO)
142 | adrl \reg, \var
143 | #elif defined(HAVE_DOLLARMACRO)
144 | adrl $1, $0
145 | #endif
146 | #elif (IS_ARM64)
147 | #if defined(HAVE_SLASHMACRO)
148 | adr \reg, \var
149 | #elif defined(HAVE_DOLLARMACRO)
150 | adr $1, $0
151 | #endif
152 | #endif
153 | .endm
154 |
155 | #if defined(HAVE_SLASHMACRO)
156 | #define INCLUDE_FILE_PARM "\file"
157 | #elif defined(HAVE_DOLLARMACRO)
158 | #define INCLUDE_FILE_PARM $0
159 | #endif
160 |
161 | .macro INCLUDE file
162 | .include INCLUDE_FILE_PARM
163 | .endm
164 |
165 | /* include the file with the variable(s) if variable 'name' is not already included */
166 | .macro INCLUDE_VAR_FILE file, name
167 | #if defined(HAVE_SLASHMACRO)
168 | .ifndef \name
169 | .include INCLUDE_FILE_PARM
170 | .endif
171 | #elif defined(HAVE_DOLLARMACRO)
172 | .ifndef $1
173 | .include INCLUDE_FILE_PARM
174 | .endif
175 | #endif
176 | .endm
177 |
178 | #endif /* BASE_GCC_ARM_S */
179 |
--------------------------------------------------------------------------------
/framework/driver/cpucycles.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include "cpuid.h"
4 | #include "cpucycles.h"
5 |
6 |
7 | #include "cpucycles_impl.inc"
8 |
9 | cycles_t
10 | LOCAL_PREFIX(cpucycles)(void) {
11 | return cpucycles_impl();
12 | }
13 |
14 | const char *LOCAL_PREFIX(cpucycles_units)(void) {
15 | return cpucycles_units_impl();
16 | }
17 |
18 |
--------------------------------------------------------------------------------
/framework/driver/cpuid.c:
--------------------------------------------------------------------------------
1 | #include "cpuid.h"
2 |
3 | #include "cpuid_impl.inc"
4 |
5 | static unsigned long cpuid_flags = CPUID_GENERIC;
6 | static unsigned long cpuid_mask = ~(unsigned long)0;
7 |
8 | unsigned long
9 | LOCAL_PREFIX(cpuid)(void) {
10 | if (cpuid_flags == CPUID_GENERIC)
11 | cpuid_flags = cpuid_impl();
12 | return cpuid_flags & cpuid_mask;
13 | }
14 |
15 | const void *
16 | LOCAL_PREFIX(cpu_select)(const void *impls, size_t impl_size, impl_test test_fn) {
17 | unsigned long cpu_flags = LOCAL_PREFIX(cpuid)();
18 | const unsigned char *p = (const unsigned char *)impls;
19 | for (;;) {
20 | const cpu_specific_impl_t *impl = (const cpu_specific_impl_t *)p;
21 | if (impl->cpu_flags == (impl->cpu_flags & cpu_flags)) {
22 | if (test_fn(impl) == 0)
23 | return impl;
24 | }
25 | if (impl->cpu_flags == CPUID_GENERIC)
26 | return NULL;
27 | p += impl_size;
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/framework/driver/gcc_driver.inc:
--------------------------------------------------------------------------------
1 | #ifndef GCC_DRIVER_INC
2 | #define GCC_DRIVER_INC
3 |
4 | #include "asmopt_internal.h"
5 |
6 | #if (defined(__i386__ ) || defined(__x86_64__))
7 | #include "x86/gcc.inc"
8 | #endif
9 |
10 | #if (defined(__arm__ ) || defined(__aarch64__))
11 | #include "arm/gcc.inc"
12 | #endif
13 |
14 | .macro INCLUDE_IF_X86_32BIT file
15 | #if (IS_X86_32)
16 | .include INCLUDE_FILE_PARM
17 | #endif
18 | .endm
19 |
20 | .macro INCLUDE_IF_X86_64BIT file
21 | #if (IS_X86_64)
22 | .include INCLUDE_FILE_PARM
23 | #endif
24 | .endm
25 |
26 |
27 | .macro INCLUDE_IF_MMX_32BIT file
28 | #if defined(HAVE_MMX)
29 | INCLUDE_IF_X86_32BIT INCLUDE_FILE_PARM
30 | #endif
31 | .endm
32 |
33 | .macro INCLUDE_IF_MMX_64BIT file
34 | #if defined(HAVE_MMX)
35 | INCLUDE_IF_X86_64BIT INCLUDE_FILE_PARM
36 | #endif
37 | .endm
38 |
39 |
40 | .macro INCLUDE_IF_SSE_32BIT file
41 | #if defined(HAVE_SSE)
42 | INCLUDE_IF_X86_32BIT INCLUDE_FILE_PARM
43 | #endif
44 | .endm
45 |
46 | .macro INCLUDE_IF_SSE_64BIT file
47 | #if defined(HAVE_SSE)
48 | INCLUDE_IF_X86_64BIT INCLUDE_FILE_PARM
49 | #endif
50 | .endm
51 |
52 |
53 | .macro INCLUDE_IF_SSE2_32BIT file
54 | #if defined(HAVE_SSE2)
55 | INCLUDE_IF_X86_32BIT INCLUDE_FILE_PARM
56 | #endif
57 | .endm
58 |
59 | .macro INCLUDE_IF_SSE2_64BIT file
60 | #if defined(HAVE_SSE2)
61 | INCLUDE_IF_X86_64BIT INCLUDE_FILE_PARM
62 | #endif
63 | .endm
64 |
65 |
66 | .macro INCLUDE_IF_SSE3_32BIT file
67 | #if defined(HAVE_SSE3)
68 | INCLUDE_IF_X86_32BIT INCLUDE_FILE_PARM
69 | #endif
70 | .endm
71 |
72 | .macro INCLUDE_IF_SSE3_64BIT file
73 | #if defined(HAVE_SSE3)
74 | INCLUDE_IF_X86_64BIT INCLUDE_FILE_PARM
75 | #endif
76 | .endm
77 |
78 |
79 | .macro INCLUDE_IF_SSSE3_32BIT file
80 | #if defined(HAVE_SSSE3)
81 | INCLUDE_IF_X86_32BIT INCLUDE_FILE_PARM
82 | #endif
83 | .endm
84 |
85 | .macro INCLUDE_IF_SSSE3_64BIT file
86 | #if defined(HAVE_SSSE3)
87 | INCLUDE_IF_X86_64BIT INCLUDE_FILE_PARM
88 | #endif
89 | .endm
90 |
91 |
92 | .macro INCLUDE_IF_SSE4_1_32BIT file
93 | #if defined(HAVE_SSE4_1)
94 | INCLUDE_IF_X86_32BIT INCLUDE_FILE_PARM
95 | #endif
96 | .endm
97 |
98 | .macro INCLUDE_IF_SSE4_1_64BIT file
99 | #if defined(HAVE_SSE4_1)
100 | INCLUDE_IF_X86_64BIT INCLUDE_FILE_PARM
101 | #endif
102 | .endm
103 |
104 |
105 | .macro INCLUDE_IF_SSE4_2_32BIT file
106 | #if defined(HAVE_SSE4_2)
107 | INCLUDE_IF_X86_32BIT INCLUDE_FILE_PARM
108 | #endif
109 | .endm
110 |
111 | .macro INCLUDE_IF_SSE4_2_64BIT file
112 | #if defined(HAVE_SSE4_2)
113 | INCLUDE_IF_X86_64BIT INCLUDE_FILE_PARM
114 | #endif
115 | .endm
116 |
117 |
118 | .macro INCLUDE_IF_AVX_32BIT file
119 | #if defined(HAVE_AVX)
120 | INCLUDE_IF_X86_32BIT INCLUDE_FILE_PARM
121 | #endif
122 | .endm
123 |
124 | .macro INCLUDE_IF_AVX_64BIT file
125 | #if defined(HAVE_AVX)
126 | INCLUDE_IF_X86_64BIT INCLUDE_FILE_PARM
127 | #endif
128 | .endm
129 |
130 |
131 | .macro INCLUDE_IF_XOP_32BIT file
132 | #if defined(HAVE_XOP)
133 | INCLUDE_IF_X86_32BIT INCLUDE_FILE_PARM
134 | #endif
135 | .endm
136 |
137 | .macro INCLUDE_IF_XOP_64BIT file
138 | #if defined(HAVE_XOP)
139 | INCLUDE_IF_X86_64BIT INCLUDE_FILE_PARM
140 | #endif
141 | .endm
142 |
143 |
144 | .macro INCLUDE_IF_AVX2_32BIT file
145 | #if defined(HAVE_AVX2)
146 | INCLUDE_IF_X86_32BIT INCLUDE_FILE_PARM
147 | #endif
148 | .endm
149 |
150 | .macro INCLUDE_IF_AVX2_64BIT file
151 | #if defined(HAVE_AVX2)
152 | INCLUDE_IF_X86_64BIT INCLUDE_FILE_PARM
153 | #endif
154 | .endm
155 |
156 |
157 | .macro INCLUDE_IF_AVX512_32BIT file
158 | #if defined(HAVE_AVX512)
159 | INCLUDE_IF_X86_32BIT INCLUDE_FILE_PARM
160 | #endif
161 | .endm
162 |
163 | .macro INCLUDE_IF_AVX512_64BIT file
164 | #if defined(HAVE_AVX512)
165 | INCLUDE_IF_X86_64BIT INCLUDE_FILE_PARM
166 | #endif
167 | .endm
168 |
169 | #endif /* GCC_DRIVER_INC */
170 |
171 |
--------------------------------------------------------------------------------
/framework/driver/generic/cpucycles_impl.inc:
--------------------------------------------------------------------------------
1 | #if defined(HAVE_GETTIMEOFDAY)
2 | #include
3 | #endif
4 |
5 | static cycles_t
6 | cpucycles_impl(void) {
7 | #if defined(HAVE_GETTIMEOFDAY)
8 | struct timeval t;
9 | gettimeofday(&t, NULL);
10 | return ((cycles_t)t.tv_sec * 1000000) + (cycles_t)t.tv_usec;
11 | #else
12 | printf("no suitable timing mechanism found\n");
13 | exit(1);
14 | return 0;
15 | #endif
16 | }
17 |
18 | static const char *
19 | cpucycles_units_impl(void) {
20 | #if defined(HAVE_GETTIMEOFDAY)
21 | return "us";
22 | #else
23 | return "";
24 | #endif
25 | }
26 |
27 |
--------------------------------------------------------------------------------
/framework/driver/generic/cpuid_flags.inc:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/framework/driver/generic/cpuid_impl.inc:
--------------------------------------------------------------------------------
1 | static unsigned long long
2 | cpuid_impl(void) {
3 | return CPUID_GENERIC;
4 | }
5 |
--------------------------------------------------------------------------------
/framework/driver/x86/cpucycles_impl.inc:
--------------------------------------------------------------------------------
1 | typedef cycles_t (*cpucycles_x86_fn)(void);
2 |
3 | extern cycles_t LOCAL_PREFIX(cpucycles_x86)(void);
4 | static cycles_t cpucycles_select(void);
5 |
6 | static cpucycles_x86_fn cpucycles_impl = cpucycles_select;
7 |
8 | #if defined(HAVE_GETTIMEOFDAY)
9 | #include
10 |
11 | static cycles_t
12 | cpucycles_x86_fallback(void) {
13 | struct timeval t;
14 | gettimeofday(&t, NULL);
15 | return ((cycles_t)t.tv_sec * 1000000) + (cycles_t)t.tv_usec;
16 | }
17 | #else
18 | /* what can a 386/486 use for this otherwise? */
19 | static cycles_t
20 | cpucycles_x86_fallback(void) {
21 | printf("no suitable timing mechanism found\n");
22 | exit(1);
23 | }
24 | #endif
25 |
26 | static cycles_t
27 | cpucycles_select(void) {
28 | cpucycles_impl = (LOCAL_PREFIX(cpuid)() & CPUID_RDTSC) ? LOCAL_PREFIX(cpucycles_x86) : cpucycles_x86_fallback;
29 | return cpucycles_impl();
30 | }
31 |
32 | static const char *
33 | cpucycles_units_impl(void) {
34 | if ((LOCAL_PREFIX(cpuid)() & CPUID_RDTSC))
35 | return "cycles";
36 | else
37 | #if defined(HAVE_GETTIMEOFDAY)
38 | return "us";
39 | #else
40 | return "";
41 | #endif
42 | }
43 |
44 |
--------------------------------------------------------------------------------
/framework/driver/x86/cpuid_flags.inc:
--------------------------------------------------------------------------------
1 | enum cpuid_flags_x86_t {
2 | CPUID_X86 = (1 << 0),
3 | CPUID_MMX = (1 << 1),
4 | CPUID_SSE = (1 << 2),
5 | CPUID_SSE2 = (1 << 3),
6 | CPUID_SSE3 = (1 << 4),
7 | CPUID_SSSE3 = (1 << 5),
8 | CPUID_SSE4_1 = (1 << 6),
9 | CPUID_SSE4_2 = (1 << 7),
10 | CPUID_AVX = (1 << 8),
11 | CPUID_XOP = (1 << 9),
12 | CPUID_AVX2 = (1 << 10),
13 | CPUID_AVX512 = (1 << 11),
14 |
15 | CPUID_RDTSC = (1 << 25),
16 | CPUID_RDRAND = (1 << 26),
17 | CPUID_POPCNT = (1 << 27),
18 | CPUID_FMA4 = (1 << 28),
19 | CPUID_FMA3 = (1 << 29),
20 | CPUID_PCLMULQDQ = (1 << 30),
21 | CPUID_AES = (1 << 31)
22 | };
23 |
24 |
--------------------------------------------------------------------------------
/framework/driver/x86/cpuid_impl.inc:
--------------------------------------------------------------------------------
1 | extern uint32_t LOCAL_PREFIX(cpuid_x86)(void);
2 |
3 | static uint32_t
4 | cpuid_impl(void) {
5 | return LOCAL_PREFIX(cpuid_x86)();
6 | }
7 |
8 |
--------------------------------------------------------------------------------
/framework/driver/x86/driver.S:
--------------------------------------------------------------------------------
1 | #if defined(__GNUC__)
2 | #include "gcc_driver.inc"
3 | #else
4 | ;.if 0
5 | %include "yasm_driver.inc"
6 | ;.endif
7 | #endif
8 |
9 | SECTION_TEXT
10 |
11 | GLOBAL_LOCAL_PREFIX cpuid_x86
12 | FN_LOCAL_PREFIX cpuid_x86
13 | CPUID_PROLOGUE
14 |
15 | /* use esi for flags */
16 | movl $(CPUID_X86), %esi
17 |
18 | /* cpuid 0 */
19 | movl $0, %eax
20 | xorl %ecx, %ecx
21 | cpuid
22 |
23 | /* eax = max level, store in edi */
24 | movl %eax, %edi
25 |
26 | /* cpus with >=2 cpuid levels support rdtsc */
27 | cmpl $2, %edi
28 | jb 1f
29 | orl $(CPUID_RDTSC), %esi
30 | 1:
31 |
32 | testl $0x00000500, %edi
33 | jz Lcpuid_x86_notp5
34 |
35 | /* Intel P5 pre-B0, only MMX */
36 | orl $(CPUID_MMX), %esi
37 | jmp Lcpuid_x86_done
38 |
39 | Lcpuid_x86_notp5:
40 |
41 | /* cpuid 1 */
42 | movl $1, %eax
43 | xorl %ecx, %ecx
44 | cpuid
45 |
46 | /* rdrand */
47 | testl $(1 << 30), %ecx
48 | jz 1f
49 | orl $(CPUID_RDRAND), %esi
50 | 1:
51 |
52 | /* aes */
53 | testl $(1 << 25), %ecx
54 | jz 1f
55 | orl $(CPUID_AES), %esi
56 | 1:
57 |
58 | /* popcnt */
59 | testl $(1 << 23), %ecx
60 | jz 1f
61 | orl $(CPUID_POPCNT), %esi
62 | 1:
63 |
64 | /* fma3 */
65 | testl $(1 << 12), %ecx
66 | jz 1f
67 | orl $(CPUID_FMA3), %esi
68 | 1:
69 |
70 | /* pclmulqdq */
71 | testl $(1 << 1), %ecx
72 | jz 1f
73 | orl $(CPUID_PCLMULQDQ), %esi
74 | 1:
75 |
76 | /* SSE4.2 */
77 | testl $(1 << 20), %ecx
78 | jz 1f
79 | orl $(CPUID_SSE4_2), %esi
80 | 1:
81 |
82 | /* SSE4.1 */
83 | testl $(1 << 19), %ecx
84 | jz 1f
85 | orl $(CPUID_SSE4_1), %esi
86 | 1:
87 |
88 | /* SSSE3 */
89 | testl $(1 << 9), %ecx
90 | jz 1f
91 | orl $(CPUID_SSSE3), %esi
92 | 1:
93 |
94 | /* SSE3 */
95 | testl $(1 ), %ecx
96 | jz 1f
97 | orl $(CPUID_SSE3), %esi
98 | 1:
99 |
100 | /* SSE2 */
101 | testl $(1 << 26), %edx
102 | jz 1f
103 | orl $(CPUID_SSE2), %esi
104 | 1:
105 |
106 | /* SSE */
107 | testl $(1 << 25), %edx
108 | jz 1f
109 | orl $(CPUID_SSE), %esi
110 | 1:
111 |
112 | /* MMX */
113 | testl $(1 << 23), %edx
114 | jz 1f
115 | orl $(CPUID_MMX), %esi
116 | 1:
117 |
118 | /* test for xsave enabled by os */
119 | testl $(1 << 27), %ecx
120 | jz Lcpuid_x86_skipavxplus
121 |
122 | /* test for avx supported by cpu */
123 | testl $(1 << 28), %ecx
124 | jz Lcpuid_x86_skipavxplus
125 |
126 | /* xgetbv(0) */
127 | xorl %ecx, %ecx
128 | .byte 0x0f, 0x01, 0xd0
129 |
130 | /* save XCR0 in scratch(ebp) */
131 | movl %eax, %ebp
132 |
133 | /* XCR0 & (XMM | YMM) */
134 | andl $((1 << 2) | (1 << 1)), %eax
135 | cmpl $((1 << 2) | (1 << 1)), %eax
136 | jne Lcpuid_x86_skipavxplus
137 |
138 | /* AVX is ok to use */
139 | orl $(CPUID_AVX), %esi
140 |
141 | /* check for max level >= 7 */
142 | cmpl $7, %edi
143 | jb Lcpuid_x86_cpuid_below_7
144 |
145 | /* cpuid 7 */
146 | movl $7, %eax
147 | xorl %ecx, %ecx
148 | cpuid
149 |
150 | /* AVX2 */
151 | testl $(1 << 5), %ebx
152 | jz 1f
153 | orl $(CPUID_AVX2), %esi
154 | 1:
155 |
156 | /* XCR0 & (OPMASK | ZMMUPPER | ZMMEXTENDED) */
157 | andl $((1 << 5) | (1 << 6) | (1 << 7)), %ebp
158 | cmpl $((1 << 5) | (1 << 6) | (1 << 7)), %ebp
159 | jne Lcpuid_x86_skipavx512
160 |
161 | /* AVX-512 */
162 | testl $(1 << 16), %ebx
163 | jz 1f
164 | orl $(CPUID_AVX512), %esi
165 | 1:
166 |
167 | Lcpuid_x86_skipavx512:
168 |
169 | Lcpuid_x86_cpuid_below_7:
170 |
171 | /* cpuid 0x80000000 */
172 | movl $0x80000000, %eax
173 | xorl %ecx, %ecx
174 | cpuid
175 |
176 | /* eax = max extended level */
177 | cmpl $0x80000001, %eax
178 | jb Lcpuid_x86_skipxopplus
179 |
180 | /* cpuid $0x80000001 */
181 | movl $0x80000001, %eax
182 | xorl %ecx, %ecx
183 | cpuid
184 |
185 | /* fma4 */
186 | testl $(1 << 16), %ecx
187 | jz 1f
188 | orl $(CPUID_FMA4), %esi
189 | 1:
190 |
191 | /* XOP */
192 | testl $(1 << 11), %ecx
193 | jz 1f
194 | orl $(CPUID_XOP), %esi
195 | 1:
196 |
197 | Lcpuid_x86_skipxopplus:
198 |
199 | Lcpuid_x86_skipavxplus:
200 |
201 | Lcpuid_x86_done:
202 | movl %esi, %eax
203 |
204 | CPUID_EPILOGUE
205 | FN_END_LOCAL_PREFIX cpuid_x86
206 |
207 |
208 |
209 | GLOBAL_LOCAL_PREFIX cpucycles_x86
210 | FN_LOCAL_PREFIX cpucycles_x86
211 |
212 | CPUCYCLES
213 |
214 | FN_END_LOCAL_PREFIX cpucycles_x86
215 |
--------------------------------------------------------------------------------
/framework/driver/x86/gcc.inc:
--------------------------------------------------------------------------------
1 | #ifndef BASE_GCC_X86_S
2 | #define BASE_GCC_X86_S
3 |
4 | #include "asmopt_internal.h"
5 |
6 | #if !defined(HAVE_SLASHMACRO) && !defined(HAVE_DOLLARMACRO)
7 | #error Unknown gnu as macro parameter convention! Run ./configure
8 | #endif
9 |
10 | #define IS_X86_32 (defined(__i386__))
11 | #define IS_X86_64 (defined(__x86_64__))
12 | #define IS_ELF (defined(__ELF__))
13 | #define IS_MACH (defined(__MACH__))
14 | #define IS_WIN32 (IS_X86_32 && (defined(_WIN32) || defined(__CYGWIN__)))
15 | #define IS_WIN64 (IS_X86_64 && (defined(_WIN64) || defined(__CYGWIN__)))
16 |
17 | #if (IS_WIN64)
18 | /* handles 0-6 arguments and optional saving of the upper 8 xmm registers */
19 | .macro WIN64STUBFN name, args, xmmused
20 | \name:; _\name:;
21 |
22 | subq $(184), %rsp
23 | movdqa %xmm6, 0(%rsp)
24 | movdqa %xmm7, 16(%rsp)
25 | .if \xmmused > 8
26 | movdqa %xmm8, 32(%rsp)
27 | movdqa %xmm9, 48(%rsp)
28 | movdqa %xmm10, 64(%rsp)
29 | movdqa %xmm11, 80(%rsp)
30 | movdqa %xmm12, 96(%rsp)
31 | movdqa %xmm13, 112(%rsp)
32 | movdqa %xmm14, 128(%rsp)
33 | movdqa %xmm15, 144(%rsp)
34 | .endif
35 | movq %rdi, 160(%rsp)
36 | movq %rsi, 168(%rsp)
37 | movq %rcx, %rdi
38 | movq %rdx, %rsi
39 | movq %r8, %rdx
40 | movq %r9, %rcx
41 | .if \args >= 5
42 | movq 224(%rsp), %r8
43 | .endif
44 | .if \args >= 6
45 | movq 232(%rsp), %r9
46 | .endif
47 | call thunk_\name
48 | movdqa 0(%rsp), %xmm6
49 | movdqa 16(%rsp), %xmm7
50 | .if \xmmused > 8
51 | movdqa 32(%rsp), %xmm8
52 | movdqa 48(%rsp), %xmm9
53 | movdqa 64(%rsp), %xmm10
54 | movdqa 80(%rsp), %xmm11
55 | movdqa 96(%rsp), %xmm12
56 | movdqa 112(%rsp), %xmm13
57 | movdqa 128(%rsp), %xmm14
58 | movdqa 144(%rsp), %xmm15
59 | .endif
60 | movq 160(%rsp), %rdi
61 | movq 168(%rsp), %rsi
62 | addq $(184), %rsp
63 | ret
64 | thunk_\name:
65 | .endm
66 |
67 | .macro FN name
68 | WIN64STUBFN \name, 4, 16
69 | .endm
70 |
71 | .macro FN_EXT name, args, xmmused
72 | WIN64STUBFN \name, \args, \xmmused
73 | .endm
74 |
75 | .macro FN_END name
76 | .endm
77 |
78 | .macro HIDDEN name
79 | .endm
80 | #elif (IS_WIN32)
81 | .macro FN name
82 | \name:
83 | _\name:
84 | .endm
85 |
86 | .macro FN_EXT name, args, xmmused
87 | FN \name
88 | .endm
89 |
90 | .macro FN_END name
91 | .endm
92 |
93 | .macro HIDDEN name
94 | .endm
95 | #elif (IS_ELF)
96 | .macro FN name
97 | \name:
98 | _\name:
99 | .endm
100 |
101 | .macro FN_EXT name, args, xmmused
102 | FN \name
103 | .endm
104 |
105 | .macro FN_END name
106 | .size \name, .-\name
107 | .size _\name, .-_\name
108 | .type \name, @function
109 | .type _\name, @function
110 | .endm
111 |
112 | .macro HIDDEN name
113 | #if defined(HAVE_AS_HIDDEN)
114 | .hidden \name
115 | .hidden _\name
116 | #endif
117 | .endm
118 |
119 | /* set NX for stack */
120 | .section .note.GNU-stack,"",@progbits
121 | #elif (IS_MACH)
122 | .macro FN name
123 | #if defined(HAVE_SLASHMACRO)
124 | \name:
125 | _\name:
126 | #elif defined(HAVE_DOLLARMACRO)
127 | $0:
128 | _$0:
129 | #endif
130 | .endm
131 |
132 | .macro FN_EXT name, args, xmmused
133 | #if defined(HAVE_SLASHMACRO)
134 | FN \name
135 | #elif defined(HAVE_DOLLARMACRO)
136 | FN $0
137 | #endif
138 | .endm
139 |
140 | .macro FN_END name
141 | .endm
142 |
143 | .macro HIDDEN name
144 | #if defined(HAVE_AS_PRIVATE_EXTERN)
145 | #if defined(HAVE_SLASHMACRO)
146 | .private_extern \name
147 | .private_extern _\name
148 | #elif defined(HAVE_DOLLARMACRO)
149 | .private_extern $0
150 | .private_extern _$0
151 | #endif
152 | #endif
153 | .endm
154 | #endif
155 |
156 | /* put everything in the code segment to simplify things */
157 | #if (IS_MACH)
158 | .macro SECTION_TEXT
159 | .section __TEXT,__text,regular
160 | .endm
161 |
162 | .macro SECTION_RODATA
163 | .section __TEXT,__text,regular
164 | .endm
165 | #else
166 | /* put everything in the code segment to simplify things */
167 | .macro SECTION_TEXT
168 | .text
169 | .endm
170 |
171 | .macro SECTION_RODATA
172 | .text
173 | .endm
174 | #endif
175 |
176 | /* declare a global function */
177 | .macro GLOBAL name
178 | #if defined(HAVE_SLASHMACRO)
179 | .globl \name
180 | .globl _\name
181 | #elif defined(HAVE_DOLLARMACRO)
182 | .globl $0
183 | .globl _$0
184 | #endif
185 | .endm
186 |
187 | .macro FN_LOCAL_PREFIX name
188 | #if defined(HAVE_SLASHMACRO)
189 | FN LOCAL_PREFIX(\name)
190 | #elif defined(HAVE_DOLLARMACRO)
191 | FN LOCAL_PREFIX($0)
192 | #endif
193 | .endm
194 |
195 | .macro FN_EXT_LOCAL_PREFIX name, args, xmmused
196 | #if defined(HAVE_SLASHMACRO)
197 | FN_EXT LOCAL_PREFIX(\name), \args, \xmmused
198 | #elif defined(HAVE_DOLLARMACRO)
199 | FN_EXT LOCAL_PREFIX($0), $1, $2
200 | #endif
201 | .endm
202 |
203 | .macro FN_END_LOCAL_PREFIX name
204 | #if defined(HAVE_SLASHMACRO)
205 | FN_END LOCAL_PREFIX(\name)
206 | #elif defined(HAVE_DOLLARMACRO)
207 | FN_END LOCAL_PREFIX($0)
208 | #endif
209 | .endm
210 |
211 | .macro GLOBAL_LOCAL_PREFIX name
212 | #if defined(HAVE_SLASHMACRO)
213 | GLOBAL LOCAL_PREFIX(\name)
214 | HIDDEN LOCAL_PREFIX(\name)
215 | #elif defined(HAVE_DOLLARMACRO)
216 | GLOBAL LOCAL_PREFIX($0)
217 | HIDDEN LOCAL_PREFIX($0)
218 | #endif
219 | .endm
220 |
221 | .macro GLOBAL_HIDDEN_FN name
222 | #if defined(HAVE_SLASHMACRO)
223 | GLOBAL \name
224 | HIDDEN \name
225 | FN \name
226 | #elif defined(HAVE_DOLLARMACRO)
227 | GLOBAL $0
228 | HIDDEN $0
229 | FN $0
230 | #endif
231 | .endm
232 |
233 | .macro GLOBAL_HIDDEN_FN_EXT name, args, xmmused
234 | #if defined(HAVE_SLASHMACRO)
235 | GLOBAL \name
236 | HIDDEN \name
237 | FN_EXT \name, \args, \xmmused
238 | #elif defined(HAVE_DOLLARMACRO)
239 | GLOBAL $0
240 | HIDDEN $0
241 | FN_EXT $0, $1, $2
242 | #endif
243 | .endm
244 |
245 | /* pic support */
246 | .macro LOAD_VAR_PIC var, reg
247 | #if (IS_X86_32)
248 | #if defined(HAVE_SLASHMACRO)
249 | call 1f
250 | 1:
251 | popl \reg
252 | leal \var - 1b(\reg), \reg
253 | #elif defined(HAVE_DOLLARMACRO)
254 | call 1f
255 | 1:
256 | popl $1
257 | leal $0 - 1b($1), $1
258 | #endif
259 | #else
260 | #if defined(HAVE_SLASHMACRO)
261 | leaq \var(%rip), \reg
262 | #elif defined(HAVE_DOLLARMACRO)
263 | leaq $0(%rip), $1
264 | #endif
265 | #endif
266 | .endm
267 |
268 | #if defined(HAVE_SLASHMACRO)
269 | #define INCLUDE_FILE_PARM "\file"
270 | #elif defined(HAVE_DOLLARMACRO)
271 | #define INCLUDE_FILE_PARM $0
272 | #endif
273 |
274 |
275 | .macro INCLUDE file
276 | .include INCLUDE_FILE_PARM
277 | .endm
278 |
279 | /* include the file with the variable(s) if variable 'name' is not already included */
280 | .macro INCLUDE_VAR_FILE file, name
281 | #if defined(HAVE_SLASHMACRO)
282 | .ifndef \name
283 | .include INCLUDE_FILE_PARM
284 | .endif
285 | #elif defined(HAVE_DOLLARMACRO)
286 | .ifndef $1
287 | .include INCLUDE_FILE_PARM
288 | .endif
289 | #endif
290 | .endm
291 |
292 | /* stupid helpers so we can have cpuid in one file */
293 |
294 | .macro CPUID_PROLOGUE
295 | #if (IS_X86_32)
296 | pushl %ebx
297 | pushl %esi
298 | pushl %edi
299 | pushl %ebp
300 |
301 | /* check that cpuid is supported */
302 | pushfl
303 | popl %eax
304 | movl %eax, %ecx
305 | xorl $(0x200000), %eax
306 | pushl %eax
307 | popfl
308 | pushfl
309 | popl %eax
310 | xorl %ecx, %eax
311 | shrl $(21), %eax
312 | andl $(1), %eax
313 | pushl %ecx
314 | popfl
315 | andl %eax, %eax
316 | jz Lcpuid_x86_done
317 | #else
318 | pushq %rbx
319 | pushq %rsi
320 | pushq %rdi
321 | pushq %rbp
322 | #endif
323 | .endm
324 |
325 | .macro CPUID_EPILOGUE
326 | #if (IS_X86_32)
327 | popl %ebp
328 | popl %edi
329 | popl %esi
330 | popl %ebx
331 | #else
332 | popq %rbp
333 | popq %rdi
334 | popq %rsi
335 | popq %rbx
336 | #endif
337 | ret
338 | .endm
339 |
340 | .macro CPUCYCLES
341 | rdtsc
342 | #if (IS_X86_64)
343 | shlq $(32), %rdx
344 | orq %rdx, %rax
345 | #endif
346 | ret
347 | .endm
348 |
349 |
350 | /* Macros for CPUID only */
351 |
352 | #define CPUID_GENERIC (0 )
353 | #define CPUID_X86 (1 << 0)
354 | #define CPUID_MMX (1 << 1)
355 | #define CPUID_SSE (1 << 2)
356 | #define CPUID_SSE2 (1 << 3)
357 | #define CPUID_SSE3 (1 << 4)
358 | #define CPUID_SSSE3 (1 << 5)
359 | #define CPUID_SSE4_1 (1 << 6)
360 | #define CPUID_SSE4_2 (1 << 7)
361 | #define CPUID_AVX (1 << 8)
362 | #define CPUID_XOP (1 << 9)
363 | #define CPUID_AVX2 (1 << 10)
364 | #define CPUID_AVX512 (1 << 11)
365 |
366 | #define CPUID_RDTSC (1 << 25)
367 | #define CPUID_RDRAND (1 << 26)
368 | #define CPUID_POPCNT (1 << 27)
369 | #define CPUID_FMA4 (1 << 28)
370 | #define CPUID_FMA3 (1 << 29)
371 | #define CPUID_PCLMULQDQ (1 << 30)
372 | #define CPUID_AES (1 << 31)
373 |
374 | #endif /* BASE_GCC_X86_S */
375 |
--------------------------------------------------------------------------------
/framework/driver/x86/yasm.inc:
--------------------------------------------------------------------------------
1 | %ifndef BASE_YASM
2 | %define BASE_YASM
3 |
4 | ; 1.1.0 and earlier incorrectly parsed movsw/movzw in gas mode: https://github.com/yasm/yasm/commit/2678cb3c3a42b3870a209ed8de38c1a16449695a
5 | %if (__YASM_VERSION_ID__ < 01020000h) ; 1.2.0
6 | %error Requires Yasm 1.2.0 or higher
7 | %endif
8 |
9 | %define HAVE_XOP 0
10 | %define HAVE_AVX2 0
11 | %define HAVE_AVX512 0
12 |
13 | %if (__YASM_VERSION_ID__ >= 01000000h) ; 1.0.0
14 | %define HAVE_XOP 1
15 | %endif
16 |
17 | %if (__YASM_VERSION_ID__ >= 01020000h) ; 1.2.0
18 | %define HAVE_AVX2 1
19 | %endif
20 |
21 | %if (__YASM_VERSION_ID__ >= 999999999) ; avx-512 isn't supported yet
22 | %define HAVE_AVX512 1
23 | %endif
24 |
25 |
26 | %define BITS32 0
27 | %define BITS64 0
28 | %define WIN 0
29 | %define ELF 0
30 | %define MACH 0
31 |
32 | %ifidn __YASM_OBJFMT__, win32
33 | %define BITS32 1
34 | %define WIN 1
35 | %elifidn __YASM_OBJFMT__, elf
36 | %error Specify bits with -f [elf32,elf64]
37 | %elifidn __YASM_OBJFMT__, elf32
38 | %define BITS32 1
39 | %define ELF 1
40 | %elifidn __YASM_OBJFMT__, macho
41 | %error Specify bits with -f [macho32,macho64]
42 | %elifidn __YASM_OBJFMT__, macho32
43 | %define BITS32 1
44 | %define MACH 1
45 | %elifidn __YASM_OBJFMT__, win64
46 | %define BITS64 1
47 | %define WIN 1
48 | %elifidn __YASM_OBJFMT__, x64
49 | %define BITS64 1
50 | %define WIN 1
51 | %elifidn __YASM_OBJFMT__, elf64
52 | %define BITS64 1
53 | %define ELF 1
54 | %elifidn __YASM_OBJFMT__, macho64
55 | %define BITS64 1
56 | %define MACH 1
57 | %else
58 | %error "Unable to determine output format"
59 | %endif
60 |
61 | %if (WIN)
62 | %if (BITS64)
63 | ; name, args, xmmused
64 | %macro win64stubfn 3
65 | %1:
66 | _ %+ %1:
67 |
68 | subq $184, %rsp
69 | movdqa %xmm6, 0(%rsp)
70 | movdqa %xmm7, 16(%rsp)
71 | %if (%3 > 8)
72 | movdqa %xmm8, 32(%rsp)
73 | movdqa %xmm9, 48(%rsp)
74 | movdqa %xmm10, 64(%rsp)
75 | movdqa %xmm11, 80(%rsp)
76 | movdqa %xmm12, 96(%rsp)
77 | movdqa %xmm13, 112(%rsp)
78 | movdqa %xmm14, 128(%rsp)
79 | movdqa %xmm15, 144(%rsp)
80 | %endif
81 | movq %rdi, 160(%rsp)
82 | movq %rsi, 168(%rsp)
83 | movq %rcx, %rdi
84 | movq %rdx, %rsi
85 | movq %r8, %rdx
86 | movq %r9, %rcx
87 | %if (%2 >= 5)
88 | movq 224(%rsp), %r8
89 | %endif
90 | %if (%2 >= 6)
91 | movq 232(%rsp), %r9
92 | %endif
93 | call thunk_ %+ %1
94 | movdqa 0(%rsp), %xmm6
95 | movdqa 16(%rsp), %xmm7
96 | %if (%3 > 8)
97 | movdqa 32(%rsp), %xmm8
98 | movdqa 48(%rsp), %xmm9
99 | movdqa 64(%rsp), %xmm10
100 | movdqa 80(%rsp), %xmm11
101 | movdqa 96(%rsp), %xmm12
102 | movdqa 112(%rsp), %xmm13
103 | movdqa 128(%rsp), %xmm14
104 | movdqa 144(%rsp), %xmm15
105 | %endif
106 | movq 160(%rsp), %rdi
107 | movq 168(%rsp), %rsi
108 | addq $184, %rsp
109 | ret
110 | thunk_ %+ %1:
111 | %endmacro
112 |
113 | ; FN name
114 | %macro FN 1
115 | win64stubfn %1, 4, 16
116 | %endmacro
117 |
118 | ; FN_EXT name, args, xmmused
119 | %macro FN_EXT 3
120 | win64stubfn %1, %2, %3
121 | %endmacro
122 |
123 | ; FN_END name
124 | %macro FN_END 1
125 | %endmacro
126 | %else
127 | ; FN name
128 | %macro FN 1
129 | %1:
130 | _ %+ %1:
131 | %endmacro
132 |
133 | ; FN_EXT name, args, xmmused
134 | %macro FN_EXT 3
135 | %1:
136 | _ %+ %1:
137 | %endmacro
138 |
139 | ; FN_END name
140 | %macro FN_END 1
141 | %endmacro
142 | %endif
143 |
144 | %macro HIDDEN 1
145 | %endmacro
146 | %elif (ELF)
147 | ; FN name
148 | %macro FN 1
149 | %1:
150 | _ %+ %1:
151 | %endmacro
152 |
153 | ; FN_EXT name, args, xmmused
154 | %macro FN_EXT 3
155 | %1:
156 | _ %+ %1:
157 | %endmacro
158 |
159 | ; FN_END name
160 | %macro FN_END 1
161 | .size %1, .-%1
162 | .type %1, @function
163 | %endmacro
164 |
165 | ; declares a global is hidden: HIDDEN name
166 | %macro HIDDEN 1
167 | %if (__YASM_VERSION_ID__ >= 09999999h) ; .hidden isn't in yasm yet?
168 | .hidden %1
169 | .hidden _ %+ %1
170 | %endif
171 | %endmacro
172 |
173 | ; set NX for stack
174 | .section .note.GNU-stack,"",@progbits
175 | %elif (MACH)
176 | ; FN name
177 | %macro FN 1
178 | %1:
179 | _ %+ %1:
180 | %endmacro
181 |
182 | ; FN_EXT name, args, xmmused
183 | %macro FN_EXT 3
184 | %1:
185 | _ %+ %1:
186 | %endmacro
187 |
188 | ; FN_END name
189 | %macro FN_END 1
190 | %endmacro
191 |
192 | ; declares a global is hidden: HIDDEN name
193 | %macro HIDDEN 1
194 | %if (__YASM_VERSION_ID__ >= 09999999h) ; .private_extern isn't in yasm yet?
195 | .private_extern %1
196 | .private_extern _ %+ %1
197 | %endif
198 | %endmacro
199 | %endif
200 |
201 | ; put everything in the code segment to simplify things
202 | %define SECTION_TEXT .section .text
203 | %define SECTION_RODATA .section .text
204 |
205 | ; declares a global function: GLOBAL name
206 | %macro GLOBAL 1
207 | .globl %1
208 | .globl _ %+ %1
209 | %endmacro
210 |
211 | %macro FN_LOCAL_PREFIX 1
212 | FN PROJECT_NAME %+ _ %+ %1
213 | %endmacro
214 |
215 | %macro FN_EXT_LOCAL_PREFIX 3
216 | FN_EXT PROJECT_NAME %+ _ %+ %1, %2, %3
217 | %endmacro
218 |
219 | %macro FN_END_LOCAL_PREFIX 1
220 | FN_END PROJECT_NAME %+ _ %+ %1
221 | %endmacro
222 |
223 | %macro GLOBAL_LOCAL_PREFIX 1
224 | GLOBAL PROJECT_NAME %+ _ %+ %1
225 | HIDDEN PROJECT_NAME %+ _ %+ %1
226 | %endmacro
227 |
228 | ; name
229 | %macro GLOBAL_HIDDEN_FN 1
230 | GLOBAL %1
231 | HIDDEN %1
232 | FN %1
233 | %endmacro
234 |
235 | ; name, args, xmmused
236 | %macro GLOBAL_HIDDEN_FN_EXT 3
237 | GLOBAL %1
238 | HIDDEN %1
239 | FN_EXT %1, %2, %3
240 | %endmacro
241 |
242 |
243 | ; pic support: LOAD_VAR_PIC var, reg
244 | %macro LOAD_VAR_PIC 2
245 | %if (BITS32)
246 | call 1f
247 | 1:
248 | popl %2
249 | leal %1 - 1b(%2), %2
250 | %else
251 | leaq %1(%rip), %2
252 | %endif
253 | %endmacro
254 |
255 | %macro INCLUDE 1
256 | %include %1
257 | %endmacro
258 |
259 | ; include the file with the variable(s) if variable 'name' is not already included: INCLUDE_VAR_FILE file, name
260 | %macro INCLUDE_VAR_FILE 2
261 | %ifndef INCLUDED_%2
262 | %define INCLUDED_%2
263 | %include %1
264 | %endif
265 | %endmacro
266 |
267 | ; stupid helpers so we can have cpuid in one file
268 |
269 | %macro CPUID_PROLOGUE 0
270 | %if (BITS32)
271 | pushl %ebx
272 | pushl %esi
273 | pushl %edi
274 | pushl %ebp
275 |
276 | ; check that cpuid is supported
277 | pushfl
278 | popl %eax
279 | movl %eax, %ecx
280 | xorl $0x200000, %eax
281 | pushl %eax
282 | popfl
283 | pushfl
284 | popl %eax
285 | xorl %ecx, %eax
286 | shrl $21, %eax
287 | andl $1, %eax
288 | pushl %ecx
289 | popfl
290 | andl %eax, %eax
291 | jz Lcpuid_x86_done
292 | %else
293 | pushq %rbx
294 | pushq %rsi
295 | pushq %rdi
296 | pushq %rbp
297 | %endif
298 | %endmacro
299 |
300 | %macro CPUID_EPILOGUE 0
301 | %if (BITS32)
302 | popl %ebp
303 | popl %edi
304 | popl %esi
305 | popl %ebx
306 | %else
307 | popq %rbp
308 | popq %rdi
309 | popq %rsi
310 | popq %rbx
311 | %endif
312 | ret
313 | %endmacro
314 |
315 | %macro CPUCYCLES 0
316 | rdtsc
317 | %if (BITS64)
318 | shlq $32, %rdx
319 | orq %rdx, %rax
320 | %endif
321 | ret
322 | %endmacro
323 |
324 | %define CPUID_GENERIC (0 )
325 | %define CPUID_X86 (1 << 0)
326 | %define CPUID_MMX (1 << 1)
327 | %define CPUID_SSE (1 << 2)
328 | %define CPUID_SSE2 (1 << 3)
329 | %define CPUID_SSE3 (1 << 4)
330 | %define CPUID_SSSE3 (1 << 5)
331 | %define CPUID_SSE4_1 (1 << 6)
332 | %define CPUID_SSE4_2 (1 << 7)
333 | %define CPUID_AVX (1 << 8)
334 | %define CPUID_XOP (1 << 9)
335 | %define CPUID_AVX2 (1 << 10)
336 | %define CPUID_AVX512 (1 << 11)
337 |
338 | %define CPUID_RDTSC (1 << 25)
339 | %define CPUID_RDRAND (1 << 26)
340 | %define CPUID_POPCNT (1 << 27)
341 | %define CPUID_FMA4 (1 << 28)
342 | %define CPUID_FMA3 (1 << 29)
343 | %define CPUID_PCLMULQDQ (1 << 30)
344 | %define CPUID_AES (1 << 31)
345 |
346 | %endif ; BASE_YASM
347 |
--------------------------------------------------------------------------------
/framework/driver/yasm_driver.inc:
--------------------------------------------------------------------------------
1 | %ifndef YASM_DRIVER_INC
2 | %define YASM_DRIVER_INC
3 |
4 | %include "asmopt_internal.h"
5 |
6 | %include "x86/yasm.inc"
7 |
8 | %macro INCLUDE_IF_X86_32BIT 1
9 | %if (BITS32)
10 | INCLUDE %1
11 | %endif
12 | %endmacro
13 |
14 | %macro INCLUDE_IF_X86_64BIT 1
15 | %if (BITS64)
16 | INCLUDE %1
17 | %endif
18 | %endmacro
19 |
20 | %macro INCLUDE_IF_MMX_32BIT 1
21 | INCLUDE_IF_X86_32BIT %1
22 | %endmacro
23 |
24 | %macro INCLUDE_IF_MMX_64BIT 1
25 | INCLUDE_IF_X86_64BIT %1
26 | %endmacro
27 |
28 |
29 | %macro INCLUDE_IF_SSE_32BIT 1
30 | INCLUDE_IF_X86_32BIT %1
31 | %endmacro
32 |
33 | %macro INCLUDE_IF_SSE_64BIT 1
34 | INCLUDE_IF_X86_64BIT %1
35 | %endmacro
36 |
37 |
38 | %macro INCLUDE_IF_SSE2_32BIT 1
39 | INCLUDE_IF_X86_32BIT %1
40 | %endmacro
41 |
42 | %macro INCLUDE_IF_SSE2_64BIT 1
43 | INCLUDE_IF_X86_64BIT %1
44 | %endmacro
45 |
46 |
47 | %macro INCLUDE_IF_SSE3_32BIT 1
48 | INCLUDE_IF_X86_32BIT %1
49 | %endmacro
50 |
51 | %macro INCLUDE_IF_SSE3_64BIT 1
52 | INCLUDE_IF_X86_64BIT %1
53 | %endmacro
54 |
55 |
56 | %macro INCLUDE_IF_SSSE3_32BIT 1
57 | INCLUDE_IF_X86_32BIT %1
58 | %endmacro
59 |
60 | %macro INCLUDE_IF_SSSE3_64BIT 1
61 | INCLUDE_IF_X86_64BIT %1
62 | %endmacro
63 |
64 |
65 | %macro INCLUDE_IF_SSE4_1_32BIT 1
66 | INCLUDE_IF_X86_32BIT %1
67 | %endmacro
68 |
69 | %macro INCLUDE_IF_SSE4_1_64BIT 1
70 | INCLUDE_IF_X86_64BIT %1
71 | %endmacro
72 |
73 |
74 | %macro INCLUDE_IF_SSE4_2_32BIT 1
75 | INCLUDE_IF_X86_32BIT %1
76 | %endmacro
77 |
78 | %macro INCLUDE_IF_SSE4_2_64BIT 1
79 | INCLUDE_IF_X86_64BIT %1
80 | %endmacro
81 |
82 |
83 | %macro INCLUDE_IF_AVX_32BIT 1
84 | INCLUDE_IF_X86_32BIT %1
85 | %endmacro
86 |
87 | %macro INCLUDE_IF_AVX_64BIT 1
88 | INCLUDE_IF_X86_64BIT %1
89 | %endmacro
90 |
91 |
92 | %macro INCLUDE_IF_XOP_32BIT 1
93 | %if HAVE_XOP
94 | INCLUDE_IF_X86_32BIT %1
95 | %endif
96 | %endmacro
97 |
98 | %macro INCLUDE_IF_XOP_64BIT 1
99 | %if HAVE_XOP
100 | INCLUDE_IF_X86_64BIT %1
101 | %endif
102 | %endmacro
103 |
104 |
105 | %macro INCLUDE_IF_AVX2_32BIT 1
106 | %if HAVE_AVX2
107 | INCLUDE_IF_X86_32BIT %1
108 | %endif
109 | %endmacro
110 |
111 | %macro INCLUDE_IF_AVX2_64BIT 1
112 | %if HAVE_AVX2
113 | INCLUDE_IF_X86_64BIT %1
114 | %endif
115 | %endmacro
116 |
117 |
118 | %macro INCLUDE_IF_AVX512_32BIT 1
119 | %if HAVE_AVX512
120 | INCLUDE_IF_X86_32BIT %1
121 | %endif
122 | %endmacro
123 |
124 | %macro INCLUDE_IF_AVX512_64BIT 1
125 | %if HAVE_AVX512
126 | INCLUDE_IF_X86_64BIT %1
127 | %endif
128 | %endmacro
129 |
130 | ; include unsupported platform includes here
131 | ; ...
132 | ; ...
133 | ; ...
134 |
135 | %endif ; YASM_DRIVER_INC
--------------------------------------------------------------------------------
/framework/fuzz.c:
--------------------------------------------------------------------------------
1 | #if (defined(_WIN32) || defined(_WIN64))
2 | #include
3 | #include
4 | #endif
5 |
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include "cpuid.h"
11 | #include "fuzz.h"
12 |
13 | /*
14 | Chacha/8 rng with no addition of state words post-mixing, no security at all, but good
15 | portable random numbers for fuzzing
16 | */
17 |
18 | #if defined(HAVE_INT32)
19 | typedef uint32_t chacha_int32;
20 | #else
21 | typedef unsigned long chacha_int32;
22 | #endif
23 |
24 | /* store a 32 bit unsigned integer as four 8 bit unsigned integers in little endian */
25 | static void
26 | store8(unsigned char *p, chacha_int32 v) {
27 | p[0] = (v ) & 0xff;
28 | p[1] = (v >> 8) & 0xff;
29 | p[2] = (v >> 16) & 0xff;
30 | p[3] = (v >> 24) & 0xff;
31 | }
32 |
33 | /* 32 bit left rotate */
34 | static chacha_int32
35 | rotate32(chacha_int32 x, int k) {
36 | return ((x << k) | (x >> (32 - k))) & 0xffffffffUL;
37 | }
38 |
39 | typedef struct chacha_state_t {
40 | chacha_int32 s[12];
41 | } chacha_state_t;
42 |
43 | /* 1 block = 64 bytes */
44 | static void
45 | chacha_blocks(chacha_state_t *state, unsigned char *out, size_t blocks) {
46 | chacha_int32 x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
47 | chacha_int32 j4,j5,j6,j7,j8,j9,j10,j11,j12,j13,j14,j15;
48 | chacha_int32 t;
49 | size_t i;
50 |
51 | j4 = state->s[0];
52 | j5 = state->s[1];
53 | j6 = state->s[2];
54 | j7 = state->s[3];
55 | j8 = state->s[4];
56 | j9 = state->s[5];
57 | j10 = state->s[6];
58 | j11 = state->s[7];
59 | j12 = state->s[8];
60 | j13 = state->s[9];
61 | j14 = state->s[10];
62 | j15 = state->s[11];
63 |
64 | for ( ; blocks; blocks -= 1, out += 64) {
65 | /* "expand 32-byte k", as 4 little endian 32-bit unsigned integers */
66 | x0 = 0x61707865;
67 | x1 = 0x3320646e;
68 | x2 = 0x79622d32;
69 | x3 = 0x6b206574;
70 | x4 = j4;
71 | x5 = j5;
72 | x6 = j6;
73 | x7 = j7;
74 | x8 = j8;
75 | x9 = j9;
76 | x10 = j10;
77 | x11 = j11;
78 | x12 = j12;
79 | x13 = j13;
80 | x14 = j14;
81 | x15 = j15;
82 |
83 | #define quarter(a,b,c,d) \
84 | a = (a + b) & 0xffffffffUL; t = d^a; d = rotate32(t,16); \
85 | c = (c + d) & 0xffffffffUL; t = b^c; b = rotate32(t,12); \
86 | a = (a + b) & 0xffffffffUL; t = d^a; d = rotate32(t, 8); \
87 | c = (c + d) & 0xffffffffUL; t = b^c; b = rotate32(t, 7);
88 |
89 | for (i = 0; i < 8; i += 2) {
90 | quarter( x0, x4, x8,x12)
91 | quarter( x1, x5, x9,x13)
92 | quarter( x2, x6,x10,x14)
93 | quarter( x3, x7,x11,x15)
94 | quarter( x0, x5,x10,x15)
95 | quarter( x1, x6,x11,x12)
96 | quarter( x2, x7, x8,x13)
97 | quarter( x3, x4, x9,x14)
98 | }
99 |
100 | store8(out + 0, x0);
101 | store8(out + 4, x1);
102 | store8(out + 8, x2);
103 | store8(out + 12, x3);
104 | store8(out + 16, x4);
105 | store8(out + 20, x5);
106 | store8(out + 24, x6);
107 | store8(out + 28, x7);
108 | store8(out + 32, x8);
109 | store8(out + 36, x9);
110 | store8(out + 40, x10);
111 | store8(out + 44, x11);
112 | store8(out + 48, x12);
113 | store8(out + 52, x13);
114 | store8(out + 56, x14);
115 | store8(out + 60, x15);
116 |
117 | /* use counter+iv as a 128 bit counter */
118 | j12 = (j12 + 1);
119 | if (!j12) {
120 | j13 = (j13 + 1);
121 | if (!j13) {
122 | j14 = (j14 + 1);
123 | if (!j14)
124 | j15 = (j15 + 1);
125 | }
126 | }
127 | }
128 |
129 | state->s[8] = j12;
130 | state->s[9] = j13;
131 | state->s[10] = j14;
132 | state->s[11] = j15;
133 | }
134 |
135 | typedef struct fuzz_state_t {
136 | chacha_state_t rng;
137 | unsigned char buffer[64];
138 | size_t remaining;
139 | } fuzz_state_t;
140 |
141 | static fuzz_state_t fuzz_state;
142 |
143 | /* reload the fuzz random number buffer */
144 | static void
145 | fuzz_reload(fuzz_state_t *st) {
146 | chacha_blocks(&st->rng, st->buffer, sizeof(st->buffer) / 64);
147 | st->remaining = sizeof(st->buffer);
148 | }
149 |
150 | /* initialize the state to all zeros */
151 | void
152 | fuzz_init_deterministic(void) {
153 | memset(&fuzz_state.rng, 0, sizeof(fuzz_state.rng));
154 | fuzz_reload(&fuzz_state);
155 | }
156 |
157 | /* initialize the state randomly */
158 | void
159 | fuzz_init(void) {
160 | #if (defined(_WIN32) || defined(_WIN64))
161 | HCRYPTPROV handle;
162 | if (!CryptAcquireContext(&handle, 0, 0, PROV_RSA_FULL, CRYPT_VERIFYCONTEXT)) {
163 | fprintf(stderr, "CryptAcquireContext failed");
164 | exit(1);
165 | }
166 | CryptGenRandom(handle, sizeof(fuzz_state.rng), (BYTE*)&fuzz_state.rng);
167 | CryptReleaseContext(handle, 0);
168 | #else
169 | FILE *f = fopen("/dev/urandom", "r");
170 | if (!f) {
171 | fprintf(stderr, "failed to open /dev/urandom");
172 | exit(1);
173 | }
174 | if (fread(&fuzz_state.rng, sizeof(fuzz_state.rng), 1, f) != 1) {
175 | fprintf(stderr, "read on /dev/urandom failed");
176 | exit(1);
177 | }
178 | fclose(f);
179 | #endif
180 | fuzz_reload(&fuzz_state);
181 | }
182 |
183 | /* get len random bytes */
184 | void
185 | fuzz_get_bytes(void *out, size_t len) {
186 | unsigned char *outb = (unsigned char *)out;
187 |
188 | while (len) {
189 | /* drain the stored buffer first */
190 | if (fuzz_state.remaining) {
191 | size_t bytes = (len > fuzz_state.remaining) ? fuzz_state.remaining : len;
192 | memcpy(outb, fuzz_state.buffer + (sizeof(fuzz_state.buffer) - fuzz_state.remaining), bytes);
193 |
194 | fuzz_state.remaining -= bytes;
195 | outb += bytes;
196 | len -= bytes;
197 | }
198 |
199 | /* fill up with full blocks */
200 | if (len >= 64) {
201 | size_t bytes = (len & ~63), blocks = len / 64;
202 | chacha_blocks(&fuzz_state.rng, outb, blocks);
203 | outb += bytes;
204 | len -= bytes;
205 | }
206 |
207 | /* refill the stored buffer if needed */
208 | if (!fuzz_state.remaining)
209 | fuzz_reload(&fuzz_state);
210 | }
211 | }
212 |
213 | /* print len bytes from bytes in hex format, xor'd against base if bytes != base */
214 | void
215 | fuzz_print_bytes(const char *desc, const unsigned char *bytes, const unsigned char *base, size_t len) {
216 | size_t i;
217 | printf("%s: ", desc);
218 | for (i = 0; i < len; i++) {
219 | if (i && ((i % 16) == 0))
220 | printf("\n");
221 | if (base != bytes) {
222 | unsigned char diff = base[i] ^ bytes[i];
223 | if (diff)
224 | printf("0x%02x,", diff);
225 | else
226 | printf("____,");
227 | } else {
228 | printf("0x%02x,", bytes[i]);
229 | }
230 | }
231 | printf("\n\n");
232 | }
233 |
234 | static void
235 | fuzz_print_input(const fuzz_variable_t *input_variables, const size_t *random_sizes, const unsigned char *input) {
236 | size_t random_size;
237 |
238 | for ( ; ; input_variables++) {
239 | switch (input_variables->type) {
240 | case FUZZ_DONE:
241 | return;
242 |
243 | case FUZZ_ARRAY:
244 | fuzz_print_bytes(input_variables->desc, input, input, input_variables->size);
245 | input += input_variables->size;
246 | break;
247 |
248 | case FUZZ_RANDOM_LENGTH_ARRAY0:
249 | case FUZZ_RANDOM_LENGTH_ARRAY1:
250 | case FUZZ_RANDOM_LENGTH_ARRAY2:
251 | case FUZZ_RANDOM_LENGTH_ARRAY3:
252 | random_size = random_sizes[input_variables->type - FUZZ_RANDOM_LENGTH_ARRAY0];
253 | fuzz_print_bytes(input_variables->desc, input, input, random_size);
254 | input += random_size;
255 | break;
256 | }
257 | }
258 | }
259 |
260 |
261 | static void
262 | fuzz_print_output(const cpu_specific_impl_t *impl, const fuzz_variable_t *output_variables, const size_t *random_sizes, const unsigned char *output, const unsigned char *generic_output) {
263 | size_t random_size;
264 |
265 | printf("IMPLEMENTATION: %s\n", impl->desc);
266 |
267 | for ( ; ; output_variables++) {
268 | switch (output_variables->type) {
269 | case FUZZ_DONE:
270 | return;
271 |
272 | case FUZZ_ARRAY:
273 | fuzz_print_bytes(output_variables->desc, output, generic_output, output_variables->size);
274 | output += output_variables->size;
275 | generic_output += output_variables->size;
276 | break;
277 |
278 | case FUZZ_RANDOM_LENGTH_ARRAY0:
279 | case FUZZ_RANDOM_LENGTH_ARRAY1:
280 | case FUZZ_RANDOM_LENGTH_ARRAY2:
281 | case FUZZ_RANDOM_LENGTH_ARRAY3:
282 | random_size = random_sizes[output_variables->type - FUZZ_RANDOM_LENGTH_ARRAY0];
283 | fuzz_print_bytes(output_variables->desc, output, generic_output, random_size);
284 | output += random_size;
285 | generic_output += random_size;
286 | break;
287 | }
288 | }
289 | }
290 |
291 | /* run the fuzzer */
292 | void
293 | fuzz(const void *impls, size_t impl_size, const fuzz_variable_t *input_variables, const fuzz_variable_t *output_variables, impl_fuzz fuzz_fn) {
294 | /* allocate data */
295 | unsigned char *fuzz_input = NULL, *fuzz_output = NULL;
296 | const cpu_specific_impl_t **impl_list_alloc = (const cpu_specific_impl_t **)malloc(sizeof(const cpu_specific_impl_t *) * 32), **impl_list;
297 | size_t impl_count = 0;
298 | size_t random_sizes[4], *random_size;
299 |
300 | /* cpu detection */
301 | unsigned long cpu_flags = LOCAL_PREFIX(cpuid)();
302 | const char *p = (const char *)impls;
303 |
304 | size_t expected_bytes_out;
305 | unsigned char *outp;
306 | size_t i;
307 |
308 | /* counter display */
309 | clock_t start, clocks;
310 | size_t counter, counter_dot, counter_line;
311 | int display_counter;
312 |
313 | /* aggregate number of implementations, storing them in reverse order (generic first, most optimized last) */
314 | impl_list = &impl_list_alloc[31];
315 | for (;;) {
316 | const cpu_specific_impl_t *impl = (const cpu_specific_impl_t *)p;
317 | if (impl->cpu_flags == (impl->cpu_flags & cpu_flags))
318 | *(impl_list--) = (const cpu_specific_impl_t *)impl;
319 | if (impl->cpu_flags == CPUID_GENERIC)
320 | break;
321 | p += impl_size;
322 | }
323 |
324 | /* need at least 2 added to do anything interesting */
325 | impl_count = (&impl_list_alloc[31] - impl_list);
326 | if (impl_count <= 1) {
327 | printf("not enough implementations to fuzz..\n");
328 | goto done;
329 | }
330 | /* point it at the last impl added */
331 | impl_list += 1;
332 |
333 | /* 16k for raw data, 1k for key material and derived data */
334 | fuzz_input = (unsigned char *)malloc(16384 + 1024);
335 | fuzz_output = (unsigned char *)malloc((16384 + 1024) * impl_count);
336 |
337 | /* show list of implementations being fuzzed */
338 | printf("fuzzing %s", impl_list[0]->desc);
339 | for (i = 1; i < impl_count; i++) {
340 | printf(", %s", impl_list[i]->desc);
341 | }
342 | printf("\n\n");
343 |
344 | /* fuzz loop */
345 | display_counter = 0;
346 | counter = 0;
347 | counter_dot = 0;
348 | counter_line = 0;
349 |
350 | start = clock();
351 | for (;;) {
352 | unsigned char *inp = fuzz_input;
353 | unsigned char *generic_out = fuzz_output;
354 |
355 | /* set up the data for this run */
356 | for (i = 0; input_variables[i].type != FUZZ_DONE; i++) {
357 | switch (input_variables[i].type) {
358 | case FUZZ_DONE:
359 | break;
360 |
361 | case FUZZ_ARRAY:
362 | fuzz_get_bytes(inp, input_variables[i].size);
363 | inp += input_variables[i].size;
364 | break;
365 |
366 | case FUZZ_RANDOM_LENGTH_ARRAY0:
367 | case FUZZ_RANDOM_LENGTH_ARRAY1:
368 | case FUZZ_RANDOM_LENGTH_ARRAY2:
369 | case FUZZ_RANDOM_LENGTH_ARRAY3:
370 | random_size = &random_sizes[input_variables[i].type - FUZZ_RANDOM_LENGTH_ARRAY0];
371 | fuzz_get_bytes(random_size, sizeof(*random_size));
372 | *random_size = (*random_size % input_variables[i].size);
373 | fuzz_get_bytes(inp, *random_size);
374 | inp += *random_size;
375 | break;
376 | }
377 | }
378 |
379 | expected_bytes_out = 0;
380 | for (i = 0; output_variables[i].type != FUZZ_DONE; i++) {
381 | switch (output_variables[i].type) {
382 | case FUZZ_DONE:
383 | break;
384 |
385 | case FUZZ_ARRAY:
386 | expected_bytes_out += output_variables[i].size;
387 | break;
388 |
389 | case FUZZ_RANDOM_LENGTH_ARRAY0:
390 | case FUZZ_RANDOM_LENGTH_ARRAY1:
391 | case FUZZ_RANDOM_LENGTH_ARRAY2:
392 | case FUZZ_RANDOM_LENGTH_ARRAY3:
393 | random_size = &random_sizes[output_variables[i].type - FUZZ_RANDOM_LENGTH_ARRAY0];
394 | expected_bytes_out += *random_size;
395 | break;
396 | }
397 | }
398 |
399 | /* gather results */
400 | outp = fuzz_output;
401 | for (i = 0; i < impl_count; i++) {
402 | fuzz_fn(impl_list[i], fuzz_input, random_sizes, outp);
403 | outp += expected_bytes_out;
404 | }
405 |
406 | /* compare results */
407 | outp = fuzz_output + expected_bytes_out;
408 | for (i = 1; i < impl_count; i++) {
409 | if (memcmp(generic_out, outp, expected_bytes_out) != 0)
410 | goto failure;
411 | outp += expected_bytes_out;
412 | }
413 |
414 | counter++;
415 |
416 | /* are we still calibrating? */
417 | if (!display_counter) {
418 | clocks = clock();
419 | if (clocks == (clock_t)-1) {
420 | /* clock is broken, use values which might suck.. */
421 | counter_line = 8192;
422 | counter_dot = (counter_line / 32);
423 | counter = 0;
424 | display_counter = 1;
425 | } else if ((clocks - start) >= CLOCKS_PER_SEC) {
426 | printf("doing approximately %u passes a second..\n", (unsigned int)(counter));
427 |
428 | /* 32 dots per line, 1 line per ~5 seconds */
429 | counter_line = 1;
430 | counter *= 5;
431 | while (counter_line < counter)
432 | counter_line *= 2;
433 | if (counter_line < 32)
434 | counter_line = 32;
435 | counter_dot = (counter_line / 32);
436 | if (counter_dot < 1)
437 | counter_dot = 1;
438 |
439 | counter = 0;
440 | display_counter = 1;
441 | }
442 | } else {
443 | if ((counter & (counter_dot - 1)) == 0)
444 | printf(".");
445 | if ((counter & (counter_line - 1)) == 0)
446 | printf("[%08x]\n", (unsigned int)(counter));
447 | }
448 | }
449 |
450 | failure:
451 | printf("fuzz mismatch! dumping input and output data\n\n");
452 |
453 | printf("INPUT\n\n");
454 | fuzz_print_input(input_variables, random_sizes, fuzz_input);
455 |
456 | printf("OUTPUT\n\n");
457 | outp = fuzz_output;
458 | fuzz_print_output(impl_list[0], output_variables, random_sizes, outp, fuzz_output);
459 | outp += expected_bytes_out;
460 |
461 | for (i = 1; i < impl_count; i++) {
462 | fuzz_print_output(impl_list[i], output_variables, random_sizes, outp, fuzz_output);
463 | outp += expected_bytes_out;
464 | }
465 |
466 | done:
467 | if (fuzz_input)
468 | free(fuzz_input);
469 | if (fuzz_output)
470 | free(fuzz_output);
471 | free((void *)impl_list_alloc);
472 | }
473 |
--------------------------------------------------------------------------------
/framework/include/bench.h:
--------------------------------------------------------------------------------
1 | #ifndef BENCH_H
2 | #define BENCH_H
3 |
4 | #include "asmopt_internal.h"
5 | #include "cpuid.h"
6 |
7 | typedef void (*impl_bench)(const void *impl);
8 |
9 | /* a 32k, 64 byte aligned buffer to bench with */
10 | unsigned char *bench_get_buffer(void);
11 |
12 | int bench(const void *impls, size_t impl_size, impl_test test_fn, impl_bench bench_fn, size_t units_count, const char *units_desc);
13 |
14 | #endif /* BENCH_H */
15 |
16 |
--------------------------------------------------------------------------------
/framework/include/cpucycles.h:
--------------------------------------------------------------------------------
1 | #ifndef CPUCYCLES_H
2 | #define CPUCYCLES_H
3 |
4 | #include "asmopt_internal.h"
5 |
6 | #if defined(HAVE_INT64)
7 | typedef uint64_t cycles_t;
8 | #elif defined(HAVE_INT32)
9 | typedef uint32_t cycles_t;
10 | #else
11 | typedef unsigned long cycles_t;
12 | #endif
13 |
14 | cycles_t LOCAL_PREFIX(cpucycles)(void);
15 | const char *LOCAL_PREFIX(cpucycles_units)(void);
16 |
17 | #endif /* CPUCYCLES_H */
18 |
19 |
--------------------------------------------------------------------------------
/framework/include/cpuid.h:
--------------------------------------------------------------------------------
1 | #ifndef CPUID_H
2 | #define CPUID_H
3 |
4 | #include "asmopt_internal.h"
5 |
6 | #if defined(__cplusplus)
7 | extern "C" {
8 | #endif
9 |
10 | enum cpuid_flags_generic_t {
11 | CPUID_GENERIC = (0)
12 | };
13 |
14 | #include "cpuid_flags.inc"
15 |
16 | unsigned long LOCAL_PREFIX(cpuid)(void);
17 |
18 | /* runtime dispatching based on current cpu */
19 | typedef struct cpu_specific_impl_t {
20 | unsigned long cpu_flags;
21 | const char *desc;
22 | /* additional information, pointers to methods, etc... */
23 | } cpu_specific_impl_t;
24 |
25 | typedef int (*impl_test)(const void *impl);
26 |
27 | const void *LOCAL_PREFIX(cpu_select)(const void *impls, size_t impl_size, impl_test test_fn);
28 |
29 | #if defined(__cplusplus)
30 | }
31 | #endif
32 |
33 | #endif /* CPUID_H */
34 |
--------------------------------------------------------------------------------
/framework/include/fuzz.h:
--------------------------------------------------------------------------------
1 | #ifndef FUZZ_H
2 | #define FUZZ_H
3 |
4 | #include "asmopt_internal.h"
5 |
6 | #if defined(__cplusplus)
7 | extern "C" {
8 | #endif
9 |
10 | typedef void (*impl_fuzz)(const void *impl, const unsigned char *in, const size_t *random_sizes, unsigned char *out);
11 |
12 | typedef enum {
13 | FUZZ_DONE,
14 | FUZZ_ARRAY,
15 | FUZZ_RANDOM_LENGTH_ARRAY0,
16 | FUZZ_RANDOM_LENGTH_ARRAY1,
17 | FUZZ_RANDOM_LENGTH_ARRAY2,
18 | FUZZ_RANDOM_LENGTH_ARRAY3
19 | } fuzz_type_t;
20 |
21 | typedef struct fuzz_variable_t {
22 | const char *desc;
23 | fuzz_type_t type;
24 | size_t size;
25 | } fuzz_variable_t;
26 |
27 | void fuzz_init(void);
28 | void fuzz_init_deterministic(void);
29 | void fuzz_get_bytes(void *out, size_t len);
30 | void fuzz_print_bytes(const char *desc, const unsigned char *bytes, const unsigned char *base, size_t len);
31 | void fuzz(const void *impls, size_t impl_size, const fuzz_variable_t *input_variables, const fuzz_variable_t *output_variables, impl_fuzz fuzz_fn);
32 |
33 | #if defined(__cplusplus)
34 | }
35 | #endif
36 |
37 | #endif /* FUZZ_H */
38 |
--------------------------------------------------------------------------------
/framework/main_shared.c:
--------------------------------------------------------------------------------
1 | #include "asmopt_internal.h"
2 |
3 | #if defined(_WIN32) || defined(__CYGWIN__)
4 |
5 | #include
6 |
7 | BOOL WINAPI DllMain(HINSTANCE hinstDLL, DWORD fdwReason, LPVOID lpvReserved) {
8 | hinstDLL;
9 | lpvReserved;
10 |
11 | switch (fdwReason) {
12 | case DLL_PROCESS_ATTACH:
13 | break;
14 |
15 | case DLL_THREAD_ATTACH:
16 | break;
17 |
18 | case DLL_THREAD_DETACH:
19 | break;
20 |
21 | case DLL_PROCESS_DETACH:
22 | break;
23 | }
24 |
25 | return TRUE;
26 | }
27 |
28 | #endif /* defined(_WIN32) || defined(__CYGWIN__) */
29 |
--------------------------------------------------------------------------------
/framework/main_util.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | /* includes, and implementations, implementations_count */
5 | typedef struct implementation_t {
6 | const char *name;
7 | int (*startup)(void);
8 | void (*fuzz)(void);
9 | void (*bench)(void);
10 | } implementation_t;
11 |
12 | #define make_impl(name) {#name, name##_startup, name##_fuzz, name##_bench}
13 |
14 | #include "util_implementations.h"
15 |
16 | static size_t implementations_count = (sizeof(implementations) / sizeof(implementation_t));
17 |
18 | static int
19 | help(void) {
20 | if (implementations_count > 1) {
21 | size_t i;
22 | printf("usage: util [");
23 | for (i = 0; i < implementations_count; i++) {
24 | printf("%s", implementations[i].name);
25 | if (i < (implementations_count - 1))
26 | printf(",");
27 | }
28 | printf("] [fuzz,bench]\n\n");
29 | } else {
30 | printf("usage: util [fuzz,bench]\n\n");
31 | }
32 | return 1;
33 | }
34 |
35 | int main(int argc, const char *argv[]) {
36 | const implementation_t *sel = implementations, *end = sel + implementations_count;
37 | size_t action_arg = 1;
38 |
39 | if (implementations_count == 0) {
40 | printf("no implementations available\n");
41 | return 1;
42 | }
43 |
44 | if (argc < ((implementations_count > 1) ? 3 : 2))
45 | return help();
46 |
47 | if (implementations_count > 1) {
48 | while (sel < end) {
49 | if (strcmp(argv[1], sel->name) == 0)
50 | break;
51 | sel++;
52 | }
53 |
54 | if (sel == end)
55 | return help();
56 |
57 | action_arg = 2;
58 | }
59 |
60 | if (sel->startup() != 0) {
61 | printf("%s failed to startup\n", sel->name);
62 | return 1;
63 | }
64 |
65 | if (strcmp(argv[action_arg], "fuzz") == 0)
66 | sel->fuzz();
67 | else if (strcmp(argv[action_arg], "bench") == 0)
68 | sel->bench();
69 | else
70 | return help();
71 |
72 | return 0;
73 | }
74 |
--------------------------------------------------------------------------------
/genvs.php:
--------------------------------------------------------------------------------
1 |
2 |
3 | function get_guid($name) {
4 | $hex = strtoupper(md5($name));
5 | return "{".substr($hex, 0, 8)."-".substr($hex, 8, 4)."-".substr($hex, 12, 4)."-".substr($hex, 16, 4)."-".substr($hex, 20, 12)."}";
6 | }
7 |
8 | function addln($str) {
9 | return $str."\xd\xa";
10 | }
11 |
12 | function echoln($str) {
13 | echo $str;
14 | echo "\n";
15 | }
16 |
17 | function fecho($f, $str) {
18 | fwrite($f, $str);
19 | }
20 |
21 | function fecholn($f, $str) {
22 | fwrite($f, $str);
23 | fwrite($f, "\xd\xa");
24 | }
25 |
26 | function quote($str) {
27 | return "\"{$str}\"";
28 | }
29 |
30 | function fixslash($str) {
31 | return str_replace("/", "\\", $str);
32 | }
33 |
34 | function my_file_get_contents($path) {
35 | if (!file_exists($path)) {
36 | echoln("unable to open {$path}!\n");
37 | exit(1);
38 | }
39 | return file_get_contents($path);
40 | }
41 |
42 | $crawl_ignore = array("asmopt.h"=>1, "asmopt_internal.h"=>1, "util_implementations.h"=>1);
43 |
44 | function crawl(&$list, $dir, $grab, $recurse) {
45 | global $crawl_ignore;
46 | $dh = opendir($dir);
47 | if ($dh) {
48 | while (($file = readdir($dh)) !== false) {
49 | $path = $dir."/".$file;
50 | if (($file == ".") || ($file == "..") || isset($crawl_ignore[$file]))
51 | continue;
52 | if (is_dir($path)) {
53 | if ($recurse)
54 | crawl($list, $path, $grab, $recurse);
55 | } else {
56 | foreach($grab as $pat) {
57 | if (preg_match($pat, $file)) {
58 | $list[] = fixslash($path);
59 | break;
60 | }
61 | }
62 | }
63 | }
64 | closedir($dh);
65 | }
66 | }
67 |
68 | abstract class gen_vs {
69 | protected $name;
70 | protected $builds;
71 | protected $projects;
72 | protected $sln;
73 | protected $project_dir;
74 | protected $files;
75 | protected $include_dirs;
76 |
77 | public function gen_vs($name) {
78 | $this->name = strtolower($name);
79 | $this->projects = array();
80 |
81 | foreach(array("lib", "dll", "util") as $type) {
82 | $name = "{$this->name}_{$type}";
83 | $this->projects[$type] = array("name"=>$name, "guid"=>get_guid($name));
84 | }
85 |
86 | $this->include_dirs = array("./", "../app/include", "../app/extensions", "../framework/include", "../framework/driver", "../framework/driver/x86");
87 | }
88 |
89 | public function build_files() {
90 | $this->files = array("driver"=>array(), "ext"=>array(), "util"=>array(), "shared"=>array(), "include"=>array());
91 | crawl($this->files["driver"], "framework/driver", array("!\.c$!", "!\.h$!", "!\.inc$!"), false);
92 | crawl($this->files["driver"], "framework/driver/x86", array("!\.c$!", "!\.S$!", "!\.h$!", "!\.inc$!"), false);
93 | crawl($this->files["ext"], "app/extensions", array("!\.c$!", "!\.S$!", "!\.inc$!", "!\.h$!"), true);
94 | crawl($this->files["include"], "app/include", array("!\.h$!"), false);
95 | crawl($this->files["include"], "framework/include", array("!\.h$!"), false);
96 | crawl($this->files["shared"], "framework", array("!main_shared\.c$!"), false);
97 | crawl($this->files["util"], "framework", array("!main_util\.c$!", "!fuzz\.c$!", "!bench\.c$!"), true);
98 |
99 | $this->projects["lib"]["files"] = array("driver", "ext", "include");
100 | $this->projects["dll"]["files"] = array("driver", "ext", "include", "shared");
101 | $this->projects["util"]["files"] = array("driver", "ext", "include", "util");
102 | }
103 |
104 | public function write_file($name, $str) {
105 | $in = array("%%name", "%%NAME", "%%projectdir");
106 | $out = array($this->name, strtoupper($this->name), $this->project_dir);
107 | $name = str_replace($in, $out, $name);
108 | $str = str_replace($in, $out, $str);
109 | $f = fopen("{$name}", "w+");
110 | chmod("{$name}", 0755);
111 | fwrite($f, $str);
112 | fclose($f);
113 | }
114 |
115 | public abstract function make();
116 | };
117 |
118 |
119 | /*
120 | vs 2010 'tricks'
121 |
122 | allow a files with the same name, but different paths, to be compiled correctly and not in to a flat directory: set
123 | ObjectFileName path to "$(IntDir)dummy\\%(RelativeDir)/", dummy eats the ../ we used to escape the vs2010 dir.
124 |
125 |
126 | */
127 |
128 | class vs2010 extends gen_vs {
129 | protected $fileinfo;
130 |
131 | protected $toolset;
132 | protected $toolsversion;
133 | protected $fileformatversion;
134 | protected $vsversion;
135 |
136 | public function vs2010($name) {
137 | parent::gen_vs($name);
138 |
139 | $this->sln = "{$this->name}.sln";
140 |
141 | foreach($this->projects as $handle=>&$info)
142 | $info["vcxproj"] = "{$info['name']}.vcxproj";
143 |
144 | $this->builds = array(
145 | "Debug|x86-32bit"=>"Debug|Win32",
146 | "Debug|amd64"=>"Debug|x64",
147 | "Release|x86-32bit"=>"Release|Win32",
148 | "Release|amd64"=>"Release|x64"
149 | );
150 |
151 | $this->project_dir = "vs2010";
152 | $this->toolset = "v100";
153 | $this->fileformatversion = "11.00";
154 | $this->vsversion = "# Visual Studio 2010";
155 | $this->toolsversion = "4.0";
156 | }
157 |
158 | function make_sln() {
159 | $f = fopen("{$this->project_dir}/".$this->sln, "w+");
160 | fecho($f,
161 | addln("Microsoft Visual Studio Solution File, Format Version {$this->fileformatversion}").
162 | addln("{$this->vsversion}")
163 | );
164 |
165 | foreach($this->projects as $handle=>$info) {
166 | fecho($f,
167 | addln("Project(\"{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}\") = ".quote($info["name"]).", ".quote($info["vcxproj"]).", ".quote($info["guid"])).
168 | addln("EndProject")
169 | );
170 | }
171 |
172 | fecholn($f, "Global");
173 | fecholn($f, " GlobalSection(SolutionConfigurationPlatforms) = preSolution");
174 | foreach($this->builds as $label=>$build)
175 | fecholn($f, " {$label} = {$label}");
176 | fecholn($f, " EndGlobalSection");
177 |
178 | fecholn($f, " GlobalSection(ProjectConfigurationPlatforms) = postSolution");
179 | foreach($this->projects as $handle=>$info) {
180 | foreach($this->builds as $label=>$build) {
181 | fecho($f,
182 | addln(" {$info['guid']}.{$label}.ActiveCfg = {$build}").
183 | addln(" {$info['guid']}.{$label}.Build.0 = {$build}")
184 | );
185 | }
186 | }
187 | fecholn($f, " EndGlobalSection");
188 |
189 | fecho($f,
190 | addln(" GlobalSection(SolutionProperties) = preSolution").
191 | addln(" HideSolutionNode = FALSE").
192 | addln(" EndGlobalSection")
193 | );
194 | fecholn($f, "EndGlobal");
195 | fclose($f);
196 | }
197 |
198 | public function make_vcxproj_filters() {
199 | foreach($this->projects as $handle=>$info) {
200 | $f = fopen("{$this->project_dir}/".$info["vcxproj"].".filters", "w+");
201 |
202 | fecholn($f,
203 | "".
204 | ""
205 | );
206 |
207 | /* list of filters we'll be using */
208 | fecho($f,
209 | "".
210 | " "
211 | );
212 |
213 | $seen = array();
214 | foreach($info["files"] as $handle) {
215 | foreach($this->files[$handle] as $path) {
216 | while (1) {
217 | $chop_directory = preg_replace("!^(.*)\\\\.*$!", "$1", $path);
218 | if ($chop_directory === $path)
219 | break;
220 | $seen[$chop_directory] = 1;
221 | $path = $chop_directory;
222 | }
223 | }
224 | }
225 |
226 | foreach($seen as $basepath=>$dummy)
227 | fecho($f, " ");
228 | fecholn($f, " ");
229 | /* list of filters we'll be using */
230 |
231 | /* list of files with their filters */
232 | foreach($info["files"] as $handle) {
233 | fecho($f, "");
234 | foreach($this->files[$handle] as $path) {
235 | $type = $this->fileinfo[$path]["type"];
236 | $folder = $this->fileinfo[$path]["basepath"];
237 | fecho($f, "<{$type} Include=\"..\\{$path}\">Source\\{$folder} {$type}>");
238 | }
239 | fecholn($f, " ");
240 | }
241 | /* list of files with their filters */
242 |
243 | fecholn($f, " ");
244 |
245 | fclose($f);
246 | }
247 | }
248 |
249 | public function make_vcxproj() {
250 | foreach($this->projects as $handle=>$info) {
251 | $f = fopen("{$this->project_dir}/".$info["vcxproj"], "w+");
252 |
253 | fecholn($f,
254 | "".
255 | ""
256 | );
257 |
258 | /* build configurations */
259 | fecholn($f, "");
260 | foreach($this->builds as $build) {
261 | $fields = explode("|", $build);
262 | fecholn($f,
263 | "".
264 | "{$fields[0]} ".
265 | "{$fields[1]} ".
266 | " "
267 | );
268 | }
269 | fecholn($f, " ");
270 | /* build configurations */
271 |
272 |
273 | /* properties for this project */
274 | fecholn($f,
275 | "".
276 | "{$info['guid']} ".
277 | "Win32Proj ".
278 | "{$this->name} ".
279 | "{$this->toolset} ".
280 | " "
281 | );
282 |
283 | /* some project configuration options */
284 | fecholn($f, " ");
285 | foreach($this->builds as $build) {
286 | $fields = explode("|", $build);
287 | $configurationmap = array("lib"=>"StaticLibrary", "dll"=>"DynamicLibrary", "util"=>"Application");
288 | $debuglibmap = array("Release"=>"false", "Debug"=>"true");
289 | fecholn($f,
290 | "".
291 | "{$configurationmap[$handle]} ".
292 | "MultiByte ".
293 | "{$debuglibmap[$fields[0]]} ".
294 | " "
295 | );
296 | }
297 | /* some project configuration options */
298 |
299 | fecholn($f, " ");
300 |
301 | fecholn($f,
302 | "".
303 | " ".
304 | " "
305 | );
306 |
307 | fecholn($f, " ");
308 |
309 | /* target and directories */
310 | foreach($this->builds as $label=>$build) {
311 | $fields = explode("|", $label);
312 | $target_name = $this->name;
313 | $target_ext = ($handle == "util") ? "exe" : $handle;
314 | fecholn($f,
315 | "".
316 | "$(SolutionDir)..\\bin\\{$fields[0]}\\{$fields[1]}\\ ".
317 | "$(SolutionDir)..\\build\\{$handle}\\{$fields[0]}\\{$fields[1]}\\ ".
318 | "{$target_name} ".
319 | ".{$target_ext} ".
320 | " "
321 | );
322 | }
323 | /* target and directories */
324 |
325 |
326 | /* compiler and linker */
327 | $settingsmap = array(
328 | "Optimization"=>array("Release"=>"MaxSpeed", "Debug"=>"Disabled"),
329 | "IntrinsicFunctions"=>array("Release"=>"true", "Debug"=>"false"),
330 | "InlineFunctionExpansion"=>array("Release"=>"AnySuitable", "Debug"=>"Disabled"),
331 | "FavorSizeOrSpeed"=>array("Release"=>"Speed", "Debug"=>"Neither"),
332 | "BufferSecurityCheck"=>array("Release"=>"false", "Debug"=>"true"),
333 | "EnableCOMDATFolding"=>array("Release"=>"true", "Debug"=>"false"),
334 | "OptimizeReferences"=>array("Release"=>"true", "Debug"=>"false"),
335 | "SubSystem"=>array("lib"=>"Windows", "dll"=>"Windows", "util"=>"Console"),
336 | "PreprocessorDefinitions"=>array("lib"=>"", "dll"=>"BUILDING_DLL;LIB_PUBLIC=__declspec(dllexport)", "util"=>"UTILITIES"),
337 | );
338 |
339 | $includes = "";
340 | foreach($this->include_dirs as $dir)
341 | $includes .= str_replace("/", "\\", $dir).";";
342 |
343 | foreach($this->builds as $build) {
344 | $fields = explode("|", $build);
345 | fecholn($f, "");
346 | /* compiler */
347 | fecholn($f,
348 | "".
349 | /* static options */
350 | " ".
351 | "Level4 ".
352 | "false ".
353 | "{$includes} ".
354 | "$(IntDir)dummy\\%(RelativeDir)/ ".
355 | /* custom options */
356 | "{$settingsmap['BufferSecurityCheck'][$fields[0]]} ".
357 | "{$settingsmap['Optimization'][$fields[0]]} ".
358 | "{$settingsmap['IntrinsicFunctions'][$fields[0]]} ".
359 | "{$settingsmap['InlineFunctionExpansion'][$fields[0]]} ".
360 | "{$settingsmap['FavorSizeOrSpeed'][$fields[0]]} ".
361 | "{$settingsmap['BufferSecurityCheck'][$fields[0]]} ".
362 | "{$settingsmap['PreprocessorDefinitions'][$handle]};%(PreprocessorDefinitions) ".
363 | " "
364 | );
365 | /* linker */
366 |
367 | switch ($handle) {
368 | case "lib":
369 | fecholn($f,
370 | "".
371 | "false ".
372 | " "
373 | );
374 | break;
375 |
376 | case "dll":
377 | case "util":
378 | fecholn($f,
379 | " ".
380 | "true ".
381 | "{$settingsmap['SubSystem'][$handle]} ".
382 | "{$settingsmap['EnableCOMDATFolding'][$fields[0]]} ".
383 | "{$settingsmap['OptimizeReferences'][$fields[0]]} ".
384 | "$(OutDir){$this->name}.dll.lib ".
385 | "$(TargetDir)$(TargetName)$(TargetExt).pdb ".
386 | ""
387 | );
388 | break;
389 | }
390 | fecholn($f, " ");
391 | }
392 | fecholn($f, " ");
393 | /* compiler and linker */
394 |
395 | /* list of files */
396 | $yasm_includes = "";
397 | foreach($this->include_dirs as $dir)
398 | $yasm_includes .= "-I{$dir} ";
399 |
400 | foreach($info["files"] as $handle) {
401 | fecholn($f, "");
402 | foreach($this->files[$handle] as $path) {
403 | $type = $this->fileinfo[$path]["type"];
404 | $folder = $this->fileinfo[$path]["basepath"];
405 | $cleanpath = str_replace("../", "", $path);
406 | $basename = preg_replace("!(.*)\..*$!", "$1", $this->fileinfo[$path]["basename"]);
407 | if ($type == "CustomBuild") {
408 | fecholn($f,
409 | "<{$type} Include=\"..\\{$path}\">".
410 | "yasm [{$cleanpath}] ".
411 | "yasm -r nasm -p gas {$yasm_includes} -o $(IntDir)\\{$folder}\\{$basename}.obj -f win32 ..\\{$path} ".
412 | "yasm -r nasm -p gas {$yasm_includes} -o $(IntDir)\\{$folder}\\{$basename}.obj -f win64 ..\\{$path} ".
413 | "$(IntDir)\\{$folder}\\{$basename}.obj ".
414 | "{$type}>"
415 | );
416 | } else {
417 | fecholn($f, "<{$type} Include=\"..\\{$path}\">{$type}>");
418 | }
419 | }
420 | fecholn($f, " ");
421 | }
422 | /* list of files */
423 |
424 | fecholn($f, " ");
425 |
426 | fclose($f);
427 | }
428 | }
429 |
430 | public function make_project() {
431 | $this->build_files();
432 |
433 | $this->fileinfo = array();
434 | foreach($this->files as $handle=>$list) {
435 | foreach($list as $path) {
436 | $basepath = preg_replace("!^(.*)\\\\.*$!", "$1", $path);
437 | $basename = preg_replace("!^.*\\\\(.*)$!", "$1", $path);
438 | $this->fileinfo[$path]["basepath"] = $basepath;
439 | $this->fileinfo[$path]["basename"] = $basename;
440 |
441 | $ext = preg_replace("!^.*\.(.*)$!", "$1", $path);
442 | switch ($ext) {
443 | case "c": $type = "ClCompile"; break;
444 | case "S": $type = "CustomBuild"; break;
445 | case "inc": $type = "ClHeader"; break;
446 | case "h": $type = "ClHeader"; break;
447 | }
448 | $this->fileinfo[$path]["type"] = $type;
449 | }
450 | }
451 |
452 | $this->make_vcxproj();
453 | $this->make_vcxproj_filters();
454 | }
455 |
456 | public function make() {
457 | if (!file_exists($this->project_dir))
458 | mkdir($this->project_dir, 0755);
459 |
460 | $this->make_sln();
461 | $this->make_project();
462 | }
463 | }
464 |
465 | class vs2012 extends vs2010 {
466 | public function vs2012($name) {
467 | parent::vs2010($name);
468 |
469 | $this->project_dir = "vs2012";
470 | $this->toolset = "v110";
471 | $this->fileformatversion = "12.00";
472 | $this->vsversion = "# Visual Studio 2012";
473 | }
474 | }
475 |
476 | class vs2013 extends vs2012 {
477 | public function vs2013($name) {
478 | parent::vs2012($name);
479 |
480 | $this->project_dir = "vs2013";
481 | $this->toolset = "v120";
482 | $this->fileformatversion = "12.00";
483 | $this->vsversion = "# Visual Studio 2013";
484 | $this->toolsversion = "12.0";
485 | }
486 | }
487 |
488 |
489 | class argument {
490 | var $set, $value;
491 | }
492 |
493 |
494 | class anyargument extends argument {
495 | function anyargument($flag) {
496 | global $argc, $argv;
497 |
498 | $this->set = false;
499 |
500 | for ($i = 1; $i < $argc; $i++) {
501 | if (!preg_match("!--".$flag."=(.*)!", $argv[$i], $m))
502 | continue;
503 | $this->value = $m[1];
504 | $this->set = true;
505 | return;
506 | }
507 | }
508 | }
509 |
510 | /* prefix an argument with a * to indicate default */
511 | class multiargument extends anyargument {
512 | function multiargument($flag, $legal_values) {
513 | parent::anyargument($flag);
514 |
515 | $map = array();
516 | $default = false;
517 | foreach($legal_values as $value) {
518 | if (substr($value, 0, 1) == "*")
519 | $default = substr($value, 1);
520 | $map[$value] = true;
521 | }
522 |
523 | if (!$this->set) {
524 | if ($default === false) {
525 | usage("value not specified for --{$flag}!");
526 | exit(1);
527 | }
528 | $this->value = $default;
529 | return;
530 | }
531 |
532 | if (!isset($map[$this->value])) {
533 | usage("{$this->value} is not a valid parameter to --{$flag}!");
534 | exit(1);
535 | }
536 | }
537 | }
538 |
539 |
540 | class flag extends argument {
541 | function flag($flag) {
542 | global $argc, $argv;
543 |
544 | $this->set = false;
545 |
546 | $flag = "--{$flag}";
547 | for ($i = 1; $i < $argc; $i++) {
548 | if ($argv[$i] !== $flag)
549 | continue;
550 | $this->value = true;
551 | $this->set = true;
552 | return;
553 | }
554 | }
555 | }
556 |
557 | function usage($reason = "") {
558 | echoln("Usage: php genvs.php [flags]");
559 | echoln("Flags in parantheses are optional");
560 | echoln("");
561 | echoln(" --version=[vs2013,vs2012,vs2010] which project type to generate");
562 | echoln(" (--disable-yasm) do not use yasm");
563 | echoln("");
564 | if ($reason)
565 | echoln($reason);
566 | }
567 |
568 | $help = new flag("help");
569 | $disable_yasm = new flag("disable-yasm");
570 | $version = new multiargument("version", array("vs2010", "vs2012", "vs2013"));
571 |
572 |
573 | if ($help->set) {
574 | usage();
575 | exit(0);
576 | }
577 |
578 | $project_name = trim(my_file_get_contents("app/project.def"));
579 |
580 | switch ($version->value) {
581 | case "vs2010": $sln = new vs2010($project_name); break;
582 | case "vs2012": $sln = new vs2012($project_name); break;
583 | case "vs2013": $sln = new vs2013($project_name); break;
584 | }
585 |
586 | $sln->make();
587 |
588 |
589 | /* build framework/include/asmopt.h and framework/include/asmopt_internal.h */
590 |
591 | if ($disable_yasm->set) {
592 | $yasm = "";
593 | } else {
594 | $yasm = <<
616 |
617 | {$yasm}
618 |
619 | #if (defined(_M_IX86))
620 | #define CPU_32BITS
621 | #elif (defined(_M_X64))
622 | #define CPU_64BITS
623 | #else
624 | #error This should never happen
625 | #endif
626 |
627 | #define HAVE_INT64
628 | #define HAVE_INT32
629 | #define HAVE_INT16
630 | #define HAVE_INT8
631 |
632 | #if (_MSC_VER < 1300)
633 | typedef signed __int64 int64_t; typedef unsigned __int64 uint64_t;
634 | typedef signed int int32_t; typedef unsigned int uint32_t;
635 | typedef signed short int16_t; typedef unsigned short uint16_t;
636 | typedef signed char int8_t; typedef unsigned char uint8_t;
637 | #elif (_MSC_VER < 1600)
638 | typedef signed __int64 int64_t; typedef unsigned __int64 uint64_t;
639 | typedef signed __int32 int32_t; typedef unsigned __int32 uint32_t;
640 | typedef signed __int16 int16_t; typedef unsigned __int16 uint16_t;
641 | typedef signed __int8 int8_t; typedef unsigned __int8 uint8_t;
642 | #else
643 | #include
644 | #endif
645 |
646 | #endif /* ASMOPT_H */
647 |
648 |
649 | EOS;
650 |
651 |
652 | $asmopt_internal = <<write_file("%%projectdir/asmopt.h", $asmopt_h);
673 | $sln->write_file("%%projectdir/asmopt_internal.h", $asmopt_internal);
674 |
675 |
676 |
677 | /* build framework/include/util_implemntations.h */
678 |
679 | $impls = array();
680 | crawl($impls, "app/include", array("!\.h$!"), false);
681 |
682 | $impl_includes = "";
683 | $impl_declares = "";
684 | for ($i = 0; $i < count($impls); $i++) {
685 | $path = $impls[$i];
686 | $basename = preg_replace("!^.*\\\\(.*)\.h$!", "$1", $path);
687 | $impl_includes .= addln("#include \"{$basename}.h\"");
688 | $impl_declares .= ($i < (count($impls) - 1)) ? addln("\tmake_impl({$basename}),") : "\tmake_impl({$basename})";
689 | }
690 |
691 | $util_implementations = <<write_file("%%projectdir/util_implementations.h", $util_implementations);
701 |
702 | ?>
--------------------------------------------------------------------------------
/sources/crypto_onetimeauth_poly1305_ref_auth.c:
--------------------------------------------------------------------------------
1 | /*
2 | 20080912
3 | D. J. Bernstein
4 | Public domain.
5 | */
6 |
7 | #include "crypto_onetimeauth.h"
8 |
9 | static void add(unsigned int h[17],const unsigned int c[17])
10 | {
11 | unsigned int j;
12 | unsigned int u;
13 | u = 0;
14 | for (j = 0;j < 17;++j) { u += h[j] + c[j]; h[j] = u & 255; u >>= 8; }
15 | }
16 |
17 | static void squeeze(unsigned int h[17])
18 | {
19 | unsigned int j;
20 | unsigned int u;
21 | u = 0;
22 | for (j = 0;j < 16;++j) { u += h[j]; h[j] = u & 255; u >>= 8; }
23 | u += h[16]; h[16] = u & 3;
24 | u = 5 * (u >> 2);
25 | for (j = 0;j < 16;++j) { u += h[j]; h[j] = u & 255; u >>= 8; }
26 | u += h[16]; h[16] = u;
27 | }
28 |
29 | static const unsigned int minusp[17] = {
30 | 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 252
31 | } ;
32 |
33 | static void freeze(unsigned int h[17])
34 | {
35 | unsigned int horig[17];
36 | unsigned int j;
37 | unsigned int negative;
38 | for (j = 0;j < 17;++j) horig[j] = h[j];
39 | add(h,minusp);
40 | negative = -(h[16] >> 7);
41 | for (j = 0;j < 17;++j) h[j] ^= negative & (horig[j] ^ h[j]);
42 | }
43 |
44 | static void mulmod(unsigned int h[17],const unsigned int r[17])
45 | {
46 | unsigned int hr[17];
47 | unsigned int i;
48 | unsigned int j;
49 | unsigned int u;
50 |
51 | for (i = 0;i < 17;++i) {
52 | u = 0;
53 | for (j = 0;j <= i;++j) u += h[j] * r[i - j];
54 | for (j = i + 1;j < 17;++j) u += 320 * h[j] * r[i + 17 - j];
55 | hr[i] = u;
56 | }
57 | for (i = 0;i < 17;++i) h[i] = hr[i];
58 | squeeze(h);
59 | }
60 |
61 | int crypto_onetimeauth(unsigned char *out,const unsigned char *in,unsigned long long inlen,const unsigned char *k)
62 | {
63 | unsigned int j;
64 | unsigned int r[17];
65 | unsigned int h[17];
66 | unsigned int c[17];
67 |
68 | r[0] = k[0];
69 | r[1] = k[1];
70 | r[2] = k[2];
71 | r[3] = k[3] & 15;
72 | r[4] = k[4] & 252;
73 | r[5] = k[5];
74 | r[6] = k[6];
75 | r[7] = k[7] & 15;
76 | r[8] = k[8] & 252;
77 | r[9] = k[9];
78 | r[10] = k[10];
79 | r[11] = k[11] & 15;
80 | r[12] = k[12] & 252;
81 | r[13] = k[13];
82 | r[14] = k[14];
83 | r[15] = k[15] & 15;
84 | r[16] = 0;
85 |
86 | for (j = 0;j < 17;++j) h[j] = 0;
87 |
88 | while (inlen > 0) {
89 | for (j = 0;j < 17;++j) c[j] = 0;
90 | for (j = 0;(j < 16) && (j < inlen);++j) c[j] = in[j];
91 | c[j] = 1;
92 | in += j; inlen -= j;
93 | add(h,c);
94 | mulmod(h,r);
95 | }
96 |
97 | freeze(h);
98 |
99 | for (j = 0;j < 16;++j) c[j] = k[j + 16];
100 | c[16] = 0;
101 | add(h,c);
102 | for (j = 0;j < 16;++j) out[j] = h[j];
103 | return 0;
104 | }
105 |
--------------------------------------------------------------------------------